2019-07-30
2019-07-30 本文已影响0人
超哥__
#! /usr/bin/env python
# # -*- coding: utf-8 -*-
import datetime
import gzip
import hashlib
import json
import logging
from lxml import etree
import os
import random
import re
import socket
import ssl
import StringIO
import sys
import threading
import threadpool
import time
defencode = 'utf-8'
ssl._create_default_https_context = ssl._create_unverified_context
if sys.version_info[0] == 2:
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
import urllib2 as urllib_
elif sys.version_info[0] == 3:
from http.server import BaseHTTPRequestHandler, HTTPServer
import urllib.request as urllib_
logging.basicConfig(level=logging.INFO, filename='serv.log', filemode='a',
format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')
def FLog(msg):
logging.info(msg)
print(datetime.datetime.now().strftime('%c') + '\t' + msg)
class NoRedirection(urllib_.HTTPRedirectHandler):
def http_error_301(req, fp, code, msg, hdrs, newurl):
return code
def http_error_302(req, fp, code, msg, hdrs, newurl):
return code
def http_error_303(req, fp, code, msg, hdrs, newurl):
return code
def httpRequest(url, headers=None, postdata=None, proxy=None):
if headers is None:
headers = {
'User-Agent':'Mozilla/5.0'
}
try:
if proxy is None:
opener = urllib_.build_opener(NoRedirection)
else:
opener = urllib_.build_opener(NoRedirection, urllib_.ProxyHandler({
"http": proxy,
"https": proxy,
}))
resp = opener.open(urllib_.Request(url, headers=headers, data=postdata))
data = resp.read()
if 'content-encoding' in resp.headers and resp.headers['content-encoding'] == 'gzip':
gz = gzip.GzipFile(fileobj=StringIO.StringIO(data))
data = gz.read()
gz.close()
return resp.code, data
except Exception as e:
return 600, None
def downFile(url, path, headers=None, postdata=None, proxy=None):
code, data = httpRequest(url=url, headers=headers, postdata=postdata, proxy=proxy)
if code == 200 and data is not None:
with open(path, 'wb') as f:
f.write(data)
proxy = None #'127.0.0.1:1080' 被封ip需要换代理
if __name__ == '__main__':
rootdir = './qiushi'
for i in range(110006543, 130000000):
itemdir = rootdir + '/%d' % i
print('handle %d' % i)
if os.path.exists(itemdir):
continue
url = 'https://m2.qiushibaike.com/article/%d' % i
code, data = httpRequest(url=url, proxy=proxy, headers={
'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0'
})
if code != 200:
continue
try:
jdata = json.loads(data)
jdata['article']['content']
except Exception as e:
continue
os.makedirs(itemdir)
contentfile = itemdir + '/data'
with open(contentfile, 'wb') as f:
f.write(data)
print('handle %d done' % i)
if 'high_url' not in jdata['article']:
continue
high_url = jdata['article']['high_url']
highurlfile = itemdir + '/' + os.path.basename(high_url)
downFile(high_url, highurlfile)
#! /usr/bin/env python
# # -*- coding: utf-8 -*-
import datetime
import gzip
import hashlib
import json
import logging
from lxml import etree
import os
import random
import re
import socket
import ssl
import StringIO
import sys
import threading
import threadpool
import time
defencode = 'utf-8'
# https使用ssl来做证书加密,python有时候无法正常解密,加上这个就可以忽略证书验证。正常获取到https的html响应
ssl._create_default_https_context = ssl._create_unverified_context
# 因为python2和python3的urllib库方法有所区别
if sys.version_info[0] == 2:
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
import urllib2 as urllib_
elif sys.version_info[0] == 3:
from http.server import BaseHTTPRequestHandler, HTTPServer
import urllib.request as urllib_
# 打印日志
logging.basicConfig(level=logging.INFO, filename='serv.log', filemode='a',
format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')
def FLog(msg):
logging.info(msg)
print(datetime.datetime.now().strftime('%c') + '\t' + msg)
# 用于设置不自动跳转
class NoRedirection(urllib_.HTTPRedirectHandler):
def http_error_301(req, fp, code, msg, hdrs, newurl):
return code
def http_error_302(req, fp, code, msg, hdrs, newurl):
return code
def http_error_303(req, fp, code, msg, hdrs, newurl):
return code
'''
http/https请求类型:
请求部分:
传参方式
1.http://www.baidu.com/omn/20190810/20190810A0ND3I00.html?a=1&b=2&aaa=
2.User:lichao -> Header
3.body传参
GET /omn/20190810/20190810A0ND3I00.html?usr=lichao&pass=lihao HTTP/1.1(\r\n)
User-Agent: ...(\r\n)
Cookie:"asl=1,sa=1"
(\r\n\r\n)
POST /omn/20190810/20190810A0ND3I00.html HTTP/1.1(\r\n)
User-Agent: ...(\r\n)
(\r\n\r\n)
body.....
响应部分:
HTTP/1.1 200 OK(\r\n)
Header1:value1(\r\n)
Header2:value2
Cookie:"JSESSIONID=aaghlajalggajlsjkdklflkjas"
...
(\r\n\r\n)
body.....
'''
def httpRequest(url, headers=None, postdata=None, proxy=None):
if headers is None:
headers = {
'User-Agent':'Mozilla/5.0'
}
try:
if proxy is None:
# opener = urllib_.build_opener() 如果想自动处理跳转
opener = urllib_.build_opener(NoRedirection)
else:
opener = urllib_.build_opener(NoRedirection, urllib_.ProxyHandler({
"http": proxy,
"https": proxy,
}))
resp = opener.open(urllib_.Request(url, headers=headers, data=postdata)) # 构造http请求
data = resp.read() # 真正的请求,获取状态码,返回的数据
if 'content-encoding' in resp.headers and resp.headers['content-encoding'] == 'gzip':
# 如果响应头告知采用gzip方式压缩,就解压body部分
gz = gzip.GzipFile(fileobj=StringIO.StringIO(data))
data = gz.read()
gz.close()
return resp.code, data
except Exception as e:
return 600, None
def downFile(url, path, headers=None, postdata=None, proxy=None):
code, data = httpRequest(url=url, headers=headers, postdata=postdata, proxy=proxy)
if code == 200 and data is not None:
with open(path, 'wb') as f:
f.write(data)
proxy = None #'127.0.0.1:1080' 被封ip需要换代理
if __name__ == '__main__':
rootdir = './qiushi'
for i in range(110006540, 130000000):
itemdir = rootdir + '/%d' % i
print('handle %d' % i)
if os.path.exists(itemdir):
continue
url = 'https://m2.qiushibaike.com/article/%d' % i
code, data = httpRequest(url=url, proxy=proxy, headers={
'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0'
})
if code != 200:
continue
try:
jdata = json.loads(data)
jdata['article']['content'] # 取json的/article/content得值
except Exception as e:
continue
os.makedirs(itemdir)
contentfile = itemdir + '/data'
with open(contentfile, 'wb') as f:
f.write(data)
print('handle %d done' % i)
if 'high_url' not in jdata['article']:
continue
high_url = jdata['article']['high_url']
highurlfile = itemdir + '/' + os.path.basename(high_url)
downFile(high_url, highurlfile)
# itemdir: ./qiushi/110006543 os.makedirs(itemdir)
# highurlfile: ./qiushi/110006543/1.img