2019-07-30

2019-07-30  本文已影响0人  超哥__
#! /usr/bin/env python
# # -*- coding: utf-8 -*-

import datetime
import gzip
import hashlib
import json
import logging
from lxml import etree
import os
import random
import re
import socket
import ssl
import StringIO
import sys
import threading
import threadpool
import time


defencode = 'utf-8'
ssl._create_default_https_context = ssl._create_unverified_context

if sys.version_info[0] == 2:
    from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
    import urllib2 as urllib_
elif sys.version_info[0] == 3:
    from http.server import BaseHTTPRequestHandler, HTTPServer
    import urllib.request as urllib_


logging.basicConfig(level=logging.INFO, filename='serv.log', filemode='a',
    format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')


def FLog(msg):
    logging.info(msg)
    print(datetime.datetime.now().strftime('%c') + '\t' + msg)


class NoRedirection(urllib_.HTTPRedirectHandler):
    def http_error_301(req, fp, code, msg, hdrs, newurl):
        return code
    def http_error_302(req, fp, code, msg, hdrs, newurl):
        return code
    def http_error_303(req, fp, code, msg, hdrs, newurl):
        return code


def httpRequest(url, headers=None, postdata=None, proxy=None):
    if headers is None:
        headers = {
            'User-Agent':'Mozilla/5.0'
        }
    try:
        if proxy is None:
            opener = urllib_.build_opener(NoRedirection)
        else:
            opener = urllib_.build_opener(NoRedirection, urllib_.ProxyHandler({
                "http": proxy,
                "https": proxy,
            }))
        resp = opener.open(urllib_.Request(url, headers=headers, data=postdata))
        data = resp.read()
        if 'content-encoding' in resp.headers and resp.headers['content-encoding'] == 'gzip':
            gz = gzip.GzipFile(fileobj=StringIO.StringIO(data))
            data = gz.read()
            gz.close()
        return resp.code, data
    except Exception as e:
        return 600, None


def downFile(url, path, headers=None, postdata=None, proxy=None):
    code, data = httpRequest(url=url, headers=headers, postdata=postdata, proxy=proxy)
    if code == 200 and data is not None:
        with open(path, 'wb') as f:
            f.write(data)


proxy = None #'127.0.0.1:1080' 被封ip需要换代理

if __name__ == '__main__':
    rootdir = './qiushi'
    for i in range(110006543, 130000000):
        itemdir = rootdir + '/%d' % i
        print('handle %d' % i)
        if os.path.exists(itemdir):
            continue
        url = 'https://m2.qiushibaike.com/article/%d' % i
        code, data = httpRequest(url=url, proxy=proxy, headers={
            'Accept-Encoding': 'gzip, deflate',
            'User-Agent': 'Mozilla/5.0'
        })
        if code != 200:
            continue
        try:
            jdata = json.loads(data)
            jdata['article']['content']
        except Exception as e:
            continue
        os.makedirs(itemdir)
        contentfile = itemdir + '/data'
        with open(contentfile, 'wb') as f:
            f.write(data)
        print('handle %d done' % i)
        if 'high_url' not in jdata['article']:
            continue
        high_url = jdata['article']['high_url']
        highurlfile = itemdir + '/' + os.path.basename(high_url)
        downFile(high_url, highurlfile)


#! /usr/bin/env python
# # -*- coding: utf-8 -*-

import datetime
import gzip
import hashlib
import json
import logging
from lxml import etree
import os
import random
import re
import socket
import ssl
import StringIO
import sys
import threading
import threadpool
import time


defencode = 'utf-8'

# https使用ssl来做证书加密,python有时候无法正常解密,加上这个就可以忽略证书验证。正常获取到https的html响应
ssl._create_default_https_context = ssl._create_unverified_context

# 因为python2和python3的urllib库方法有所区别
if sys.version_info[0] == 2:
    from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
    import urllib2 as urllib_
elif sys.version_info[0] == 3:
    from http.server import BaseHTTPRequestHandler, HTTPServer
    import urllib.request as urllib_

# 打印日志
logging.basicConfig(level=logging.INFO, filename='serv.log', filemode='a',
    format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')


def FLog(msg):
    logging.info(msg)
    print(datetime.datetime.now().strftime('%c') + '\t' + msg)

# 用于设置不自动跳转
class NoRedirection(urllib_.HTTPRedirectHandler):
    def http_error_301(req, fp, code, msg, hdrs, newurl):
        return code
    def http_error_302(req, fp, code, msg, hdrs, newurl):
        return code
    def http_error_303(req, fp, code, msg, hdrs, newurl):
        return code

'''
http/https请求类型:
请求部分:

传参方式
1.http://www.baidu.com/omn/20190810/20190810A0ND3I00.html?a=1&b=2&aaa=
2.User:lichao -> Header
3.body传参

GET /omn/20190810/20190810A0ND3I00.html?usr=lichao&pass=lihao HTTP/1.1(\r\n)
User-Agent: ...(\r\n)
Cookie:"asl=1,sa=1"
(\r\n\r\n)

POST /omn/20190810/20190810A0ND3I00.html HTTP/1.1(\r\n)
User-Agent: ...(\r\n)
(\r\n\r\n)
body.....

响应部分:
HTTP/1.1 200 OK(\r\n)
Header1:value1(\r\n)
Header2:value2
Cookie:"JSESSIONID=aaghlajalggajlsjkdklflkjas"
...
(\r\n\r\n)
body.....
'''

def httpRequest(url, headers=None, postdata=None, proxy=None):
    if headers is None:
        headers = {
            'User-Agent':'Mozilla/5.0'
        }
    try:
        if proxy is None:
            # opener = urllib_.build_opener() 如果想自动处理跳转
            opener = urllib_.build_opener(NoRedirection)
        else:
            opener = urllib_.build_opener(NoRedirection, urllib_.ProxyHandler({
                "http": proxy,
                "https": proxy,
            }))
        resp = opener.open(urllib_.Request(url, headers=headers, data=postdata)) # 构造http请求
        data = resp.read() # 真正的请求,获取状态码,返回的数据
        if 'content-encoding' in resp.headers and resp.headers['content-encoding'] == 'gzip':
            # 如果响应头告知采用gzip方式压缩,就解压body部分
            gz = gzip.GzipFile(fileobj=StringIO.StringIO(data)) 
            data = gz.read()
            gz.close()
        return resp.code, data
    except Exception as e:
        return 600, None


def downFile(url, path, headers=None, postdata=None, proxy=None):
    code, data = httpRequest(url=url, headers=headers, postdata=postdata, proxy=proxy)
    if code == 200 and data is not None:
        with open(path, 'wb') as f:
            f.write(data)

proxy = None #'127.0.0.1:1080' 被封ip需要换代理

if __name__ == '__main__':
    rootdir = './qiushi'
    for i in range(110006540, 130000000):
        itemdir = rootdir + '/%d' % i
        print('handle %d' % i)
        if os.path.exists(itemdir):
            continue
        url = 'https://m2.qiushibaike.com/article/%d' % i
        code, data = httpRequest(url=url, proxy=proxy, headers={
            'Accept-Encoding': 'gzip, deflate',
            'User-Agent': 'Mozilla/5.0'
        })
        if code != 200:
            continue
        try:
            jdata = json.loads(data)
            jdata['article']['content'] # 取json的/article/content得值
        except Exception as e:
            continue
        os.makedirs(itemdir) 
        contentfile = itemdir + '/data'
        with open(contentfile, 'wb') as f:
            f.write(data)
        print('handle %d done' % i)
        if 'high_url' not in jdata['article']:
            continue
        high_url = jdata['article']['high_url']
        highurlfile = itemdir + '/' + os.path.basename(high_url)
        downFile(high_url, highurlfile)
        # itemdir: ./qiushi/110006543   os.makedirs(itemdir)
        # highurlfile: ./qiushi/110006543/1.img
上一篇下一篇

猜你喜欢

热点阅读