爬虫

正则>>糗事百科文字

2018-09-27  本文已影响0人  郭祺迦

import re

import urllib.request

import urllib.parse

def main():

    url = 'https://www.qiushibaike.com/textnew/page/'

    start = int(input("开始页码:"))

    end = int(input("结束页码:"))

    # 循环生成页码

    for page in range(start, end + 1):

        # print(page)

        # 生成请求对象

        request = get_request(url, page)

        #生成相应的请求数据

        response = get_response(request)

        #通过re正则提取数据,存储数据

        base_data = get_data(response)

def get_request(url, page):

    #拼接生成url

    url += str(page)

    #请求头

    headers = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'

    }

    request = urllib.request.Request(url,headers=headers)

    return request

def get_response(request):

    response = urllib.request.urlopen(request)

    return response

def get_data(response):

    html = response.read().decode("utf8")

    # print(html)

    pattern = re.compile(r'<div class="content">.*?<span>(.*?)</span>.*?</div>',re.S)

    ret = pattern.findall(html)

    for  i in ret :

        with open('qiutu.html','a',encoding="utf-8") as fp:

            fp.write(i + '\n')

if __name__ == '__main__':

    main()

上一篇下一篇

猜你喜欢

热点阅读