糗事百科在线解析之xpath应用

2019-06-02  本文已影响0人  hcc_9bf4

解析网站:https://www.qiushibaike.com/text/
解析用户和内容

python代码:

import urllib.request
import urllib.parse
from lxml import etree
import time
import json

item_list=[]
def handle_request(url,page):
    # url=url+str(page)+'/'
    headers={

        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',

    }
    url=url.format(page)
    
    request=urllib.request.Request(url=url,headers=headers)
    return request
def content_parse(content):
    #使用xpath在线解析
    tree=etree.HTML(content)
    qiushi_list=tree.xpath('//div[starts-with(@id,"qiushi_tag")]')
    # print(qiushi_list)
    # print(len(qiushi_list))
    for qiushi in qiushi_list:
        zuozhe=qiushi.xpath('.//div[@class="author clearfix"]//h2/text()')[0]
        # print(zuozhe)
        neirong=qiushi.xpath('.//div[@class="content"]/span/text()')

        # print(neirong)
        item={
        '作者':zuozhe,
        '内容':neirong,
        }
        #将内容添加到列表中
        item_list.append(item)

def main():
    url='https://www.qiushibaike.com/text/page/{}/'
    start_page=int(input('请输入起始页码:'))
    end_page=int(input('请输入结束页码:'))
    for page in range(start_page,end_page + 1):
        #构建请求对象
        request=handle_request(url,page)
        print('开始下载%s页' % page)
        #获取响应
        content=urllib.request.urlopen(request).read().decode()
        #解析内容
        content_parse(content)
        print('结束下载%s页' % page)
        time.sleep(2)
        #写入到文件中
        string=json.dumps(item_list,ensure_ascii=False)

        with open('baikeduanzi.txt','w',encoding='utf8') as fp:
            fp.write(string)

if __name__ == '__main__':
    main()
    print("下载完成...")
    print("请查看当前路径下的baikeduanzi.txt文件")

解析后:


上一篇下一篇

猜你喜欢

热点阅读