正则尝试

2018-11-14  本文已影响0人  dongger

主要收获

import requests
import re
import json
urls=["https://www.qiushibaike.com/text/page/{}/".format (str(i)) for i in range(1,14)]
def get_one_page(url):
    web_data=requests.get(url)
    return web_data.text
def parse_one_page(html):
    content=re.compile('<h2>(.*?)</h2>.*?</div>.*?<span>(.*?)</span>.*?</div>.*?number.*?>(\d+)</i>',re.S)
    item=re.findall(content,html)
    for i in item:
        yield {
            "title":i[0].strip(),
            "article":i[1].strip().replace('<br/>','\n'),
            "funny":i[2],
        }
def write_to_file(content):
    with open('re_douban.txt','a',encoding='utf-8') as f:
        f.write(json.dumps(content,ensure_ascii=False)+'\n')

def main():
    for url in urls:
        html=get_one_page(url)
        data=parse_one_page(html)
        for i in data:
            write_to_file(i)

if __name__ == '__main__':
    main()
上一篇 下一篇

猜你喜欢

热点阅读