简单使用re模块爬取糗事百科文字

2019-01-31  本文已影响0人  徒手說梦话
# 爬取糗事百科文字
import re
import requests

def data_capture(url):
    headers = {
        'User-Agent':'ozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
    response = requests.get(url,headers = headers)
    text = response.text
    contents = re.findall('<div class="content">.*?<span>(.*?)</span>',text,re.S) # re.S == re.DOTALL
    for content in contents:
        content = re.sub('<br/>','',content)
        print(content.strip())  # 去除空格,换行

def spider():
    urls = 'https://www.qiushibaike.com/text/page/{}/'
    for i in range(1,6):
        url = urls.format(i)
        data_capture(url)
        break

spider()
上一篇 下一篇

猜你喜欢

热点阅读