正则>>糗事百科文字
import re
import urllib.request
import urllib.parse
def main():
url = 'https://www.qiushibaike.com/textnew/page/'
start = int(input("开始页码:"))
end = int(input("结束页码:"))
# 循环生成页码
for page in range(start, end + 1):
# print(page)
# 生成请求对象
request = get_request(url, page)
#生成相应的请求数据
response = get_response(request)
#通过re正则提取数据,存储数据
base_data = get_data(response)
def get_request(url, page):
#拼接生成url
url += str(page)
#请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
request = urllib.request.Request(url,headers=headers)
return request
def get_response(request):
response = urllib.request.urlopen(request)
return response
def get_data(response):
html = response.read().decode("utf8")
# print(html)
pattern = re.compile(r'<div class="content">.*?<span>(.*?)</span>.*?</div>',re.S)
ret = pattern.findall(html)
for i in ret :
with open('qiutu.html','a',encoding="utf-8") as fp:
fp.write(i + '\n')
if __name__ == '__main__':
main()