request+bs4爬取糗事百科数据
2017-07-07 本文已影响21人
CaesarsTesla
import requests
from bs4 import BeautifulSoup
import json
import time
i = 0
data = {}
def save_file(content):
file = open('qsbk.txt','a')
file.writelines(content)
file.close()
while True:
url = 'https://www.qiushibaike.com/8hr/page/'+str(i)+'/?s=4986156'
data['dicAccept-Encoding'] = 'gzip, deflate'
data['Referer'] = 'https://www.qiushibaike.com/'
data['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
data['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/602.3.12 (KHTML, like Gecko) Version/10.0.2 Safari/602.3.12'
data['Accept-Language'] = 'zh-cn'
respose = requests.get(url,data)
soup = BeautifulSoup(respose.text,'html5lib')
results = soup.find_all('div',class_='content')
for result in results:
span = result.select('span')
print(span[0].text +'\n'+'\n')
save_file(span[0].text +'\n')
i += 1;
time.sleep(4)
在这里我让其4秒自动执行下一页数据的抓取,并进行保存,最终的结果就像这样。(当然,不应该这么做的)
WechatIMG85.jpeg