制作一个小爬虫,xlsx导出资料

2019-08-07  本文已影响0人  远方不会远

import requests,json
from bs4 import BeautifulSoup
url=('https://www.pintu360.com/a{}.html?{}')
tt=[]
for i in range(1,621):
payload = {'fnName':'getArticleList','type':'recommend','id':'0','pageNumber':i,'duration':'quarter'}#post请求参数
r = requests.post('https://www.pintu360.com/service/ajax_article_service.php', data=payload)#发送post请求
jd =json.loads(r.text) #json格式化
#循环数据
for ed in jd:
#拼接网址
newurl=url.format(ed['id'],ed['op'])
#调用自定义方法获取文章详情
rents=getnewsDetail(newurl)
rents['imgUrl']=ed['imgUrl']
#存放到集合
tt.append(rents)

图片.png

def getnewsDetail(newurl):
result = {}
soup = requests.get(newurl)
soup.encoding = 'utf-8'
soup = BeautifulSoup(soup.text,'html.parser')
result['title']=soup.select('.title')[0].text
result['time'] = soup.select('.article-date span')[0].text
result['author']=soup.select('.article-author')[0].contents[0].strip('文/')
result['note']=soup.select('#note')[0].text.strip('摘要:')
result['article']=soup.select('.article-content .text p')[:-1]
return result


图片.png

import pandas
df = pandas.DataFrame(tt)
df


图片.png

df.to_excel('news.xlsx')

图片.png

另一个方法

from time import sleep
睡眠
sleep(2)

图片.png
上一篇下一篇

猜你喜欢

热点阅读