爬取新闻网页的信息
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
import json
'''
将抓取评论数的方法整理成函式
'''
commentURL= 'http://comment.sina.com.cn/page/info?version=1&\
format=json&channel=gn&newsid=comos-{}&group=0&compress=0&\
ie=gbk&oe=gbk&page=1&page_size=3&t_size=3&\h_size=10'
#抓取评论数
def getCommentCounts(newsurl):
m = re.search('doc-i(.*).shtml',newsurl)
newsid = m.group(1)
#将newsid塞进commentURL中
setid = commentURL.format(newsid)
#获取评论
req=requests.get(setid)
jd = json.loads(req.text)['result']['count']['total']
return jd
'''
将抓取抓取内文信息的方法整理成函式
'''
def getNewsDetail(newsurl):
result={}
res=requests.get(newsurl)
res.encoding='utf-8'
html=BeautifulSoup(res.text,'html.parser')
#获取标题
result['title']=html.select('.main-title')[0].text
#获取新闻时间
strdate=html.select('.date')[0].text.strip()
result['date']=datetime.strptime(strdate,'%Y年%m月%d日 %H:%M')
#获取新闻来源
result['source'] = html.select('.date-source a')[0].contents[0]
#获取新闻内容
result['article'] = html.select('.article p')[:-1].text.strip()
#获取作者
result['author'] = html.select('.show_author')[0].text.strip('责任编辑:')
#获取评论数
result['comments'] = getCommentCounts(newsurl)
return result
'''
根据分页批量抓取新闻
'''
def getNewsUrl(url):
newsDetails={}
req = requests.get(url)
req.encoding='utf-8'
fd = json.loads(req.text)['result']['data']
for newsurls in fd:
newsDetails.append(getNewsDetail(newsurls['url']))
return newsDetails
if "__name__" == "__main__":
''' 第一二部分用
#新闻链接
news = 'https://news.sina.com.cn/c/xl/2019-02-21/doc-ihrfqzka7703333.shtml'
getCommentCounts(news)
'''
newsurl = 'https://feed.sina.com.cn/api/roll/get?pageid=121&\
lid=1356&num=20&versionNumber=1.2.4&\page=2&encode=utf-8'
getNewsUrl(newsurl)
来自于视频学习:https://study.163.com/course/courseMain.htm?courseId=1003285002