抽取新闻相关信息

2018-08-31  本文已影响0人  测试媛617
import requests
from bs4 import BeautifulSoup

res = requests.get('http://news.sina.com.cn/c/gat/2018-08-29/doc-ihikcahf2103336.shtml')
res.encoding = 'utf-8'
# print(res.text)
soup = BeautifulSoup(res.text,"html.parser")

# 获取新闻标题
title = soup.select('.main-title')[0].text
print(title)
import requests
from bs4 import BeautifulSoup

res = requests.get('http://news.sina.com.cn/c/gat/2018-08-29/doc-ihikcahf2103336.shtml')
res.encoding = 'utf-8'
# print(res.text)
soup = BeautifulSoup(res.text,"html.parser")
time = soup.select('.date-source')[0].contents[1].text
print(time)
import requests
from bs4 import BeautifulSoup

res = requests.get('http://news.sina.com.cn/c/gat/2018-08-29/doc-ihikcahf2103336.shtml')
res.encoding = 'utf-8'
# print(res.text)
soup = BeautifulSoup(res.text,"html.parser")
print('@'.join([p.text.strip() for p in soup.select('#article p')[:-1]]))
import requests
res_comment = requests.get('http://comment5.news.sina.com.cn/page/info?version=1&format=json&channel=gn&newsid=comos-hikcahf2103336&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&thread=1')
res_comment.encoding = 'utf-8'
import json
# 转化为json格式
jd = json.loads(res_comment.text)
# print(jd)
# 获取评论数
comment_num = jd['result']['count']['show']
print(comment_num)
newsurl = 'http://news.sina.com.cn/c/gat/2018-08-29/doc-ihikcahf2103336.shtml'
newsid = newsurl.split('/')[-1].rstrip('.shtml').lstrip('doc-i')
print(newsid)
上一篇下一篇

猜你喜欢

热点阅读