抽取新闻相关信息
2018-08-31 本文已影响0人
测试媛617
- 抽取新闻标题
import requests
from bs4 import BeautifulSoup
res = requests.get('http://news.sina.com.cn/c/gat/2018-08-29/doc-ihikcahf2103336.shtml')
res.encoding = 'utf-8'
# print(res.text)
soup = BeautifulSoup(res.text,"html.parser")
# 获取新闻标题
title = soup.select('.main-title')[0].text
print(title)
- 抽取新闻时间
import requests
from bs4 import BeautifulSoup
res = requests.get('http://news.sina.com.cn/c/gat/2018-08-29/doc-ihikcahf2103336.shtml')
res.encoding = 'utf-8'
# print(res.text)
soup = BeautifulSoup(res.text,"html.parser")
time = soup.select('.date-source')[0].contents[1].text
print(time)
- 抽取新闻内文
import requests
from bs4 import BeautifulSoup
res = requests.get('http://news.sina.com.cn/c/gat/2018-08-29/doc-ihikcahf2103336.shtml')
res.encoding = 'utf-8'
# print(res.text)
soup = BeautifulSoup(res.text,"html.parser")
print('@'.join([p.text.strip() for p in soup.select('#article p')[:-1]]))
- 抽取新闻评论数
import requests
res_comment = requests.get('http://comment5.news.sina.com.cn/page/info?version=1&format=json&channel=gn&newsid=comos-hikcahf2103336&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&thread=1')
res_comment.encoding = 'utf-8'
import json
# 转化为json格式
jd = json.loads(res_comment.text)
# print(jd)
# 获取评论数
comment_num = jd['result']['count']['show']
print(comment_num)
- 抽取新闻id
newsurl = 'http://news.sina.com.cn/c/gat/2018-08-29/doc-ihikcahf2103336.shtml'
newsid = newsurl.split('/')[-1].rstrip('.shtml').lstrip('doc-i')
print(newsid)