parse_baidu_m_news
2022-09-01 本文已影响0人
是东东
from lxml import etree
text = response.content.decode('utf-8')
tree = etree.HTML(text)
script = ''.join((tree.xpath('//script[contains(@id,"atom-data-")]/text()')))
print(script)
import json
oo = json.loads(script)
details = oo.get('data', {}).get('list') or []
for detail in details:
rank = detail.get('index')
url = detail.get('titleurl') or detail.get('url') or detail.get('params', {}).get('originUrl')
img_url = detail.get('img') or detail.get('imgsrcurl')
title = detail.get('title')
desc = detail.get('abstract')
keywords1 = etree.HTML(title).xpath('//em/text()') or []
keywords2 = etree.HTML(desc).xpath('//em/text()') or []
keyword = []
keyword.extend(keywords1)
keyword.extend(keywords2)
keyword = list(set(keyword))
title = title.replace('<em>', '').replace('</em>', '')
desc = desc.replace('<em>', '').replace('</em>', '')
press_time = detail.get('posttime')
subsitename = detail.get('subsitename')