parse_baidu_m_news

2022-09-01  本文已影响0人  是东东
from lxml import etree
text = response.content.decode('utf-8')
tree = etree.HTML(text)
script = ''.join((tree.xpath('//script[contains(@id,"atom-data-")]/text()')))
print(script)
import json
oo = json.loads(script)
details = oo.get('data', {}).get('list') or []
for detail in details:
    rank = detail.get('index')
    url = detail.get('titleurl') or detail.get('url') or detail.get('params', {}).get('originUrl')
    img_url = detail.get('img') or detail.get('imgsrcurl')
    title = detail.get('title')
    desc = detail.get('abstract')
    keywords1 = etree.HTML(title).xpath('//em/text()') or []
    keywords2 = etree.HTML(desc).xpath('//em/text()') or []
    keyword = []
    keyword.extend(keywords1)
    keyword.extend(keywords2)
    keyword = list(set(keyword))
    title = title.replace('<em>', '').replace('</em>', '')
    desc = desc.replace('<em>', '').replace('</em>', '')
    press_time = detail.get('posttime')
    subsitename = detail.get('subsitename')
上一篇下一篇

猜你喜欢

热点阅读