parse_baidu_m_news

from lxml import etree
text = response.content.decode('utf-8')
tree = etree.HTML(text)
script = ''.join((tree.xpath('//script[contains(@id,"atom-data-")]/text()')))
print(script)
import json
oo = json.loads(script)
details = oo.get('data', {}).get('list') or []
for detail in details:
    rank = detail.get('index')
    url = detail.get('titleurl') or detail.get('url') or detail.get('params', {}).get('originUrl')
    img_url = detail.get('img') or detail.get('imgsrcurl')
    title = detail.get('title')
    desc = detail.get('abstract')
    keywords1 = etree.HTML(title).xpath('//em/text()') or []
    keywords2 = etree.HTML(desc).xpath('//em/text()') or []
    keyword = []
    keyword.extend(keywords1)
    keyword.extend(keywords2)
    keyword = list(set(keyword))
    title = title.replace('', '').replace('', '')
    desc = desc.replace('', '').replace('', '')
    press_time = detail.get('posttime')
    subsitename = detail.get('subsitename')

你可能感兴趣的:(parse_baidu_m_news)