Spider 电影天堂部分信息

import requests
from lxml import etree

res = requests.get('https://www.ygdy8.net/html/gndy/china/index.html')

res.encoding = 'gbk'
con = res.text

html = etree.HTML(con)

hrefs = html.xpath('//div[@class="co_content8"]//table//a[2]/@href')

data_list = []
for href in hrefs:
    detail_url = 'https://www.ygdy8.net/' + href
    detail_res = requests.get(detail_url)
    detail_res.encoding = 'gbk'
    detail_con = detail_res.text

    detail_html = etree.HTML(detail_con)

    img = detail_html.xpath('//div[@id="Zoom"]//img[1]/@src')[0]
    texts = detail_html.xpath('//div[@id="Zoom"]//p[1]/text()')
    # print(texts)

    for index, text in enumerate(texts):
        if '◎片\u3000\u3000名' in text:
            name = text.replace('◎片\u3000\u3000名', '').strip()

        if '◎年\u3000\u3000代' in text:
            year = text.replace('◎年\u3000\u3000代', '').strip()

        if '◎主\u3000\u3000演' in text:

            actors = []
            actor = text.replace('◎主\u3000\u3000演', '').strip()
            actors.append(actor)

            for i in range(index+1, len(texts)):

                if '◎标\u3000\u3000签' in texts[i] or '◎简\u3000\u3000介' in texts[i]:
                    break

                actor = texts[i].strip()
                actors.append(actor)

    data_dict = {
     
        'img': img,
        'name': name,
        'year': year,
        'actors': ','.join(actors)
    }

    data_list.append(data_dict)

你可能感兴趣的:(技术信息,大数据)