python爬虫入门之————————————————案例演练

python爬虫入门之————————————————案例演练_第1张图片python爬虫入门之————————————————案例演练_第2张图片

源码

"""
Version 1.1.0
Author lkk
Email [email protected]
date 2018-11-25 18:39
DESC 电影天堂
"""
# https://www.dy2018.com/
from urllib import request
import time
from lxml import html
from fake_useragent import UserAgent


def target_data(url):
    ua = UserAgent()
    headers = {
        'User-agent': ua.random
    }
    start_url = request.Request(url, headers=headers)
    response = request.urlopen(start_url)
    data = response.read()
    encoding = 'gb2312'
    data_info = data.decode(encoding, 'ignore')
    # soup = BeautifulSoup(data_info, 'lxml')
    docs = html.fromstring(data_info)
    return docs


def core(url):
    docs = target_data(url)
    links = docs.xpath("//tr[2]/td[2]/b/a[@class='ulink'][2]/@href")

    return links


def get_data():
    list_urls = core(url)
    base_url = 'https://www.dy2018.com'
    for i in list_urls:
        last_url = base_url + i
        docs = target_data(last_url)
        name = docs.xpath("//div[@class='co_area2']/div[@class='title_all']/h1/text()")
        score = docs.xpath("////ul/div[@class='position']/span[1]")
        date = docs.xpath("//div[@class='position']/span[@class='updatetime']/text()")
        play_date = docs.xpath("//div[@id='Zoom']/p[9]/text()")
        classify = docs.xpath("//div[@id='Zoom']/p[6]/text()")
        abstract = docs.xpath("//div[@id='Zoom']/p[31]/text()")
        download_links = docs.xpath("//table[1]//tr/td/anchor/a")
        print(name, score[0].xpath("string(.)"), date, play_date, classify, abstract, download_links, last_url)
        # TODO 数据持久化


if __name__ == '__main__':
    urls = ['https://www.dy2018.com/3/', 'https://www.dy2018.com/2/',
            'https://www.dy2018.com/0/', 'https://www.dy2018.com/1/',
            'https://www.dy2018.com/4/',  'https://www.dy2018.com/8/',
            'https://www.dy2018.com/5/',  'https://www.dy2018.com/7/',
            'https://www.dy2018.com/15/', 'https://www.dy2018.com/14/',
            'https://www.dy2018.com/html/tv/hytv/index.html', 'https://www.dy2018.com/html/tv/oumeitv/index.html',
            'https://www.dy2018.com/html/tv/rihantv/index.html', 'https://www.dy2018.com/html/zongyi2013/index.html',
            'https://www.dy2018.com/html/dongman/index.html',
           ]
    for i in urls:
        url = i
        print(url)
        base_url = 'https://www.dy2018.com'
        docs = target_data(url)
        info = core(url)
        link_list = core(url)
        get_data()
        while True:
            try:
                next_page = docs.xpath("//div[@class='x']/p/a[text()='下一页']/@href")[0]
                if len(next_page) > 0:
                    next_url = base_url + next_page
                    print(next_url)
                    core(next_url)
                    url = next_url
                    time.sleep(5)
                    get_data()
                    docs = target_data(next_url)
                else:
                    break
            except Exception as e:
                print(e)
                break

 

你可能感兴趣的:(爬虫)