Python3 爬取电影网下载链接

废话不多说,如下代码,自行理解注释,自行分析网站dom结构

 


    # -*- coding: utf-8 -*-

    __author__ = 'fengxin'

    import logging
    import urllib3
    from bs4 import BeautifulSoup
    import xlwt

    # 日志输出
    LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
    logging.basicConfig(filename='pyTestHtty.log', level=logging.DEBUG, format=LOG_FORMAT)

    # 用来存储电影和电影url的字段
    move_url_name = {}

    # 用来存储代码url
    proxy_url = 'http://proxy.baibianip.com:8000'

    # 操作excle对象
    workbook = xlwt.Workbook(encoding='utf-8')
    worksheet = workbook.add_sheet('My Worksheet')

    # 随便搞了一个header
    headers={'Accept': '',
             'Accept-Encoding': 'gzip, deflate',
             'Accept-Language': 'zh-CN,zh;q=0.9',
             'Cache-Control': 'max-age=0',
             'Connection': 'close',
             'Cookie': '',
             'Host': 'www.dytt8.net',
             'If-Modified-Since': 'Fri, 12 Oct 2018 03:22:06 GMT',
             'If-None-Match': '',
             'Upgrade-Insecure-Requests': '1',
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}


    # 这是一个代理代码 可能是付费的
    def get_proxy():
        return urllib3.ProxyManager(proxy_url)


    # 网站电影列表页
    def get_move_page_url(page):
        return 'http://www.dytt8.net/html/gndy/dyzz/list_23_'+str(page)+'.html'


    # 爬取电影url数据
    def get_move_page_html(page_url):
        try:
            proxy = get_proxy()
            result = proxy.request('GET', page_url, headers=headers)
            if result.status == 200:
                logging.info("get success url:%s", page_url)
                return BeautifulSoup(result.data.decode('gb2312', 'ignore'), "html.parser")
            else:
                logging.error("get error url:%s,error code is:%s", page_url, str(result.status))
                return None
        except Exception:
            logging.error("get error url:%s", page_url, exc_info=True)
            return None


    # 获取电影详情页面内容
    def get_move_url_and_desc(soup):
        if soup is not None:
            domain = "http://www.dytt8.net"
            tables = soup.find_all('table', 'tbspan')
            for table in tables:
                move_url_name[table.findAll('td')[5].text] = domain+table.a.get('href')


    # 获取每部电影的下载url
    def get_move_down_load_url(soup):
        # 磁力链接url
        magnet_url = ''
        # ftp下载url
        down_url = ''
        try:
            a_tag_list = soup.find_all(id='Zoom')[0].find_all('a')
            magnet_url = a_tag_list[0].get('href')
            down_url = a_tag_list[1].get('href')
        except Exception:
            logging.error("getMoveDownloadUrl error", exc_info=True)
        return magnet_url, down_url


    # 入口方法
    if __name__ == '__main__':
        pageNum = 181
        for nowPageNum in range(1, pageNum):
            url = get_move_page_url(nowPageNum)
            get_move_url_and_desc(get_move_page_html(url))
        row = 0
        print(len(move_url_name))
        for key in move_url_name:
            url = get_move_down_load_url(get_move_page_html(move_url_name[key]))
            print(key)
            worksheet.write(row, 0, label=key)
            print(url[0])
            worksheet.write(row, 1, label=url[0])
            print(url[1])
            worksheet.write(row, 2, label=url[1])
            row=row+1
        workbook.save(r'move.xls')

 

你可能感兴趣的:(Python)