废话不多说,如下代码,自行理解注释,自行分析网站dom结构
# -*- coding: utf-8 -*-
__author__ = 'fengxin'
import logging
import urllib3
from bs4 import BeautifulSoup
import xlwt
# 日志输出
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(filename='pyTestHtty.log', level=logging.DEBUG, format=LOG_FORMAT)
# 用来存储电影和电影url的字段
move_url_name = {}
# 用来存储代码url
proxy_url = 'http://proxy.baibianip.com:8000'
# 操作excle对象
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('My Worksheet')
# 随便搞了一个header
headers={'Accept': '',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'close',
'Cookie': '',
'Host': 'www.dytt8.net',
'If-Modified-Since': 'Fri, 12 Oct 2018 03:22:06 GMT',
'If-None-Match': '',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
# 这是一个代理代码 可能是付费的
def get_proxy():
return urllib3.ProxyManager(proxy_url)
# 网站电影列表页
def get_move_page_url(page):
return 'http://www.dytt8.net/html/gndy/dyzz/list_23_'+str(page)+'.html'
# 爬取电影url数据
def get_move_page_html(page_url):
try:
proxy = get_proxy()
result = proxy.request('GET', page_url, headers=headers)
if result.status == 200:
logging.info("get success url:%s", page_url)
return BeautifulSoup(result.data.decode('gb2312', 'ignore'), "html.parser")
else:
logging.error("get error url:%s,error code is:%s", page_url, str(result.status))
return None
except Exception:
logging.error("get error url:%s", page_url, exc_info=True)
return None
# 获取电影详情页面内容
def get_move_url_and_desc(soup):
if soup is not None:
domain = "http://www.dytt8.net"
tables = soup.find_all('table', 'tbspan')
for table in tables:
move_url_name[table.findAll('td')[5].text] = domain+table.a.get('href')
# 获取每部电影的下载url
def get_move_down_load_url(soup):
# 磁力链接url
magnet_url = ''
# ftp下载url
down_url = ''
try:
a_tag_list = soup.find_all(id='Zoom')[0].find_all('a')
magnet_url = a_tag_list[0].get('href')
down_url = a_tag_list[1].get('href')
except Exception:
logging.error("getMoveDownloadUrl error", exc_info=True)
return magnet_url, down_url
# 入口方法
if __name__ == '__main__':
pageNum = 181
for nowPageNum in range(1, pageNum):
url = get_move_page_url(nowPageNum)
get_move_url_and_desc(get_move_page_html(url))
row = 0
print(len(move_url_name))
for key in move_url_name:
url = get_move_down_load_url(get_move_page_html(move_url_name[key]))
print(key)
worksheet.write(row, 0, label=key)
print(url[0])
worksheet.write(row, 1, label=url[0])
print(url[1])
worksheet.write(row, 2, label=url[1])
row=row+1
workbook.save(r'move.xls')