前段时间,Python老师让做一个程序,想来想去还是代表Python的爬虫较好
加上最近一直在电影天堂上下载电影,就只好做这个了
界面设计不是自己手打的啊,界面使用PyQt5 Designer设计的
很明显的,左边一个列表放目录,右边是详情,上方是菜单栏
最终生成.py文件,开始修改
下面代码是自动生成的(我小改了下)
# 界面UI布局
def setupUi(self, MainWindow):
MainWindow.setObjectName("MainWindow")
MainWindow.resize(1102, 832)
self.centralwidget = QtWidgets.QWidget(MainWindow)
self.centralwidget.setObjectName("centralwidget")
self.listView = QtWidgets.QListView(self.centralwidget)
self.listView.setGeometry(QtCore.QRect(0, 0, 271, 731))
self.listView.setObjectName("listView")
self.label1 = QtWidgets.QLabel(self.centralwidget)
self.label1.setGeometry(QtCore.QRect(710, 10, 81, 51))
self.label1.setObjectName("label1")
self.label1_is = QtWidgets.QLabel(self.centralwidget)
self.label1_is.setGeometry(QtCore.QRect(810, 10, 241, 51))
self.label1_is.setText("")
self.label1_is.setWordWrap(True)
self.label1_is.setObjectName("label1_is")
self.images = QtWidgets.QLabel(self.centralwidget)
self.images.setGeometry(QtCore.QRect(280, 10, 391, 621))
self.images.setText("")
self.images.setPixmap(QtGui.QPixmap("1.png"))
self.images.setScaledContents(True)
self.images.setObjectName("images")
self.label2 = QtWidgets.QLabel(self.centralwidget)
self.label2.setGeometry(QtCore.QRect(710, 50, 81, 51))
self.label2.setObjectName("label2")
self.label2_is = QtWidgets.QLabel(self.centralwidget)
self.label2_is.setGeometry(QtCore.QRect(810, 50, 241, 51))
self.label2_is.setText("")
self.label2_is.setWordWrap(True)
self.label2_is.setObjectName("label2_is")
self.label3 = QtWidgets.QLabel(self.centralwidget)
self.label3.setGeometry(QtCore.QRect(710, 90, 81, 51))
self.label3.setObjectName("label3")
self.label4 = QtWidgets.QLabel(self.centralwidget)
self.label4.setGeometry(QtCore.QRect(710, 130, 81, 51))
self.label4.setObjectName("label4")
self.label5 = QtWidgets.QLabel(self.centralwidget)
self.label5.setGeometry(QtCore.QRect(710, 170, 81, 51))
self.label5.setObjectName("label5")
self.label6 = QtWidgets.QLabel(self.centralwidget)
self.label6.setGeometry(QtCore.QRect(710, 210, 81, 51))
self.label6.setObjectName("label6")
self.label7 = QtWidgets.QLabel(self.centralwidget)
self.label7.setGeometry(QtCore.QRect(710, 250, 81, 51))
self.label7.setObjectName("label7")
self.label8 = QtWidgets.QLabel(self.centralwidget)
self.label8.setGeometry(QtCore.QRect(710, 290, 81, 51))
self.label8.setObjectName("label8")
self.label9 = QtWidgets.QLabel(self.centralwidget)
self.label9.setGeometry(QtCore.QRect(280, 630, 81, 51))
self.label9.setObjectName("label9")
self.btn_download = QtWidgets.QPushButton(self.centralwidget)
self.btn_download.setGeometry(QtCore.QRect(480, 740, 131, 41))
self.btn_download.setObjectName("btn_download")
self.label3_is = QtWidgets.QLabel(self.centralwidget)
self.label3_is.setGeometry(QtCore.QRect(810, 90, 241, 51))
self.label3_is.setText("")
self.label3_is.setWordWrap(True)
self.label3_is.setObjectName("label3_is")
self.label4_is = QtWidgets.QLabel(self.centralwidget)
self.label4_is.setGeometry(QtCore.QRect(810, 130, 241, 51))
self.label4_is.setText("")
self.label4_is.setWordWrap(True)
self.label4_is.setObjectName("label4_is")
self.label5_is = QtWidgets.QLabel(self.centralwidget)
self.label5_is.setGeometry(QtCore.QRect(810, 170, 241, 51))
self.label5_is.setText("")
self.label5_is.setWordWrap(True)
self.label5_is.setObjectName("label5_is")
self.label6_is = QtWidgets.QLabel(self.centralwidget)
self.label6_is.setGeometry(QtCore.QRect(810, 210, 241, 51))
self.label6_is.setText("")
self.label6_is.setWordWrap(True)
self.label6_is.setObjectName("label6_is")
self.label7_is = QtWidgets.QLabel(self.centralwidget)
self.label7_is.setGeometry(QtCore.QRect(810, 250, 241, 51))
self.label7_is.setText("")
self.label7_is.setWordWrap(True)
self.label7_is.setObjectName("label7_is")
self.label8_is = QtWidgets.QLabel(self.centralwidget)
self.label8_is.setGeometry(QtCore.QRect(810, 310, 241, 311))
self.label8_is.setText("")
self.label8_is.setAlignment(QtCore.Qt.AlignLeading | QtCore.Qt.AlignLeft | QtCore.Qt.AlignTop)
self.label8_is.setWordWrap(True)
self.label8_is.setObjectName("label8_is")
self.btn_up = QtWidgets.QPushButton(self.centralwidget)
self.btn_up.setGeometry(QtCore.QRect(10, 740, 93, 28))
self.btn_up.setObjectName("btn_up")
self.btn_down = QtWidgets.QPushButton(self.centralwidget)
self.btn_down.setGeometry(QtCore.QRect(170, 740, 93, 28))
self.btn_down.setObjectName("btn_down")
self.label_page = QtWidgets.QLabel(self.centralwidget)
self.label_page.setGeometry(QtCore.QRect(110, 746, 51, 20))
self.label_page.setAlignment(QtCore.Qt.AlignCenter)
self.label_page.setObjectName("label_page")
self.textBrowser = QtWidgets.QTextBrowser(self.centralwidget)
self.textBrowser.setGeometry(QtCore.QRect(350, 640, 731, 81))
self.textBrowser.setObjectName("textBrowser")
MainWindow.setCentralWidget(self.centralwidget)
self.menubar = QtWidgets.QMenuBar(MainWindow)
self.menubar.setGeometry(QtCore.QRect(0, 0, 1102, 26))
self.menubar.setObjectName("menubar")
self.menu1 = QtWidgets.QMenu(self.menubar)
self.menu1.setObjectName("menu1")
self.menu2 = QtWidgets.QMenu(self.menubar)
self.menu2.setObjectName("menu2")
MainWindow.setMenuBar(self.menubar)
self.statusbar = QtWidgets.QStatusBar(MainWindow)
self.statusbar.setObjectName("statusbar")
MainWindow.setStatusBar(self.statusbar)
self.actiongu = QtWidgets.QAction(MainWindow)
self.actiongu.setObjectName("actiongu")
self.action2 = QtWidgets.QAction(MainWindow)
self.action2.setObjectName("action2")
self.action3 = QtWidgets.QAction(MainWindow)
self.action3.setObjectName("action3")
self.guanyu = QtWidgets.QAction(MainWindow)
self.guanyu.setObjectName("guanyu")
self.xieyi = QtWidgets.QAction(MainWindow)
self.xieyi.setObjectName("xieyi")
self.quexian = QtWidgets.QAction(MainWindow)
self.quexian.setObjectName("quexian")
self.zuixin = QtWidgets.QAction(MainWindow)
self.zuixin.setObjectName("zuixin")
self.zonghe = QtWidgets.QAction(MainWindow)
self.zonghe.setObjectName("zonghe")
self.guonei = QtWidgets.QAction(MainWindow)
self.guonei.setObjectName("guonei")
self.oumei = QtWidgets.QAction(MainWindow)
self.oumei.setObjectName("oumei")
self.rihan = QtWidgets.QAction(MainWindow)
self.rihan.setObjectName("rihan")
self.shanchu = QtWidgets.QAction(MainWindow)
self.shanchu.setObjectName("shanchu")
self.dakai = QtWidgets.QAction(MainWindow)
self.dakai.setObjectName("dakai")
self.exit = QtWidgets.QAction(MainWindow)
self.exit.setObjectName("exit")
self.menu1.addAction(self.zuixin)
self.menu1.addAction(self.zonghe)
self.menu1.addAction(self.guonei)
self.menu1.addAction(self.oumei)
self.menu1.addAction(self.rihan)
self.menu1.addSeparator()
self.menu1.addAction(self.shanchu)
self.menu1.addAction(self.dakai)
self.menu1.addSeparator()
self.menu1.addAction(self.exit)
self.menu2.addAction(self.guanyu)
self.menu2.addAction(self.xieyi)
self.menu2.addAction(self.quexian)
self.menubar.addAction(self.menu1.menuAction())
self.menubar.addAction(self.menu2.menuAction())
self.retranslateUi(MainWindow)
QtCore.QMetaObject.connectSlotsByName(MainWindow)
# 界面UI文字
def retranslateUi(self, MainWindow):
_translate = QtCore.QCoreApplication.translate
MainWindow.setWindowTitle(_translate("MainWindow", "电影天堂微型客户端Python"))
self.label1.setText(_translate("MainWindow", "片 名"))
self.label2.setText(_translate("MainWindow", "年 代"))
self.label3.setText(_translate("MainWindow", "产 地"))
self.label4.setText(_translate("MainWindow", "类 别"))
self.label5.setText(_translate("MainWindow", "豆瓣评分"))
self.label6.setText(_translate("MainWindow", "片 长"))
self.label7.setText(_translate("MainWindow", "导 演"))
self.label8.setText(_translate("MainWindow", "主 演"))
self.label9.setText(_translate("MainWindow", "简 介"))
self.btn_download.setText(_translate("MainWindow", "下载"))
self.btn_up.setText(_translate("MainWindow", "上一页"))
self.btn_down.setText(_translate("MainWindow", "下一页"))
self.label_page.setText(_translate("MainWindow", "1"))
self.textBrowser.setHtml(_translate("MainWindow",
"\n"
"\n"
""))
self.menu1.setTitle(_translate("MainWindow", "电影"))
self.menu2.setTitle(_translate("MainWindow", "关于"))
self.actiongu.setText(_translate("MainWindow", "1"))
self.action2.setText(_translate("MainWindow", "2"))
self.action3.setText(_translate("MainWindow", "3"))
self.guanyu.setText(_translate("MainWindow", "关于"))
self.xieyi.setText(_translate("MainWindow", "协议"))
self.quexian.setText(_translate("MainWindow", "软件缺陷"))
self.zuixin.setText(_translate("MainWindow", "最新电影"))
self.zonghe.setText(_translate("MainWindow", "综合电影"))
self.guonei.setText(_translate("MainWindow", "国内电影"))
self.oumei.setText(_translate("MainWindow", "欧美电影"))
self.rihan.setText(_translate("MainWindow", "日韩电影"))
self.shanchu.setText(_translate("MainWindow", "删除缓存"))
self.dakai.setText(_translate("MainWindow", "打开缓存目录"))
self.exit.setText(_translate("MainWindow", "退出软件"))
# 设置界面控件绑定及初始化界面数据
self.setDataUpdate(MainWindow)
设计思路
PS:先上变量
URl5个常量分别是电影天堂(ygdy8.net)目录列表格式,通过format加入page即可访问page页的数据
1.获取目录及链接放到两个列表变量中,使其对应
2.列表单击方法
3.按钮单击方法
4.获取目录后更新列表方法
5.获取当前选择的电影类型(返回URL)
# 最新电影
URL_NEW = "https://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html"
# 综合
URL_COMPREHENSIVE = "https://www.ygdy8.net/html/gndy/jddy/list_63_{}.html"
# 国内
URL_DOMESTIC = "https://www.ygdy8.net/html/gndy/china/list_4_{}.html"
# 欧美
URL_EUROPE_AND_AMERICA = "https://www.ygdy8.net/html/gndy/oumei/list_7_{}.html"
# 日韩
URL_JAPAN_AND_KOREA = "https://www.ygdy8.net/html/gndy/rihan/list_6_{}.html"
# 电影详情页元组
movie = {}
# 目录页列表
catalog = []
# 目录链接列表,与目录对应
link = []
# 电影:1=最新 2=综合 3=国内 4=欧美 5=日韩
type = 1
# 页数
page = 1
# 爬虫获取目录及链接,将值保存到catalog和link列表
# 需要注意,电影天堂的最新电影和综合电影的目录没有跳过
# 而国内,欧美日韩等目录界面有个废链接,需要跳过去,在下方代码if
def getCatalog(self, url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'} # 模拟浏览器登入
r = requests.get(url, headers=headers, timeout=10) # 获取网页
except:
pass
# print(r.status_code)
else:
html = r.content.decode('gb2312', 'ignore') # 解码gb2312,忽略其中有异常的编码,仅显示有效的编码
# print(html)
# 解析网页
soup = BeautifulSoup(html, 'lxml')
tiaoguo = False
self.catalog = []
self.link = []
for li in soup.select('.co_content8 b'): # 选择所有co_content8 下的所有的 b 节点
for a in li.select('a'): # 选择 b 节点下的 a 节点
# print(a.string)
link = 'https://www.ygdy8.net' + a['href'] # 构造每个电影的网页链接
if (self.type == 1 or self.type == 2):
self.catalog.append(getTitle(a.string))
self.link.append(link)
else:
# 每次获取一个结果后,存储一次 每两次存储一次,因为有一次不对,没有电影名称
if tiaoguo:
# print(a.string)
self.catalog.append(getTitle(a.string))
self.link.append(link)
tiaoguo = False
else:
tiaoguo = True
initMovie是详情页爬虫,在下方有代码
# 列表单击方法
def listClick(self, qModelIndex):
# print(self.link[qModelIndex.row()]) #调试输出选中的列表Index
self.movie = getMovie(self.link[qModelIndex.row()])
# print(self.movie)
self.initMovie()
里面很多代码在下方贴,最终可运行Py文件见下方链接
# 按钮单击方法
def btnClick(self, btn):
# print(btn.text())# 输出被点击的按钮
if (btn.text() == "下一页"):
self.page += 1
self.getCatalog(self.getMovieType(self.type).format(self.page))
self.label_page.setText(str(self.page))
# print(self.catalog)
self.updateListCatalog()
elif (btn.text() == "上一页"):
self.page -= 1
self.getCatalog(self.getMovieType(self.type).format(self.page))
self.label_page.setText(str(self.page))
# print(self.catalog)
self.updateListCatalog()
elif (btn.text() == "下载"):
QMessageBox.information(MainWindow, "下载", "所有的下载链接没爬出来")
重新获取目录变量并赋值给列表
# 更新列表目录
def updateListCatalog(self):
slm = QStringListModel()
slm.setStringList(self.catalog)
self.listView.setModel(slm)
这里是根据type变量的值,来控制你爬取的是哪个类型的电影目录
# 返回电影type链接格式
def getMovieType(self, type):
if (type == 1):
return self.URL_NEW
elif (type == 2):
return self.URL_COMPREHENSIVE
elif (type == 3):
return self.URL_DOMESTIC
elif (type == 4):
return self.URL_EUROPE_AND_AMERICA
elif (type == 5):
return self.URL_JAPAN_AND_KOREA
# 模拟浏览器访问url并获取页面内容,返回元组
def getMovie(url):
# 获取html
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'} # 模拟浏览器登入
r = requests.get(url, headers=headers, timeout=10) # 获取网页
except:
pass
# print(r.status_code)
else:
yuanma = r.content.decode('gb2312', 'ignore') # 解码gb2312,忽略其中有异常的编码,仅显示有效的编码
# print(yuanma)
movie = {}
html = etree.HTML(yuanma)
title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
# print(title)
movie['title'] = title
zoomE = html.xpath("//div[@id='Zoom']")[0]
imgs = zoomE.xpath(".//img/@src")
cover = imgs[0] # 电影海报
movie['cover'] = cover
infos = zoomE.xpath(".//text()")
# print(infos)
for index, info in enumerate(infos):
if info.startswith("◎年 代"):
# print(info)
info = info.replace("◎年 代", "").strip()
# print(info)
movie['year'] = info
elif info.startswith("◎产 地"):
info = info.replace("◎产 地", "").strip()
movie['country'] = info
elif info.startswith("◎类 别"):
info = info.replace("◎类 别", "").strip()
movie['category'] = info
elif info.startswith("◎豆瓣评分"):
info = info.replace("◎豆瓣评分", "").strip()
movie['douban_rating'] = info
elif info.startswith("◎片 长"):
info = info.replace("◎片 长", "").strip()
movie['duration'] = info
elif info.startswith("◎导 演"):
info = info.replace("◎导 演", "").strip()
movie['director'] = info
# 影片的主演有多个,所有要添加判断
elif info.startswith("◎主 演"):
info = info.replace("◎主 演", "").strip()
# print(info)
actors = [info, ]
for x in range(index + 1, len(infos)):
actor = infos[x].strip()
# 此处对应修复缺陷5,因为有的电影详情页中没有标签,修复失败
if actor.startswith("◎标 签") or actor.startswith("◎简 介"):
break
actors.append(actor)
movie['actors'] = actors
elif info.startswith("◎简 介"):
info = info.replace("◎简 介", "").strip()
for x in range(index + 1, len(infos)):
profile = infos[x].strip()
if profile.startswith("【下载地址】"):
break
# 因为在这里下载地址下面有一行空格,不加if的话会替换掉profile,使profile为空格
if (len(movie) == 9):
movie['profile'] = profile
# print(movie.get("profile"))
# 下载地址
download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")
# print(download_url)
movie['download_url'] = download_url
return movie
# 获取图片,下载图片至缓存目录;即二次加载不用下载图片
def getImage(image_url, image_name):
root = 'D://img//'
image_name = root + image_name + ".jpg"
try:
if not os.path.exists(root): # 判断文件夹是否存在,不存在则创建文件夹
os.mkdir(root)
if not os.path.exists(image_name): # 判断图片文件是否存在,存在则进行提示
s = requests.get(image_url) # 通过requests.get方式获取文件
# 使用with语句可以不用自己手动关闭已经打开的文件流
with open(image_name, "wb") as f: # 开始写文件,wb代表写二进制文件
f.write(s.content)
# print("爬取完成")
else:
print("文件已存在")
except Exception as e:
print("爬取失败:" + str(e))
# 菜单单击方法
def menuClick(self, q):
# print(q.text() + ':被单击') # 输出那个Qmenu对象被点击
# QMessageBox.[information,question,warning,ctitical,about]
# QMessageBox.about(self, "消息框标题", "这是关于软件的说明", QMessageBox.Yes | QMessageBox.No)
if (q.text() == "关于"):
QMessageBox.about(MainWindow, "关于本微型客户端", ""
"本站严禁提供任何带色 情,违法内容的影片!"
"欢迎大家监督,有问题可发邮件到ygkf88#gmail.com(请将#换成@),"
"本站所有资源来源于网友交流,"
"只供网络测试、请在24小时内删除所下内容,"
"开始清理无版权的内容,"
"请大家支持正版到影院观看或购买正版CD!"
"")
elif (q.text() == "协议"):
QMessageBox.about(MainWindow, "协议", "本软件数据来源于电影天堂(ygdy8.net)。用于学习,请勿用于商用")
elif (q.text() == "软件缺陷"):
QMessageBox.about(MainWindow, "软件缺陷", "本软件仅用于Python实践,软件缺陷如下\n"
"1.代码没优化,如获取目录时多处代码冗余,又如本行代码下的方法\n"
"2.加载缓慢,爬取下载链接爬不出来\n"
"3.主演那,本来设计的时候觉得应该能放得下,结果放不下,懒得改控件了\n"
"4.没有跳转页面,只能一页页加载,不写的里有同上\n"
"5.详情页爬虫有点问题,电影天堂详情页的格式不太统一,导致加载有些界面会卡死\n"
"6.详情页爬虫本来发现问题后,想做两个的,一个xpath加载详情页数据,一个"
"BeautifulSoup方式加载,懒得写BeautifulSoup式了"
"")
elif (q.text() == "最新电影"):
self.type = 1
self.page = 1
self.getCatalog(self.getMovieType(self.type).format(self.page))
self.label_page.setText(str(self.page))
# print(self.catalog)
self.updateListCatalog()
elif (q.text() == "综合电影"):
self.type = 2
self.page = 1
self.getCatalog(self.getMovieType(self.type).format(self.page))
self.label_page.setText(str(self.page))
# print(self.catalog)
self.updateListCatalog()
elif (q.text() == "国内电影"):
self.type = 3
self.page = 1
self.getCatalog(self.getMovieType(self.type).format(self.page))
self.label_page.setText(str(self.page))
# print(self.catalog)
self.updateListCatalog()
elif (q.text() == "欧美电影"):
self.type = 4
self.page = 1
self.getCatalog(self.getMovieType(self.type).format(self.page))
self.label_page.setText(str(self.page))
# print(self.catalog)
self.updateListCatalog()
elif (q.text() == "日韩电影"):
self.type = 5
self.page = 1
self.getCatalog(self.getMovieType(self.type).format(self.page))
self.label_page.setText(str(self.page))
# print(self.catalog)
self.updateListCatalog()
elif (q.text() == "删除缓存"):
QMessageBox.about(MainWindow, "删除缓存", "这里功能没有写,大致是两种方法,1,遍历文件夹文件,循环删除至"
"空文件夹,删不删文件夹视情况;2.直接删除目录,再次打开时直接新建"
"缓存目录")
elif (q.text() == "打开缓存目录"):
openResourceManager()
elif (q.text() == "退出软件"):
sys.exit()
基本重点代码以上,还有初始化数据的代码,及一部分工具方法没放上去
最终结果:
下载链接:链接: https://pan.baidu.com/s/14omsJMpqo1Gl6MAx7_S7hg 提取码: 821o