import urllib.request
import bs4
import re
import time
from multiprocessing import Pool
class getLink(object):
def __init__(self,url):
self.url = url
def main(self):
downFile = open("down.txt", "w", encoding='utf-8')
downFile.truncate()
i = 0
page = 1
for urlSingle in self.url:
result = self.getResult(urlSingle)
print("第%d" % (page) + "页")
downFile.write("第%d" % (page) + "页\n")
page += 1
for rs in result:
pid, Name = self.getInfo(rs)
DownUrl0, DownUrl1 = self.getDownUrl(pid)
i += 1
print("*******************************************")
print("正在爬取第%d" % (i) + "个 " + "电影名称: " + Name)
downFile.write("--------")
downFile.write("第%d" % (i) + "个" + Name + "\n")
downFile.write("英语中字: " + DownUrl0 + "\n")
downFile.write("中英双字: " + DownUrl1 + "\n")
def getResult(self,url):
#shift+tab 同时左移
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36Name"
}
html = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(html).read().decode('utf-8')
# 获取pid与电影name
# 设置正则匹配规则pat1
bs = bs4.BeautifulSoup(response, "lxml")
result = bs.find_all(class_="main_top")
return result
def getInfo(self,result):
# 获取名字
Name = result.find('a').getText()
# 获取href
href = result.find('a').get('href')
# 获取pid
str1 = href.split('.')
str2 = str1[2].split('/')
pid = str2[4]
return pid, Name
def getDownUrl(self,pid):
DownUrl0 = "http://www.dexiazai.cc/newdown/?pid=" + pid + "&linkn=0"
DownUrl1 = "http://www.dexiazai.cc/newdown/?pid=" + pid + "&linkn=1"
return DownUrl0, DownUrl1
if __name__ == '__main__':
pool = Pool(4)
url = []
for i in range(1467):
url.append("http://www.dexiazai.cc" + "/plus/list.php?tid=50&PageNo=" + str(i))
Link = getLink(url)
#Link.main()
pool.map_async(Link.main())
pool.close()
pool.join()
python基础学习路线:点击打开链接