Python爬取De下载站相关代码

Python爬取De下载站相关代码,因为没有设置代理,所以爬到800页左右就被干掉了,后续要加上
import urllib.request
import bs4
import re
import time
from  multiprocessing import Pool
class getLink(object):
    def __init__(self,url):
        self.url = url

    def main(self):
        downFile = open("down.txt", "w", encoding='utf-8')
        downFile.truncate()
        i = 0
        page = 1
        for urlSingle in self.url:

            result = self.getResult(urlSingle)
            print("第%d" % (page) + "页")
            downFile.write("第%d" % (page) + "页\n")
            page += 1
            for rs in result:
                pid, Name = self.getInfo(rs)
                DownUrl0, DownUrl1 = self.getDownUrl(pid)
                i += 1
                print("*******************************************")
                print("正在爬取第%d" % (i) + "个 " + "电影名称: " + Name)
                downFile.write("--------")
                downFile.write("第%d" % (i) + "个" + Name + "\n")
                downFile.write("英语中字: " + DownUrl0 + "\n")
                downFile.write("中英双字: " + DownUrl1 + "\n")

    def getResult(self,url):
        #shift+tab 同时左移
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36Name"
        }
        html = urllib.request.Request(url, headers=headers)
        response = urllib.request.urlopen(html).read().decode('utf-8')
        # 获取pid与电影name
        # 设置正则匹配规则pat1
        bs = bs4.BeautifulSoup(response, "lxml")
        result = bs.find_all(class_="main_top")
        return result

    def getInfo(self,result):
        # 获取名字
        Name = result.find('a').getText()

        # 获取href
        href = result.find('a').get('href')
        # 获取pid
        str1 = href.split('.')
        str2 = str1[2].split('/')
        pid = str2[4]
        return pid, Name

    def getDownUrl(self,pid):
        DownUrl0 = "http://www.dexiazai.cc/newdown/?pid=" + pid + "&linkn=0"
        DownUrl1 = "http://www.dexiazai.cc/newdown/?pid=" + pid + "&linkn=1"
        return DownUrl0, DownUrl1
if __name__ == '__main__':
    pool = Pool(4)
    url = []
    for i in range(1467):
        url.append("http://www.dexiazai.cc" + "/plus/list.php?tid=50&PageNo=" + str(i))
    Link = getLink(url)
    #Link.main()
    pool.map_async(Link.main())
    pool.close()
    pool.join()

python基础学习路线:点击打开链接

你可能感兴趣的:(python,爬虫)