python多线程爬取百度云电影网站

import queue
import threading
import requests
import re
from lxml import etree
import time
import random
import json

# 已爬 url
urlList = []

# 正在爬url对列
urlsData = queue.Queue()

# urlERRor失败次数
urlError = {}
# 第几个爬虫
count = 0

# 模拟header头
header = {
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'zh-CN,zh;q=0.8',
}

class Counter(threading.Thread):
    # @summary: 初始化对象。
    # @param lock: 琐对象。
    # @param threadName: 线程名称
    # @param requests: 线程名称
    # @param url:  爬取来源url
    # @param name: 数据名称
    # @param id: 数据id
    def __init__(self, lock, threadName, requests, url):

        print(threadName+'run..')
        super(Counter, self).__init__(name=threadName)
        self.lock = lock
        self.requests = requests
        self.url = url

    def _data_get(self):

        # 开始任务
        try:
            # 爬取来源地址
            html = requests.get(self.url,headers=header)
            rs = etree.HTML(html.content)
            # 解析网页百度地址
            url = re.findall(r'href="(https://pan.baidu.com/s/.*?|http://pan.baidu.com/s/.*?)"',html.content.decode('utf-8'))
            # 解析网页百度云密码
            password = re.findall(r'密码(:|;|: )(\w{0,4})', html.content.decode('utf-8'))
            name = rs.xpath('//h1/text()')
            # 打印
            try:
                password = password[0][1]
            except BaseException as e:
                password = ''

            # 爬取豆瓣电影封面图
            try:
                url1 = "http://www.baidu.com/s?"
                html = requests.get(url1,params={
                    'wd':"site:movie.douban.com {}".format(self.name)
                })
                select = etree.HTML(html.content)
                # saveHtml("text1", html.content)
                a = select.xpath('//h3[@class="t"]/a/@href')
                html = requests.get(a[0])
                select = etree.HTML(html.content)
                # print(html.content)
                ase = select.xpath('//img/@src')
                img = ase[0]
            except BaseException as e:
                print(self.name,'豆瓣电影封面获取失败')
                img = ''
            # 提交数据

            print(name[0])
            # 提交数据
            rr = requests.post('http://localhost/basic/index.php?r=bian/update', {
                'password': password,
                'url': url[0],
                'img': img,
                'source_url': self.url,
                'name': name[0]
            })
            threadmax.release()
            print(rr.content)
#             message = '''
#             '%s','%s','%s','%s';
# ''' % (password, url[0], img, name[0])
#             print(message)

        except BaseException as e:
            if self.url in urlError:
                urlError[self.url] = urlError[self.url] + 1
            else:
                urlError[self.url] = 1
            if urlError[self.url]<3:
                urlsData.put(self.url)
            print('百度云地址解析失败',self.url,'失败次数',urlError[self.url],e)
            print('目前剩余任务', urlsData.qsize())


    def run(self):
        global count
        self.lock.acquire()
        self._data_get()
        self.lock.release()

if __name__ == '__main__':
    threadmax = threading.BoundedSemaphore(100)
    lock = threading.Lock()
    i = 0
    try:
        # 单进程爬所有任务url
        for index1 in range(20):
            index = 1038 - index1
            html = requests.get('http://www.xiexingeini.com/page/{}'.format(index), headers=header)
            html = etree.HTML(html.content)
            # 所有任务
            urls = html.xpath('//header/h2[@class="entry-title"]/a/@href')
            for url in range(len(urls)):
                urlsData.put(urls[url])
            print('已抓取url',urlsData.qsize())
        print('全部任务:',urlsData.qsize())

        # 对列循环爬取
        while True:
            threads = []
            uu = urlsData.get()
            i = i+1
            try:
                threadmax.acquire()
                ts = Counter(lock, "thread-" + str(i), requests, uu).start()
            except BaseException as e:
                print(e)
                # 重新插入对列
                urlsData.put(uu)
                if e == "can't start new thread":
                    print('线程开启失败')
                    time.sleep(180)
                else:
                    print(uu,'error')
    except BaseException as e:
        print('url error')
        print(e)
    # while True:

# # 添加数据:吃
# q1.put('a')
# q1.put('b')
#
# # 打印队列中的数据
# print(q1.queue)
#
# # 取出队列中的数据:先进先出原则
# print(q1.get())
# print(q1.queue)
# print(q1.qsize())
# print(q1.get())
# 当队列里没有数据是,get获取不到到数据,会造成阻塞

你可能感兴趣的:(python多线程爬取百度云电影网站)