关于python多线程的爬虫的一些实践

#coding=utf-8
#!/usr/bin/python

# 从html文档中提取出url,经过检验后加入待抓去队列,应该是producer
#       url是否重复
#       url是否符合模式
# 从待抓去队列中get url,然后去抓去,consumer
# -*- coding: utf-8 -*-

import threading
import time
import sys
import urllib.request
import chardet
import html.parser
import urllib.parse
import re

class MyHtmlParser(html.parser.HTMLParser):
    def __init__(self,data):
        html.parser.HTMLParser.__init__(self)
        self.data = data



class Producer(threading.Thread):
    def __init__(self,name,queue):
        threading.Thread.__init__(self)
        self.name = name
        self.queue = queue

    def run(self):
        for i in range(50):
            pass
            # self.queue.put('a wonton')
            # print('producing wontons'+str(self.queue.qsize()))



class Consumer(threading.Thread):

    def __init__(self, name, queue):
        threading.Thread.__init__(self)
        self.name = name
        self.queue = queue
        self.headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                        'Accept-Language':'zh-CN,zh;q=0.8',
                        'Connection':'keep-alive',
                        'Upgrade-Insecure-Requests':'1',
                        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'}
    def run(self):

        url = self.queue.get()
        self.headers['host'] = urllib.parse.urlparse(url).netloc
        req = urllib.request.Request(url,headers=self.headers)


        with urllib.request.urlopen(req) as f:
            # lang_info_temp = chardet.detect(f.read())
            # print(lang_info_temp)
            # print(list(f.getheaders()))
            html_str = f.read().decode('utf-8')

        # lang_info = chardet.detect(data)
        # print()
        # lang_info['encoding']
        # print(data.decode('utf-8'))
        with open(r'./'+str(self.headers['host']),'w',encoding='utf-8') as output:
            output.write(html_str)






def main():
    import queue
    to_crawl_queue = queue.Queue()
    to_crawl_queue.put('http://www.cma.gov.cn/')
    to_crawl_queue.put('http://www.sina.com.cn/')
    to_crawl_queue.put('https://www.baidu.com/')

    producer = Producer('xiaomachaoshou',to_crawl_queue)
    consumer = Consumer('wo',to_crawl_queue)
    producer.start()
    consumer.start()



if __name__=='__main__':
    p = '\\".*html\\"'
    real_p = re.compile(p)
    input_string = 'nishi sm yisi me "hello.html" hello world"cradle.html"'
    match_list = real_p.findall(input_string)
    print(match_list)

    # main()

    try:
        sys.exit(0)
    except SystemExit:
        print('The mini_spider has exited gracefully')

你可能感兴趣的:(python-爬虫)