python--CSDN爬虫

 

python--CSDN爬虫_第1张图片

# encoding:utf-8
__author__ = 'Sun'

import re
import urllib.request
import urllib
import queue
import threading
import os


queue = queue.Queue()
visited = set()
cnt = 0
class CsdnBlogSpider(threading.Thread):

        def __init__(self, queue, opener, blog_name):
                threading.Thread.__init__(self)
                self.queue = queue
                self.opener = opener
                self.blog_name = blog_name
                self.lock = threading.Lock()

        def htmltotxt(self, data, fout):
                html = data.read().decode("utf-8")  
                html.encode("utf-8")
 
                content=re.findall('
(.*?)
',html,re.S) reg=re.compile('<[^>]*>') fout.write(fout) def save_data(self, data, filename): if not os.path.exists('blog'): blog_path = os.path.join(os.path.abspath('.'),'blog') os.mkdir(blog_path) try: fout = open('./blog/' + 'lalal' + '.txt', 'wb') htmltotxt(data,fout) #fout.write(data); except IOError as e: print(e) # finally: # fout.close() def find_title(self,data): data = data.decode('utf-8') begin = data.find(r'' + url) self.lock.release() try: res = self.opener.open(url, timeout=1000) except Exception as e: if hasattr(e, 'reason'): print('reason:', e.reason) elif hasattr(e, 'code'): print('error code:', e.code) cnt -= 1 self.queue.task_done() continue else: data = res.read() title = self.find_title(data) self.save_data(data,title) data = data.decode('utf-8') blog_urls = re.compile('/' + self.blog_name + '/article/details/' + '\d*') for url in blog_urls.findall(data): url = 'http://blog.csdn.net' + url if url not in visited: self.queue.put(url) visited |= {url} # print('加入队列---》' + url) #当调用一次get函数之后就调用此函数,让线程阻塞或退出 self.queue.task_done() def init(name, number=10): #抓取的url个数 global cnt global visited # blog_name = input('输入博客名称:') # thread_num = input('输入启动线程数:') blog_name = name.lower() th_num = int(number) url = 'http://blog.csdn.net/' + blog_name + '/' #模仿浏览器访问CSDN opener = urllib.request.build_opener(urllib.request.HTTPHandler) headers = [ ('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko') ] urllib.request.install_opener(opener) opener.addheaders = headers queue.put(url) visited |= {url} cnt = 0 for i in range(th_num): t = CsdnBlogSpider(queue,opener,blog_name) t.setDaemon(True) t.start() queue.join() print('--------end!!!-----') print('共抓取:' + str(cnt)) if __name__ == '__main__': init("xiaoxiaoCYG")

 

你可能感兴趣的:(python)