【python】爬虫-爬取CSDN上的博客文章

# -*- coding: utf-8 -*-
import html
import os
import queue
import re
import threading
import urllib

'''author fzuim'''

BlogSet = set()

class CsdnBlogSpider:
    def __init__(self, bolgname, myqueue):
        self.bolgname = bolgname
        self.pageindex = 1
        self.myqueue = myqueue
        self.blogurl = 'https://blog.csdn.net/' + bolgname
        self.getfinish = False

    def GetPageCode(self, PageIndex):
        pageurl = self.blogurl + '/article/list/' + str(PageIndex)
        try:
            req = urllib.request.Request(pageurl)
            req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
            res = urllib.request.urlopen(req)
            Code = res.read().decode('utf-8')
            return Code
        except urllib.error.URLError as e:
            if hasattr(e, 'reason'):
                print(u'查看博客失败:', e.reason)
            return None
    
    def GetBlogList(self):
        global BlogSet
        Code = self.GetPageCode(self.pageindex)
        # 判断是否爬取完毕
        IsNone = re.search(u'空空如也', Code)
        if IsNone:
            self.getfinish = True
            return

        # 获取当前页,所有博客链接
        rule = '/' + self.bolgname + '/article/details/' + '\\d*'
        # 用以下正则,会多匹配到一篇为yoyo_liyy写的[帝都的凛冬]....
        # rule = '
.*?class="article-list">.*?', url) try: req = urllib.request.Request(url) req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)') res = urllib.request.urlopen(req) Data = res.read().decode('utf-8') # 解析博客标题 title_re = '
.*?class="article-type.*?>(.*?)' title_re += '.*?

(.*?)

' pattern = re.compile(title_re, re.S) items = re.findall(pattern, Data) for item in items: title = '[' + item[0] + '] ' + html.unescape(item[1]) # 文件名特殊字符替换,否则文件创建失败 title = title.replace('\\', '-').replace('/', '-').replace(':', '-').replace('*', '-').\ replace('?', '-').replace('"', '-').replace('<', '-').replace('>', '-').replace('|', '-') # 保存成.html格式文件 if not os.path.exists('blog'): blog_path = os.path.join(os.path.abspath('.'), 'blog') os.mkdir(blog_path) try: fout = open('./blog/' + title.strip() + '.html', 'wb') fout.write(Data.encode('utf-8')) except IOError as e: print(e) finally: fout.close() except urllib.error.URLError as e: if hasattr(e, 'reason'): print(u'爬取失败:', e.reason) self.myqueue.task_done() def Start(self): self.getfinish = False thread = threading.Thread(target=self.SaveHtmlData) thread.start() while not self.getfinish: self.GetBlogList() thread.join() print(u'爬取完毕...') if __name__ == "__main__": blogname = input(u'请输入待爬取的博客名称:') myqueue = queue.Queue() spider = CsdnBlogSpider(blogname, myqueue) spider.Start()

运行示例:【python】爬虫-爬取CSDN上的博客文章_第1张图片
【python】爬虫-爬取CSDN上的博客文章_第2张图片

你可能感兴趣的:(Python开发)