python抓取QQ空间博客文章

       作者:华亮

        转载请说明出处:http://blog.csdn.net/cedricporter


外面挂着台风,下午把人人相册的爬虫写了,晚上偶无聊又把QQ空间的博客的爬虫写了,默认只抓取提供的Q号的空间,可以在main.py里面填上Q号,也可以加个循环弄很多个Q号....博客里面的图片就木有理它了,要下载回来也很简单。有空再完善了。


# -*-coding:utf-8-*-
# Filename: main.py
# 作者:华亮
#

from QQ import QQ

if __name__ == '__main__':
    # 第一个参数为QQ号,第二个为保存文件名
    QQ.DownloadBlog('414112390', 'blog.txt')



# -*-coding:utf-8-*-
# Filename: QQ.py
# 作者:华亮
#

import urllib
import urllib2
import re
from HTMLParser import HTMLParser


# 获取QQ空间博客列表
class QQBlogList(HTMLParser):
    in_key_div = False
    in_ul = False
    in_li = False
    in_a = False
    blogList = []
    lasturl = ''
    
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if tag == 'div' and 'class' in attrs and attrs['class'] == 'bloglist':
            self.in_key_div = True
        elif self.in_key_div:
            if tag == 'ul':
                self.in_ul = True
            elif self.in_ul and tag == 'li':
                self.in_li = True
            elif self.in_li and tag == 'a' and 'href' in attrs:
                self.in_a = True
                self.lasturl = attrs['href']
    
    def handle_data(self, data):
        if self.in_a:
            self.blogList.append((data, self.lasturl))
    
    def handle_endtag(self, tag):
        if self.in_key_div and tag == 'div':
            self.in_key_div = False
        elif self.in_ul and tag == 'ul':
            self.in_ul = False
        elif self.in_li and tag == 'li':
            self.in_li = False
        elif self.in_a and tag == 'a':
            self.in_a = False
            
         
            
class QQ:  
    '''
    QQ
        作者:华亮
        说明:自动下载QQ空间博客文章
    '''
        
    @staticmethod      
    def DownloadBlog(qq, filename = None):
        print 'Start'
        blogurl = 'http://qz.qq.com/%s/bloglist?page=0' % qq
        QQ.__Download(blogurl, filename)           
        print 'End'
    
    @staticmethod
    def __Download(starturl, filename):
        url = starturl
        
        cookieFile = urllib2.HTTPCookieProcessor()
        opener = urllib2.build_opener(cookieFile)    
        
        # 获取所有页的文章路径
        while True:
            req = urllib2.Request(url)
            result = opener.open(req)        
            text = result.read()     
            
            qq = QQBlogList()        
            qq.feed(text)
            qq.close()          
                   
            nextpagePattern = re.compile(r'下一页')              
            nextpage = nextpagePattern.search(text)
            if nextpage:
                url = nextpage.group(1)            
            else:
                break  
          
        if not filename:
            filename = "blog.txt"
        file = open(filename, 'w')    
        
        # 下载文章
        blogContentPattern = re.compile(r'
(.*?)
', re.S) for title, url in qq.blogList: print 'Downloading', title req = urllib2.Request(url) result = opener.open(req) file.write('\n' + title + '\n') ret = blogContentPattern.search( result.read() ) if ret: file.write(ret.group(1).replace('

', '\n')) file.close()



你可能感兴趣的:(Python)