python抓取QQ空间的日志

# -*- coding: utf-8 -*-



from HttpRequestModule import * 

import os
import json
import traceback

import codecs
from lxml import etree
import StringIO, gzip 
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

def write_file(file_name,file_data,encoding):
    if len(file_data) == 0 :
        print "file_data is zero"
        return
    file_dir = r"D:\fs\test_data\qqzone"
    file_path=os.path.join(file_dir,file_name)
    print file_path   
#    fp=open(file_path,"w")
#    fp.write(file_data)
#    fp.flush()
#    fp.close()
    with codecs.open(file_path,"w",encoding) as f:
        f.write(file_data)


def decodeJson(json_string):
    decode_json=None
    try:
        decode_json=json.loads(json_string) 
        return decode_json
    except (TypeError, ValueError) as err:
        print( 'TypeError or ValueError:{0}'.format(err) )
    except  Exception,e:
        print( traceback.format_exc() )
        pass
    return decode_json



def getUserBlogList():
    blog_list=[]
    diray_url='''
    http://b1.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=859226880&blogType=0&cateName=&cateHex=&statYear=2015&reqInfo=7&pos=0&num=15&sortType=0&absType=0&source=0&rand=0.6346770680975169&ref=qzone&g_tk=1611717761&verbose=1
    '''   
    data=doGet(diray_url) 
    data_len = len(data)
    if data_len == 0 :
        print "data len is 0"
        return blog_list
    data_json = data[10:data_len-2]  
    #write_file('bloglist.txt',data_json,'utf-8')   
    decode_json=decodeJson(data_json.decode("gbk"))
    if decode_json == None :
        print "decode_json is None"
        return []
    if decode_json['code'] != 0:
        print "server response code is "+decode_json['code']
        return []
    data =decode_json['data']         
    if data['totalNum'] <=0 :
        print "server response totalnum is "+data['totalNum']
        return []
    blog_list=data['list']   
    return blog_list

def getUserBlog(uin,blogid):
    url='''   
    http://b1.qzone.qq.com/cgi-bin/blognew/blog_output_data?uin=%(uin)s&blogid=%(blogid)s&styledm=ctc.qzonestyle.gtimg.cn&imgdm=ctc.qzs.qq.com&bdm=b.qzone.qq.com&mode=2&numperpage=15&timestamp=1437033537&dprefix=&inCharset=gb2312&outCharset=gb2312&ref=qzone
    '''%{'uin':uin,'blogid':blogid}
   
    my_headers={
    "Accept-Encoding":"gzip,deflate,sdch",
    "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6" ,
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36" ,
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" ,
    "Referer": "http://ctc.qzs.qq.com/qzone/newblog/blogcanvas.html"    
    }
    request = urllib2.Request(url,headers=my_headers)
    try:
        response = urllib2.urlopen(request)  
    except URLError,e:   
        if hasattr(e, 'code'):    
            print('The server couldn\'t fulfill the request. errorcode:{0}'.format(e.code ))                  
        elif hasattr(e, 'reason'): 
            print('We failed to reach a server. reason:{0}'.format(e.reason ))                           
    else:        
        page = response.read()  
        return page                     
   
    return ""

def getText(elem):
    rc = []
    for node in elem.itertext():
        rc.append(node.strip())
    return ''.join(rc)

def gzdecode(data) :
    compressedstream = StringIO.StringIO(data)
    gziper = gzip.GzipFile(fileobj=compressedstream)  
    data2 = gziper.read()   # 读取解压缩后数据 
    return data2 
    
def test(blogid):
    print blogid
    blog_data=getUserBlog('859226880',blogid)
    blog_data=gzdecode(blog_data)     
    #write_file( blogid+'.html',blog_data )
    #return
    try: 
        content=blog_data.decode('utf-8')
        tree=etree.HTML(content)    
        node=tree.xpath("//div[@id='blogDetailDiv']")[0]
        tgt_data=getText(node)
        print "*"*30
        print tgt_data
        write_file( blogid+'.txt',tgt_data, 'gbk')
        return
    except  Exception,ex :
        print "111",Exception,":",ex
        try:            
            content=blog_data.decode('gbk')
            tree=etree.HTML(content)    
            node=tree.xpath("//div[@id='blogDetailDiv']")[0]
            tgt_data=getText(node)
            print "_"*30
            print tgt_data
            write_file( blogid+'.txt',tgt_data ,'utf-8')
        except  Exception,ex :
            print "222",Exception,":",ex       
    
 
def main():
    print "main"
    test("1288281044")
    #return 
    blog_list=getUserBlogList()
    for blog_item in blog_list:
        blogId=blog_item['blogId']
        print blogId
        test( str(blogId) )
    pass
    


main()


你可能感兴趣的:(python抓取QQ空间的日志)