#
-*- coding: utf-8 -*-
from HttpRequestModule import *
import os
import json
import traceback
import codecs
from lxml import etree
import StringIO, gzip
import sys
reload(sys)
sys.setdefaultencoding( ' utf-8 ')
def write_file(file_name,file_data,encoding):
if len(file_data) == 0 :
print " file_data is zero "
return
file_dir = r " D:\fs\test_data\qqzone "
file_path=os.path.join(file_dir,file_name)
print file_path
# fp=open(file_path,"w")
# fp.write(file_data)
# fp.flush()
# fp.close()
with codecs.open(file_path, " w ",encoding) as f:
f.write(file_data)
def decodeJson(json_string):
decode_json=None
try:
decode_json=json.loads(json_string)
return decode_json
except (TypeError, ValueError) as err:
print( ' TypeError or ValueError:{0} '.format(err) )
except Exception,e:
print( traceback.format_exc() )
pass
return decode_json
def getUserBlogList():
blog_list=[]
diray_url= '''
http://b1.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=859226880&blogType=0&cateName=&cateHex=&statYear=2015&reqInfo=7&pos=0&num=15&sortType=0&absType=0&source=0&rand=0.6346770680975169&ref=qzone&g_tk=1611717761&verbose=1
'''
data=doGet(diray_url)
data_len = len(data)
if data_len == 0 :
print " data len is 0 "
return blog_list
data_json = data[10:data_len-2]
# write_file('bloglist.txt',data_json,'utf-8')
decode_json=decodeJson(data_json.decode( " gbk "))
if decode_json == None :
print " decode_json is None "
return []
if decode_json[ ' code '] != 0:
print " server response code is "+decode_json[ ' code ']
return []
data =decode_json[ ' data ']
if data[ ' totalNum '] <=0 :
print " server response totalnum is "+data[ ' totalNum ']
return []
blog_list=data[ ' list ']
return blog_list
def getUserBlog(uin,blogid):
url= '''
http://b1.qzone.qq.com/cgi-bin/blognew/blog_output_data?uin=%(uin)s&blogid=%(blogid)s&styledm=ctc.qzonestyle.gtimg.cn&imgdm=ctc.qzs.qq.com&bdm=b.qzone.qq.com&mode=2&numperpage=15×tamp=1437033537&dprefix=&inCharset=gb2312&outCharset=gb2312&ref=qzone
'''%{ ' uin ':uin, ' blogid ':blogid}
my_headers={
" Accept-Encoding ": " gzip,deflate,sdch ",
" Accept-Language ": " zh-CN,zh;q=0.8,en;q=0.6 " ,
" User-Agent ": " Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36 " ,
" Accept ": " text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 " ,
" Referer ": " http://ctc.qzs.qq.com/qzone/newblog/blogcanvas.html "
}
request = urllib2.Request(url,headers=my_headers)
try:
response = urllib2.urlopen(request)
except URLError,e:
if hasattr(e, ' code '):
print( ' The server couldn\ 't fulfill the request. errorcode:{0} ' .format(e.code ))
elif hasattr(e, ' reason '):
print( ' We failed to reach a server. reason:{0} '.format(e.reason ))
else:
page = response.read()
return page
return ""
def getText(elem):
rc = []
for node in elem.itertext():
rc.append(node.strip())
return ''.join(rc)
def gzdecode(data) :
compressedstream = StringIO.StringIO(data)
gziper = gzip.GzipFile(fileobj=compressedstream)
data2 = gziper.read() # 读取解压缩后数据
return data2
def test(blogid):
print blogid
blog_data=getUserBlog( ' 859226880 ',blogid)
blog_data=gzdecode(blog_data)
# write_file( blogid+'.html',blog_data )
# return
try:
content=blog_data.decode( ' utf-8 ')
tree=etree.HTML(content)
node=tree.xpath( " //div[@id='blogDetailDiv'] ")[0]
tgt_data=getText(node)
print " * "*30
print tgt_data
write_file( blogid+ ' .txt ',tgt_data, ' gbk ')
return
except Exception,ex :
print " 111 ",Exception, " : ",ex
try:
content=blog_data.decode( ' gbk ')
tree=etree.HTML(content)
node=tree.xpath( " //div[@id='blogDetailDiv'] ")[0]
tgt_data=getText(node)
print " _ "*30
print tgt_data
write_file( blogid+ ' .txt ',tgt_data , ' utf-8 ')
except Exception,ex :
print " 222 ",Exception, " : ",ex
def main():
print " main "
test( " 1288281044 ")
# return
blog_list=getUserBlogList()
for blog_item in blog_list:
blogId=blog_item[ ' blogId ']
print blogId
test( str(blogId) )
pass
main()
from HttpRequestModule import *
import os
import json
import traceback
import codecs
from lxml import etree
import StringIO, gzip
import sys
reload(sys)
sys.setdefaultencoding( ' utf-8 ')
def write_file(file_name,file_data,encoding):
if len(file_data) == 0 :
print " file_data is zero "
return
file_dir = r " D:\fs\test_data\qqzone "
file_path=os.path.join(file_dir,file_name)
print file_path
# fp=open(file_path,"w")
# fp.write(file_data)
# fp.flush()
# fp.close()
with codecs.open(file_path, " w ",encoding) as f:
f.write(file_data)
def decodeJson(json_string):
decode_json=None
try:
decode_json=json.loads(json_string)
return decode_json
except (TypeError, ValueError) as err:
print( ' TypeError or ValueError:{0} '.format(err) )
except Exception,e:
print( traceback.format_exc() )
pass
return decode_json
def getUserBlogList():
blog_list=[]
diray_url= '''
http://b1.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=859226880&blogType=0&cateName=&cateHex=&statYear=2015&reqInfo=7&pos=0&num=15&sortType=0&absType=0&source=0&rand=0.6346770680975169&ref=qzone&g_tk=1611717761&verbose=1
'''
data=doGet(diray_url)
data_len = len(data)
if data_len == 0 :
print " data len is 0 "
return blog_list
data_json = data[10:data_len-2]
# write_file('bloglist.txt',data_json,'utf-8')
decode_json=decodeJson(data_json.decode( " gbk "))
if decode_json == None :
print " decode_json is None "
return []
if decode_json[ ' code '] != 0:
print " server response code is "+decode_json[ ' code ']
return []
data =decode_json[ ' data ']
if data[ ' totalNum '] <=0 :
print " server response totalnum is "+data[ ' totalNum ']
return []
blog_list=data[ ' list ']
return blog_list
def getUserBlog(uin,blogid):
url= '''
http://b1.qzone.qq.com/cgi-bin/blognew/blog_output_data?uin=%(uin)s&blogid=%(blogid)s&styledm=ctc.qzonestyle.gtimg.cn&imgdm=ctc.qzs.qq.com&bdm=b.qzone.qq.com&mode=2&numperpage=15×tamp=1437033537&dprefix=&inCharset=gb2312&outCharset=gb2312&ref=qzone
'''%{ ' uin ':uin, ' blogid ':blogid}
my_headers={
" Accept-Encoding ": " gzip,deflate,sdch ",
" Accept-Language ": " zh-CN,zh;q=0.8,en;q=0.6 " ,
" User-Agent ": " Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36 " ,
" Accept ": " text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 " ,
" Referer ": " http://ctc.qzs.qq.com/qzone/newblog/blogcanvas.html "
}
request = urllib2.Request(url,headers=my_headers)
try:
response = urllib2.urlopen(request)
except URLError,e:
if hasattr(e, ' code '):
print( ' The server couldn\ 't fulfill the request. errorcode:{0} ' .format(e.code ))
elif hasattr(e, ' reason '):
print( ' We failed to reach a server. reason:{0} '.format(e.reason ))
else:
page = response.read()
return page
return ""
def getText(elem):
rc = []
for node in elem.itertext():
rc.append(node.strip())
return ''.join(rc)
def gzdecode(data) :
compressedstream = StringIO.StringIO(data)
gziper = gzip.GzipFile(fileobj=compressedstream)
data2 = gziper.read() # 读取解压缩后数据
return data2
def test(blogid):
print blogid
blog_data=getUserBlog( ' 859226880 ',blogid)
blog_data=gzdecode(blog_data)
# write_file( blogid+'.html',blog_data )
# return
try:
content=blog_data.decode( ' utf-8 ')
tree=etree.HTML(content)
node=tree.xpath( " //div[@id='blogDetailDiv'] ")[0]
tgt_data=getText(node)
print " * "*30
print tgt_data
write_file( blogid+ ' .txt ',tgt_data, ' gbk ')
return
except Exception,ex :
print " 111 ",Exception, " : ",ex
try:
content=blog_data.decode( ' gbk ')
tree=etree.HTML(content)
node=tree.xpath( " //div[@id='blogDetailDiv'] ")[0]
tgt_data=getText(node)
print " _ "*30
print tgt_data
write_file( blogid+ ' .txt ',tgt_data , ' utf-8 ')
except Exception,ex :
print " 222 ",Exception, " : ",ex
def main():
print " main "
test( " 1288281044 ")
# return
blog_list=getUserBlogList()
for blog_item in blog_list:
blogId=blog_item[ ' blogId ']
print blogId
test( str(blogId) )
pass
main()