python抓取搜索微信

#coding:utf-8
import urllib
import re
from urllib import quote
import HTMLParser
import time

def decodeHtml(inhtml):
    h = HTMLParser.HTMLParser()
    s = h.unescape(inhtml)
    return s

def strip_tags(html):
    html = html.strip()    
    html = html.strip("\n")    
    result = []    
    parse = HTMLParser.HTMLParser()   
    parse.handle_data = result.append    
    parse.feed(html)    
    parse.close()    
    return "".join(result)

def saveImage(count,url):
    f = open('Images/%d.jpg'%count,'wb')
    f.write(urllib.urlopen(url).read())
    f.close()

def timeSwap(timeStamp):
    timeStamp = int(timeStamp)        
    timeArray = time.localtime(timeStamp)         
    otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)  
    return otherStyleTime 

def getWeixinInfo(keyword):
    print quote(keyword)
    url = "http://weixin.sogou.com/weixin?type=2&query="+quote(keyword)+"&ie=utf8&_ast=1404888960&_asf=null&w=01029901&cid=null"
    webcontent = urllib.urlopen(url).read()
    title = re.findall('id="\w+_title_\d+">(.*?)',webcontent)
    summary = re.findall('id="\w+_summary_\d+">(.*?)

',webcontent) timeStamp = re.findall("vrTimeHandle552write\('(.*?)'\)",webcontent) imagesInfo = re.findall('
[\w\W]+?
',webcontent) link,imageSrc = [],[] for i in range(len(imagesInfo)): imgHtml = imagesInfo[i] link += re.findall('href="(.*?)"',imgHtml) imageSrc += re.findall('src="(.*?)"',imgHtml) for i in range(len(imageSrc)): print "save the %s image"%str(i+1) print decodeHtml(link[i]) print strip_tags(title[i]) print decodeHtml(imageSrc[i]) print strip_tags(summary[i]) print timeSwap(timeStamp[i]) saveImage(i+1,decodeHtml(imageSrc[i])) getWeixinInfo("世界杯")

你可能感兴趣的:(python抓取搜索微信)