前几天刚来头儿让爬个淘宝交易记录先看看,就用python写了个,我是分成两步爬的,首先是爬取商品链接,代码如下:
#-*- coding:utf-8 -*- import BeautifulSoup import urllib2 import json import cookielib class MyParser: def __init__(self,seedurl,destpath,stop_file_path): self.seedurl=seedurl self.stop_file_path=stop_file_path stop_file=open(stop_file_path,"rb") splits=stop_file.readline().split("\t") self.no_0=splits[0] #stop文件里的值:初始为0 self.no_1=splits[1] #当前页第几个物品 self.no_2=splits[2] #当前物品第几个记录 self.destpath=destpath def run(self): print self.no_0 while int(self.no_0)<5*44: self.seedurl=self.seedurl+str(self.no_0) headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36"} req=urllib2.Request(url=self.seedurl,headers=headers) content=urllib2.urlopen(req).read() contentsoup=BeautifulSoup.BeautifulSoup(content) items=contentsoup.findAll("div",{"class":"col title"}) out_file=open(self.destpath,"a+") for item in items: print item.find("a")["href"] out_file.write(item.find("a")["href"]+"\n") out_file.flush() out_file.close() self.no_0=int(self.no_0)+44 print "ok" def run(): seedurl="http://s.taobao.com/search?spm=a230r.1.8.15.5n02zF&refpid=420461_1006&tab=all&q=%C5%AE%D1%A9%B7%C4%C9%C0&style=list&bcoffset=-4&s=" item_stop_file="e://item_stop_file" record_stop_file="s://record_stop_file" outFile="e://out" myParser=MyParser(seedurl,outFile,item_stop_file) myParser.run() if __name__=="__main__": run() print "done!"
下面根据上面爬到的文件,爬取每个商品的交易记录,代码如下:
#-*- coding:utf-8 -*- ''' Created on 2014��7��23�� @author: sj ''' import re import BeautifulSoup import os import urllib2 class MyParser: def __init__(self,item_path_file,stop_file,out_file): self.item_path_file=item_path_file self.stop_file=stop_file self.out_file=out_file stop_object=open(self.stop_file,"rb") splits=stop_object.readline().split("\t") stop_object.close() self.item=splits[0] self.page=splits[1] self.record=splits[2] self.tag=0 def run(self): print self.item print self.page print self.record item_object=open(self.item_path_file,"rb") num_items=len(item_object.readlines()) item_object.close() item_object=open(self.item_path_file,"rb") for line in item_object.readlines()[int(self.item):num_items]: try: if re.search("tmall",line): stop_object=open(self.stop_file,"rb") item_new=stop_object.readline().split("\t")[0] stop_object.close() stop_object=open(self.stop_file,"wb") stop_object.write(item_new+"\t"+"0"+"\t"+"0"+"\n") stop_object.flush() stop_object.close() continue print line headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36"} req=urllib2.Request(url=line,headers=headers) content=urllib2.urlopen(req,timeout=3).read() contentSoup=BeautifulSoup.BeautifulSoup(content) data_api=contentSoup.find("button",{"id":"J_listBuyerOnView"})["data-api"] parameters=data_api.split("?")[1] stop_object=open(self.stop_file,"rb") bid_page=stop_object.readline().split("\t")[1] stop_object.close() page_size=int(parameters.split("&")[2].split("=")[1]) while int(bid_page)<int(page_size): print "没有超过pagesize的大小..." print bid_page if self.tag==1: data_api=data_api.replace("bid_page="+str(bid_page),"bid_page="+str(int(bid_page)+1)) else: data_api=data_api.replace("bid_page=1","bid_page="+str(int(bid_page)+1)) data_url=data_api+"&ua=006tpOWUuXBidH1MRWQZ0InIldyJ0J3AibxJg%3D%3D%7CtaBkcTQxVFHEsbQxBFEEIfY%3D%7CtJFV4sbweFGpcSkNye3Y7ckNKV7GLmae5976Lfo%3D%7Cs6aDR2N2MzZTVsO2szYjpsOmAwbil4KX4tei15LXgpeSh%2FLHQmax%7Csqcy9kFUkBUANfF0sJQ9VOM7Y%2BeTZUGWQQ%3D%3D%7CsSTgxOA3%7CsIVB9vM3Mvbj1pPGAmcSJ0KGk6bDxgJ3EpdTRnMWE9eihwLVAg%3D%3D%7Cv%2Fo%2Bia0L%2FGqyyuwU7KUtCc3o3Vic%2BZzJDVhtOA3aDQ%3D%3D%7CvusvmLyYXOuOy%2B4qrzpfm85L3jpvq767rmp%2Fau8rbjvsKC3pzektWB04vWq9%7Cvfj9%2BDw5%2FdgcCUxZnaj9iEw5XJitafw4LViP&t=1406097091097&callback=Hub.data.records_reload" req=urllib2.Request(url=data_url,headers=headers) datacontent=urllib2.urlopen(req,timeout=3).read() datacontent=datacontent.decode("gbk").encode("utf-8") self.deal(datacontent) bid_page=int(bid_page)+1 stop_object=open(self.stop_file,"wb") stop_object.write(str(self.item)+"\t"+str(bid_page)+"\t"+"0") stop_object.flush() stop_object.close() self.tag=1 print self.item if int(bid_page)>=page_size: print "超过page_size大小,保存下一个物品的行数 0 0" stop_object=open(self.stop_file,"wb") stop_object.write(str(int(self.item)+1)+"\t0\t0\n") stop_object.close() self.item=int(self.item)+1 except Exception as e: if e=="timed out": continue def deal(self,content): ls=[m.start() for m in re.finditer("\"",content)] content=content[(ls[0]+1):ls[-3]] contentSoup=BeautifulSoup.BeautifulSoup(content) recordshtml=contentSoup.find("tbody") if recordshtml==None: return recordshtml=recordshtml.findAll("tr") for record in recordshtml: cols=record.findAll("td") if len(cols)!=5: continue name=cols[0].text price_em=cols[1].findAll("em") price=price_em[-1].text num=cols[2].text time=cols[3].text type=cols[4].text line=name+"\t"+price+"\t"+num+"\t"+time+"\t"+type+"\n" print line out_object=open(self.out_file,"a+") out_object.write(line) out_object.flush() out_object.close() print "ok" def run(): item_path_file="e:/item_path_file" stop_file="e://stop_file" out_file="e://records_file" parser=MyParser(item_path_file,stop_file,out_file) parser.run() if __name__=="__main__": run() print "done~"
这里item_path_file 就是第一步爬取到的商品链接文件,stop_file用于记录爬取到的位置,其实不记录也可以,上面程序没有记录爬取失败数据文件。
注意,这里可能会爬取到天猫上的物品,但是天猫的交易记录和淘宝的格式不一样,所以这里直接过滤掉天猫的。
这次爬数据比之前进步的地方:
try except的使用,之前没有用,每次超时还要手动把程序停掉,然后再开启,从断点处爬,try except 的使用使得超时就跳过本链接,这样少了很多人工操作。
后来得知自己都是手动爬的,还有一种scrapy框架比较简单些。