爬取淘宝商品信息,昨天利用selenium和PhantomJS爬取了淘宝页面,但是速度方面有些欠缺,所以今天简单的利用scrapy框架来实现同样功能,并同样保存到mongo数据库中
import scrapy
import re
import pymongo
from taobao.items import TaobaoItem
class WeisuenSpider(scrapy.Spider):
name = 'taobao_'
start_url = "https://s.taobao.com/search?q=%E5%A5%B3%E8%A3%85&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.50862.201856-taobao-item.1&ie=utf8&initiative_id=staobaoz_20180309&ie=utf8&bcoffset=4&ntoffset=4&p4ppushleft=1%2C48"
detail_urls=[]
data=[]
client=pymongo.MongoClient("localhost",27017)
db=client.taobao
db=db.nvz
def start_requests(self):
for i in range(100):#爬100页数据
url=self.start_url+'&s='+str(i*44) #每一页只能支持44个展示
yield scrapy.FormRequest(url=url,callback=self.parse)
def parse(self, response):
# 匹配出相关信息
item=response.xpath('//script/text()').extract()
pat='"raw_title":"(.*?)","pic_url".*?,"detail_url":"(.*?)","view_price":"(.*?)"'
urls=re.findall(pat,str(item)) # 列表里面一个个元组
# print(urls)
for url in urls: #解析url并放入数组中
weburl=self.url_decode(temp=url[1]) #提取/添加出完整的url地址
item = TaobaoItem()
item['name']=url[0]
item['price']=url[2]
item['link']=weburl
s1 = {'name':item['name'],'price':item['price'],'lick' :weburl}
self.db.insert(s1)
yield item
def url_decode(self,temp):
while '\\' in temp:
index=temp.find('\\')
st=temp[index:index+7]
temp=temp.replace(st,'')
index=temp.find('id')
temp=temp[:index+2]+'='+temp[index+2:]
index=temp.find('ns')
temp=temp[:index]+'&'+'ns='+temp[index+2:]
index=temp.find('abbucket')
temp='https:'+temp[:index]+'&'+'abbucket='+temp[index+8:]
return temp