taobao.py主爬取程序
# -*- coding: utf-8 -*-
import scrapy
import json
import re
from ..items import TaobaoItem
class TaobaoSpider(scrapy.Spider):
name = 'taobao'
allowed_domains = ['taobao.com']
start_urls = ['https://s.taobao.com']
#https://s.taobao.com/search?q=%E4%B8%89%E6%98%9F&s=88
def parse(self,response):
for i in range(2, 11):
i = i * 44
#q=后面是搜索的商品名称,本爬虫用三星做实例
next_url = 'https://s.taobao.com/search?q=%E4%B8%89%E6%98%9F&s={}'.format(i)
yield scrapy.Request(next_url,callback=self.parse_xq)
def parse_xq(self, response):
item = TaobaoItem()
# with open('taobao.html','wb') as f:
# f.write(response.body)
html_str = response.text
# print(html_str)
titles = re.findall(r'"raw_title":"(.*?)"',html_str)
print(titles)
prices = re.findall(r'"view_price":"(.*?)"',html_str)
print(prices)
# price = html_str.xpath('//*[@id="mainsrp-itemlist"]/div/div/div[1]/div[1]/div[2]/div[1]/div[1]/strong')
# print(price)
sales_nums = re.findall(r'"view_sales":"(.*?)"',html_str)
print(sales_nums)
image_urls = re.findall(r'"pic_url":"(.*?)"',html_str)
print(image_urls)
for i in range(0,len(titles)):
item['title'] = titles[i]
item['price'] = prices[i]
item['sales_num'] = sales_nums[i]
item['image_url'] = image_urls[i]
yield item
items.py 中的代码
class TaobaoItem(scrapy.Item):
title = scrapy.Field()
price = scrapy.Field()
sales_num = scrapy.Field()
image_url = scrapy.Field()
#简化pipelines中的sql存储
def get_insert_sql(self):
sql = 'insert into tb_test(title,price,sales_num,image_url) values (%s,%s,%s,%s)'
data = (self['title'],self['price'],self['sales_num'],self['image_url'])
return (sql,data)
pipelines中的代码
class MysqlProjectPipeline(object):
def process_item(self, item, spider):
(insert_sql,data) = item.get_insert_sql()
myhelper = MysqlHelper()
myhelper.execute_modify_sql(insert_sql,data)
其他配置见文章scrapy基本配置