这里我们只爬取了淘宝的列表页和宝贝的链接,名称,价格,销量,图片地址。如果有需求爬取内容页,自行添加
taobao_wang是我自己建的执行文件名
from scrapy.cmdline import execute
execute('scrapy crawl taobao_wang'.split())
# -*- coding: utf-8 -*-
import scrapy,re
from ..piaot import * #导包自定义
from ..items import taobao_Mysql #导自定义的实例化类
class TaobaoWangSpider(scrapy.Spider):
name = 'taobao_wang'
allowed_domains = ['taobao.com']
# start_urls = ['http://taobao.com/']
# 我们使用这个函数作为初始的执行函数
def start_requests(self):
a=input('请输入查询内容:')
b=int(input('需要多少页:'))
# 将name值转成url编码
name = parse.quote(a)
# 循环页数
for i in range(b):
# 拼接url地址
url='https://s.taobao.com/search?q={}&s={}'.format(name,i*48)
# 添加报头
form={
'User-Agent':pa(), #自定义的piaot中的pa():报头
}
# 发送get请求
req=scrapy.Request(url=url,callback=self.parse,headers=form)
# 发送
yield req
# 获得内容将信息取出
def parse(self, response):
# 创建实例化对象,items下的taobao_Mysql类
mysql=taobao_Mysql()
# 将返回的值转成text默认utf-8编码
#里面的值是个字典
html=response.text
name=re.findall('"title":"(.*?)"', html)
jiag=re.findall('"price":"(.*?)"',html)
xiaosho_shul=re.findall('"month_sales":"(.*?)"',html)
pic_url=re.findall('"pic_url":"(.*?)"',html)
dz_url=re.findall('"url":"(.*?)"',html)
# 循环遍历出信息
for i in range(len(name)):
# 给实例化赋值
mysql['name']=name[i]
mysql['jiag']=jiag[i]
mysql['xiaosho_shul']=xiaosho_shul[i]
mysql['img_url']='http:'+pic_url[i]
mysql['dz_url']=dz_url[i]
# 将实例化的值发送
yield mysql
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class TaobaoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
# 建立自定义实例化类
class taobao_Mysql(scrapy.Item):
# define the fields for your item here like:
# 创建实例化的值
name = scrapy.Field()
jiag=scrapy.Field()
xiaosho_shul=scrapy.Field()
img_url=scrapy.Field()
dz_url=scrapy.Field()
导入自定义包,其实使用就是加一个报头,大家手动加不导包也可以的
from piaot import *
#是否遵循规则,不懂请百度
#ROBOTSTXT_OBEY改成False
ROBOTSTXT_OBEY = False
#添加头信息
DEFAULT_REQUEST_HEADERS = {
'User-Agent':pa(), #导入了piaot包里的pa()报头
}
#配置存储文件地址和优先级
ITEM_PIPELINES = {
# 'taobao.pipelines.TaobaoPipeline': 300,
'taobao.pipelines.taobao_mysql': 300,
}
# -*- coding: utf-8 -*-
import pymysql
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
class TaobaoPipeline(object):
def process_item(self, item, spider):
return item
class taobao_mysql(object):
def process_item(self, item, spider):
# 将传过来的宝贝的链接将需要替换的替换,因为不替换的话无法使用
dz=item['dz_url']
# 字符串替换
dz_url=dz.replace('\\u003d', '=')
dz_url=dz_url.replace('\\u0026', '&')
# sql语句
sql = "insert into xq_4 values(NULL,'{}','{}','{}','{}','{}')".format(item['name'], item['jiag'],item['xiaosho_shul'],item['img_url'],dz_url)
# print(sql)
# 打开数据库连接,
db = pymysql.connect("192.168.43.128", "root", "123456", "xq", charset='utf8')
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
# 使用 fetchone() 方法获取一条数据
data = cursor.execute(sql)
print("Database version : %s " % data)
# 提交到数据库执行
db.commit()
db.close()