- spiders代码
import scrapy
from scrapy import Request
import re
from JingDong.items import JingdongItem
class ExampleSpider(scrapy.Spider):
'''京东网python书籍信息'''
name = 'jingdong'
def start_requests(self):
url_str = 'https://search.jd.com/Search?keyword=python'
yield Request(url=url_str,callback=self.parse,dont_filter=True,meta={'page':'1'})
def parse(self, response):
for i in range(1, 61):
good_url = response.xpath('//*[@id="J_goodsList"]/ul/li[{}]/div/div[1]/a/@href'.format(i)).extract()
good_names = response.xpath('//*[@id="J_goodsList"]/ul/li[{}]/div/div[3]/a//text()'.format(i)).extract()
price = response.xpath('//*[@id="J_goodsList"]/ul/li[{}]/div/div[2]//text()'.format(i)).extract()
sales_volumn = 0
if response.xpath('//*[@id="J_goodsList"]/ul/li[{}]/div/span'.format(i)).extract() != '广告':
comments = response.xpath('//*[@id="J_goodsList"]/ul/li[{}]/div/div[5]/strong//text()'.format(i)).extract()
else:
comments = response.xpath('//*[@id="J_goodsList"]/ul/li[{}]/div/div[2]/strong//text()'.format(i)).extract()
post = response.xpath('//li[@class="gl-item"][{}]//a/img/@src'.format(i)).extract()
price_new = self.get_price(price)
good_name = self.get_name(good_names)
comment = self.get_comments(comments)
item = JingdongItem()
item['good_url'] = ''.join(good_url)
item['post'] = ''.join(post)
item['price'] = price_new
item['sales_volumn'] = sales_volumn
item['goods_name'] = good_name
item['comments'] = comment
yield item
yield Request(url=response.url, callback=self.parse, dont_filter=True, meta={'page':'2'})
def get_price(self,pri):
price_old = ''.join(pri)
if re.search('[0-9]+\.[0-9]+', price_old):
dd = re.search('[0-9]+\.[0-9]+', price_old)
return float(dd.group())
else:
return 0
def get_name(self, name):
if len(name) > 3:
return name[1] + name[2]
else:
return ''.join(name)
def get_comments(self,comms):
if len(comms) >= 1:
comms = comms[0]
if re.search('万', comms):
co = int(re.search(r'[0-9]+', comms).group()) * 10000
return int(co)
else:
return int(re.search(r'[0-9]+', comms).group())
else:
return 0
- 中间件
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FOptions
import time
from scrapy.http import HtmlResponse
class SeleniumMiddlewares(object):
def __init__(self):
self.options = FOptions()
#self.options.add_argument("-headless")
self.browser = webdriver.Firefox(executable_path="/home/hello/Downloads/geckodriver",firefox_options=self.options)
def process_request(self,request,spider):
if int(request.meta['page']) == 1:
self.browser.get(request.url)
time.sleep(5)
for i in range(1,8):
self.browser.execute_script("window.scrollTo(0,{})".format(i *1000))
time.sleep(2)
self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(2)
if int(request.meta['page']) == 2:
self.browser.get(request.url)
self.browser.implicitly_wait(10)
self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(2)
next_page = self.browser.find_element_by_xpath('//em[contains(text(),"下一页")]')
next_page.click()
time.sleep(3)
for i in range(1, 8):
self.browser.execute_script("window.scrollTo(0,{})".format(i *1000))
time.sleep(2)
self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(2)
return HtmlResponse(url=self.browser.current_url,body=self.browser.page_source,encoding="utf-8",request=request)
- pipelines
import pymysql
class CsdnPipeline(object):
def open_spider(self, spider):
# 连接数据库
# self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='123456', db='movie', charset='utf8')
# 将配置文件读到内存中,是一个字典
host = '47.75.81.75' # 远程主机端口号
port = 3306 #端口号不带引号哟
user = 'user_name'
password = 'password'
dbname = 'database_name'
dbcharset = 'utf8'
self.conn = pymysql.Connect(host=host, port=port, user=user, password=password, db=dbname, charset=dbcharset)
def process_item(self, item, spider):
# 写入数据库中,blogs_jingdong 是数据表名字
sql = 'insert into blogs_jingdong(url, author, post_time, count_views,comments,title) values("%s", "%s", "%s", "%s", "%s", "%s")' % (
item['url'], item['author'], item['post_time'], item['count_views'], item['comments'], item['title'])
# 执行sql语句
self.cursor = self.conn.cursor()
try:
self.cursor.execute(sql)
print('#' * 10 + '保存成功啦')
self.conn.commit()
except Exception as e:
print('*' * 10 + '下载出错啦')
print(e)
self.conn.rollback()
return item
def close_spider(self, spider):
self.cursor.close()
self.conn.close()
- items
class JingdongItem(scrapy.Item):
good_url = scrapy.Field()
goods_name = scrapy.Field()
price = scrapy.Field()
sales_volumn = scrapy.Field()
comments = scrapy.Field()
post = scrapy.Field()
- 存储结果