scrapy + selenium模拟 爬取京东商品信息

  1. spiders代码
import scrapy
from scrapy import Request
import re
from JingDong.items import JingdongItem

class ExampleSpider(scrapy.Spider):
    '''京东网python书籍信息'''
    name = 'jingdong'

    def start_requests(self):
        url_str = 'https://search.jd.com/Search?keyword=python'
        yield Request(url=url_str,callback=self.parse,dont_filter=True,meta={'page':'1'})
    def parse(self, response):
        for i in range(1, 61):
            good_url = response.xpath('//*[@id="J_goodsList"]/ul/li[{}]/div/div[1]/a/@href'.format(i)).extract()
            good_names = response.xpath('//*[@id="J_goodsList"]/ul/li[{}]/div/div[3]/a//text()'.format(i)).extract()
            price = response.xpath('//*[@id="J_goodsList"]/ul/li[{}]/div/div[2]//text()'.format(i)).extract()
            sales_volumn = 0
            if response.xpath('//*[@id="J_goodsList"]/ul/li[{}]/div/span'.format(i)).extract() != '广告':
                comments = response.xpath('//*[@id="J_goodsList"]/ul/li[{}]/div/div[5]/strong//text()'.format(i)).extract()
            else:
                comments = response.xpath('//*[@id="J_goodsList"]/ul/li[{}]/div/div[2]/strong//text()'.format(i)).extract()
            post = response.xpath('//li[@class="gl-item"][{}]//a/img/@src'.format(i)).extract()
            price_new = self.get_price(price)
            good_name = self.get_name(good_names)
            comment = self.get_comments(comments)
            item = JingdongItem()
            item['good_url'] = ''.join(good_url)
            item['post'] = ''.join(post)
            item['price'] = price_new
            item['sales_volumn'] = sales_volumn
            item['goods_name'] = good_name
            item['comments'] = comment
            yield item
        yield Request(url=response.url, callback=self.parse, dont_filter=True, meta={'page':'2'})

    def get_price(self,pri):
        price_old = ''.join(pri)
        if re.search('[0-9]+\.[0-9]+', price_old):
            dd = re.search('[0-9]+\.[0-9]+', price_old)
            return float(dd.group())
        else:
            return 0

    def get_name(self, name):
        if len(name) > 3:
            return name[1] + name[2]
        else:
            return ''.join(name)

    def get_comments(self,comms):
        if len(comms) >= 1:
            comms = comms[0]
            if re.search('万', comms):
                co = int(re.search(r'[0-9]+', comms).group()) * 10000
                return int(co)
            else:
                return int(re.search(r'[0-9]+', comms).group())
        else:
            return 0
  1. 中间件
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FOptions
import time
from scrapy.http import HtmlResponse


class SeleniumMiddlewares(object):

    def __init__(self):
        self.options = FOptions()
        #self.options.add_argument("-headless")
        self.browser = webdriver.Firefox(executable_path="/home/hello/Downloads/geckodriver",firefox_options=self.options)

    def process_request(self,request,spider):
        if int(request.meta['page']) == 1:
            self.browser.get(request.url)
            time.sleep(5)
            for i in range(1,8):
                self.browser.execute_script("window.scrollTo(0,{})".format(i *1000))
                time.sleep(2)
            self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
            time.sleep(2)

        if int(request.meta['page']) == 2:
            self.browser.get(request.url)
            self.browser.implicitly_wait(10)
            self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
            time.sleep(2)
            next_page = self.browser.find_element_by_xpath('//em[contains(text(),"下一页")]')
            next_page.click()
            time.sleep(3)
            for i in range(1, 8):
                self.browser.execute_script("window.scrollTo(0,{})".format(i *1000))
                time.sleep(2)
            self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
            time.sleep(2)
        return HtmlResponse(url=self.browser.current_url,body=self.browser.page_source,encoding="utf-8",request=request)
  1. pipelines
import pymysql

class CsdnPipeline(object):

    def open_spider(self, spider):
        # 连接数据库
        # self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='123456', db='movie', charset='utf8')
        # 将配置文件读到内存中,是一个字典
        host = '47.75.81.75'   # 远程主机端口号
        port = 3306         #端口号不带引号哟
        user = 'user_name'
        password = 'password'
        dbname = 'database_name'
        dbcharset = 'utf8'
        self.conn = pymysql.Connect(host=host, port=port, user=user, password=password, db=dbname, charset=dbcharset)

    def process_item(self, item, spider):
        # 写入数据库中,blogs_jingdong 是数据表名字
        sql = 'insert into blogs_jingdong(url, author, post_time, count_views,comments,title) values("%s", "%s", "%s", "%s", "%s", "%s")' % (
        item['url'], item['author'], item['post_time'], item['count_views'], item['comments'], item['title'])
        # 执行sql语句
        self.cursor = self.conn.cursor()
        try:
            self.cursor.execute(sql)
            print('#' * 10 + '保存成功啦')
            self.conn.commit()
        except Exception as e:
            print('*' * 10 + '下载出错啦')
            print(e)
            self.conn.rollback()
        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.conn.close()
  1. items
class JingdongItem(scrapy.Item):

    good_url = scrapy.Field()
    goods_name = scrapy.Field()
    price = scrapy.Field()
    sales_volumn = scrapy.Field()
    comments = scrapy.Field()
    post =  scrapy.Field()
  1. 存储结果
    scrapy + selenium模拟 爬取京东商品信息_第1张图片

你可能感兴趣的:(爬虫)