爬虫练手:爬取淘宝某一类目商品信息

爬取目标:淘宝下某一类目商品的标题、链接、原价、优惠促销价格、评论数等信息(也可进一步爬取详细评论信息)。

源代码

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class Taobao01Item(scrapy.Item):
    # define the fields for your item here like:
    #商品名称
    title = scrapy.Field()
    #商品链接
    link = scrapy.Field()
    #商品价格(原价)
    price = scrapy.Field()
    #促销价格
    price_now = scrapy.Field()
    #评论数
    comment = scrapy.Field()

爬虫文件 tb01.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
import re
from taobao01.items import Taobao01Item
import urllib.request

class Tb01Spider(scrapy.Spider):
    name = "tb01"
    allowed_domains = ["taobao.com"]
    start_urls = ['http://www.taobao.com/']

    def parse(self, response):
        key = '零食'

        for i in range(1,3):
            url = "https://s.taobao.com/search?q=" + key + "&ie=utf8&s=" + str((i - 1) * 44)
            print("要爬取的url是:" + url)
            yield Request(url=url,callback=self.goodlist)

    def goodlist(self,response):
        body = response.body.decode()
        pat = '"nid":"(.*?)"'
        allid = re.compile(pattern=pat).findall(body)
        # print(allid)
        for id in allid:
            url = "https://item.taobao.com/item.htm?id=" + str(id)

            yield Request(url=url,callback=self.good,meta={"id":id})

    def good(self,response):
        id = response.meta["id"]
        comment_url = "https://rate.taobao.com/detailCount.do?callback=jsonp100&itemId=" + str(id)

        try:
            title = response.xpath("//h3[@class='tb-main-title']/text()").extract()[0]
        except:
            title = response.xpath("//h1[@data-spm='1000983']/text()").extract()[0]

        link = response.url

        try:
            price = response.xpath("//em[@class='tb-rmb-num']/text()").extract()[0]
        except:
            price = "100"
            # try:
            #     price = response.xpath("//dl[@id='J_StrPriceModBox']/dd/span[@class='tm-price']/text()").extract()[0]
            # except:
            #     price = response.xpath("//dl[@id='J_StrPriceModBox']/dd/div[class='tm-promo-price']/span[@class='tm-price']/text()").extract()[0]
        commentdata = urllib.request.urlopen(comment_url).read().decode("utf-8","ignore")
        pat = 'jsonp100({"count":(.*?)})'

        # comment = re.compile(pat).findall(commentdata)[0]

        print("商品url是:" + link)
        print("商品价格是:" + price)
        print("商品评论url是:" + comment_url)
        print("返回的评论字符串是:" + commentdata)
        # print("评论数是:" + comment)
        print("")

        item = Taobao01Item()
        item["title"] = title
        item["link"] = link
        item["price"] = price
        # item["comment"] = comment
        yield item

pipelines.py

将爬取到的数据插入到数据库:略,可参考博文
http://www.jianshu.com/p/164f3fda2d1c

(本文未完待续)

你可能感兴趣的:(爬虫练手:爬取淘宝某一类目商品信息)