练习代码如下:
主函数:
# _*_ coding:utf-8 _*_ import scrapy from scrapy.selector import Selector from lxml import etree from ScrapyTest.items import ScrapytestItem class testSpider(scrapy.Spider): #爬虫名字 name = "itcast" #允许爬取的域名 # allowwd_domains = ["http://www.itcast.cn/"] #爬取信息的网址 start_urls = ( "http://www.hs-bianma.com/search.php?ser=%E5%8F%A3%E7%BA%A2&ai=1", ) def parse(self, response): # print(response.body) #先爬取口红列表中的所有信息 items = response.xpath("/html/body/div[2]/div") print(items) #使用xpath,在items的基础上继续查找口红的具体信息 #使用replace()方法删除多余的字段 #使用yield将爬取信息传给items.py for item in items: str = './/div[1]/text()' m = item.xpath(str).extract_first() n = item.xpath('.//div[2]').extract_first() # a = response.xpath('/html/body/div[2]/div/div[2]').extract()[11] b = item.xpath(".//div[3]/a[1]/@href").extract_first() c = item.xpath(".//div[3]/a[2]/@href").extract_first() a1 = n.replace('
', '').replace('', '').replace('', '')
a1 = a1.replace('商品描述', '')
# print(m)
# print(a1)
# print(b)
# print(c)
# print(a1)
item = ScrapytestItem()
item['hscode'] = m
item['商品描述'] = a1
item['归类实例'] = b
item['详情'] = c
yield item
# for m in mlist:
# test_list = {}
# print(test_list)
#
# test_list['m'] = m
# print(m)
#判断下一页是否存在,存在着交给newParse处理
next_page = response.xpath("//*[@id='main']/div/div[3]/a[1]/@href").extract_first()
next_page = 'http://www.hs-bianma.com'+next_page
print(next_page)
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.myParse)
def myParse(self, response):
# print(response.text)
second_items = response.xpath(".//*[@id='main']/div")
# print(second_items)
for second_item in second_items:
second_hscode = second_item.xpath(".//div[1]").extract_first()
second_hscode = second_hscode.replace('', '').replace('', '')
second_hscode = second_hscode.replace('
', '').replace('
','').replace('hscode','')
second_description = second_item.xpath(".//div[2]/text()").extract_first()
second_details = second_item.xpath(".//div[3]/a/@href").extract_first()
print(second_hscode)
print(second_description)
print(second_details)
item = ScrapytestItem()
item['second_hscode'] = second_hscode
item['二级商业描述'] = second_description
item['二级详情'] = second_details
yield item
setting.py:
# -*- coding: utf-8 -*- # Scrapy settings for ScrapyTest project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'ScrapyTest' SPIDER_MODULES = ['ScrapyTest.spiders'] NEWSPIDER_MODULE = 'ScrapyTest.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #伪装为火狐浏览器访问 USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0' # 修改编码为gbk,utf-8不行 FEED_EXPORT_ENCODING = 'gbk' #输出csv文件 FEED_URI = u'file:///F://douban.csv' FEED_FORMAT = 'CSV' # Obey robots.txt rules ROBOTSTXT_OBEY = False
items.py:
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class ScrapytestItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() hscode = scrapy.Field() 商品描述 = scrapy.Field() 归类实例 = scrapy.Field() 详情 = scrapy.Field() second_hscode = scrapy.Field() 二级商业描述 = scrapy.Field() 二级详情 = scrapy.Field() pass
start.py:
pipelines.py:
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html class ScrapytestPipeline(object): def process_item(self, item, spider): return item class FilePipeLine(object): def __init__(self): self.file = open('f:/hjd', 'wb') #输出txt文件 def process_item(self, item, spider): line = "%s\t%s\t%s\t%s\t%s\t%s\n" % (item['hscode'], item['description'], item['example'], item['details']) self.file.write(line.encode("utf-8")) return item