scrapy startproject scrapy_carhome
(1)settings.py
BOT_NAME = 'scrapy_carhome'
SPIDER_MODULES = ['scrapy_carhome.spiders']
NEWSPIDER_MODULE = 'scrapy_carhome.spiders'
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
(2)items.py
import scrapy
class ScrapyCarhomeItem(scrapy.Item):
pass
(3)pipelines.py
from itemadapter import ItemAdapter
class ScrapyCarhomePipeline:
def process_item(self, item, spider):
return item
(4)carhome.py
import scrapy
class CarhomeSpider(scrapy.Spider):
name = 'carhome'
allowed_domains = ['car.autohome.com.cn']
start_urls = ['https://car.autohome.com.cn/price/brand-15.html']
def parse(self, response):
print("===================================================")
# text()用来定位a标签文本内容
names = response.xpath('//div[@class="main-class"]/a/text()')
prices = response.xpath('//div[@class="main-lever"]//span/span/text()')
for i in range(len(names)):
name = names[i].extract()
price = prices[i].extract()
print(name, price)
scrapy startproject scrapy_post_fanyi
scrapy genspider fanyi_post fanyi.baidu.com
(1)settings.py
BOT_NAME = 'scrapy_post_fanyi'
SPIDER_MODULES = ['scrapy_post_fanyi.spiders']
NEWSPIDER_MODULE = 'scrapy_post_fanyi.spiders'
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
(2)pipelines.py
import pymongo
import json
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
class ScrapyPostFanyiPipeline:
def process_item(self, item, spider):
return item
class MongoPipeline:
collection_name = 'scrapy_items'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
# 用来指定一个类的方法为类方法,没有此参数指定的类的方法为实例方法
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri = crawler.settings.get('MONGO_URI'),
mongo_db = crawler.settings.get('MONGO_DATABASE', 'items')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())
return item
class DuplicatesPipeline:
def __init__(self):
self.ids_seen = set()
def process_items(self, item, spider):
adapter = ItemAdapter(item)
if adapter['id'] in self.ids_seen:
raise DropItem(f"Duplicate item found: {item!r}")
else:
self.ids_seen.add(adapter['id'])
return item
class JsonWriterPipeline:
def open_spider(self, spider):
self.file = open('items.jl', 'w')
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(ItemAdapter(item).asdict() + "\n")
self.file.write(line)
return item
(3)fanyi_post.py
import scrapy
import json
class FanyiPostSpider(scrapy.Spider):
name = 'fanyi_post'
def start_requests(self):
url = 'http://fanyi.baidu.com/sug'
data = {'kw': 'final'}
yield scrapy.FormRequest(url=url, formdata=data, callback=self.parse_second)
def parse_second(self, response):
content = response.text
print(content)
obj = json.loads(content)
print(obj)
# post请求如果没有参数,这个请求将没有任何意义,与start_urls和parse无关
# allowed_domains = ['fanyi.baidu.com']
# start_urls = ['http://fanyi.baidu.com/']
# def parse(self, response):
# pass
scrapy startproject scrapy_dangdang
scrapy genspider dangdang category.dangdang.com
(1)settings.py
BOT_NAME = 'scrapy_dangdang'
SPIDER_MODULES = ['scrapy_dangdang.spiders']
NEWSPIDER_MODULE = 'scrapy_dangdang.spiders'
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
# 配置item pipelines
ITEM_PIPELINES = {
# 管道可以有很多个,优先级是1到1000,值越小优先级越高
'scrapy_dangdang.pipelines.ScrapyDangdangPipeline': 300,
'scrapy_dangdang.pipelines.DangDangDownloadPipeline': 301
}
(2)pipelines.py
from urllib import request
from itemadapter import ItemAdapter
import uuid
# 需要在settings.py中开启pipelines
class ScrapyDangdangPipeline:
# 在爬虫文件执行前执行
def open_spider(self, spider):
self.f = open('book.json', 'w', encoding='utf-8')
# item就是yield的book对象
def process_item(self, item, spider):
self.f.write(str(item))
return item
# 在爬虫文件执行后执行
def close_spider(self, spider):
self.f.close()
# 多条管道开启:定义管道类,在settings.py中开启管道
class DangDangDownloadPipeline:
def process_item(self, item, spider):
url = 'https:' + item.get('img')
filename = './books/' + str(uuid.uuid4()).replace('-', '') + '.jpg'
request.urlretrieve(url=url, filename=filename)
return item
(3)items.py
import scrapy
class ScrapyDangdangItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 定义下载的数据项 图片 名称 价格
img = scrapy.Field()
name = scrapy.Field()
price = scrapy.Field()
(4)dangdang.py
import scrapy
from scrapy_dangdang.items import ScrapyDangdangItem
class DangdangSpider(scrapy.Spider):
name = 'dangdang'
#allowed_domains = ['category.dangdang.com']
base_url = 'http://category.dangdang.com/pg'
page = 1
start_urls = ['http://category.dangdang.com/cp01.01.02.00.00.00.html']
def parse(self, response):
# pipelines下载数据
# item 定义数据结构
lis = response.xpath('//ul[@id="component_59"]/li')
for li in lis:
img = li.xpath('.//img/@data-original').extract_first()
if img:
img = img
else:
img = li.xpath('.//img/@src').extract_first()
name = li.xpath('.//img/@alt').extract_first().strip()
price = li.xpath('.//p[@class="price"]/span[1]/text()').extract_first()
book = ScrapyDangdangItem(img=img, name=name, price=price)
# 获取到book后交给pipelines
yield book
if self.page < 2:
self.page = self.page + 1
url = self.base_url + str(self.page) + '-cp01.01.02.00.00.00.html'
yield scrapy.Request(url=url, callback=self.parse)