scrapy startproject 项目名称
scrapy genspider demo_spider https://tieba.baidu.com/
在项目目录下创建一个begin.py文件,写入以下代码
from scrapy import cmdline
cmdline.execute('scrapy crawl demo_spider'.split())
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
#修改通道时候记得在这也要修改成通道名称
ITEM_PIPELINES = {
'Demo.pipelines.DemoPipeline': 300,
}
spider.py
import scrapy
from tieba_demo.items import TiebaDemoItem
class TiebaSpiderSpider(scrapy.Spider):
name = 'tieba_spider'
allowed_domains = ['https://tieba.baidu.com/']
start_urls = ['https://tieba.baidu.com/f?ie=utf-8&kw=%E6%9D%A8%E8%B6%85%E8%B6%8A']
url = "https://tieba.baidu.com/f?kw=%E6%9D%A8%E8%B6%85%E8%B6%8A&ie=utf-8&pn={}"
# 生成获取每一页页面的链接
def parse(self, response):
item = TiebaDemoItem()
for page in range(0, 251, 50):
url = self.url.format(page) # 贴吧每一页的链接
yield scrapy.Request(url, callback=self.get_data, dont_filter=True, meta=item)
# 获取页面内的内容页面的链接
def get_data(self, response):
item = response.meta
datas = response.xpath("//div[@class='threadlist_title pull_left j_th_tit ']/a/@href").extract()
for data in datas:
article_url = 'https://tieba.baidu.com' + data # 每一页链接中每个文章
yield scrapy.Request(article_url, callback=self.get_pic_url, dont_filter=True, meta=item)
# 获取页面内图片的链接
def get_pic_url(self, response):
item = response.meta
datas = response.xpath("//img[@class='BDE_Image']/@src").extract() #文章中的图片链接获取
item["pic_url"] = datas
yield item #传入items
items.py
import scrapy
class TiebaDemoItem(scrapy.Item):
#通过items传给管道一个图片链接的变量
pic_url = scrapy.Field()
pipelines.py
from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
#重写管道的图片存储方法,利用管道传入的图片链接进行存储
class Save_Image_Pipelines(ImagesPipeline):
def get_media_requests(self, item, info):
for url in item["pic_url"]:
yield Request(url)
def file_path(self, request, response=None, info=None):
return request.url[-15:]
settings.py
#修改图片存储的位置
IMAGES_STORE="./爬取图片"