保存为json格式的终端操作代码:
scrapy crawl 爬虫文件名 -o 随便起的文件名.json -s FEEN_EXPORT_ENCODING=utf-8
import scrapy
# 在同级文件夹路径下找到指定的文件items
# 所以要回到上级文件夹路径来找 ..回到上级路径
from ..items import MokoItem
class MokokoSpider(scrapy.Spider):
name = 'mokoko'
allowed_domains = ['moko.cc']
#通常会修改start_rl
start_urls = ['http://www.moko.cc/channels/post/153/1.html']
def parse(self, response):
# print(response.text)
ul_list=response.xpath('//ul[@class="post small-post"]')
print(ul_list)
all_items=[]
for ul in ul_list:
# 初始化一个item对象
item= MokoItem()
# xpath对象获取内容都是列表
# 返回的内容为:scrapy.selector
# 如果对象类型为scrapy.selector 那么这个对象可以
# 被继续迭代 也可以被xpath继续寻找里面的内容
title=ul.xpath('.//div[@class="cover"]/@cover-text')
# print(title)
# print(type(title))
# 将xpath对象转化为列表对象 [0]取出里面的元素
title=title.extract()[0]
# print(title)
# 如果对象的类型为list 那么这个对象可以迭代
# 但是不能再继续使用xpath
# print(type(title))
clicknum=ul.xpath('.//li[last()]/span/text()').extract()[0]
imgsrc=ul.xpath('.//img/@src2').extract()[0]
item['title']=title
item['imgsrc'] = imgsrc
item['clicknum'] = clicknum
yield item
#项目需求是 将斗鱼网页代码用json格式获取里面的图片链接下载 并在终端保存为json格式
import scrapy
from ..items import DoudouyuItem
import json
class TupianSpider(scrapy.Spider):
name = 'tupian'
allowed_domains = ['api.douyucdn.cn']
start_urls = ['http://api.douyucdn.cn/api/v1/getverticalRoom?limit=20&offset=']
def parse(self, response):
jsobj = json.loads(response.text)
for src in jsobj["data"]:
item = DoudouyuItem()
src = src["room_src"]
# print(src)
item['src']=[src]
yield item
# print('正在获取第一页')
for x in range(0,500,20):
url='http://api.douyucdn.cn/api/v1/getverticalRoom?limit=20&offset=' + str(x)
yield scrapy.Request(url=url,callback=self.parse)
如果只下载图片只需要在settings.py里将下载管道解注释然后按照如下代码进行:
ITEM_PIPELINES = {
# 'doudouyu.pipelines.DoudouyuPipeline': 300,
#专门负责下载图片的管道
'scrapy.pipelines.images.ImagesPipeline':1
}
IMAGES_STORE='tutuqian'
IMAGES_URLS_FIELD='src'
项目需求 将http://pic.netbian.com/4kmeishi/中的图片下载下来
# -*- coding: utf-8 -*-
import scrapy
from ..items import ImagenetItem
class ImageSpider(scrapy.Spider):
name = 'image'
allowed_domains = ['pic.netbian.com']
start_urls = ['http://pic.netbian.com/4kmeishi/']
def parse(self, response):
img_list = response.xpath('//ul[@class="clearfix"]/li/a/img/@src')
print(img_list)
for img in img_list:
item=ImagenetItem()
src = 'http://pic.netbian.com/' + img.extract()
# print(src)
item['src']=[src]
yield item
next_url=response.xpath('//div[@class="page"]/a[text()="下一页"]/@href').extract()
if len(next_url)!=0:
print('*****************************')
url='http://pic.netbian.com/'+next_url[0]
yield scrapy.Request(url=url,callback=self.parse)
# print(next_url)
ITEM_PIPELINES = {
# 'imagenet.pipelines.ImagenetPipeline': 300,
#ImagesPipeline要与管道中的方法名字一样
'scrapy.pipelines.images.ImagesPipeline':1
}
#图片的存储路径
IMAGES_STORE='imagessss'
IMAGES_URLS_FIELD='src'
scrapy crawl 爬虫名 -o 爬虫名.csv
scrapy crawl 爬虫名 -o 爬虫名.xml
scrapy crawl 爬虫名 -o 爬虫名.json -s FEED_EXPORT_ENCODING=utf-8
项目需求 将贴吧中楼主的内容全部保存为TXT文本
# -*- coding: utf-8 -*-
import scrapy
# from..items import XiaoshuospiderItem
class ZhigengniaoSpider(scrapy.Spider):
name = 'zhigengniao'
allowed_domains = ['tieba.baidu.com']
start_urls = ['https://tieba.baidu.com/p/5815118868?pn=']
def parse(self, response):
info_list = response.xpath('//div[@class="l_post l_post_bright j_l_post clearfix "]')
for info in info_list:
name_list = info.xpath('.//ul[@class="p_author"]/li/a/text()').extract()
for name in name_list:
if name == '乔深沉':
content_list = info.xpath('.//div[@class="p_content "]/cc/div/text()')
for con in content_list:
# item = XiaoshuospiderItem()
con = con.extract()
# item['con'] = con
# yield item
with open('xiaoshuo.txt','a',encoding='utf-8')as f:
f.write(con)
f.write('\n')
next_url = response.xpath('//li[@class="l_pager pager_theme_5 pb_list_pager"]/a[text()="下一页"]/@href').extract()
if len(next_url) != 0:
url = 'https://tieba.baidu.com' + next_url[0]
yield scrapy.Request(url=url, callback=self.parse)
项目需求 将小说和图片同时保存在本地文件
# -*- coding: utf-8 -*-
import scrapy
from ..items import QishuItem
class XiaoshuoSpider(scrapy.Spider):
name = 'xiaoshuo'
allowed_domains = ['qisuu.la']
start_urls = ['https://www.qisuu.la/']
def parse(self, response):
# print(response.text)
#获取所有小说的类型
type_list=response.xpath('//div[@class="nav"]/a/@href').extract()
# print(type_list)
#列表里面第一个是首页 将首页 去掉
del type_list[0]
#输出url
# print(response.url)
for type in type_list:
#拼接每一个类型的url地址
# 在这个方法里面 response.url 为start_url
url=response.url+type[1:]
print(url)
yield scrapy.Request(url=url,callback=self.get_content_with_type_url)
#用来找到每一种类型对应的小说
def get_content_with_type_url(self,response):
# print(response.text)
#找到类型中 第一页所有小说详情页链接地址
book_list=response.xpath('//div[@class="listBox"]/ul/li/a/@href').extract()
print('************************')
# print(book_list)
for book in book_list:
# 在这个方法里面 response.url 为: https://www.qisuu.la/soft/sort0(x)/
url='https://www.qisuu.la'+book
yield scrapy.Request(url=url,callback=self.get_detail_with_book_url)
#获取每一本书的内容详情
def get_detail_with_book_url(self,response):#extract_first :转化成列表同时取出第一个元素
item=QishuItem()
#获取小说标题
name=response.xpath('//div[@class="detail_right"]/h1/text()').extract_first('')
info_list=response.xpath('//div[@class="detail_right"]/ul/li/text()').extract()
#获取需要下载的小说图片地址
imageurl=response.xpath('//div[@class="detail_pic"]/img/@src').extract_first('')
imageurl='https://www.qisuu.la'+imageurl
#获取小说的下载地址
downloadurl=response.xpath('//div[@class="showDown"]/ul/li[3]/script').extract_first('').split(',')[1].strip("'")
print(downloadurl)
print(imageurl)
item['imageurl']=[imageurl]
item['downloadurl']=[downloadurl]
item['name']=name
clicknum=info_list[0]
item['clicknum']=clicknum
filesize = info_list[1]
item['filesize'] = filesize
booktype = info_list[2]
item['booktype'] = booktype
updatetime = info_list[3]
item['updatetime'] = updatetime
bookstatus = info_list[4]
item['bookstatus'] = bookstatus
bookauthor = info_list[5]
item['bookauthor'] = bookauthor
print('//////////////////////////////////////////')
print(info_list)
yield item
ITEM_PIPELINES = {
'qishu.pipelines.QishuPipeline': 300,
#图片下载管道
'scrapy.pipelines.images.ImagesPipeline':1,
#文件(文字)下载管道
'scrapy.pipelines.files.FilesPipeline':2
}
IMAGES_STORE='file/image'
IMAGES_URLS_FIELD='imageurl'
FILES_STORE='file/book'
FILES_URLS_FIELD='downloadurl'
项目需求 将文件进行相应的保存 然后将json格式补充完整
# -*- coding: utf-8 -*-
import scrapy
import re
from ..items import HongxiuxiuItem
class XiuxiuSpider(scrapy.Spider):
name = 'xiuxiu'
allowed_domains = ['hongxiu.com']
start_urls = ['https://www.hongxiu.com/all?gender=2&catId=-1']
def parse(self, response):
type_list=response.xpath('//ul[@type="category"]/li/a/@href').extract()
del type_list[0]
for type in type_list:
url='https://www.hongxiu.com'+type
split=re.compile(r'.*?catId=(.*?)&.*?',re.S)
catId=re.findall(split,url)
print(catId)
yield scrapy.Request(url=url,meta={'type':catId[0]},callback=self.get_content_with_type_url)
def get_content_with_type_url(self,response):
catId=response.meta['type']
for page_num in range(1,11):
url='https://www.hongxiu.com/all?pageNum='+str(page_num)+'&pageSize=10&gender=2&catId='+catId+'&isFinish=-1&isVip=-1&size=-1&updT=-1&orderBy=0'
print(url)
yield scrapy.Request(url=url,callback=self.get_book_with_url)
def get_book_with_url(self,response):
detail_list=response.xpath('//div[@class="book-info"]/h3/a/@href').extract()
for book in detail_list:
url='https://www.hongxiu.com'+book
print('********************************************************')
print(url)
yield scrapy.Request(url=url,callback=self.get_detail_with_url)
def get_detail_with_url(self,response):
type = response.xpath('//div[@class="crumbs-nav center1020"]/span/a[2]/text()').extract_first('')
print(type)
name = response.xpath('//div[@class="book-info"]/h1/em/text()').extract_first('')
print(name)
author = response.xpath('//div[@class="book-info"]/h1/a/text()').extract_first('')
print(author)
total = response.xpath('//p[@class="total"]/span/text()').extract_first('') + response.xpath(
'//p[@class="total"]/em/text()').extract_first('')
print(total)
love = response.xpath('//p[@class="total"]/span[2]/text()').extract_first('') + response.xpath(
'//p[@class="total"]/em[2]/text()').extract_first('')
print(love)
cilk = response.xpath('//p[@class="total"]/span[3]/text()').extract_first('') + response.xpath(
'//p[@class="total"]/em[3]/text()').extract_first('')
print(cilk)
introduce = response.xpath('//p[@class="intro"]/text()').extract_first('')
print(introduce)
url = 'https:' + response.xpath('//div[@class="book-img"]//img/@src').extract_first('')
url = url.replace('\r', '')
print(url)
item=HongxiuxiuItem()
item['type']=type
item['name'] = name
item['author'] = author
item['total'] = total
item['love'] = love
item['cilk'] = cilk
item['introduce'] = introduce
item['url']=[url]
yield item
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import codecs
import os
import json
class HongxiuxiuPipeline(object):
def __init__(self):
self.file=codecs.open(filename='book.json',mode='w+',encoding='utf-8')
self.file.write('"book_list":[')
def process_item(self, item, spider):
res=dict(item)
str=json.dumps(res,ensure_ascii=False)
self.file.write(str)
self.file.write(',\n')
return item
def close_spider(self,spider):
self.file.seek(-1,os.SEEK_END)
self.file.truncate()
self.file.seek(-1,os.SEEK_END)
self.file.truncate()
self.file.write(']')
self.file.close()
ITEM_PIPELINES = {
'hongxiuxiu.pipelines.HongxiuxiuPipeline': 300,
'scrapy.pipelines.images.ImagesPipeline':1
}
IMAGES_STORE='TUTUTUTUTU'
IMAGES_URLS_FIELD='url'
项目要求 将图片进行下载 然后将每个图片模块的名称作为下载图片后的存储文件名称
# -*- coding: utf-8 -*-
import scrapy
from ..items import SucaiItem
class TubiaoSpider(scrapy.Spider):
name = 'tubiao'
allowed_domains = ['sc.chinaz.com']
start_urls = ['http://sc.chinaz.com/']
def parse(self, response):
icon_url=response.xpath('//li[@class="nos"]/a[3]/@href').extract_first('')
full_url='http://sc.chinaz.com'+icon_url
yield scrapy.Request(url=full_url,callback=self.parse_icon_url)
def parse_icon_url(self,response):
a_list=response.xpath('//ul[@class="pngblock imgload"]/li/span/a')
for a in a_list:
href=a.xpath('@href').extract_first('')
title=a.xpath('text()').extract_first('')
print(title)
# meta:负责传递往下一个方法发送的内容
yield scrapy.Request(url=href,meta={'title':title},callback=self.get_detail_with_url)
def get_detail_with_url(self,reseponse):
title=reseponse.meta['title']
img_list=reseponse.xpath('//div[@class="png_sl"]/div/img/@src').extract()
for img in img_list:
item=SucaiItem()
item['title']=title
item['img']=[img]
yield item
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import scrapy
# 系统中下载图片的管道
from scrapy.pipelines.images import ImagesPipeline
# 系统管道有下载图片的功能 我们的管道继承了系统的管道也有了
# 下载图片的功能
class SucaiPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
# print('管道方法执行了')
# 这个方法会循环执行
# 前面每次传入一个item 这个item被交给了引擎
# 引擎又交给了管道来执行 管道里面有很多个方法
# 这些方法会依次执行
yield scrapy.Request(url=item['img'][0],meta={'item':item})
# 管道里面提供了一系列的内置方法,这些方法会自动从第一个执行到最后一个
def file_path(self, request, response=None, info=None):
print('/8*/*/*/*965327-**/-*-/-*/*/*/***/*/**/**/*/')
item = request.meta['item']
print(item['title'])
print(item['img'])
# 设置图片的路径为 类型名称/url地址
image_name=item['img'][0].split('/')[-1]
path='%s/%s' % (item['title'],image_name)
return path
DOWNLOAD_DELAY = 0.3#将其解注释
ITEM_PIPELINES = {
'sucai.pipelines.SucaiPipeline': 300,
}
IMAGES_STORE='imagesssssss'
下载4k风景图片
# -*- coding: utf-8 -*-
import scrapy
from ..items import TuwangItem
class BianSpider(scrapy.Spider):
name = 'bian'
allowed_domains = ['pic.netbian.com']
start_urls = ['http://pic.netbian.com/4kfengjing/']
def parse(self, response):
img_list=response.xpath('//ul[@class="clearfix"]/li//img/@src').extract()
for img in img_list:
url='http://pic.netbian.com'+img
print(url)
item=TuwangItem()
item['url']=[url]
yield item
next_url=response.xpath('//div[@class="page"]/a[text()="下一页"]/@href').extract()
if len(next_url) != 0:
url='http://pic.netbian.com'+next_url[0]
yield scrapy.Request(url=url,callback=self.parse)
ITEM_PIPELINES = {
'tuwang.pipelines.TuwangPipeline': 300,
'scrapy.pipelines.images.ImagesPipeline':1
}
IMAGES_STORE='tutupian'
IMAGES_URLS_FIELD='url'
项目要求 将小说爬取出来然后在管道进行json格式的保存
# -*- coding: utf-8 -*-
import scrapy
from ..items import XiaoshuoItem
class XiaoxioashuoSpider(scrapy.Spider):
name = 'xiaoxiaoshuo'
allowed_domains = ['readnovel.com']
start_urls = ['https://www.readnovel.com/']
def parse(self, response):
# book_list=response.xpath('//div[@class="book-info"]')
book_list=response.css('.book-info')
print(book_list)
for book in book_list:
# 获取小说名称
name=book.xpath('.//h4/a/@title').extract_first('')
if len(name) ==0:
name = book.xpath('.//h3/a/@title').extract_first('')
des=book.xpath('.//p/text()').extract_first('')
author=book.xpath('.//div[@class="state-box cf"]/a/text()').extract_first('')
type=book.xpath('.//div[@class="state-box cf"]/i/text()').extract_first('')
item=XiaoshuoItem()
item['name']=name
item['des']=des
item['author']=author
item['type']=type
yield item
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# 用来打开指定文件 并且对文件进行转码 防止出现乱码问题
import codecs
import json
import os
class XiaoshuoPipeline(object):
def __init__(self):
# w:写文件 r:读文件
# w+:读写文件 r+:读写文件
# 前者读写文件 如果文件不存在 则创建
# 后者读写文件 如果文件不存在 则抛出异常
self.file=codecs.open(filename='book.json',mode='w+',encoding='utf-8')
self.file.write('"list:["')
# 如果想要将数据写入本地或者使用数据库的时候 这个方法需要保留
def process_item(self, item, spider):
# 将item对象转化为一个字典对象
res=dict(item)
# dumps 将字典对象转化成字符串 ASCII编码是否可用
# 如果直接将字典形式的数据写入文件当中,会发生错误
# 所以需要将字典形式的值 转化成字符串写入文件当中
str = json.dumps(res,ensure_ascii=False)
# 将数据写入到文件当中
self.file.write(str)
self.file.write(',\n')
def open_spider(self,spider):
pass
def close_spider(self,spider):
# 删除文件当中最后一个字符
# -1 表示偏移量至文件的末尾
# SEEK_END 定位到文件的最后一个字符
self.file.seek(-1,os.SEEK_END)
# 开始执行
self.file.truncate()
self.file.seek(-1, os.SEEK_END)
self.file.truncate()
self.file.write(']')
self.file.close()
#解注释
ITEM_PIPELINES = {
'xiaoshuo.pipelines.XiaoshuoPipeline': 300,
}
# -*- coding: utf-8 -*-
import scrapy
from ..items import HonghongxiuxiuItem
class HongxiuxiuSpider(scrapy.Spider):
name = 'hongxiuxiu'
allowed_domains = ['hongxiu.com']
start_urls = ['https://www.hongxiu.com/finish?gender=2&catId=-1']
def parse(self, response):
li_list=response.xpath('//div[@class="right-book-list"]/ul/li')
for li in li_list:
img='https'+li.xpath('.//div[@class="book-img"]/a/img/@src').extract_first('')
print(img)
name=li.xpath('.//div[@class="book-info"]/h3/a/text()').extract_first('')
print(name)
author=li.xpath('.//div[@class="book-info"]/h4/a/text()').extract_first('')
print(author)
intro=li.xpath('.//p[@class="intro"]/text()').extract_first('')
print(intro)
item=HonghongxiuxiuItem()
item['img']=img
item['name']=name
item['author']=author
item['intro']=intro
yield item
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import sqlite3
class HonghongxiuxiuPipeline(object):
def process_item(self, item, spider):
return item
class HongXiuDBPipeline(object):
def open_spider(self,spider):
self.connect=sqlite3.connect('hongxiuDB')
self.cursor=self.connect.cursor()
self.cursor.execute('create table if not exists bookTable(name text,author text,img text, intro text)')
self.connect.commit()
def process_item(self,item,spider):
self.cursor.execute('insert into bookTable (name,author,img,intro)VALUES ("{}","{}","{}","{}")'.format(item['name'],item['author'],item['img'],item['intro']))
self.connect.commit()
def close_spider(self,spider):
self.cursor.close()
self.connect.close()
ITEM_PIPELINES = {
'honghongxiuxiu.pipelines.HonghongxiuxiuPipeline': 300,
'honghongxiuxiu.pipelines.HongXiuDBPipeline':1
}
# -*- coding: utf-8 -*-
import scrapy
# https://search.51job.com/list/170200,000000,0000,00,9,99,java,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
from ..items import JobItem
class JavaSpider(scrapy.Spider):
name = 'zhiye'
allowed_domains = ['search.51job.com']
start_urls = ['https://search.51job.com/list/170200,000000,0000,00,9,99,java,2,1.html?','https://search.51job.com/list/170200,000000,0000,00,9,99,python,2,1.html?']
def parse(self, response):
# 1.分离当前页面所有数据 存储到item中
# 2.获取下一页链接,请求
div_list=response.xpath('//div[@id="resultList"]/div[@class="el"]')
for div in div_list:
# contains:只要属性包含某个值
jobname=div.xpath('.//p[contains(@class,"t1")]/span/a/@title').extract_first('')
print(jobname)
#公司名称
companyname=div.xpath('.//span[@class="t2"]/a/@title').extract_first('')
print(companyname)
cityname=div.xpath('.//span[@class="t3"]/text()').extract_first('')
print(cityname)
salary=div.xpath('.//span[@class="t4"]/text()').extract_first('')
print(salary)
min_salary=0
max_salary=0
if u'年' in salary:
money = salary.split('万')[0].split('-')
min_salary=int(money[0])/12
min_salary='%.1f'% min_salary
max_salary='%.1f'% (int(money[1])/12)
elif u'万' in salary:
money =salary.split('万')[0].split('-')
min_salary=money[0]
max_salary=money[1]
elif u'千' in salary:
money =salary.split('千')[0]
if '-' in money:
min_salary=float(money.split('-')[0])*0.1
max_salary=float(money.split('-')[1])*0.1
else:
min_salary=0
max_salary=float(money)*0.1
elif u'日' in salary:
money=salary.split('元')
min_salary=0
max_salary=int(money[0])*30/10000
else:
min_salary=0
max_salary=0
date=div.xpath('.//span[@class="t5"]/text()').extract_first('')
item = JobItem()
item['jobname']=jobname
item['companyname'] = companyname
item['cityname'] = cityname
item['min_salary'] = min_salary
item['max_salary'] = max_salary
item['date'] = date
yield item
next_url=response.xpath('//li[@class="bk"]/a[text()="下一页"]/@href').extract()
print('**********************************************')
print(next_url)
if len(next_url) != 0:
print(next_url[0])
yield scrapy.Request(url=next_url[0],callback=self.parse)
# -*- coding: utf-8 -*-
import scrapy
from ..items import DianyingItem
class TiantangSpider(scrapy.Spider):
name = 'tiantang'
allowed_domains = ['ygdy8.net']
start_urls = ['http://www.ygdy8.net/html/gndy/index.html']
def parse(self, response):
detail_list=response.xpath('//div[@class="co_area2"]//tr')
for datail in detail_list:
url = 'http://www.ygdy8.net'+datail.xpath('.//td[1]/a[2]/@href').extract_first('')
print(url)
yield scrapy.Request(url=url,callback=self.detail_info)
def detail_info(self,response):
title=response.xpath('//div[@class="title_all"]//font/text()').extract_first('')
href =response.xpath('//tr[@style="WORD-WRAP : break-word"]/a/@href').extract_first('')
print(href)
item=DianyingItem()
item['title']=title
item['href']=href
yield item
import pymysql
class DianyingPipeline(object):
def __init__(self):
self.connect=pymysql.connect(host='localhost',user='root',password='666666',db='movie',port=3306)
self.cursor=self.connect.cursor()
def process_item(self, item, spider):
self.cursor.execute('insert into movieTable(title,href)VALUES ("{}","{}")'.format(item['title'],item['href']))
self.connect.commit()
return item
def close_spider(self,spider):
self.cursor.close()
self.connect.close()
ITEM_PIPELINES = {
'dianying.pipelines.DianyingPipeline': 300,
}
# -*- coding: utf-8 -*-
import scrapy
from ..items import HongxiuItem
class HongxiuxiuSpider(scrapy.Spider):
name = 'hongxiuxiu'
allowed_domains = ['hongxiu.com']
start_urls = ['https://www.hongxiu.com/all?catId=30008']
def parse(self, response):
li_list=response.xpath('//div[@class="right-book-list"]/ul/li')
print(li_list)
for li in li_list:
img = 'https:'+li.xpath('.//div[@class="book-img"]/a/img/@src').extract_first('')
title=li.xpath('.//div[@class="book-img"]/a/img/@alt').extract_first('')
author=li.xpath('.//div[@class="book-info"]/h4/a/text()').extract_first('')
intro=li.xpath('.//div[@class="book-info"]/p[@class="intro"]/text()').extract_first('')
item=HongxiuItem()
item["img"]=[img]
item["title"]=title
item["author"]=author
item["intro"]=intro
yield item
import scrapy
import codecs
import os
import json
from scrapy.pipelines.images import ImagesPipeline
class HongxiuPipeline(object):
def __init__(self):
self.file=codecs.open(filename='hongxiu.json',mode='w+',encoding='utf-8')
def process_item(self, item, spider):
res=dict(item)
str=json.dumps(res,ensure_ascii=False)
self.file.write(str)
self.file.write('\n')
return item
class HongXiuDownloadPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
url=item['img'][0]
yield scrapy.Request(url=url,meta={'item':item})
def file_path(self, request, response=None, info=None):
item=request.meta['item']
bookname=item['title']
path= bookname+'.jpg'
return path
ITEM_PIPELINES = {
'hongxiu.pipelines.HongxiuPipeline': 300,
'hongxiu.pipelines.HongXiuDownloadPipeline':1
}
IMAGES_STORE='imgggg'
# -*- coding: utf-8 -*-
import scrapy
import re
class JobSpider(scrapy.Spider):
name = 'job'
allowed_domains = ['51job.com']
start_urls = ['https://search.51job.com/list/170200,000000,0000,00,9,99,java,2,1.html','https://search.51job.com/list/170200,000000,0000,00,9,99,Python,2,1.html']
def parse(self, response):
total_page=response.xpath('//div[@class="p_in"]/span[1]/text()').extract_first('')
print(total_page)
# 使用正则取出页码里面的所有的数字
res=re.compile(r'\d+')
# 得到的结果是一个对象 ,从对象中取出匹配的结果
result=re.findall(res,total_page)[0]
print(result)
# 获取请求的网页
url=response.url
print(url)
if 'java' in url:
for page in range(1,int(result)+1):
java_url='https://search.51job.com/list/170200,000000,0000,00,9,99,java,2,{}.html'.format(page)
yield scrapy.Request(url=java_url,callback=self.get_detail_with_page)
else:
for page in range(1,int(result)+1):
python_url='https://search.51job.com/list/170200,000000,0000,00,9,99,Python,2,{}.html'.format(page)
yield scrapy.Request(url=python_url,callback=self.get_detail_with_page)
def get_detail_with_page(self,response):
print(response.url)
from scrapy.http.response.html import HtmlResponse
from scrapy import signals
class taobaospidermiddleware(object):
def process_request(self,request,spider):
if spider.name=='shishang':
spider.driver.get(request.url)
spider.driver.implicitly_wait(10)
response=HtmlResponse(url=spider.driver.current_url,
request=request,
body=spider.driver.page_source,
encoding='utf-8')
return response
# -*- coding: utf-8 -*-
import scrapy
from ..items import TaobaoItem
from selenium import webdriver
class ShishangSpider(scrapy.Spider):
name = 'shishang'
allowed_domains = ['taobao.com']
start_urls = ['https://s.taobao.com/search?q=%E6%97%B6%E5%B0%9A%E7%94%B7%E9%9E%8B&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=6&ntoffset=6&p4ppushleft=1%2C48&s=0']
def __init__(self):
self.driver=webdriver.PhantomJS()
def parse(self, response):
content_list=response.xpath('//div[@class="ctx-box J_MouseEneterLeave J_IconMoreNew"]')
for x in content_list:
name=x.xpath('.//div[@class="row row-2 title"]/a').xpath('string(.)').extract()[0].strip('\n').replace(' ','').strip('\n')
price=x.xpath('.//div[@class="price g_price g_price-highlight"]/strong/text()').extract_first('')
dian_name=x.xpath('.//div[@class="shop"]/a/span[2]/text()').extract_first('')
item=TaobaoItem()
item['name']=name
item['price']=price
item['dian_name']=dian_name
yield item
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from openpyxl import Workbook
class TaobaoPipeline(object):
def __init__(self):
self.wb=Workbook()
self.ws=self.wb.active
self.ws.append(['名称','价格','店铺'])
def process_item(self, item, spider):
line=[item['name'],item['price'],item['dian_name']]
self.ws.append(line)
self.wb.save('时尚男鞋.xlsx')
return item
def spider_closed(self,spider):
pass
DOWNLOADER_MIDDLEWARES = {
'taobao.middlewares.taobaospidermiddleware': 543,
}
ITEM_PIPELINES = {
'taobao.pipelines.TaobaoPipeline': 300,
}