一、文本存储
大体结构是这样的
with open('名字.text','w',encoding='utf-8') as f:
f.write()
f.close()
实例如下
class NovelSpider(scrapy.Spider):
name = 'novel'
allowed_domains = ['tieba.baidu.com']
start_urls = ['https://tieba.baidu.com/p/5815118868?pn=1']
f = open('tieba.txt','a', encoding='utf-8')
def parse(self, response):
# print(response.text)
# print("==========================================================")
# 找到指定的div的标签,该标签内贴吧内容和作者的集合体
# div_list = response.xpath('//div[@class="d_post_content j_d_post_content "]')
div_list = response.xpath('//div[@class="l_post l_post_bright j_l_post clearfix "]')
# print(div_list)
# print("===================================")
# 找到作者
for div in div_list:
# 获取含有louzhubiaoshi_wrap 类名的标签
# 改类名只有楼主才有
author = div.xpath('.//div[@class="louzhubiaoshi_wrap"]').extract()
print("----------------------------------")
print(author)
if len(author) !=0:
# 获取标签内全部文本的方式
# 1.获取最外层标签,遍历内部所有的子标签,获取标签文本
# 2,正则去掉所有标签 <.*?> re.comlile
# 3.text 获取标签的文本 //text()获取标签及子标签的文本
# 4.使用xpath(string(.))这种方式来获取所有文本并且拼接
# content_list = div.xpath('.//div[@class="d_post_content j_d_post_content "]/text()').extract()
content_list = div.xpath('.//div[@class="d_post_content j_d_post_content "]').xpath('string(.)').extract()[0] + '\n'
# print(content_list)
# print("000000000000000000000000000000000000000000000000000")
# self.f.write(content)
remove = re.compile('\s')
douhao = re.compile(',')
content =[]
for string in content_list:
string = string.sub(remove,'',string)
string = re.sub(douhao,' ',string)
content += string + ','
# print(content)
self.f.write(content)
self.f.write('\n')
#
如要将数据存储成json格式的话,具体实现如下
import codecs,os,json
class HongxiutianxiangPipeline(object):
print("========================================")
def __init__(self):
self.file = codecs.open(filename='book.json' ,mode='w+',encoding='utf-8')
self.file.write('"book_list":[')
def process_item(self, item, spider):
res = dict(item)
str = json.dumps(res,ensure_ascii=False)
self.file.write(str)
self.file.write(',\n')
return item
def close_spider(self ,spSEider):
self.file.seek(-1,os.SEEK_END)
self.file.truncate()
self.file.seek(-1,os.SEEK_END)
self.file.truncate()
self.file.write(']')
self.file.close()
还有下面的主要是用于数据转码的,防止出现乱码,也是json格式、
# 用来打开指定文件,并且对文件进行转码,防止出现乱码问题
import codecs
import json
class XiaoshuoPipeline(object):
def __init__(self):
# w 写文件
# w+ 读写文件 r 读写文件
# 以上两个的区别,前者如果文件不存在则创建,后者速写文件,如果文件不存在,则抛出异常
self.file = codecs.open(filename='book.json',mode='w+',encoding='utf-8')
# self.file.write("list":"[]")
# 如果想要将数据写入本地或者使用数据库的时候,这个方法需要保留
def process_item(self, item, spider):
# print(item,'==================')
# 将item对象转化成字典对象
res = dict(item)
# print(res,'-------------------')
# 将字典对象转化成字符串, ascii编码是否可用
# 如果直接将字典形式的数据写入到文件当中,会发生错误,转化成字符串写入到文件当中
str = json.dumps(res,ensure_ascii=False)
# print(str,'1111111111111111111111111111111111')
# 将数据写入到文件当中
self.file.write(str)
self.file.write('\n')
def open_spider(self,spider):
print("爬虫开始了")
二、sqlite3微型数据库存储
在爬虫中一般都是在pipeline里面设置的,然后再setting里面的ITEM_PIPELINES解注释,添加
ITEM_PIPELINES ={
'Job.pipelines.Jobpipeline':300,
}
项目名+要开启的管道名+在管道里新添加的类或者是说要用到的类,这里还有优先级,数字越小优先级越高
import sqlite3
class JobPipeline(object):
def process_item(self, item, spider):
return item
class JobDBPipeline(object):
def open_spider(self,spider):
self.connect = sqlite3.connect('JobDB')
self.cursor = self.connect.cursor()
self.cursor.execute(
'create table if not exists JobTable(job_name,job_company,job_place,job_time,min_salary,max_salary)')
self.connect.commit()
def process_item(self, item,spider):
self.cursor.execute('insert into JobTable(job_name,job_company,job_place,job_time,min_salary,max_salary) VALUES ("{}","{}","{}","{}","{}","{}")'.format(item['job_name'],item['job_company'],item['job_place'],item['job_time'],item['min_salary'],item['max_salary']))
self.connect.commit()
def close_spider(self,spider):
self.cursor.close()
self.connect.close()
三、将数据存储到excel表格,以豆瓣电影存储到excel表格为例
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from fake_useragent import UserAgent
import xlwt,requests
from lxml import etree
class DBmovie(object):
def __init__(self):
self.base_url = 'https://movie.douban.com/top250'
self.headers = UserAgent()
self.workBooke = None
self.sheet = None
self.record = 1
def spider_manage(self):
self.excel_build()
self.get_url_code()
self.workBook.save('电影表.xls')
def excel_build(self):
self.workBook = xlwt.Workbook(encoding='utf-8')
self.sheet = self.workBook.add_sheet('电影排行榜')
self.sheet.write(0,0,'电影排名')
self.sheet.write(0,1,'电影名称')
self.sheet.write(0,2,'演员与导演')
self.sheet.write(0,3,'电影评分')
self.sheet.write(0,4,'电影影评')
# self.sheet.write(0,5,'电影内容')
def get_url_code(self, url =''):
headers = {
'User-Agent': self.headers.random
}
full_url = self.base_url +url
# print(full_url)
response = requests.get(full_url, headers=headers).text
# print(response)
code = etree.HTML(response)
print(code)
item_div = code.xpath('//div[@class="item"]')
# print(item_div)
for tag in item_div:
movie_name = tag.xpath('.//div[@class="hd"]/a/span/text()')
# print(movie_name)
name=''
for movie in movie_name:
name += movie
# print(name)
movie_rank = tag.xpath('div/em[@class=""]/text()')[0]
# print(movie_rank)
movie_author = tag.xpath('.//div[@class="bd"]/p/text()')[0]
movie_author = movie_author.strip('\n').replace(' ','')
# print(movie_author)
movie_grade = tag.xpath('.//span[@class="rating_num"]/text()')[0]
# print(movie_grade)
movie_comment = tag.xpath('.//div[@class="star"]/span[last()]/text()')[0]
movie_comment = movie_comment[0:-3]
# print(movie_comment)
# movie_content = tag.xpath('.//p[@class="quote"]/span/text()')[0]
# movie_content = movie_content.strip('\n').replace('。','')
# print(movie_content)
# print(movie_content)
self.sheet.write(self.record,0,movie_rank)
self.sheet.write(self.record,1,name)
self.sheet.write(self.record,2,movie_author)
self.sheet.write(self.record,3,movie_grade)
self.sheet.write(self.record,4,movie_comment)
# self.sheet.write(self.record,5,movie_content)
self.record += 1
self.get_next_page(code)
def get_next_page(self,code):
next_url = code.xpath('//span[@class="next"]/a/@href')
# print(next_url)
if len(next_url) == 0:
print("已经是最后一页了")
return
self.get_url_code(next_url[0])
movie = DBmovie()
movie.spider_manage()
四、mysql数据库存储
class DianyingtiantangPipeline(object):
def __init__(self):
self.connect = pymysql.connect(host='localhost',user='root',password='123456',db='Movie',port=3305)
self.cursor = self.connect.cursor()
def process_item(self, item, spider):
self.cursor.execute('insert into movieTable (name ,href) VALUES ("{}","{}")'.format(item['name'],item['href']))
self.connect.commit()
return item
def close_spider(self ,spider):
self.cursor.close()
self.connect.close()
五、在爬虫终端命令形式存成表格形式
scrapy crawl 爬虫名 -o 爬虫名.csv
六、终端命令将数据的解码方式改成utf-8,防止中文乱码
scrapy crawl 爬虫名 -o 爬虫名.json -s FEED_EXPORT_ENCORDING=UTF-8
七、将数据中的图片存到本地文件夹
在setting里面添加
IMAGES_STORE = '***'(文件名、默认是在当前目录的父级,在这里你也可选择已存在文件夹的路径)
IMAGES_URLS_FIELD = '***' (一般填写图片的下载路径,在爬虫文件中要特别注意,要传的路径最好带【】就是将得到的数据转储到items里面的时候)
将文本文件存到本地(例如小说之类)
FILES_STORES = '****' (性质和存图片的一样)
FILES_URLS_FIELD = '***'(同上)