在爬虫中获取数据后的各种存储方法

一、文本存储

大体结构是这样的

with open('名字.text','w',encoding='utf-8') as f:

               f.write()

               f.close()

实例如下

class NovelSpider(scrapy.Spider):
    name = 'novel'
    allowed_domains = ['tieba.baidu.com']
    start_urls = ['https://tieba.baidu.com/p/5815118868?pn=1']


    f = open('tieba.txt','a', encoding='utf-8')
    def parse(self, response):
        # print(response.text)
        # print("==========================================================")
        # 找到指定的div的标签,该标签内贴吧内容和作者的集合体
        # div_list = response.xpath('//div[@class="d_post_content j_d_post_content "]')
        div_list = response.xpath('//div[@class="l_post l_post_bright j_l_post clearfix  "]')
        # print(div_list)
        # print("===================================")
        # 找到作者
        for div in div_list:
            # 获取含有louzhubiaoshi_wrap 类名的标签
            # 改类名只有楼主才有
            author = div.xpath('.//div[@class="louzhubiaoshi_wrap"]').extract()
            print("----------------------------------")
            print(author)
            if len(author) !=0:
                # 获取标签内全部文本的方式
                # 1.获取最外层标签,遍历内部所有的子标签,获取标签文本
                # 2,正则去掉所有标签  <.*?> re.comlile
                # 3.text 获取标签的文本 //text()获取标签及子标签的文本
                # 4.使用xpath(string(.))这种方式来获取所有文本并且拼接
                # content_list = div.xpath('.//div[@class="d_post_content j_d_post_content "]/text()').extract()
                content_list = div.xpath('.//div[@class="d_post_content j_d_post_content "]').xpath('string(.)').extract()[0] + '\n'
                # print(content_list)
                # print("000000000000000000000000000000000000000000000000000")
                # self.f.write(content)

                remove = re.compile('\s')
                douhao = re.compile(',')
                content =[]
                for string in content_list:
                    string = string.sub(remove,'',string)
                    string = re.sub(douhao,' ',string)
                    content += string + ','
                # print(content)
                self.f.write(content)
                self.f.write('\n')
        #

如要将数据存储成json格式的话,具体实现如下

import codecs,os,json
class HongxiutianxiangPipeline(object):
    print("========================================")
    def __init__(self):
        self.file = codecs.open(filename='book.json' ,mode='w+',encoding='utf-8')
        self.file.write('"book_list":[')
    def process_item(self, item, spider):
        res = dict(item)
        str = json.dumps(res,ensure_ascii=False)
        self.file.write(str)
        self.file.write(',\n')
        return item
    def close_spider(self ,spSEider):
        self.file.seek(-1,os.SEEK_END)
        self.file.truncate()

        self.file.seek(-1,os.SEEK_END)
        self.file.truncate()

        self.file.write(']')
        self.file.close()

 还有下面的主要是用于数据转码的,防止出现乱码,也是json格式、

# 用来打开指定文件,并且对文件进行转码,防止出现乱码问题
import codecs
import json
class XiaoshuoPipeline(object):
    def __init__(self):
        # w  写文件
        # w+ 读写文件  r 读写文件
        # 以上两个的区别,前者如果文件不存在则创建,后者速写文件,如果文件不存在,则抛出异常
        self.file = codecs.open(filename='book.json',mode='w+',encoding='utf-8')
        # self.file.write("list":"[]")
    # 如果想要将数据写入本地或者使用数据库的时候,这个方法需要保留
    def process_item(self, item, spider):
        # print(item,'==================')
        # 将item对象转化成字典对象
        res = dict(item)
        # print(res,'-------------------')
        # 将字典对象转化成字符串, ascii编码是否可用
        # 如果直接将字典形式的数据写入到文件当中,会发生错误,转化成字符串写入到文件当中
        str = json.dumps(res,ensure_ascii=False)
        # print(str,'1111111111111111111111111111111111')
        # 将数据写入到文件当中
        self.file.write(str)
        self.file.write('\n')
    def open_spider(self,spider):
        print("爬虫开始了")

二、sqlite3微型数据库存储

在爬虫中一般都是在pipeline里面设置的,然后再setting里面的ITEM_PIPELINES解注释,添加

ITEM_PIPELINES ={

     'Job.pipelines.Jobpipeline':300,

}

项目名+要开启的管道名+在管道里新添加的类或者是说要用到的类,这里还有优先级,数字越小优先级越高

import sqlite3
class JobPipeline(object):
    def process_item(self, item, spider):
        return item
class JobDBPipeline(object):
    def open_spider(self,spider):
        self.connect = sqlite3.connect('JobDB')
        self.cursor = self.connect.cursor()
        self.cursor.execute(
            'create table if not exists JobTable(job_name,job_company,job_place,job_time,min_salary,max_salary)')
        self.connect.commit()
    def process_item(self, item,spider):
        self.cursor.execute('insert into JobTable(job_name,job_company,job_place,job_time,min_salary,max_salary) VALUES ("{}","{}","{}","{}","{}","{}")'.format(item['job_name'],item['job_company'],item['job_place'],item['job_time'],item['min_salary'],item['max_salary']))
        self.connect.commit()

    def close_spider(self,spider):
        self.cursor.close()
        self.connect.close()

三、将数据存储到excel表格,以豆瓣电影存储到excel表格为例

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from fake_useragent import UserAgent
import xlwt,requests
from lxml import etree
class DBmovie(object):
    def __init__(self):
        self.base_url = 'https://movie.douban.com/top250'
        self.headers = UserAgent()
        self.workBooke = None
        self.sheet = None
        self.record = 1

    def spider_manage(self):
        self.excel_build()
        self.get_url_code()
        self.workBook.save('电影表.xls')
    def excel_build(self):
        self.workBook = xlwt.Workbook(encoding='utf-8')
        self.sheet = self.workBook.add_sheet('电影排行榜')
        self.sheet.write(0,0,'电影排名')
        self.sheet.write(0,1,'电影名称')
        self.sheet.write(0,2,'演员与导演')
        self.sheet.write(0,3,'电影评分')
        self.sheet.write(0,4,'电影影评')
        # self.sheet.write(0,5,'电影内容')


    def get_url_code(self, url =''):
        headers = {
            'User-Agent': self.headers.random
        }
        full_url = self.base_url +url
        # print(full_url)
        response = requests.get(full_url, headers=headers).text
        # print(response)
        code = etree.HTML(response)
        print(code)
        item_div = code.xpath('//div[@class="item"]')
        # print(item_div)
        for tag in item_div:
            movie_name = tag.xpath('.//div[@class="hd"]/a/span/text()')
            # print(movie_name)
            name=''
            for movie in movie_name:
                name += movie
            # print(name)
            movie_rank = tag.xpath('div/em[@class=""]/text()')[0]
            # print(movie_rank)
            movie_author = tag.xpath('.//div[@class="bd"]/p/text()')[0]
            movie_author = movie_author.strip('\n').replace(' ','')
            # print(movie_author)
            movie_grade = tag.xpath('.//span[@class="rating_num"]/text()')[0]
            # print(movie_grade)
            movie_comment = tag.xpath('.//div[@class="star"]/span[last()]/text()')[0]
            movie_comment = movie_comment[0:-3]
            # print(movie_comment)
            # movie_content = tag.xpath('.//p[@class="quote"]/span/text()')[0]
            # movie_content = movie_content.strip('\n').replace('。','')
            # print(movie_content)
            # print(movie_content)
            self.sheet.write(self.record,0,movie_rank)
            self.sheet.write(self.record,1,name)
            self.sheet.write(self.record,2,movie_author)
            self.sheet.write(self.record,3,movie_grade)
            self.sheet.write(self.record,4,movie_comment)
            # self.sheet.write(self.record,5,movie_content)
            self.record += 1

        self.get_next_page(code)
    def get_next_page(self,code):
        next_url = code.xpath('//span[@class="next"]/a/@href')
        # print(next_url)
        if len(next_url) == 0:
            print("已经是最后一页了")
            return
        self.get_url_code(next_url[0])

movie = DBmovie()
movie.spider_manage()

 

四、mysql数据库存储

class DianyingtiantangPipeline(object):
    def __init__(self):
        self.connect = pymysql.connect(host='localhost',user='root',password='123456',db='Movie',port=3305)
        self.cursor = self.connect.cursor()
    def process_item(self, item, spider):
        self.cursor.execute('insert into movieTable (name ,href) VALUES ("{}","{}")'.format(item['name'],item['href']))
        self.connect.commit()
        return item
    def close_spider(self ,spider):
        self.cursor.close()
        self.connect.close()

 

五、在爬虫终端命令形式存成表格形式

scrapy crawl 爬虫名 -o 爬虫名.csv

 

六、终端命令将数据的解码方式改成utf-8,防止中文乱码

scrapy crawl 爬虫名 -o 爬虫名.json -s FEED_EXPORT_ENCORDING=UTF-8

 

七、将数据中的图片存到本地文件夹

在setting里面添加

IMAGES_STORE = '***'(文件名、默认是在当前目录的父级,在这里你也可选择已存在文件夹的路径)

IMAGES_URLS_FIELD = '***' (一般填写图片的下载路径,在爬虫文件中要特别注意,要传的路径最好带【】就是将得到的数据转储到items里面的时候)

将文本文件存到本地(例如小说之类)

FILES_STORES = '****' (性质和存图片的一样)

FILES_URLS_FIELD = '***'(同上)

 

你可能感兴趣的:(在爬虫中获取数据后的各种存储方法)