csv

快速读取csv文件 去空行 自定义字段顺序 编码 日志级别 忽略和重新请求 增加最大线程池

快速读取csv,有进度条。

# “达观杯”csv数据文件读取
import time
import pandas as pd
from tqdm import tqdm
def reader_pandas(file, chunkSize=100000, patitions=10 ** 4):
    reader = pd.read_csv(file, iterator=True)
    chunks = []
    with tqdm(range(patitions), 'Reading ...') as t:
        for _ in t:
            try:
                chunk = reader.get_chunk(chunkSize)
                chunks.append(chunk)
            except StopIteration:
                break
    return pd.concat(chunks, ignore_index=True)
print(reader_pandas("./data/train_set.csv"))
if __name__ == '__main__':
    from scrapy import cmdline

    cmdline.execute('scrapy crawl Pakistan_thenews'.split())
    # cmdline.execute('scrapy crawl Pakistan_thenews -o ./csv_file/Pakistan_thenews_p.csv -t csv'.split())

settings.py

# 自定义字段顺序
FEED_EXPORT_FIELDS = [
   'country',
   'category',
   'data_url',
   'title',
   'abstract',
   'content',
   'img_url',
   'press_time',
]
# 在同层目录,settings.py文件指定分隔符
# CSV_DELIMITER = '\t'

# 编码
FEED_EXPORT_ENCODING = "gb18030"

# 日志级别
# LOG_LEVEL = 'INFO'
# LOG_LEVEL = 'ERROR'
# LOG_FILE = 'mySpider.log'

# 遇到错误忽略掉不重新请求,则设成[]
# RETRY_HTTP_CODES = []
RETRY_HTTP_CODES = [500, 502, 503, 504, 508, 400, 403, 404, 408, 520]

# 增加最大线程池
REACTOR_THREADPOOL_MAXSIZE = 1

去空行

# scrapy.exporters.CsvItemExporter,在io.TextIOWrapper加入参数newline='',问题解决
class CsvItemExporter(BaseItemExporter):
 
    def __init__(self, file, include_headers_line=True, join_multivalued=',', **kwargs):
        self._configure(kwargs, dont_fail=True)
        if not self.encoding:
            self.encoding = 'utf-8'
        self.include_headers_line = include_headers_line
        self.stream = io.TextIOWrapper(
            file,
            newline='',
            line_buffering=False,
            write_through=True,
            encoding=self.encoding
        ) if six.PY3 else file
        self.csv_writer = csv.writer(self.stream, **kwargs)
        self._headers_not_written = True

你可能感兴趣的:(csv)