https://docs.python.org/3/library/csv.html
这段代码批量给.csv文件加上headers(thanks Dalao for the help):但是readlines应该是有毛病的。
import os
dir_name = os.path.dirname(os.path.realpath(__file__))
print('dir_name:', dir_name)
columns = ["question, potentially_related_question, answer"]
for f_name in os.listdir(dir_name):
try:
if f_name.endswith('.csv'):
with open(f_name) as f:
data = f.readlines()
data = columns + data
with open(f_name, "w") as f:
f.write("\n".join(data))
except:
print(f_name)
来自段老板的建议:
办法好像效率不太高啊…
你可以试试 先把原来的第一行存下来 然后直接用columns把第一行覆盖掉 再把原来的第一行append到文件尾。
不只是速度…原来的方法相当于重写了整个文件…?直接readlines也很吃内存的。
打开读一行关上
打开写一行关上
打开append一行
最简单的
但是注意算一下字符数差值。覆盖要看长度的,你是把原来的第一行覆盖掉
修改之后的代码,非原创,改自于StackOverflow回答
import os
import codecs
import csv
import sys
# import csv
maxInt = sys.maxsize
decrement = True
while decrement:
# decrease the maxInt value by factor 10
# as long as the OverflowError occurs.
decrement = False
try:
csv.field_size_limit(maxInt)
except OverflowError:
maxInt = int(maxInt/10)
decrement = True
def split(filehandler, delimiter=',', row_limit=1000,
output_name_template='tasks_%s.csv', output_path='.', keep_headers=True):
reader = csv.reader(filehandler, delimiter=delimiter)
current_piece = 1
current_out_path = os.path.join(
output_path,
output_name_template % current_piece
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
current_limit = row_limit
if keep_headers:
headers = next(reader)
current_out_writer.writerow(headers)
for i, row in enumerate(reader):
if i + 1 > current_limit:
current_piece += 1
current_limit = row_limit * current_piece
current_out_path = os.path.join(
output_path,
output_name_template % current_piece
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
if keep_headers:
current_out_writer.writerow(headers)
current_out_writer.writerow(row)
split(codecs.open('/your/path/here/ori_tasks.csv', 'rU'))