with open('C:/Users/asus/Desktop/Python/test.csv') as f:
for line in f:
print line
def read_in_block(file_path):
BLOCK_SIZE = 1024
with open(file_path,'r') as f:
while True:
block = f.read(BLOCK_SIZE)
if block:
yield block
else:
return # 若读到文件末尾,则退出
for block in read_in_block('C:/Users/asus/Desktop/Python/test.csv'):
print block
import pandas as pd
data = pd.read_csv('C:/Users/asus/Desktop/Python/test.csv',chunksize=1000000,header=None,sep=';')
for chunk in data:
print chunk
玩方法三的时候着实让我激动了一下,耗时57s!而且还转换成了一个让我感觉它就是DataFrame类型的数据。chunksize是pandas读取csv文件的方法其中一个小小的参数,一般使用read_csv()的时候,它都是为None的。但其实,当chunksize被设置为数值是,read_csv()就会迭代读取数据,而不是一次性读取。如此就会返回一个TextFileReader的迭代器,注意,是TextFileReader哦!!属性是data = pd.read_csv('C:/Users/asus/Desktop/Python/test.csv',chunksize=1000000,header=None,sep=';')
# print data
df = pd.DataFrame(columns=[0,1,2,3,4,5,6,7,8]) # 初始化一个空的DataFrame
for chunk in data:
# print type(chunk)
# print chunk
df = pd.concat([df,chunk],ignore_index=True) # 将chunk都连接起来形成一个DataFrame
print df
for chunk in data:
# print type(chunk)
# print chunk
chunk = chunk.dropna(axis=1) # 去空
chunk = chunk.drop_duplicates() # 删除重复数据
df = pd.concat([df,chunk],ignore_index=True)
print df
对于我的文件,废掉的数据有很多,所以我整理后数据缩减了很多,我觉得就是因为这样我的程序才不会再报内存错误。由于这数据是指定的chunksize=1000000,即每100万条数据读取一次,分成了很多段,所以在整合一个新的DataFrame后,还需要再进行一些数据清洗整合去重什么的。程序耗时68s。
reader = pd.read_csv('C:/Users/asus/Desktop/Python/test.csv',iterator=True,sep=';')
loop = True
chunksize = 100000
chunks = []
while loop:
try:
chunk = reader.get_chunk(chunksize)
chunk = chunk.dropna(axis=1)
chunk = chunk.drop_duplicates()
chunks.append(chunk)
except StopIteration:
loop = False
print "Iteration is stopped."
df = pd.concat(chunks,ignore_index=True)
df = df.dropna(axis=1)
df = df.drop_duplicates()
print df