前边我们已经学习了如何处理单个csv文件
我们做数据处理的目的是为了处理多个文件
下面来看一下多个csv文件的处理,还是提供两种方式csv和pandas
#!/usr/bin/env python3
#coding='utf-8'
import sys
import csv
import glob
import os
input_path = sys.argv[1]
count = 0 #用于计算处理的文件个数
for input_file in glob.glob(os.path.join(input_path,'*.csv')): #使用glob来获取要处理的文件,使用os来获取要处理的文件的绝对路径名
row_counter = 1 #用于计算文件中的行数,表头被单独拿出来了所以行数开始设置为1
with open(input_file,'r',newline='') as fileread:
filereader = csv.reader(fileread)
header = next(filereader,None)
for row in filereader:
row_counter += 1 #统计行数
print('{}:\t{} rows\t{} colums'.format(os.path.basename(input_file),row_counter,len(header))) #打印统计结果
count +=1
print('总共处理了{}文件'.format(count))
#执行结果:
ceshi1.csv: 13 rows 5 colums
ceshi3.csv: 11 rows 5 colums
ceshi2.csv: 30 rows 4 colums
总共处理了3文件
#!/usr/bin/env python
#coding='utf-8'
import sys
import pandas as pd
import os
import glob
count = 0
input_path = sys.argv[1]
for filename in glob.glob(os.path.join(input_path,'*.csv')):
data_framet = pd.read_csv(filename)
row_count = len(data_framet)
columns = data_framet.columns.size
print('{}:\t{}row\t{}columns'.format(os.path.basename(filename),row_count,columns))
count += 1
print('总共处理{}文件'.format(count))
#注意了,pandas是不会处理表头和空格的
#执行结果:
ceshi1.csv: 12row 5columns
ceshi3.csv: 10row 5columns
ceshi2.csv: 29row 4columns
总共处理3文件
pandas会将表头去掉
#!/usr/bin/env python3
#coding='utf-8'
import sys
import csv
import glob
import os
input_path = sys.argv[1]
input_file = sys.argv[2]
count = 0 #用于控制表头输入
dir_file = os.path.join(input_path,'data*.txt')
for filename in glob.glob(dir_file):
with open(filename,'r',newline='') as fileread:
with open(input_file,'a',newline='') as filewrite:
filereader = csv.reader(fileread)
filewriter = csv.writer(filewrite)
header = next(filereader)
if count == 0:
filewriter.writerow(header)
count += 1
for row in filereader:
filewriter.writerow(row)
#执行结果
Customer ID,Customer Name,Invoice Number,Sale Amount,Purchase Date
1234,John Smith,100-0002,"$1,200.00",1/1/14
2345,Mary Harrison,100-0003,"$1,425.00",1/6/14
3456,Lucy Gomez,100-0004,"$1,390.00",1/11/14
4567,Rupert Jones,100-0005,"$1,257.00",1/18/14
5678,Jenny Walters,100-0006,"$1,725.00",1/24/14
8765,Tony Song,100-0015,"$1,167.00",3/8/14
2345,Mary Harrison,100-0016,"$1,789.00",3/17/14
6543,Rachel Paz,100-0017,"$2,042.00",3/22/14
3456,Lucy Gomez,100-0018,"$1,511.00",3/28/14
4321,Susan Wallace,100-0019,"$2,280.00",3/30/14
9876,Daniel Farber,100-0008,"$1,115.00",2/2/14
8765,Laney Stone,100-0009,"$1,367.00",2/8/14
7654,Roger Lipney,100-0010,"$2,135.00",2/15/14
6543,Thomas Haines,100-0011,"$1,346.00",2/17/14
5432,Anushka Vaz,100-0012,"$1,560.00",2/21/14
#!/usr/bin/env python
#coding='utf-8'
import sys
import pandas as pd
import os
import glob
input_path = sys.argv[1]
datalist = [] #定义空的列表来存放多个文件的内容
for filename in glob.glob(os.path.join(input_path,'data*.txt')):
data_framet = pd.read_csv(filename,index_col=None)
datalist.append(data_framet)
data = pd.concat(datalist,axis=0,ignore_index=True) #axis=0垂直方向合并 ignore_index=True忽略原来的行号
print(data)
Customer ID Customer Name ... Sale Amount Purchase Date
0 1234 John Smith ... $1,200.00 1/1/14
1 2345 Mary Harrison ... $1,425.00 1/6/14
2 3456 Lucy Gomez ... $1,390.00 1/11/14
3 4567 Rupert Jones ... $1,257.00 1/18/14
4 5678 Jenny Walters ... $1,725.00 1/24/14
5 8765 Tony Song ... $1,167.00 3/8/14
6 2345 Mary Harrison ... $1,789.00 3/17/14
7 6543 Rachel Paz ... $2,042.00 3/22/14
8 3456 Lucy Gomez ... $1,511.00 3/28/14
9 4321 Susan Wallace ... $2,280.00 3/30/14
10 9876 Daniel Farber ... $1,115.00 2/2/14
11 8765 Laney Stone ... $1,367.00 2/8/14
12 7654 Roger Lipney ... $2,135.00 2/15/14
13 6543 Thomas Haines ... $1,346.00 2/17/14
14 5432 Anushka Vaz ... $1,560.00 2/21/14
[15 rows x 5 columns]
#!/usr/bin/env python
#coding='utf-8'
import sys
import pandas as pd
import os
import glob
datalist = []
input_path = sys.argv[1]
dirfile = glob.glob(os.path.join(input_path,'data*.txt'))
for filename in dirfile:
data_farmet = pd.read_csv(filename,index_col=None)
cost = pd.DataFrame([float(str(value).strip('$').replace(',','')) for value in data_farmet.loc[:,'Sale Amount']])
data = {'mean' : cost.mean(),'sum':cost.sum()}
datalist.append(pd.DataFrame(data,columns=['mean', 'sum']))
result = pd.concat(datalist,axis=0,ignore_index=True)
print(result)
mean sum
0 1399.4 6997.0
1 1757.8 8789.0
2 1504.6 7523.0