python处理多个csv文件

前边我们已经学习了如何处理单个csv文件
我们做数据处理的目的是为了处理多个文件
下面来看一下多个csv文件的处理,还是提供两种方式csv和pandas

文件计数与文件中行数的统计

csv方式
#!/usr/bin/env python3
#coding='utf-8'

import sys
import csv
import glob
import os

input_path = sys.argv[1]
count = 0  #用于计算处理的文件个数
for input_file in glob.glob(os.path.join(input_path,'*.csv')):  #使用glob来获取要处理的文件,使用os来获取要处理的文件的绝对路径名
	row_counter = 1 #用于计算文件中的行数,表头被单独拿出来了所以行数开始设置为1
	with open(input_file,'r',newline='') as fileread:
		filereader = csv.reader(fileread)
		header = next(filereader,None)
		for row in filereader:
			row_counter += 1  #统计行数
		print('{}:\t{} rows\t{} colums'.format(os.path.basename(input_file),row_counter,len(header)))  #打印统计结果
		count +=1
print('总共处理了{}文件'.format(count))
#执行结果:
ceshi1.csv:	13 rows	5 colums
ceshi3.csv:	11 rows	5 colums
ceshi2.csv:	30 rows	4 colums
总共处理了3文件
pandas方式
#!/usr/bin/env python
#coding='utf-8'
import sys
import pandas as pd
import os
import glob
count = 0
input_path = sys.argv[1]

for filename in glob.glob(os.path.join(input_path,'*.csv')):
	data_framet = pd.read_csv(filename)
	row_count = len(data_framet)
	columns = data_framet.columns.size
	print('{}:\t{}row\t{}columns'.format(os.path.basename(filename),row_count,columns))
	count += 1
print('总共处理{}文件'.format(count))
#注意了,pandas是不会处理表头和空格的
#执行结果:
ceshi1.csv:	12row	5columns
ceshi3.csv:	10row	5columns
ceshi2.csv:	29row	4columns
总共处理3文件

pandas会将表头去掉

将多个文件合并

csv方式
#!/usr/bin/env python3
#coding='utf-8'

import sys
import csv
import glob
import os

input_path = sys.argv[1]
input_file = sys.argv[2]
count = 0  #用于控制表头输入
dir_file = os.path.join(input_path,'data*.txt')
for filename in glob.glob(dir_file):
	with open(filename,'r',newline='') as fileread:
		with open(input_file,'a',newline='') as filewrite:
			filereader = csv.reader(fileread)
			filewriter = csv.writer(filewrite)
			header = next(filereader)
			if count == 0:
				filewriter.writerow(header)
			count += 1
			for row in filereader:
				filewriter.writerow(row)
#执行结果
Customer ID,Customer Name,Invoice Number,Sale Amount,Purchase Date
1234,John Smith,100-0002,"$1,200.00",1/1/14
2345,Mary Harrison,100-0003,"$1,425.00",1/6/14
3456,Lucy Gomez,100-0004,"$1,390.00",1/11/14
4567,Rupert Jones,100-0005,"$1,257.00",1/18/14
5678,Jenny Walters,100-0006,"$1,725.00",1/24/14
8765,Tony Song,100-0015,"$1,167.00",3/8/14
2345,Mary Harrison,100-0016,"$1,789.00",3/17/14
6543,Rachel Paz,100-0017,"$2,042.00",3/22/14
3456,Lucy Gomez,100-0018,"$1,511.00",3/28/14
4321,Susan Wallace,100-0019,"$2,280.00",3/30/14
9876,Daniel Farber,100-0008,"$1,115.00",2/2/14
8765,Laney Stone,100-0009,"$1,367.00",2/8/14
7654,Roger Lipney,100-0010,"$2,135.00",2/15/14
6543,Thomas Haines,100-0011,"$1,346.00",2/17/14
5432,Anushka Vaz,100-0012,"$1,560.00",2/21/14

pandas方式
#!/usr/bin/env python
#coding='utf-8'
import sys
import pandas as pd
import os
import glob
input_path = sys.argv[1]
datalist = []  #定义空的列表来存放多个文件的内容
for filename in glob.glob(os.path.join(input_path,'data*.txt')):
    data_framet = pd.read_csv(filename,index_col=None)
    datalist.append(data_framet)
data = pd.concat(datalist,axis=0,ignore_index=True)  #axis=0垂直方向合并 ignore_index=True忽略原来的行号
print(data)
    Customer ID  Customer Name  ... Sale Amount Purchase Date
0          1234     John Smith  ...   $1,200.00        1/1/14
1          2345  Mary Harrison  ...   $1,425.00        1/6/14
2          3456     Lucy Gomez  ...   $1,390.00       1/11/14
3          4567   Rupert Jones  ...   $1,257.00       1/18/14
4          5678  Jenny Walters  ...   $1,725.00       1/24/14
5          8765      Tony Song  ...   $1,167.00        3/8/14
6          2345  Mary Harrison  ...   $1,789.00       3/17/14
7          6543     Rachel Paz  ...   $2,042.00       3/22/14
8          3456     Lucy Gomez  ...   $1,511.00       3/28/14
9          4321  Susan Wallace  ...   $2,280.00       3/30/14
10         9876  Daniel Farber  ...   $1,115.00        2/2/14
11         8765    Laney Stone  ...   $1,367.00        2/8/14
12         7654   Roger Lipney  ...   $2,135.00       2/15/14
13         6543  Thomas Haines  ...   $1,346.00       2/17/14
14         5432    Anushka Vaz  ...   $1,560.00       2/21/14

[15 rows x 5 columns]

计算

#!/usr/bin/env python
#coding='utf-8'
import sys
import pandas as pd
import os
import glob
datalist = []
input_path = sys.argv[1]
dirfile = glob.glob(os.path.join(input_path,'data*.txt'))
for filename in dirfile:
    data_farmet = pd.read_csv(filename,index_col=None)
    cost = pd.DataFrame([float(str(value).strip('$').replace(',','')) for value in data_farmet.loc[:,'Sale Amount']])
    data = {'mean' : cost.mean(),'sum':cost.sum()}
    datalist.append(pd.DataFrame(data,columns=['mean', 'sum']))

result = pd.concat(datalist,axis=0,ignore_index=True)
print(result)
     mean     sum
0  1399.4  6997.0
1  1757.8  8789.0
2  1504.6  7523.0

你可能感兴趣的:(python数据分析基础)