10.Python-csv文件的处理及pandas的使用

1.数据准备

csv文件的一个主要优点是有很多程序可以存储,转换和处理纯文本文件。
csv 只保存数据,不保存公式。

C:\Users\4201.HJSC\Desktop\Python_exercise
vi supplier_data.csv 
Supplier Name,Invoice Number,Part Number,Cost,Purchase Date
Supplier x,001-1001,2341,$500 ,1/20/2014
Supplier x,001-1001,2341,$501 ,1/20/2014
Supplier x,001-1001,5467,$750 ,1/20/2014
Supplier x,001-1001,5467,$750 ,1/20/2014
Supplier y,50-9501,7009,$250 ,1/30/2014
Supplier y,50-9501,7009,$250 ,1/30/2014
Supplier y,50-9505,6650,$125 ,2/3/2014
Supplier y,50-9505,6650,$125 ,2/3/2014
Supplier z,920-4803,3321,$615 ,2/3/2014
Supplier z,920-4804,3321,$615 ,2/10/2014
Supplier z,920-4805,3321,$615 ,2/17/2014
Supplier z,920-4806,3321,$615 ,2/24/2014

2.基础python,不使用csv模块。

sys.argv[0]:脚本名称。
sys.argv[1]:脚本第一个输入参数。
sys.argv[0]:脚本第二个输入参数。
#map函数将str函数应用于header_list中每个元素,确保每个元素都是字符串
#join函数在header_list中每个元素中间插入逗号,将列表转换成字符串。
#split函数将字符串用逗号分割变成列表。
#output_file,'w',newline=''替换为:output_file,'wb' 
#output_file,'r',newline=''替换为:output_file,'rb' 

vi lcsv_read_with_simple_parsing_and_write.py 
#!/usr/bin/env python3 
import sys 
input_file=sys.argv[1]
output_file=sys.argv[2]
with open(input_file,'rb') as filereader:
	with open(output_file,'wb') as filewriter:  #output_file,'w',newline=''替换为:output_file,'wb' 
		header=filereader.readline()
		header=header.strip() 
		header_list=header.split(',')   #split函数将字符串用逗号分割变成列表。
		print(header_list)
		filewriter.write(','join(map(str,header_list))+'\n') #map函数将str函数应用于header_list中每个元素,确保每个元素都是字符串
		for row in filereader:                               #join函数在header_list中每个元素中间插入逗号,将列表转换成字符串。
			row=row.strip() 
			row_list=row.split(',')
			print(row_list)
			filewriter.write(','.join(map(str,row_list))+'\n')
			
#结果
[root@mysql51 python_scripts]# python lcsv_read_with_simple_parsing_and_write.py supplier_data.csv  output.csv
['Supplier Name', 'Invoice Number', 'Part Number', 'Cost', 'Purchase Date']
['Supplier x', '001-1001', '2341', '$500 ', '1/20/2014']
['Supplier x', '001-1001', '2341', '$501 ', '1/20/2014']
['Supplier x', '001-1001', '5467', '$750 ', '1/20/2014']
['Supplier x', '001-1001', '5467', '$750 ', '1/20/2014']
['Supplier y', '50-9501', '7009', '$250 ', '1/30/2014']
['Supplier y', '50-9501', '7009', '$250 ', '1/30/2014']
['Supplier y', '50-9505', '6650', '$125 ', '2/3/2014']
['Supplier y', '50-9505', '6650', '$125 ', '2/3/2014']
['Supplier z', '920-4803', '3321', '$615 ', '2/3/2014']
['Supplier z', '920-4804', '3321', '$615 ', '2/10/2014']
['Supplier z', '920-4805', '3321', '$615 ', '2/17/2014']
['Supplier z', '920-4806', '3321', '$615 ', '2/24/2014']

[root@mysql51 python_scripts]# more output.csv 
Supplier Name,Invoice Number,Part Number,Cost,Purchase Date
Supplier x,001-1001,2341,$500 ,1/20/2014
Supplier x,001-1001,2341,$501 ,1/20/2014
Supplier x,001-1001,5467,$750 ,1/20/2014
Supplier x,001-1001,5467,$750 ,1/20/2014
Supplier y,50-9501,7009,$250 ,1/30/2014
Supplier y,50-9501,7009,$250 ,1/30/2014
Supplier y,50-9505,6650,$125 ,2/3/2014
Supplier y,50-9505,6650,$125 ,2/3/2014
Supplier z,920-4803,3321,$615 ,2/3/2014
Supplier z,920-4804,3321,$615 ,2/10/2014
Supplier z,920-4805,3321,$615 ,2/17/2014
Supplier z,920-4806,3321,$615 ,2/24/2014

3.pandas 

vi pandas_parsing_and_write.py 
#!/usr/bin/env python3 
import sys 
import pandas as pd 
input_file=sys.argv[1]
output_file=sys.argv[2]
data_frame=pd.read_csv(input_file)  #读取csv文件
print(data_frame)
data_frame.to_csv(output_file,index=False) #将读取的内容转换为csv并输出。

#通列表,字典,元组相似,数据框也是存储数据的一种格式。
#数据框中保留了表格这种数据组织方式,不需要使用列表套列表的方式来分析数据。

python C:\Users\4201.HJSC\PycharmProjects\pythonProject\pandas_parsing_and_write.py C:\Users\4201.HJSC\Desktop\Python_exercise\supplier_data.csv C:\Users\4201.HJSC\Desktop\Python_exercise\output_pandas.csv
报错:
No module named pandas  #提示没有pandas;
pip install pandas      #下载Pandas ;
python.exe -m pip install --upgrade pip  #升级pandas; 

重新执行。
C:\Users\4201.HJSC>python C:\Users\4201.HJSC\PycharmProjects\pythonProject\pandas_parsing_and_write.py C:\Users\4201.HJSC\Desktop\Python_exercise\supplier_data.csv C:\Users\4201.HJSC\Desktop\Python_exercise\output_pandas.csv
   Supplier Name Invoice Number  Part Number   Cost Purchase Date
0     Supplier x       001-1001         2341  $500      1/20/2014
1     Supplier x       001-1001         2341  $501      1/20/2014
2     Supplier x       001-1001         5467  $750      1/20/2014
3     Supplier x       001-1001         5467  $750      1/20/2014
4     Supplier y        50-9501         7009  $250      1/30/2014
5     Supplier y        50-9501         7009  $250      1/30/2014
6     Supplier y        50-9505         6650  $125       2/3/2014
7     Supplier y        50-9505         6650  $125       2/3/2014
8     Supplier z       920-4803         3321  $615       2/3/2014
9     Supplier z       920-4804         3321  $615      2/10/2014
10    Supplier z       920-4805         3321  $615      2/17/2014
11    Supplier z       920-4806         3321  $615      2/24/2014

#此时已经生成文件: output_pandas.csv
#如果金额字段中包含逗号,则需要使用正则表达式搜索带有逗号的值,然后删除逗号再处理。

4.基础python,使用csv;

#csv模块就是被涉及用于处理数值中的嵌入逗号的模式。
vi 2csv_reader_parsing_and_write.py 
#encoding=utf-8 
#!/usr/bin/env python3 
import csv 
import sys 
input_file=sys.argv[1]
output_file=sys.argv[2]
with open(input_file,'rb') as csv_in_file:
	with open(output_file,'wb') as csv_out_file:
		filereader=csv.reader(csv_in_file,delimiter=',')
		filewriter=csv.writer(csv_out_file,delimiter=',')
		for row_list in filereader:
			print(row_list)
			filewriter.writerow(row_list)
			
			
#writerow函数将每行中的列表值写入输出文件。


python  2csv_reader_parsing_and_write.py  supplier_data.csv 2output_csv.csv

[root@mysql51 python_scripts]# python  2csv_reader_parsing_and_write.py  supplier_data.csv 2output_csv.csv
['Supplier Name', 'Invoice Number', 'Part Number', 'Cost', 'Purchase Date']
['Supplier x', '001-1001', '2341', '$500 ', '1/20/2014']
['Supplier x', '001-1001', '2341', '$501 ', '1/20/2014']
['Supplier x', '001-1001', '5467', '$750 ', '1/20/2014']
['Supplier x', '001-1001', '5467', '$750 ', '1/20/2014']
['Supplier y', '50-9501', '7009', '$250 ', '1/30/2014']
['Supplier y', '50-9501', '7009', '$250 ', '1/30/2014']
['Supplier y', '50-9505', '6650', '$125 ', '2/3/2014']
['Supplier y', '50-9505', '6650', '$125 ', '2/3/2014']
['Supplier z', '920-4803', '3321', '$615 ', '2/3/2014']
['Supplier z', '920-4804', '3321', '$615 ', '2/10/2014']
['Supplier z', '920-4805', '3321', '$615 ', '2/17/2014']
['Supplier z', '920-4806', '3321', '$615 ', '2/24/2014']
[root@mysql51 python_scripts]# 
[root@mysql51 python_scripts]# more 2output_csv.csv
Supplier Name,Invoice Number,Part Number,Cost,Purchase Date
Supplier x,001-1001,2341,$500 ,1/20/2014
Supplier x,001-1001,2341,$501 ,1/20/2014
Supplier x,001-1001,5467,$750 ,1/20/2014
Supplier x,001-1001,5467,$750 ,1/20/2014
Supplier y,50-9501,7009,$250 ,1/30/2014
Supplier y,50-9501,7009,$250 ,1/30/2014
Supplier y,50-9505,6650,$125 ,2/3/2014
Supplier y,50-9505,6650,$125 ,2/3/2014
Supplier z,920-4803,3321,$615 ,2/3/2014
Supplier z,920-4804,3321,$615 ,2/10/2014
Supplier z,920-4805,3321,$615 ,2/17/2014
Supplier z,920-4806,3321,$615 ,2/24/2014

5.基础python -csv

vi 3csv_reader_value_meets_condition.py 
#!/usr/bin/env python3
import csv 
import sys 
input_file=sys.argv[1]
output_file=sys.argv[2]
with open(input_file,'rb') as csv_in_file:
	with open(output_file,'wb') as csv_out_file:
		filereader=csv.reader(csv_in_file)
		filewriter=csv.writer(csv_out_file)
		header=next(filereader)
		filewriter.writerow(header)
		for row_list in filereader:
			supplier=str(row_list[0].strip()) #取第一列的值
			cost=str(row_list[3]).strip('$').replace(',','') #第四列的值删除列表中的$和逗号 
			if supplier=='Supplier Z' or float(cost)>600:
				filewriter.writerow(row_list)


#结果
[root@mysql51 python_scripts]# python 3csv_reader_value_meets_condition.py supplier_data.csv 3output_csv.csv
[root@mysql51 python_scripts]# 
[root@mysql51 python_scripts]# 
[root@mysql51 python_scripts]# cat 3output_csv.csv
Supplier Name,Invoice Number,Part Number,Cost,Purchase Date
Supplier x,001-1001,5467,$750 ,1/20/2014
Supplier x,001-1001,5467,$750 ,1/20/2014
Supplier z,920-4803,3321,$615 ,2/3/2014
Supplier z,920-4804,3321,$615 ,2/10/2014
Supplier z,920-4805,3321,$615 ,2/17/2014
Supplier z,920-4806,3321,$615 ,2/24/2014

6.pandas的loc函数

vi pandas_value_meets_condition.py 
#!/usr/bin/env python3
import pandas as pd 
import sys 
input_file=sys.argv[1]
output_file=sys.argv[2]
data_frame=pd.read_csv(input_file)
data_frame['Cost']=data_frame['Cost'].str.strip('$').astype(float)
data_frame_value_meets_condition=data_frame.loc[(data_frame['Supplier Name'].str.contains('Z'))|(data_frame['Cost']>600),:]
data_frame_value_meets_condition.to_csv(output_file,index=False)

python C:\Users\4201.HJSC\PycharmProjects\pythonProject\pandas_value_meets_condition.py \
C:\Users\4201.HJSC\Desktop\Python_exercise\supplier_data.csv \
C:\Users\4201.HJSC\Desktop\Python_exercise\4output_csv.csv
由于linux上没有pandas,numpy等包,不能运行,再windows上运行。
运行后生成:4output_csv.csv

7.总结

操作csv文件:csv模块,pandas模块。

你可能感兴趣的:(Python开发及自动化,pandas,python,数据分析)