1.数据准备
csv文件的一个主要优点是有很多程序可以存储,转换和处理纯文本文件。
csv 只保存数据,不保存公式。
C:\Users\4201.HJSC\Desktop\Python_exercise
vi supplier_data.csv
Supplier Name,Invoice Number,Part Number,Cost,Purchase Date
Supplier x,001-1001,2341,$500 ,1/20/2014
Supplier x,001-1001,2341,$501 ,1/20/2014
Supplier x,001-1001,5467,$750 ,1/20/2014
Supplier x,001-1001,5467,$750 ,1/20/2014
Supplier y,50-9501,7009,$250 ,1/30/2014
Supplier y,50-9501,7009,$250 ,1/30/2014
Supplier y,50-9505,6650,$125 ,2/3/2014
Supplier y,50-9505,6650,$125 ,2/3/2014
Supplier z,920-4803,3321,$615 ,2/3/2014
Supplier z,920-4804,3321,$615 ,2/10/2014
Supplier z,920-4805,3321,$615 ,2/17/2014
Supplier z,920-4806,3321,$615 ,2/24/2014
2.基础python,不使用csv模块。
sys.argv[0]:脚本名称。
sys.argv[1]:脚本第一个输入参数。
sys.argv[0]:脚本第二个输入参数。
#map函数将str函数应用于header_list中每个元素,确保每个元素都是字符串
#join函数在header_list中每个元素中间插入逗号,将列表转换成字符串。
#split函数将字符串用逗号分割变成列表。
#output_file,'w',newline=''替换为:output_file,'wb'
#output_file,'r',newline=''替换为:output_file,'rb'
vi lcsv_read_with_simple_parsing_and_write.py
#!/usr/bin/env python3
import sys
input_file=sys.argv[1]
output_file=sys.argv[2]
with open(input_file,'rb') as filereader:
with open(output_file,'wb') as filewriter: #output_file,'w',newline=''替换为:output_file,'wb'
header=filereader.readline()
header=header.strip()
header_list=header.split(',') #split函数将字符串用逗号分割变成列表。
print(header_list)
filewriter.write(','join(map(str,header_list))+'\n') #map函数将str函数应用于header_list中每个元素,确保每个元素都是字符串
for row in filereader: #join函数在header_list中每个元素中间插入逗号,将列表转换成字符串。
row=row.strip()
row_list=row.split(',')
print(row_list)
filewriter.write(','.join(map(str,row_list))+'\n')
#结果
[root@mysql51 python_scripts]# python lcsv_read_with_simple_parsing_and_write.py supplier_data.csv output.csv
['Supplier Name', 'Invoice Number', 'Part Number', 'Cost', 'Purchase Date']
['Supplier x', '001-1001', '2341', '$500 ', '1/20/2014']
['Supplier x', '001-1001', '2341', '$501 ', '1/20/2014']
['Supplier x', '001-1001', '5467', '$750 ', '1/20/2014']
['Supplier x', '001-1001', '5467', '$750 ', '1/20/2014']
['Supplier y', '50-9501', '7009', '$250 ', '1/30/2014']
['Supplier y', '50-9501', '7009', '$250 ', '1/30/2014']
['Supplier y', '50-9505', '6650', '$125 ', '2/3/2014']
['Supplier y', '50-9505', '6650', '$125 ', '2/3/2014']
['Supplier z', '920-4803', '3321', '$615 ', '2/3/2014']
['Supplier z', '920-4804', '3321', '$615 ', '2/10/2014']
['Supplier z', '920-4805', '3321', '$615 ', '2/17/2014']
['Supplier z', '920-4806', '3321', '$615 ', '2/24/2014']
[root@mysql51 python_scripts]# more output.csv
Supplier Name,Invoice Number,Part Number,Cost,Purchase Date
Supplier x,001-1001,2341,$500 ,1/20/2014
Supplier x,001-1001,2341,$501 ,1/20/2014
Supplier x,001-1001,5467,$750 ,1/20/2014
Supplier x,001-1001,5467,$750 ,1/20/2014
Supplier y,50-9501,7009,$250 ,1/30/2014
Supplier y,50-9501,7009,$250 ,1/30/2014
Supplier y,50-9505,6650,$125 ,2/3/2014
Supplier y,50-9505,6650,$125 ,2/3/2014
Supplier z,920-4803,3321,$615 ,2/3/2014
Supplier z,920-4804,3321,$615 ,2/10/2014
Supplier z,920-4805,3321,$615 ,2/17/2014
Supplier z,920-4806,3321,$615 ,2/24/2014
3.pandas
vi pandas_parsing_and_write.py
#!/usr/bin/env python3
import sys
import pandas as pd
input_file=sys.argv[1]
output_file=sys.argv[2]
data_frame=pd.read_csv(input_file) #读取csv文件
print(data_frame)
data_frame.to_csv(output_file,index=False) #将读取的内容转换为csv并输出。
#通列表,字典,元组相似,数据框也是存储数据的一种格式。
#数据框中保留了表格这种数据组织方式,不需要使用列表套列表的方式来分析数据。
python C:\Users\4201.HJSC\PycharmProjects\pythonProject\pandas_parsing_and_write.py C:\Users\4201.HJSC\Desktop\Python_exercise\supplier_data.csv C:\Users\4201.HJSC\Desktop\Python_exercise\output_pandas.csv
报错:
No module named pandas #提示没有pandas;
pip install pandas #下载Pandas ;
python.exe -m pip install --upgrade pip #升级pandas;
重新执行。
C:\Users\4201.HJSC>python C:\Users\4201.HJSC\PycharmProjects\pythonProject\pandas_parsing_and_write.py C:\Users\4201.HJSC\Desktop\Python_exercise\supplier_data.csv C:\Users\4201.HJSC\Desktop\Python_exercise\output_pandas.csv
Supplier Name Invoice Number Part Number Cost Purchase Date
0 Supplier x 001-1001 2341 $500 1/20/2014
1 Supplier x 001-1001 2341 $501 1/20/2014
2 Supplier x 001-1001 5467 $750 1/20/2014
3 Supplier x 001-1001 5467 $750 1/20/2014
4 Supplier y 50-9501 7009 $250 1/30/2014
5 Supplier y 50-9501 7009 $250 1/30/2014
6 Supplier y 50-9505 6650 $125 2/3/2014
7 Supplier y 50-9505 6650 $125 2/3/2014
8 Supplier z 920-4803 3321 $615 2/3/2014
9 Supplier z 920-4804 3321 $615 2/10/2014
10 Supplier z 920-4805 3321 $615 2/17/2014
11 Supplier z 920-4806 3321 $615 2/24/2014
#此时已经生成文件: output_pandas.csv
#如果金额字段中包含逗号,则需要使用正则表达式搜索带有逗号的值,然后删除逗号再处理。
4.基础python,使用csv;
#csv模块就是被涉及用于处理数值中的嵌入逗号的模式。
vi 2csv_reader_parsing_and_write.py
#encoding=utf-8
#!/usr/bin/env python3
import csv
import sys
input_file=sys.argv[1]
output_file=sys.argv[2]
with open(input_file,'rb') as csv_in_file:
with open(output_file,'wb') as csv_out_file:
filereader=csv.reader(csv_in_file,delimiter=',')
filewriter=csv.writer(csv_out_file,delimiter=',')
for row_list in filereader:
print(row_list)
filewriter.writerow(row_list)
#writerow函数将每行中的列表值写入输出文件。
python 2csv_reader_parsing_and_write.py supplier_data.csv 2output_csv.csv
[root@mysql51 python_scripts]# python 2csv_reader_parsing_and_write.py supplier_data.csv 2output_csv.csv
['Supplier Name', 'Invoice Number', 'Part Number', 'Cost', 'Purchase Date']
['Supplier x', '001-1001', '2341', '$500 ', '1/20/2014']
['Supplier x', '001-1001', '2341', '$501 ', '1/20/2014']
['Supplier x', '001-1001', '5467', '$750 ', '1/20/2014']
['Supplier x', '001-1001', '5467', '$750 ', '1/20/2014']
['Supplier y', '50-9501', '7009', '$250 ', '1/30/2014']
['Supplier y', '50-9501', '7009', '$250 ', '1/30/2014']
['Supplier y', '50-9505', '6650', '$125 ', '2/3/2014']
['Supplier y', '50-9505', '6650', '$125 ', '2/3/2014']
['Supplier z', '920-4803', '3321', '$615 ', '2/3/2014']
['Supplier z', '920-4804', '3321', '$615 ', '2/10/2014']
['Supplier z', '920-4805', '3321', '$615 ', '2/17/2014']
['Supplier z', '920-4806', '3321', '$615 ', '2/24/2014']
[root@mysql51 python_scripts]#
[root@mysql51 python_scripts]# more 2output_csv.csv
Supplier Name,Invoice Number,Part Number,Cost,Purchase Date
Supplier x,001-1001,2341,$500 ,1/20/2014
Supplier x,001-1001,2341,$501 ,1/20/2014
Supplier x,001-1001,5467,$750 ,1/20/2014
Supplier x,001-1001,5467,$750 ,1/20/2014
Supplier y,50-9501,7009,$250 ,1/30/2014
Supplier y,50-9501,7009,$250 ,1/30/2014
Supplier y,50-9505,6650,$125 ,2/3/2014
Supplier y,50-9505,6650,$125 ,2/3/2014
Supplier z,920-4803,3321,$615 ,2/3/2014
Supplier z,920-4804,3321,$615 ,2/10/2014
Supplier z,920-4805,3321,$615 ,2/17/2014
Supplier z,920-4806,3321,$615 ,2/24/2014
5.基础python -csv
vi 3csv_reader_value_meets_condition.py
#!/usr/bin/env python3
import csv
import sys
input_file=sys.argv[1]
output_file=sys.argv[2]
with open(input_file,'rb') as csv_in_file:
with open(output_file,'wb') as csv_out_file:
filereader=csv.reader(csv_in_file)
filewriter=csv.writer(csv_out_file)
header=next(filereader)
filewriter.writerow(header)
for row_list in filereader:
supplier=str(row_list[0].strip()) #取第一列的值
cost=str(row_list[3]).strip('$').replace(',','') #第四列的值删除列表中的$和逗号
if supplier=='Supplier Z' or float(cost)>600:
filewriter.writerow(row_list)
#结果
[root@mysql51 python_scripts]# python 3csv_reader_value_meets_condition.py supplier_data.csv 3output_csv.csv
[root@mysql51 python_scripts]#
[root@mysql51 python_scripts]#
[root@mysql51 python_scripts]# cat 3output_csv.csv
Supplier Name,Invoice Number,Part Number,Cost,Purchase Date
Supplier x,001-1001,5467,$750 ,1/20/2014
Supplier x,001-1001,5467,$750 ,1/20/2014
Supplier z,920-4803,3321,$615 ,2/3/2014
Supplier z,920-4804,3321,$615 ,2/10/2014
Supplier z,920-4805,3321,$615 ,2/17/2014
Supplier z,920-4806,3321,$615 ,2/24/2014
6.pandas的loc函数
vi pandas_value_meets_condition.py
#!/usr/bin/env python3
import pandas as pd
import sys
input_file=sys.argv[1]
output_file=sys.argv[2]
data_frame=pd.read_csv(input_file)
data_frame['Cost']=data_frame['Cost'].str.strip('$').astype(float)
data_frame_value_meets_condition=data_frame.loc[(data_frame['Supplier Name'].str.contains('Z'))|(data_frame['Cost']>600),:]
data_frame_value_meets_condition.to_csv(output_file,index=False)
python C:\Users\4201.HJSC\PycharmProjects\pythonProject\pandas_value_meets_condition.py \
C:\Users\4201.HJSC\Desktop\Python_exercise\supplier_data.csv \
C:\Users\4201.HJSC\Desktop\Python_exercise\4output_csv.csv
由于linux上没有pandas,numpy等包,不能运行,再windows上运行。
运行后生成:4output_csv.csv
7.总结
操作csv文件:csv模块,pandas模块。