CSV(comma-separated value,逗号分隔值)文件格式是一种非常简单的数据存储与分享
方式。 CSV 文件将数据表格存储为纯文本,表格(或电子表格)中的每个单元格都是一个
数值或字符串
主要介绍通过csv库和pandas库对csv文件的处理操作
#使用的测试数据
Supplier Name,Invoice Number,Part Number,Cost,Purchase Date
Supplier X,001-1001,2341,$500.00,1/20/14
Supplier X,001-1001,2341,$500.00,1/20/14
Supplier X,001-1001,5467,$750.00,1/20/14
Supplier X,001-1001,5467,$750.00,1/20/14
Supplier Y,50-9501,7009,$250.00,1/30/14
Supplier Y,50-9501,7009,$250.00,1/30/14
Supplier Y,50-9505,6650,$125.00,2/3/14
Supplier Y,50-9505,6650,$125.00,2/3/14
Supplier Z,920-4803,3321,$615.00,2/3/14
Supplier Z,920-4804,3321,$615.00,2/10/14
Supplier Z,920-4805,3321,$615.00,2/17/14
Supplier Z,920-4806,3321,$615.00,2/24/14
input_file = sys.argv[1] #是要在脚本后加入的参数,表示第一个参数
output_file = sys.argv[2]#是要在脚本后加入的参数,表示第二个参数
#执行的时候
python3 test.py ceshi.csv out.txt
#!/usr/bin/env python
#coding='utf-8'
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
file_read = pd.read_csv(input_file) #通过pandas的read_csv()方法读取csv文件内容
file_read.to_csv(output_file,index=False) #读取的内容通过to_csv()方法写入到output_file指定的文件中
#file_read是一个DataFrame的对象
#!/usr/bin/env python
#coding='utf-8'
import sys
import csv
input_file = sys.argv[1]
output_file = sys.argv[2]
with open(input_file,'r',newline='') as file_read: #打开input_file指定的文件进行只读操作
with open(output_file,'w',newline='') as file_write: #打开output_file指定的文件进行只写操作
file_reader = csv.reader(file_read) #通过csv的reader()方法读取文件
file_writer = csv.writer(file_write) #通过csv的writer进行写的操作
header = next(file_reader) #取出文件的第一行,也就是表头
file_writer.writerow(header) #将表头写入文件
for row in file_reader: #通过for循环读取每行文件
print(row)
file_writer.writerow(row) #将内容写入
上边的读写是最简单的操作,而我们使用数据时往往都要通过筛选,来获取我们认为有用的数据,下面我们来进行条件筛选
我们来筛序Supplier Name 中包含Z的行和Cost大于600的行
#!/usr/bin/env python
#coding='utf-8'
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
#获取Cost列的内容,并做一些处理
data_cost = data_frame['Cost'].str.strip('$').astype(float)
#获取Supplier Name列的内容
data_name = data_frame['Supplier Name'].str
#筛序符合条件的内容
data_result = data_frame.loc[(data_name.contains('Z')) | (data_cost > 600.0),:]
print(data_result)
#将符合条件的内容写入文件
data_result.to_csv(output_file,index=False)
#!/usr/bin/env python
#coding='utf-8'
import sys
import csv
input_file = sys.argv[1]
output_file = sys.argv[2]
with open(input_file,'r',newline='') as file_read:
with open(output_file,'w',newline='') as file_write:
file_reader = csv.reader(file_read)
file_writer = csv.writer(file_write)
header = next(file_reader)
file_writer.writerow(header)
for row in file_reader:
cost = float(row[3].strip('$'))
name = row[0]
if 'Z' in name or cost > 600.0:
print(row)
file_writer.writerow(row)
使用pandas是我们可以认为是一个字典,(pandas的DataFrame是一个类似于字典形式的),可以直接通过列名来操作
而csv方式是通过列表进行操作的。我们需要关心索引值
个人感觉pandas比较方便
#!/usr/bin/env python
#coding='utf-8'
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
datas = ['1/20/14', '1/30/14']
data_framet = pd.read_csv(input_file)
date = data_framet['Purchase Date']
data_result = data_framet.loc[(date.isin(datas)),:]
print(data_result)
data_result.to_csv(output_file,index=False)
#pandas中用isin方法来判断
在这里插入代码片
#!/usr/bin/env python
#coding='utf-8'
import sys
import csv
input_file = sys.argv[1]
output_file = sys.argv[2]
datas = ['1/20/14', '1/30/14'] #定义一个集合列表
with open(input_file,'r',newline='') as file_read:
with open(output_file,'w',newline='') as file_write:
file_reader = csv.reader(file_read)
file_writer = csv.writer(file_write)
header = next(file_reader)
file_writer.writerow(header)
for row in file_reader: #判断是否在集合列表中
date = row[4]
if date in datas:
print(row)
file_writer.writerow(row)
#!/usr/bin/env python
#coding='utf-8'
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
data_framet = pd.read_csv(input_file)
data_result = data_framet.iloc[:,[0,3]] #使用iloc来获取
print(data_result)
data_result.to_csv(output_file,index=False)
#!/usr/bin/env python3
#coding='utf-8'
import sys
import csv
input_file = sys.argv[1]
output_file = sys.argv[2]
my_columns = [0,3]
with open(input_file,'r',newline='') as file_read:
with open(output_file,'w',newline='') as file_write:
file_reader = csv.reader(file_read)
file_writer = csv.writer(file_write)
for row in file_reader:
row_list = []
for index in my_columns:
row_list.append(row[index])
print(row_list)
file_writer.writerow(row_list)
#!/usr/bin/env python
#coding='utf-8'
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
data_framet = pd.read_csv(input_file)
data_relust = data_framet.loc[:,['Supplier Name','Cost']]
print(data_relust)
data_relust.to_csv(output_file,index=False)
#!/usr/bin/env python3
#coding='utf-8'
import sys
import csv
input_file = sys.argv[1]
output_file = sys.argv[2]
my_row = ['Supplier Name','Cost']
my_columns = []
with open(input_file,'r',newline='') as fileread:
with open(output_file,'w',newline='') as filewrite:
filereader = csv.reader(fileread)
filewriter = csv.writer(filewrite)
header = next(filereader)
for index in range(len(header)):
if header[index] in my_row:
my_columns.append(index)
filewriter.writerow(my_row)
for row in filereader:
row_list = []
for index in my_columns:
row_list.append(row[index])
print(row_list)
filewriter.writerow(row_list)
#!/usr/bin/env python
#coding='utf-8'
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
data_framet = pd.read_csv(input_file)
data_reslust = data_framet.loc[3:4,:]
print(data_reslust)
data_reslust.to_csv(output_file,index=False)
#pandas还提供有一个drop的方法来排除不想要的行
#!/usr/bin/env python3
#coding='utf-8'
import sys
import csv
input_file = sys.argv[1]
output_file = sys.argv[2]
count = 0
with open(input_file,'r',newline='') as fileread:
with open(output_file,'w',newline='') as filewrite:
filereader = csv.reader(fileread)
filewriter = csv.writer(filewrite)
for row in filereader:
if count >= 2 and count <=3:
print(row)
filewriter.writerow(row)
count += 1
csv方式就是定义一个列表,将列表写入文件即可
下面只介绍一下pandas方式
#!/usr/bin/env python
#coding='utf-8'
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
headers = ['Supplier Name1','Invoice Number',\
'Part Number','Cost','Purchase Date']
data_framet = pd.read_csv(input_file,header=None,names=headers)
print(data_framet)