python之csv文件处理(基础)

CSV(comma-separated value,逗号分隔值)文件格式是一种非常简单的数据存储与分享
方式。 CSV 文件将数据表格存储为纯文本,表格(或电子表格)中的每个单元格都是一个
数值或字符串

主要介绍通过csv库和pandas库对csv文件的处理操作

#使用的测试数据
Supplier Name,Invoice Number,Part Number,Cost,Purchase Date
Supplier X,001-1001,2341,$500.00,1/20/14
Supplier X,001-1001,2341,$500.00,1/20/14
Supplier X,001-1001,5467,$750.00,1/20/14
Supplier X,001-1001,5467,$750.00,1/20/14
Supplier Y,50-9501,7009,$250.00,1/30/14
Supplier Y,50-9501,7009,$250.00,1/30/14
Supplier Y,50-9505,6650,$125.00,2/3/14
Supplier Y,50-9505,6650,$125.00,2/3/14
Supplier Z,920-4803,3321,$615.00,2/3/14
Supplier Z,920-4804,3321,$615.00,2/10/14
Supplier Z,920-4805,3321,$615.00,2/17/14
Supplier Z,920-4806,3321,$615.00,2/24/14
input_file = sys.argv[1]  #是要在脚本后加入的参数,表示第一个参数
output_file = sys.argv[2]#是要在脚本后加入的参数,表示第二个参数

#执行的时候
python3 test.py ceshi.csv out.txt

csv的读写

使用pandas进行csv文件的读写操作
#!/usr/bin/env python
#coding='utf-8'

import sys
import pandas as pd

input_file = sys.argv[1]
output_file = sys.argv[2]
file_read = pd.read_csv(input_file)  #通过pandas的read_csv()方法读取csv文件内容
file_read.to_csv(output_file,index=False) #读取的内容通过to_csv()方法写入到output_file指定的文件中 
#file_read是一个DataFrame的对象
使用csv库进行csv文件的读写
#!/usr/bin/env python
#coding='utf-8'

import sys
import csv

input_file = sys.argv[1]
output_file = sys.argv[2]

with open(input_file,'r',newline='') as file_read:  #打开input_file指定的文件进行只读操作
    with open(output_file,'w',newline='') as file_write:  #打开output_file指定的文件进行只写操作
        file_reader = csv.reader(file_read)  #通过csv的reader()方法读取文件
        file_writer = csv.writer(file_write)  #通过csv的writer进行写的操作
        header = next(file_reader)  #取出文件的第一行,也就是表头
        file_writer.writerow(header)  #将表头写入文件
        for row in file_reader:  #通过for循环读取每行文件
            print(row)
            file_writer.writerow(row)  #将内容写入

上边的读写是最简单的操作,而我们使用数据时往往都要通过筛选,来获取我们认为有用的数据,下面我们来进行条件筛选

筛选特定的行

我们来筛序Supplier Name 中包含Z的行和Cost大于600的行

pandas方式
#!/usr/bin/env python
#coding='utf-8'
import sys
import pandas as pd

input_file = sys.argv[1]
output_file = sys.argv[2]

data_frame = pd.read_csv(input_file)
#获取Cost列的内容,并做一些处理
data_cost = data_frame['Cost'].str.strip('$').astype(float)
#获取Supplier Name列的内容
data_name = data_frame['Supplier Name'].str
#筛序符合条件的内容
data_result = data_frame.loc[(data_name.contains('Z')) | (data_cost > 600.0),:]
print(data_result) 
#将符合条件的内容写入文件
data_result.to_csv(output_file,index=False)
csv方式
#!/usr/bin/env python
#coding='utf-8'
import sys
import csv

input_file = sys.argv[1]
output_file = sys.argv[2]

with open(input_file,'r',newline='') as file_read:
    with open(output_file,'w',newline='') as file_write:
        file_reader = csv.reader(file_read)
        file_writer = csv.writer(file_write)
        header = next(file_reader)
        file_writer.writerow(header)
        for row in file_reader:
            cost = float(row[3].strip('$'))
            name = row[0]
            if 'Z' in name or cost > 600.0:
                print(row)
                file_writer.writerow(row)

使用pandas是我们可以认为是一个字典,(pandas的DataFrame是一个类似于字典形式的),可以直接通过列名来操作
而csv方式是通过列表进行操作的。我们需要关心索引值
个人感觉pandas比较方便

行中的值属于某个集合

pandas方式
#!/usr/bin/env python
#coding='utf-8'
import sys
import pandas as pd

input_file = sys.argv[1]
output_file = sys.argv[2]

datas = ['1/20/14', '1/30/14']
data_framet = pd.read_csv(input_file)
date = data_framet['Purchase Date']
data_result = data_framet.loc[(date.isin(datas)),:]
print(data_result)
data_result.to_csv(output_file,index=False)

#pandas中用isin方法来判断
在这里插入代码片
csv方式
#!/usr/bin/env python
#coding='utf-8'
import sys
import csv

input_file = sys.argv[1]
output_file = sys.argv[2]

datas = ['1/20/14', '1/30/14']  #定义一个集合列表
with open(input_file,'r',newline='') as file_read:
	with open(output_file,'w',newline='') as file_write:
		file_reader = csv.reader(file_read)
		file_writer = csv.writer(file_write)
		header = next(file_reader)
		file_writer.writerow(header)
		for row in file_reader:  #判断是否在集合列表中
			date = row[4]
			if date in datas:
				print(row)
				file_writer.writerow(row)

根据列的索引值,选取列

pandas方式
#!/usr/bin/env python
#coding='utf-8'
import sys
import pandas as pd

input_file = sys.argv[1]
output_file = sys.argv[2]

data_framet = pd.read_csv(input_file)
data_result = data_framet.iloc[:,[0,3]]  #使用iloc来获取
print(data_result)
data_result.to_csv(output_file,index=False)
csv方式
#!/usr/bin/env python3
#coding='utf-8'

import sys
import csv

input_file = sys.argv[1]
output_file = sys.argv[2]
my_columns = [0,3]
with open(input_file,'r',newline='') as file_read:
	with open(output_file,'w',newline='') as file_write:
		file_reader = csv.reader(file_read)
		file_writer = csv.writer(file_write)
		for row in file_reader:
			row_list = []
			for index in my_columns:
				row_list.append(row[index])
			print(row_list)
			file_writer.writerow(row_list)

根据列的名称,获取列

pandas方式
#!/usr/bin/env python
#coding='utf-8'
import sys
import pandas as pd

input_file = sys.argv[1]
output_file = sys.argv[2]

data_framet = pd.read_csv(input_file)
data_relust = data_framet.loc[:,['Supplier Name','Cost']]
print(data_relust)
data_relust.to_csv(output_file,index=False)
csv方式
#!/usr/bin/env python3
#coding='utf-8'

import sys
import csv

input_file = sys.argv[1]
output_file = sys.argv[2]
my_row = ['Supplier Name','Cost']
my_columns = []
with open(input_file,'r',newline='') as fileread:
	with open(output_file,'w',newline='') as filewrite:
		filereader = csv.reader(fileread)
		filewriter = csv.writer(filewrite)
		header = next(filereader)
		for index in range(len(header)):
			if header[index] in my_row:
				my_columns.append(index)
		filewriter.writerow(my_row)
		for row in filereader:
			row_list = []
			for index in my_columns:
				row_list.append(row[index])
			print(row_list)
			filewriter.writerow(row_list)

选取连续的行或指定的行

pandas方式
#!/usr/bin/env python
#coding='utf-8'
import sys
import pandas as pd

input_file = sys.argv[1]
output_file = sys.argv[2]

data_framet = pd.read_csv(input_file)
data_reslust = data_framet.loc[3:4,:]
print(data_reslust)
data_reslust.to_csv(output_file,index=False)
#pandas还提供有一个drop的方法来排除不想要的行
csv方式
#!/usr/bin/env python3
#coding='utf-8'

import sys
import csv

input_file = sys.argv[1]
output_file = sys.argv[2]
count = 0
with open(input_file,'r',newline='') as fileread:
	with open(output_file,'w',newline='') as filewrite:
		filereader = csv.reader(fileread)
		filewriter = csv.writer(filewrite)
		for row in filereader:
			if count >= 2 and count <=3:
				print(row)
				filewriter.writerow(row)	
			count += 1

添加表题行

csv方式就是定义一个列表,将列表写入文件即可
下面只介绍一下pandas方式

#!/usr/bin/env python
#coding='utf-8'
import sys
import pandas as pd

input_file = sys.argv[1]
output_file = sys.argv[2]
headers = ['Supplier Name1','Invoice Number',\
'Part Number','Cost','Purchase Date']

data_framet = pd.read_csv(input_file,header=None,names=headers)
print(data_framet)

你可能感兴趣的:(python数据分析基础)