下文来自与我阅读书籍《科学数据处理》的笔记,可能对于书上有些代码并不熟悉,所以留一些坑在这里,日后学会了就补上。如果大家原意留言解答,小白感激不尽。
以下都只是代码部分,相关注解会在我后续学习中进行补全,所以这个坑一定要来补!
#!/usr/bin/env python3
import sys
#这个路径的设置是基于.py文件与需要处理的文件在同一个文件夹下
#如果不是这样,或者简单一点,直接赋为绝对路径也可以
input_file = sys.argv[1]
output_file = sys.argv[2]
with open(input_file, 'r', newline='') as filereader:
with open(output_file, 'w', newline='') as filewriter:
header = filereader.readline()
header = header.strip()
header_list = header.split(',')
print(header_list)
filewriter.write(','.join(map(str,header_list))+'\n')
for row in filereader:
row = row.strip()
row_list = row.split(',')
print(row_list)
#下面一行的操作我也不太理解,但是是吧值输出到输出文件中
filewriter.write(','.join(map(str,row_list))+'\n')
#!/usr/bin/env python3
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
print(data_frame)
data_frame.to_csv(output_file, index=False)
#!/usr/bin/env python3
import csv
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
with open(input_file, 'r', newline='') as csv_in_file:
with open(output_file, 'w', newline='') as csv_out_file:
#使用CVS模块来读写
#delimiter=','是默认分隔符,所以如果你的输入文件和输出文件都是用逗号分隔的,就不需要指定这个参数
filereader = csv.reader(csv_in_file, delimiter=',')
filewriter = csv.writer(csv_out_file, delimiter=',')
for row_list in filereader:
print(row_list)
filewriter.writerow(row_list)
#!/usr/bin/env python3
import csv
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
with open(input_file, 'r', newline='') as csv_in_file:
with open(output_file, 'w', newline='') as csv_out_file:
filereader = csv.reader(csv_in_file)
filewriter = csv.writer(csv_out_file)
#读出文本第一行赋给header
header = next(filereader)
filewriter.writerow(header)
for row_list in filereader:
#取每一行第一列数据
supplier = str(row_list[0]).strip()
cost = str(row_list[3]).strip('$').replace(',', '')
if supplier == 'Supplier Z' or float(cost) > 600.0:
filewriter.writerow(row_list)
#!/usr/bin/env python3
import pandas as pd
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
data_frame['Cost']=data_frame['Cost'].str.strip('$').astype(float)
#使用loc函数可以同时选择特定的行与列
data_frame_value_meets_condition=data_frame.loc[(data_frame['Supplier Name'].str.contains('Z')) | (data_frame['Cost'] > 600.0), :]
data_frame_value_meets_condition.to_csv(output_file, index=False)
#!/usr/bin/env python3
import csv
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
important_dates = ['1/20/14', '1/30/14']
with open(input_file, 'r', newline='') as csv_in_file:
with open(output_file, 'w', newline='') as csv_out_file:
filereader = csv.reader(csv_in_file)
filewriter = csv.writer(csv_out_file)
header = next(filereader)
filewriter.writerow(header)
for row_list in filereader:
#取出需要判断的位置
a_date = row_list[4]
#判断
if a_date in important_dates:
filewriter.writerow(row_list)
#!/usr/bin/env python3
import pandas as pd
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
important_dates = ['1/20/14', '1/30/14']
#isin用来进行匹配
data_frame_value_in_set = data_frame.loc[data_frame['Purchase Date'].isin(important_dates), :]
data_frame_value_in_set.to_csv(output_file, index=False)
#!/usr/bin/env python3
import csv
import re
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
#模式匹配
#在元字符中,寻找以001-开头的正则表达式
pattern = re.compile(r'(?P^001-.*)' , re.I)
with open(input_file, 'r', newline='') as csv_in_file:
with open(output_file, 'w', newline='') as csv_out_file:
filereader = csv.reader(csv_in_file)
filewriter = csv.writer(csv_out_file)
header = next(filereader)
filewriter.writerow(header)
for row_list in filereader:
invoice_number = row_list[1]
if pattern.search(invoice_number):
filewriter.writerow(row_list)
对于正则表达式不熟悉的朋友,可以移步我的另外一个blog:
https://blog.csdn.net/qq_44671752/article/details/104384168
#!/usr/bin/env python3
import pandas as pd
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
data_frame_value_matches_pattern = data_frame.loc[data_frame['Invoice Number'].\
str.startswith("001-"), :]
data_frame_value_matches_pattern.to_csv(output_file, index=False)
#!/usr/bin/env python3
import csv
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
my_columns = [0, 3]
with open(input_file, 'r', newline='') as csv_in_file:
with open(output_file, 'w', newline='') as csv_out_file:
filereader = csv.reader(csv_in_file)
filewriter = csv.writer(csv_out_file)
for row_list in filereader:
row_list_output = [ ]
for index_value in my_columns:
row_list_output.append(row_list[index_value])
filewriter.writerow(row_list_output)
#!/usr/bin/env python3
import pandas as pd
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
#使用iloc来索引位置
data_frame_column_by_index = data_frame.iloc[:, [0, 3]]
data_frame_column_by_index.to_csv(output_file, index=False)
#!/usr/bin/env python3
import csv
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
#需要索引的值
my_columns = ['Invoice Number', 'Purchase Date']
my_columns_index = []
with open(input_file, 'r', newline='') as csv_in_file:
with open(output_file, 'w', newline='') as csv_out_file:
filereader = csv.reader(csv_in_file)
filewriter = csv.writer(csv_out_file)
header = next(filereader, None)
for index_value in range(len(header)):
#判断
if header[index_value] in my_columns:
my_columns_index.append(index_value)
filewriter.writerow(my_columns)
for row_list in filereader:
row_list_output = [ ]
for index_value in my_columns_index:
row_list_output.append(row_list[index_value])
filewriter.writerow(row_list_output)
#!/usr/bin/env python3
import pandas as pd
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
#也是直接进行输入约束条件
data_frame_column_by_name = data_frame.loc[:, ['Invoice Number', 'Purchase Date']]
data_frame_column_by_name.to_csv(output_file, index=False)
#!/usr/bin/env python3
import csv
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
row_counter = 0
with open(input_file, 'r', newline='') as csv_in_file:
with open(output_file, 'w', newline='') as csv_out_file:
filereader = csv.reader(csv_in_file)
filewriter = csv.writer(csv_out_file)
for row in filereader:
#通过计数来选取特定的行
if row_counter >= 3 and row_counter <= 15:
filewriter.writerow([value.strip() for value in row])
row_counter += 1
#!/usr/bin/env python3
import pandas as pd
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file, header=None)
#使用drop函数,直接去掉不需要的行
data_frame = data_frame.drop([0,1,2,16,17,18])
data_frame.columns = data_frame.iloc[0]
data_frame = data_frame.reindex(data_frame.index.drop(3))
data_frame.to_csv(output_file, index=False)
。。。未完待续