Python的pandas库+CSV文件处理详细使用,以及与一般Python处理操作对比

Python的pandas库+CSV文件处理详细使用,以及与一般Python处理操作对比

下文来自与我阅读书籍《科学数据处理》的笔记,可能对于书上有些代码并不熟悉,所以留一些坑在这里,日后学会了就补上。如果大家原意留言解答,小白感激不尽。
以下都只是代码部分,相关注解会在我后续学习中进行补全,所以这个坑一定要来补!

1.读写CSV文件(1)

一般处理

#!/usr/bin/env python3
import sys
 
#这个路径的设置是基于.py文件与需要处理的文件在同一个文件夹下
#如果不是这样,或者简单一点,直接赋为绝对路径也可以
input_file = sys.argv[1]
output_file = sys.argv[2]
 
 with open(input_file, 'r', newline='') as filereader:
     with open(output_file, 'w', newline='') as filewriter:
        header = filereader.readline()
        header = header.strip()
        header_list = header.split(',')
        print(header_list)
        filewriter.write(','.join(map(str,header_list))+'\n')
        for row in filereader:
          row = row.strip()
          row_list = row.split(',')
          print(row_list)
          #下面一行的操作我也不太理解,但是是吧值输出到输出文件中
          filewriter.write(','.join(map(str,row_list))+'\n')

pandas处理

#!/usr/bin/env python3
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
print(data_frame)
data_frame.to_csv(output_file, index=False)

读写CVS文件(2)

一般处理

#!/usr/bin/env python3
import csv
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
with open(input_file, 'r', newline='') as csv_in_file:
  with open(output_file, 'w', newline='') as csv_out_file:
    #使用CVS模块来读写
    #delimiter=','是默认分隔符,所以如果你的输入文件和输出文件都是用逗号分隔的,就不需要指定这个参数
    filereader = csv.reader(csv_in_file, delimiter=',')
    filewriter = csv.writer(csv_out_file, delimiter=',')
    for row_list in filereader:
      print(row_list)
      filewriter.writerow(row_list)

2.筛选特定的行

一般操作

#!/usr/bin/env python3
import csv
import sys

input_file = sys.argv[1]
output_file = sys.argv[2]
with open(input_file, 'r', newline='') as csv_in_file:
  with open(output_file, 'w', newline='') as csv_out_file:
    filereader = csv.reader(csv_in_file)
    filewriter = csv.writer(csv_out_file)
    #读出文本第一行赋给header
    header = next(filereader)
    filewriter.writerow(header)
    for row_list in filereader:
      #取每一行第一列数据
      supplier = str(row_list[0]).strip()
      cost = str(row_list[3]).strip('$').replace(',', '')
      if supplier == 'Supplier Z' or float(cost) > 600.0:
        filewriter.writerow(row_list)

pandas处理

#!/usr/bin/env python3
import pandas as pd
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
data_frame['Cost']=data_frame['Cost'].str.strip('$').astype(float)
#使用loc函数可以同时选择特定的行与列
data_frame_value_meets_condition=data_frame.loc[(data_frame['Supplier Name'].str.contains('Z')) | (data_frame['Cost'] > 600.0), :]

data_frame_value_meets_condition.to_csv(output_file, index=False)

2.1 行中的值属于特定集合

一般处理

#!/usr/bin/env python3
import csv
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
important_dates = ['1/20/14', '1/30/14']
with open(input_file, 'r', newline='') as csv_in_file:
  with open(output_file, 'w', newline='') as csv_out_file:
    filereader = csv.reader(csv_in_file)
    filewriter = csv.writer(csv_out_file)
    header = next(filereader)
    filewriter.writerow(header)
    for row_list in filereader:
      #取出需要判断的位置
      a_date = row_list[4]
      #判断
      if a_date in important_dates:
        filewriter.writerow(row_list)

pandas处理

#!/usr/bin/env python3
import pandas as pd
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
important_dates = ['1/20/14', '1/30/14']
#isin用来进行匹配
data_frame_value_in_set = data_frame.loc[data_frame['Purchase Date'].isin(important_dates), :]
data_frame_value_in_set.to_csv(output_file, index=False)

2.2 匹配某个模式或正则表达式

一般模式

#!/usr/bin/env python3
import csv
import re
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
#模式匹配
#在元字符中,寻找以001-开头的正则表达式
pattern = re.compile(r'(?P^001-.*)', re.I)
with open(input_file, 'r', newline='') as csv_in_file:
  with open(output_file, 'w', newline='') as csv_out_file:
    filereader = csv.reader(csv_in_file)
    filewriter = csv.writer(csv_out_file)
    header = next(filereader)
    filewriter.writerow(header)
    for row_list in filereader:
      invoice_number = row_list[1]
      if pattern.search(invoice_number):
        filewriter.writerow(row_list)

对于正则表达式不熟悉的朋友,可以移步我的另外一个blog:

https://blog.csdn.net/qq_44671752/article/details/104384168

pandas处理

#!/usr/bin/env python3
import pandas as pd
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
data_frame_value_matches_pattern = data_frame.loc[data_frame['Invoice Number'].\
str.startswith("001-"), :]
data_frame_value_matches_pattern.to_csv(output_file, index=False)

3.选取特定列

一般处理

#!/usr/bin/env python3
import csv
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
my_columns = [0, 3]
with open(input_file, 'r', newline='') as csv_in_file:
  with open(output_file, 'w', newline='') as csv_out_file:
    filereader = csv.reader(csv_in_file)
    filewriter = csv.writer(csv_out_file)
    for row_list in filereader:
      row_list_output = [ ]
      for index_value in my_columns:
        row_list_output.append(row_list[index_value])
        filewriter.writerow(row_list_output)

pandas处理

#!/usr/bin/env python3
import pandas as pd
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
#使用iloc来索引位置
data_frame_column_by_index = data_frame.iloc[:, [0, 3]]
data_frame_column_by_index.to_csv(output_file, index=False)

3.1索引列标题

一般处理

#!/usr/bin/env python3
import csv
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
#需要索引的值
my_columns = ['Invoice Number', 'Purchase Date']
my_columns_index = []

with open(input_file, 'r', newline='') as csv_in_file:
  with open(output_file, 'w', newline='') as csv_out_file:
    filereader = csv.reader(csv_in_file)
    filewriter = csv.writer(csv_out_file)
    header = next(filereader, None)
    for index_value in range(len(header)):
      #判断
      if header[index_value] in my_columns:
        my_columns_index.append(index_value)
        filewriter.writerow(my_columns)
        for row_list in filereader:
          row_list_output = [ ]
          for index_value in my_columns_index:
            row_list_output.append(row_list[index_value])
            filewriter.writerow(row_list_output)

pandas处理

#!/usr/bin/env python3
import pandas as pd
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
#也是直接进行输入约束条件
data_frame_column_by_name = data_frame.loc[:, ['Invoice Number', 'Purchase Date']]
data_frame_column_by_name.to_csv(output_file, index=False)

4.选取连续的行

一般处理

#!/usr/bin/env python3
import csv
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
row_counter = 0
with open(input_file, 'r', newline='') as csv_in_file:
  with open(output_file, 'w', newline='') as csv_out_file:
    filereader = csv.reader(csv_in_file)
    filewriter = csv.writer(csv_out_file)
    for row in filereader:
      #通过计数来选取特定的行
      if row_counter >= 3 and row_counter <= 15:
        filewriter.writerow([value.strip() for value in row])
        row_counter += 1

pandas处理

#!/usr/bin/env python3
import pandas as pd
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file, header=None)
#使用drop函数,直接去掉不需要的行
data_frame = data_frame.drop([0,1,2,16,17,18])
data_frame.columns = data_frame.iloc[0]
data_frame = data_frame.reindex(data_frame.index.drop(3))
data_frame.to_csv(output_file, index=False)

。。。未完待续

你可能感兴趣的:(Python)