python对文件进行操作

1.切分PDF文件提取任意页面

from PyPDF2 import PdfFileReader, PdfFileWriter

def split_pdf(fileName, result, start=0, end=None):
    #打开原始pdf文件
    pdf_src = PdfFileReader(fileName)
    if end is None:
        #获取页数
        end = pdf_src.getNumPages()
    with open(result, 'wb') as fp:
        pdf = PdfFileWriter()
        #提取页面内容,写入空白文件
        for num in range(start, end):
            pdf.addPage(pdf_src.getPage(num))
        #写入结果pdf文件
        pdf.write(fp)

path = './123.pdf'
split_pdf(path, './1.pdf', 0, 7)

2.读取csv文件

import csv

csv_reader = csv.reader(open("data.csv"))
for row in csv_reader:
        print(row)
import pandas as pd

csv_reader = pd.read_csv("data.csv")
print(csv_reader)

#csv_reader = pd.read_table("data.csv", sep=",")
#print(csv_reader)

3.读取txt文件

#read files
def loadData(fileName):
    with open(fileName) as f:
        lines = f.readlines()

#write files
def writeData(pathFile, outText):
    with open(pathFile, "a+") as f:
        for item in outText:
            f.write(item+"\n")

4.对比两个文件的数据

import pandas as pd

train_data = pd.DataFrame({
    'a':[1,2,3,4,5,5],
    'b':['a','c','a','t','b','p']
})
test_data = pd.DataFrame({
    'a':[1,12,13,14,19,19],
    'b':['g','b','y','a','d','y']
})
col_list = train_data.columns.tolist()
for i in col_list:
    #set() 函数创建一个无序不重复元素集,可进行关系测试,删除重复数据,还可以计算交集、差集、并集等。
    set1 = set(train_data[i])
    set2 = set(test_data[i])
    print('set1:',set1)
    print(i,'相同的元素:', (set1 & set2))
    print(i, '不同的元素:', (set1 ^ set2))
    print(i, '并集:', (set1 | set2))
    print(i, '差集1:', (set1 - set2)) #train_data有test_data没有的元素,equal ((set1|set2) - set2)
    print(i, '差集2:', (set2 - set1)) #train_data没有test_data有的元素,equal ((set1|set2) - set1)

5.Excel 转PDF

Pywin32

你可能感兴趣的:(python)