1.切分PDF文件提取任意页面
from PyPDF2 import PdfFileReader, PdfFileWriter
def split_pdf(fileName, result, start=0, end=None):
#打开原始pdf文件
pdf_src = PdfFileReader(fileName)
if end is None:
#获取页数
end = pdf_src.getNumPages()
with open(result, 'wb') as fp:
pdf = PdfFileWriter()
#提取页面内容,写入空白文件
for num in range(start, end):
pdf.addPage(pdf_src.getPage(num))
#写入结果pdf文件
pdf.write(fp)
path = './123.pdf'
split_pdf(path, './1.pdf', 0, 7)
2.读取csv文件
import csv
csv_reader = csv.reader(open("data.csv"))
for row in csv_reader:
print(row)
import pandas as pd
csv_reader = pd.read_csv("data.csv")
print(csv_reader)
#csv_reader = pd.read_table("data.csv", sep=",")
#print(csv_reader)
3.读取txt文件
#read files
def loadData(fileName):
with open(fileName) as f:
lines = f.readlines()
#write files
def writeData(pathFile, outText):
with open(pathFile, "a+") as f:
for item in outText:
f.write(item+"\n")
4.对比两个文件的数据
import pandas as pd
train_data = pd.DataFrame({
'a':[1,2,3,4,5,5],
'b':['a','c','a','t','b','p']
})
test_data = pd.DataFrame({
'a':[1,12,13,14,19,19],
'b':['g','b','y','a','d','y']
})
col_list = train_data.columns.tolist()
for i in col_list:
#set() 函数创建一个无序不重复元素集,可进行关系测试,删除重复数据,还可以计算交集、差集、并集等。
set1 = set(train_data[i])
set2 = set(test_data[i])
print('set1:',set1)
print(i,'相同的元素:', (set1 & set2))
print(i, '不同的元素:', (set1 ^ set2))
print(i, '并集:', (set1 | set2))
print(i, '差集1:', (set1 - set2)) #train_data有test_data没有的元素,equal ((set1|set2) - set2)
print(i, '差集2:', (set2 - set1)) #train_data没有test_data有的元素,equal ((set1|set2) - set1)
5.Excel 转PDF
Pywin32