PDF2IMG
需要安装python的img2pdf和pdf2img的包,还需要安装https://github.com/oschwartz10612/poppler-windows/releases/,并且解压之后把路径+lib/bin添加到环境变量
import multiprocessing
import sys
from datetime import time
import time
from pdf2image import convert_from_path
import os
from tqdm import tqdm
def getFiles(path):
Filelist = []
for home, dirs, files in os.walk(path):
for file in files:
# 文件名列表,包含完整路径
file_path = os.path.join(home, file).replace('\\', '/')
Filelist.append(file_path)
#Filelist.append(file)
return Filelist
def report(outputpath, file):
with open(str(outputpath+"/"+"convertreport.txt"), "a") as report:
report.write(str(file+" has been converted. \n"))
print(file+" has been converted.")
return
def convert2(outputpath, pages, file):
pagecount = 1
for page in pages:
outputname = str(file+'_'+str(pagecount)+'.png')
page.save(str(outputpath+"/"+outputname), 'PNG')
pagecount += 1
return
def convert(path, outputpath):
'''Takes all files from a given directory with pdf files and turns them into jpg files. filename.pdf leads to filename_1.jpg, filename_2.png jpg.'''
#outputpath = path+'_output'
if os.path.exists(outputpath):
pass
else:
#os.system("mkdir "+ outputpath)
os.makedirs(outputpath)
pages = convert_from_path(str(path), 500, size=(1300, 1500))
dir_path, filename = os.path.split(path)
convert2(outputpath, pages, filename)
#report(outputpath, filename)
#print("All files are converted!")
return outputpath
def main():
path = r'C:\Users\Administrator\Desktop\1/'#os.getcwd()#'G:/xiao/dataset_molcreateV2/data/1/'
save_path = path
time_start = time.time()
files = getFiles(path)
for file in tqdm(files):
#finder = os.path.split(file)[1].split('.')[0]
file_finder = save_path + 'pics/'#finder
if not os.path.exists(file_finder):
os.mkdir(file_finder)
if file.endswith('.pdf'):
convert(file, file_finder)
# if len(sys.argv) != 2:
# print("\"Usage of this function: convert.py input_path")
# if len(sys.argv) == 2:
# convert(sys.argv[1])
# sys.exit(1)
time_end = time.time()
print('use time', time_end - time_start)
if __name__ == '__main__':
main()
IMG2PDF
import os
import img2pdf
from PIL import Image
def doImg2Pdf(fileName):
fileList = os.listdir(fileName)
num = 14 #pdf有多少页
if num > len(fileList):
print('num长度需小于:', len(fileList))
exit()
if int(len(fileList) % num) == 0:
num_file = int(len(fileList) / num)
else:
num_file = int(len(fileList) / num) + 1
print(num_file)
cnt = 0
for n in range(1, num_file + 1): # 创建文件夹
with open("sample1_%s.pdf"%n, "wb") as f:
pngList = []
list_n = fileList[num * cnt:num * (cnt + 1)]
for pngName in list_n:
pngList.append(fileName + pngName)
pfn_bytes = img2pdf.convert(pngList)
f.write(pfn_bytes)
cnt += 1
print("转换完成")
doImg2Pdf(r'C:\Users\Administrator\Desktop\1\pics\\')```