实现PDF转HTML有多种方式,
今天记录下以图片的形式,转化为HTML。
【原理分析】利用pdfbox解析pdf文档,按照分页来拆分,以每一页转化为一个图片,依次类推,将每一个图片拼接到HTML中,则实现了PDF转HTML,而且可以保留原pdf的文档格式。
【maven依赖】
org.apache.pdfbox
pdfbox
2.0.4
org.apache.pdfbox
pdfbox-tools
2.0.4
org.apache.pdfbox
fontbox
2.0.4
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
public class Pdf2HTML {
public static void PdfToImage(String pdfurl) {
StringBuffer buffer = new StringBuffer();
FileOutputStream fos;
PDDocument document;
File pdfFile;
int size;
BufferedImage image;
FileOutputStream out;
Long randStr = 0l;
//PDF转换成HTML保存的文件夹
String path = "E:\\Pdf2HTML";
File htmlsDir = new File(path);
if (!htmlsDir.exists()) {
htmlsDir.mkdirs();
}
File htmlDir = new File(path + "/");
if (!htmlDir.exists()) {
htmlDir.mkdirs();
}
try {
//遍历处理pdf附件
randStr = System.currentTimeMillis();
buffer.append("\r\n");
buffer.append("\r\n");
buffer.append("\r\n");
buffer.append("\r\n");
buffer.append("\r\n");
buffer.append("\r\n");
document = new PDDocument();
//pdf附件
pdfFile = new File(pdfurl);
document = PDDocument.load(pdfFile, (String) null);
size = document.getNumberOfPages();
Long start = System.currentTimeMillis(), end = null;
System.out.println("===>pdf : " + pdfFile.getName() + " , size : " + size);
PDFRenderer reader = new PDFRenderer(document);
for (int i = 0; i < size; i++) {
//image = newPDFRenderer(document).renderImageWithDPI(i,130,ImageType.RGB);
image = reader.renderImage(i, 1.5f);
//生成图片,保存位置
out = new FileOutputStream(path + "/" + "image" + "_" + i + ".jpg");
ImageIO.write(image, "png", out); //使用png的清晰度
//将图片路径追加到网页文件里
buffer.append("\r\n");
image = null;
out.flush();
out.close();
}
reader = null;
document.close();
buffer.append("\r\n");
buffer.append("