史上最简单的JAVA实现PDF转HTML

实现PDF转HTML有多种方式,

今天记录下以图片的形式,转化为HTML。

【原理分析】利用pdfbox解析pdf文档,按照分页来拆分,以每一页转化为一个图片,依次类推,将每一个图片拼接到HTML中,则实现了PDF转HTML,而且可以保留原pdf的文档格式。

【maven依赖】

        
			org.apache.pdfbox
			pdfbox
			2.0.4
		

		
			org.apache.pdfbox
			pdfbox-tools
			2.0.4
		

		
			org.apache.pdfbox
			fontbox
			2.0.4
		
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;

public class Pdf2HTML {

    public static void PdfToImage(String pdfurl) {
        StringBuffer buffer = new StringBuffer();
        FileOutputStream fos;
        PDDocument document;
        File pdfFile;
        int size;
        BufferedImage image;
        FileOutputStream out;
        Long randStr = 0l;
        //PDF转换成HTML保存的文件夹
        String path = "E:\\Pdf2HTML";
        File htmlsDir = new File(path);
        if (!htmlsDir.exists()) {
            htmlsDir.mkdirs();
        }
        File htmlDir = new File(path + "/");
        if (!htmlDir.exists()) {
            htmlDir.mkdirs();
        }
        try {
            //遍历处理pdf附件
            randStr = System.currentTimeMillis();
            buffer.append("\r\n");
            buffer.append("\r\n");
            buffer.append("\r\n");
            buffer.append("\r\n");
            buffer.append("\r\n");
            buffer.append("\r\n");
            document = new PDDocument();
            //pdf附件
            pdfFile = new File(pdfurl);
            document = PDDocument.load(pdfFile, (String) null);
            size = document.getNumberOfPages();
            Long start = System.currentTimeMillis(), end = null;
            System.out.println("===>pdf : " + pdfFile.getName() + " , size : " + size);
            PDFRenderer reader = new PDFRenderer(document);
            for (int i = 0; i < size; i++) {
                //image = newPDFRenderer(document).renderImageWithDPI(i,130,ImageType.RGB);
                image = reader.renderImage(i, 1.5f);
                //生成图片,保存位置
                out = new FileOutputStream(path + "/" + "image" + "_" + i + ".jpg");
                ImageIO.write(image, "png", out); //使用png的清晰度
                //将图片路径追加到网页文件里
                buffer.append("\r\n");
                image = null;
                out.flush();
                out.close();
            }
            reader = null;
            document.close();
            buffer.append("\r\n");
            buffer.append("");
            end = System.currentTimeMillis() - start;
            System.out.println("===> Reading pdf times: " + (end / 1000));
            start = end = null;
            //生成网页文件
            fos = new FileOutputStream(path + randStr + ".html");
            System.out.println(path + randStr + ".html");
            fos.write(buffer.toString().getBytes());
            fos.flush();
            fos.close();
            buffer.setLength(0);


        } catch (Exception e) {
            System.out.println("===>Reader parse pdf to jpg error : " + e.getMessage());
            e.printStackTrace();
        }
    }

    public static void main(String[] args) {

        String pdf = "E:\\Pdf2HTML\\java开发工程师.pdf";

        //传入PDF地址
        PdfToImage(pdf);
    }
}

 

 

 

 

你可能感兴趣的:(java常用工具)