java 把一本pdf内容是扫描件的书转换成txt文本

第一步:导入jar包

        
            org.apache.pdfbox
            pdfbox
            2.0.12
        
        
            com.baidu.aip
            java-sdk
            4.8.0
        

第二步:提取pdf中的图片

import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileInputStream;
import java.util.Iterator;

import javax.imageio.ImageIO;

import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;

public class Test2 {

    public static void main(String[] args) {
        String file = "/Users/jin/Downloads/xxxxx.pdf";
        String path = "/Users/jin/Downloads/img/";
        try {
            // 打开pdf文件流
            FileInputStream fis = new FileInputStream(file);
            // 加载 pdf 文档,获取PDDocument文档对象
            PDDocument document = PDDocument.load(fis);
            // 获取PDDocumentCatalog文档目录对象
            PDDocumentCatalog catalog = document.getDocumentCatalog();
            // 获取文档页面PDPage列表
            int pages = document.getNumberOfPages();
            int count = 1;
            for (int j = 1; j < pages; j++) {
                PDPage page = document.getPage(j);
                PDResources resources = page.getResources();
                Iterable xobjects = resources.getXObjectNames();
                if (xobjects != null) {
                    Iterator imageIter = xobjects.iterator();
                    while (imageIter.hasNext()) {
                        COSName key = (COSName) imageIter.next();
                        if (resources.isImageXObject(key) && (!key.getName().equals("QuickPDFIm848de7a9"))) {
                            try {
                                PDImageXObject image = (PDImageXObject) resources.getXObject(key);
                                BufferedImage bimage = image.getImage();
                                ImageIO.write(bimage, "jpg", new File(path + count + ".jpg"));
                                count++;
                                System.out.println(count);
                            } catch (Exception e) {
                            }
                        }

                    }
                }
            }
        } catch (Exception e) {
            System.out.println();
        }
    }
}

第三步:把图片转换成文本保存(这里使用的是百度文字识别)

import java.util.HashMap;

import org.json.JSONArray;
import org.json.JSONObject;

import com.baidu.aip.ocr.AipOcr;
import com.jin.demo.util.FileUtil;

public class Sample {
    // 设置APPID/AK/SK
    public static final String APP_ID = "xxx";
    public static final String API_KEY = "xxx";
    public static final String SECRET_KEY = "xxx";

    public static void main(String[] args) {
        // 初始化一个AipOcr
        AipOcr client = new AipOcr(APP_ID, API_KEY, SECRET_KEY);

        // 可选:设置网络连接参数
        client.setConnectionTimeoutInMillis(2000);
        client.setSocketTimeoutInMillis(60000);

        // 也可以直接通过jvm启动参数设置此环境变量
        System.setProperty("aip.log4j.conf", "path/to/your/log4j.properties");

        String filePath = "/Users/jin/Desktop/book/xxx.txt";
        System.out.println("---begin---");
        // 调用接口
        for (int i = 4; i < 222; i++) {
            System.out.println("---" + i + "---");
            String path = "/Users/jin/Downloads/img/" + i + ".jpg";
            JSONObject res = client.basicGeneral(path, new HashMap());
            JSONArray jsonArray = res.getJSONArray("words_result");
            for (int j = 0; j < jsonArray.length(); j++) {
                JSONObject jsonObject = jsonArray.getJSONObject(j);
                String content = jsonObject.getString("words");
                    addContent(filePath, content);
            }
        }
        System.out.println("---over---");

    }

        public static void addContent(String path, String content) {
        FileWriter fw = null;
        try {
            // 如果文件存在,则追加内容;如果文件不存在,则创建文件
            File f = new File(path);
            fw = new FileWriter(f, true);
        } catch (IOException e) {
            e.printStackTrace();
        }
        PrintWriter pw = new PrintWriter(fw);
        pw.println(content);
        pw.flush();
        try {
            fw.flush();
            pw.close();
            fw.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

你可能感兴趣的:(java 把一本pdf内容是扫描件的书转换成txt文本)