JAVA识别PDF和OFD电子发票并解析为java对象

上一篇我们说了java实现电子发票中的发票税号等信息识别的几种可用方案,最后博主选取了识别文件二维码的方式,而且文章最后也说了,这种有局限性,去到的信息有限,而且针对OFD格式也得继续想办法,那接下来,我们就说一下怎么处理这个问题,并且如何去识别OFD格式的发票文件中的内容:

想看上一篇思路的请看博主的这篇文章:

java实现电子发票中的发票税号等信息识别的几种可用方案

https://blog.csdn.net/Alex_81D/article/details/128923743

看看这一篇发票识别的做法:

先看一下效果:

这是原图:

JAVA识别PDF和OFD电子发票并解析为java对象_第1张图片

这个是识别后的效果:

JAVA识别PDF和OFD电子发票并解析为java对象_第2张图片

完全一致。

不卖关子了,经过全网寻找,这个文章是比较靠谱的一个:

开源地址:https://github.com/sanluan/einvoice

电子发票识别,可识别 电子普票 电子专票 文件类型支持 pdf ofd

在线识别页面 http://www.heycore.com/invoice.html

如果需要打开ofd文件,再推荐个网站:

https://inv-veri.chinatax.gov.cn/xgxz.html

直接在国家税务总局网站下载,这样ofd格式就可以打开使用了。

JAVA识别PDF和OFD电子发票并解析为java对象_第3张图片

部分代码信息:

接口类:InvoiceController


import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.dom4j.DocumentException;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;

import com.sanluan.einvoice.service.Invoice;
import com.sanluan.einvoice.service.OfdInvoiceExtractor;
import com.sanluan.einvoice.service.PdfInvoiceExtractor;

@RestController
@RequestMapping("/invoice")
public class InvoiceController {

    @Value("${backupPath}")
    private String backupPath;

    private static ThreadLocal> threadLocal = new ThreadLocal<>();
    private static final String FILE_NAME_FORMAT_STRING = "yyyy/MM-dd/HH-mm-ssSSSS";
    public static final RequestConfig defaultRequestConfig = RequestConfig.custom().setSocketTimeout(5000).setConnectTimeout(5000)
            .setConnectionRequestTimeout(5000).build();

    /**
     * @param pattern
     * @return date format
     */
    public static DateFormat getDateFormat(String pattern) {
        Map map = threadLocal.get();
        DateFormat format = null;
        if (null == map) {
            map = new HashMap<>();
            format = new SimpleDateFormat(pattern);
            map.put(pattern, format);
            threadLocal.set(map);
        } else {
            format = map.computeIfAbsent(pattern, k -> new SimpleDateFormat(k));
        }
        return format;
    }

    @RequestMapping(value = "/extrat")
    public Invoice extrat(@RequestParam(value = "file", required = false) MultipartFile file, String url) {
        String fileName = getDateFormat(FILE_NAME_FORMAT_STRING).format(new Date());
        File dest = null;
        boolean ofd = false;
        if (null != file && !file.isEmpty()) {
            if (file.getOriginalFilename().toLowerCase().endsWith(".ofd")) {
                ofd = true;
                dest = new File(backupPath, fileName + ".ofd");
            } else {
                dest = new File(backupPath, fileName + ".pdf");
            }
            dest.getParentFile().mkdirs();
            try {
                FileUtils.copyInputStreamToFile(file.getInputStream(), dest);
            } catch (IOException e) {
            }
        } else if (null != url) {
            if (url.toLowerCase().endsWith(".ofd")) {
                ofd = true;
                dest = new File(backupPath, fileName + ".ofd");
            } else {
                dest = new File(backupPath, fileName + ".pdf");
            }
            dest.getParentFile().mkdirs();
            try (CloseableHttpClient httpclient = HttpClients.custom().setDefaultRequestConfig(defaultRequestConfig).build();) {
                HttpUriRequest request = new HttpGet(url);
                try (CloseableHttpResponse response = httpclient.execute(request)) {
                    HttpEntity entity = response.getEntity();
                    if (null != entity) {
                        BufferedInputStream inputStream = new BufferedInputStream(entity.getContent());
                        FileUtils.copyInputStreamToFile(inputStream, dest);
                        EntityUtils.consume(entity);
                    }
                }
            } catch (Exception e) {
            }
        }
        Invoice result = null;
        try {
            if (null != dest) {
                if (ofd) {
                    result = OfdInvoiceExtractor.extract(dest);
                } else {
                    result = PdfInvoiceExtractor.extract(dest);
                }
                if (null != result.getAmount()) {
                    dest.delete();
                }
            } else {
                result = new Invoice();
                result.setTitle("error");
            }
        } catch (IOException | DocumentException e) {
            e.printStackTrace();
            result = new Invoice();
            result.setTitle("error");
        }
        return result;
    }
}

pdf解析类:

import java.awt.Rectangle;
import java.io.File;
import java.io.IOException;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;

/**
 * 专用于处理电子发票识别的类
 *
 */

public class PdfInvoiceExtractor {

    public static Invoice extract(File file) throws IOException {
        Invoice invoice = new Invoice();
        PDDocument doc = PDDocument.load(file);
        PDPage firstPage = doc.getPage(0);
        int pageWidth = Math.round(firstPage.getCropBox().getWidth());
        PDFTextStripper textStripper = new PDFTextStripper();
        textStripper.setSortByPosition(true);
        String fullText = textStripper.getText(doc);
        if (firstPage.getRotation() != 0) {
            pageWidth = Math.round(firstPage.getCropBox().getHeight());
        }
        String allText = replace(fullText).replaceAll("(", "(").replaceAll(")", ")").replaceAll("¥", "¥");
        {
            String reg = "机器编号:(?\\d{12})|发票代码:(?\\d{12})|发票号码:(?\\d{8})|:(?\\d{4}年\\d{2}月\\d{2}日)"
                    + "|校验码:(?\\d{20}|\\S{4,})";
            Pattern pattern = Pattern.compile(reg);
            Matcher matcher = pattern.matcher(allText);
            while (matcher.find()) {
                if (matcher.group("machineNumber") != null) {
                    invoice.setMachineNumber(matcher.group("machineNumber"));
                } else if (matcher.group("code") != null) {
                    invoice.setCode(matcher.group("code"));
                } else if (matcher.group("number") != null) {
                    invoice.setNumber(matcher.group("number"));
                } else if (matcher.group("date") != null) {
                    invoice.setDate(matcher.group("date"));
                } else if (matcher.group("checksum") != null) {
                    invoice.setChecksum(matcher.group("checksum"));
                }
            }
        }
        {
            String reg = "合计¥?(?[^ \\f\\n\\r\\t\\v\\*]*)(?:¥?(?\\S*)|\\*+)\\s";
            Pattern pattern = Pattern.compile(reg);
            Matcher matcher = pattern.matcher(allText);
            if (matcher.find()) {
                try {
                    invoice.setAmount(new BigDecimal(matcher.group("amount")));
                } catch (Exception e) {
                }
                try {
                    invoice.setTaxAmount(new BigDecimal(matcher.group("taxAmount")));
                } catch (Exception e) {
                    invoice.setTaxAmount(new BigDecimal(0));
                }
            }
        }
        if (null == invoice.getAmount()) {
            String reg = "合\\u0020*计\\u0020*¥?(?[^ ]*)\\u0020+¥?(?:(?\\S*)|\\*+)\\s";
            Pattern pattern = Pattern.compile(reg);
            Matcher matcher = pattern.matcher(fullText);
            if (matcher.find()) {
                try {
                    invoice.setAmount(new BigDecimal(matcher.group("amount")));
                } catch (Exception e) {
                    invoice.setAmount(new BigDecimal(0));
                }
                try {
                    invoice.setTaxAmount(new BigDecimal(matcher.group("taxAmount")));
                } catch (Exception e) {
                    invoice.setTaxAmount(new BigDecimal(0));
                }
            }
        }
        {
            String reg = "价税合计\\u0028大写\\u0029(?\\S*)\\u0028小写\\u0029¥?(?\\S*)\\s";
            Pattern pattern = Pattern.compile(reg);
            Matcher matcher = pattern.matcher(allText);
            if (matcher.find()) {
                invoice.setTotalAmountString(matcher.group("amountString"));
                try {
                    invoice.setTotalAmount(new BigDecimal(matcher.group("amount")));
                } catch (Exception e) {
                    invoice.setTotalAmount(new BigDecimal(0));
                }
            }
        }
        {
            String reg = "收款人:(?\\S*)复核:(?\\S*)开票人:(?\\S*)销售方";
            Pattern pattern = Pattern.compile(reg);
            Matcher matcher = pattern.matcher(allText);
            if (matcher.find()) {
                invoice.setPayee(matcher.group("payee"));
                invoice.setReviewer(matcher.group("reviewer"));
                invoice.setDrawer(matcher.group("drawer"));
            }
            if (allText.indexOf("通行费") > 0 && allText.indexOf("车牌号") > 0) {
                invoice.setType("通行费");
            }
            Pattern type00Pattern = Pattern.compile("(?

\\S*)通发票"); Matcher m00 = type00Pattern.matcher(allText); if (m00.find()) { invoice.setTitle(m00.group("p").replaceAll("(?:国|统|一|发|票|监|制)", "") + "通发票"); if (null == invoice.getType()) { invoice.setType("普通发票"); } } else { Pattern type01Pattern = Pattern.compile("(?

\\S*)用发票"); Matcher m01 = type01Pattern.matcher(allText); if (m01.find()) { invoice.setTitle(m01.group("p").replaceAll("(?:国|统|一|发|票|监|制)", "") + "用发票"); if (null == invoice.getType()) { invoice.setType("专用发票"); } } } } PDFKeyWordPosition kwp = new PDFKeyWordPosition(); Map> positionListMap = kwp .getCoordinate(Arrays.asList("机器编号", "税率", "价税合计", "合计", "开票日期", "规格型号", "车牌号", "开户行及账号", "密", "码", "区"), doc); PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition(true); PDFTextStripperByArea detailStripper = new PDFTextStripperByArea(); detailStripper.setSortByPosition(true); { Position machineNumber; if (positionListMap.get("机器编号").size() > 0) { machineNumber = positionListMap.get("机器编号").get(0); } else { machineNumber = positionListMap.get("开票日期").get(0); machineNumber.setY(machineNumber.getY() + 30); } Position taxRate = positionListMap.get("税率").get(0); Position totalAmount = positionListMap.get("价税合计").get(0); Position amount = positionListMap.get("合计").get(0); Position model = null; if (!positionListMap.get("规格型号").isEmpty()) { model = positionListMap.get("规格型号").get(0); } else { model = positionListMap.get("车牌号").get(0); model.setX(model.getX() - 15); } List account = positionListMap.get("开户行及账号"); Position buyer; Position seller; if (account.size() < 2) { buyer = new Position(51, 122); seller = new Position(51, 341); } else { buyer = account.get(0); seller = account.get(1); } int maqX = 370; List mi = positionListMap.get("密"); List ma = positionListMap.get("码"); List qu = positionListMap.get("区"); for (int i = 0; i < mi.size(); i++) { float x1 = mi.get(i).getX(); for (int j = 0; j < ma.size(); j++) { float x2 = ma.get(j).getX(); if (Math.abs(x1 - x2) < 5) { for (int k = 0; k < qu.size(); k++) { float x3 = qu.get(k).getX(); if (Math.abs(x2 - x3) < 5) { maqX = Math.round((x1 + x2 + x3) / 3); } } } } } { int x = Math.round(model.getX()) - 13; int y = Math.round(taxRate.getY()) + 5; // 用税率的y坐标作参考 int h = Math.round(amount.getY()) - Math.round(taxRate.getY()) - 25; // 价税合计的y坐标减去税率的y坐标 detailStripper.addRegion("detail", new Rectangle(0, y, pageWidth, h)); stripper.addRegion("detailName", new Rectangle(0, y, x, h)); stripper.addRegion("detailPrice", new Rectangle(x, y, pageWidth, h)); } { int x = maqX + 10; int y = Math.round(machineNumber.getY()) + 10; int w = pageWidth - maqX - 10; int h = Math.round(taxRate.getY() - 5) - y; stripper.addRegion("password", new Rectangle(x, y, w, h)); } { int x = Math.round(buyer.getX()) - 15; // 开户行及账号的x为参考 int y = Math.round(machineNumber.getY()) + 10; // 机器编号的y坐标为参考 int w = maqX - x - 5; // 密码区x坐标为参考 int h = Math.round(buyer.getY()) - y + 20; // 开户行及账号的y坐标为参考 stripper.addRegion("buyer", new Rectangle(x, y, w, h)); } { int x = Math.round(seller.getX()) - 15; // 开户行及账号为x参考 int y = Math.round(totalAmount.getY()) + 10; // 价税合计的y坐标为参考 int w = maqX - x - 5; // 密码区的x为参考 int h = Math.round(seller.getY()) - y + 20; // 开户行及账号的y为参考 stripper.addRegion("seller", new Rectangle(x, y, w, h)); } } stripper.extractRegions(firstPage); detailStripper.extractRegions(firstPage); doc.close(); invoice.setPassword(StringUtils.trim(stripper.getTextForRegion("password"))); String reg = "名称:(?\\S*)|纳税人识别号:(?\\S*)|地址、电话:(?

\\S*)|开户行及账号:(?\\S*)|电子支付标识:(?\\S*)"; { String buyer = replace(stripper.getTextForRegion("buyer")); Pattern pattern = Pattern.compile(reg); Matcher matcher = pattern.matcher(buyer); while (matcher.find()) { if (matcher.group("name") != null) { invoice.setBuyerName(matcher.group("name")); } else if (matcher.group("code") != null) { invoice.setBuyerCode(matcher.group("code")); } else if (matcher.group("address") != null) { invoice.setBuyerAddress(matcher.group("address")); } else if (matcher.group("account") != null) { invoice.setBuyerAccount(matcher.group("account")); } else if (matcher.group("account2") != null) { invoice.setBuyerAccount(matcher.group("account2")); } } } { String seller = replace(stripper.getTextForRegion("seller")); Pattern pattern = Pattern.compile(reg); Matcher matcher = pattern.matcher(seller); while (matcher.find()) { if (matcher.group("name") != null) { invoice.setSellerName(matcher.group("name")); } else if (matcher.group("code") != null) { invoice.setSellerCode(matcher.group("code")); } else if (matcher.group("address") != null) { invoice.setSellerAddress(matcher.group("address")); } else if (matcher.group("account") != null) { invoice.setSellerAccount(matcher.group("account")); } } } { List skipList = new ArrayList<>(); List detailList = new ArrayList<>(); String[] detailPriceStringArray = stripper.getTextForRegion("detailPrice").replaceAll(" ", " ").replaceAll(" ", " ") .replaceAll("\r", "").split("\\n"); for (String detailString : detailPriceStringArray) { Detail detail = new Detail(); detail.setName(""); String[] itemArray = StringUtils.split(detailString, " "); if (2 == itemArray.length) { detail.setAmount(new BigDecimal(itemArray[0])); detail.setTaxAmount(new BigDecimal(itemArray[1])); detailList.add(detail); } else if (2 < itemArray.length) { detail.setAmount(new BigDecimal(itemArray[itemArray.length - 3])); String taxRate = itemArray[itemArray.length - 2]; if (taxRate.indexOf("免税") > 0 || taxRate.indexOf("不征税") > 0 || taxRate.indexOf("出口零税率") > 0 || taxRate.indexOf("普通零税率") > 0 || taxRate.indexOf("%") < 0) { detail.setTaxRate(new BigDecimal(0)); detail.setTaxAmount(new BigDecimal(0)); } else { BigDecimal rate = new BigDecimal(Integer.parseInt(taxRate.replaceAll("%", ""))); detail.setTaxRate(rate.divide(new BigDecimal(100))); detail.setTaxAmount(new BigDecimal(itemArray[itemArray.length - 1])); } for (int j = 0; j < itemArray.length - 3; j++) { if (itemArray[j].matches("^(-?\\d+)(\\.\\d+)?$")) { if (null == detail.getCount()) { detail.setCount(new BigDecimal(itemArray[j])); } else { detail.setPrice(new BigDecimal(itemArray[j])); } } else { if (itemArray.length >= j + 1 && !itemArray[j + 1].matches("^(-?\\d+)(\\.\\d+)?$")) { detail.setUnit(itemArray[j + 1]); detail.setModel(itemArray[j]); j++; } else if (itemArray[j].length() > 2) { detail.setModel(itemArray[j]); } else { detail.setUnit(itemArray[j]); } } } detailList.add(detail); } else { skipList.add(detailString); } } String[] detailNameStringArray = stripper.getTextForRegion("detailName").replaceAll(" ", " ").replaceAll(" ", " ") .replaceAll("\r", "").split("\\n"); String[] detailStringArray = replace(detailStripper.getTextForRegion("detail")).replaceAll("\r", "").split("\\n"); int i = 0, j = 0, h = 0, m = 0; Detail lastDetail = null; for (String detailString : detailStringArray) { if (m < detailNameStringArray.length) { if (detailString.matches("\\S+\\d*(%|免税|不征税|出口零税率|普通零税率)\\S*") && !detailString.matches("^ *\\d*(%|免税|不征税|出口零税率|普通零税率)\\S*") && detailString.matches("\\S+\\d+%[\\-\\d]+\\S*") || detailStringArray.length > i + 1 && detailStringArray[i + 1].matches("^ *\\d*(%|免税|不征税|出口零税率|普通零税率)\\S*")) { if (j < detailList.size()) { lastDetail = detailList.get(j); lastDetail.setName(detailNameStringArray[m]); } j++; } else if (null != lastDetail && StringUtils.isNotBlank(detailNameStringArray[m])) { if (skipList.size() > h) { String skip = skipList.get(h); if (detailString.endsWith(skip)) { if (detailString.equals(skip)) { m--; } else { lastDetail.setName(lastDetail.getName() + detailNameStringArray[m]); } lastDetail.setModel(lastDetail.getModel() + skip); h++; } else { lastDetail.setName(lastDetail.getName() + detailNameStringArray[m]); } } else { lastDetail.setName(lastDetail.getName() + detailNameStringArray[m]); } } } i++; m++; } invoice.setDetailList(detailList); } return invoice; } public static String replace(String str) { return str.replaceAll(" ", "").replaceAll(" ", "").replaceAll(":", ":").replaceAll(" ", ""); }

这块里面的内容请各位老铁自行封装。

OfdInvoiceExtractor类

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.math.BigDecimal;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;

import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.springframework.util.StreamUtils;

/**
 * 专用于处理电子发票识别的类
 * 
 *
 */

public class OfdInvoiceExtractor {

    public static Invoice extract(File file) throws IOException, DocumentException {
        ZipFile zipFile = new ZipFile(file);
        ZipEntry entry = zipFile.getEntry("Doc_0/Attachs/original_invoice.xml");
        ZipEntry entry1 = zipFile.getEntry("Doc_0/Pages/Page_0/Content.xml");
        InputStream input = zipFile.getInputStream(entry);
        InputStream input1 = zipFile.getInputStream(entry1);
        String body = StreamUtils.copyToString(input, Charset.forName("utf-8"));
        String content = StreamUtils.copyToString(input1, Charset.forName("utf-8"));
        zipFile.close();
        Document document = DocumentHelper.parseText(body);
        Element root = document.getRootElement();
        Invoice invoice = new Invoice();
        invoice.setMachineNumber(root.elementTextTrim("MachineNo"));
        invoice.setCode(root.elementTextTrim("InvoiceCode"));
        invoice.setNumber(root.elementTextTrim("InvoiceNo"));
        invoice.setDate(root.elementTextTrim("IssueDate"));
        invoice.setChecksum(root.elementTextTrim("InvoiceCheckCode"));
        invoice.setAmount(new BigDecimal(root.elementTextTrim("TaxExclusiveTotalAmount")));
        invoice.setTaxAmount(new BigDecimal(root.elementTextTrim("TaxTotalAmount")));
        int ind = content.indexOf("圆整");
        invoice.setTotalAmountString(content.substring(content.lastIndexOf(">", ind) + 1, ind + 2));
        invoice.setTotalAmount(new BigDecimal(root.elementTextTrim("TaxInclusiveTotalAmount")));
        invoice.setPayee(root.elementTextTrim("Payee"));
        invoice.setReviewer(root.elementTextTrim("Checker"));
        invoice.setDrawer(root.elementTextTrim("InvoiceClerk"));
        int index = content.indexOf("");
        invoice.setTitle(content.substring(content.lastIndexOf(">", index) + 1, index));
        invoice.setType("普通发票");
        if (invoice.getTitle().contains("专用发票")) {
            invoice.setType("专用发票");
        } else if (invoice.getTitle().contains("通行费")) {
            invoice.setType("通行费");
        }
        invoice.setPassword(root.elementText("TaxControlCode"));
        Element buyer = root.element("Buyer");
        {
            invoice.setBuyerName(buyer.elementTextTrim("BuyerName"));
            invoice.setBuyerCode(buyer.elementTextTrim("BuyerTaxID"));
            invoice.setBuyerAddress(buyer.elementTextTrim("BuyerAddrTel"));
            invoice.setBuyerAccount(buyer.elementTextTrim("BuyerFinancialAccount"));
        }
        Element seller = root.element("Seller");
        {
            invoice.setSellerName(seller.elementTextTrim("SellerName"));
            invoice.setSellerCode(seller.elementTextTrim("SellerTaxID"));
            invoice.setSellerAddress(seller.elementTextTrim("SellerAddrTel"));
            invoice.setSellerAccount(seller.elementTextTrim("SellerFinancialAccount"));
        }
        Element details = root.element("GoodsInfos");
        {
            List detailList = new ArrayList<>();
            List elements = details.elements();
            for (Element element : elements) {
                Detail detail = new Detail();
                detail.setName(element.elementTextTrim("Item"));
                detail.setAmount(new BigDecimal(element.elementTextTrim("Amount")));
                detail.setTaxAmount(new BigDecimal(element.elementTextTrim("TaxAmount")));
                detail.setCount(new BigDecimal(element.elementTextTrim("Quantity")));
                detail.setPrice(new BigDecimal(element.elementTextTrim("Price")));
                detail.setUnit(element.elementTextTrim("MeasurementDimension"));
                detail.setModel(element.elementTextTrim("Specification"));
                detail.setTaxRate(
                        new BigDecimal(element.elementTextTrim("TaxScheme").replace("%", "")).divide(new BigDecimal(100)));
                detailList.add(detail);
            }
            invoice.setDetailList(detailList);
        }
        return invoice;
    }
}

请注意空指针:element.elementTextTrim("TaxAmount")

不是每个发票都有这个的,比如这个发票,直接免税,所以请注意修改

JAVA识别PDF和OFD电子发票并解析为java对象_第4张图片

VO对象:

import java.math.BigDecimal;
import java.util.List;

public class Invoice {
    private String title;
    private String machineNumber;
    private String code;
    private String number;
    private String date;
    private String checksum;
    private String buyerName;
    private String buyerCode;
    private String buyerAddress;
    private String buyerAccount;
    private String password;
    private BigDecimal amount;
    private BigDecimal taxAmount;
    private String totalAmountString;
    private BigDecimal totalAmount;
    private String sellerName;
    private String sellerCode;
    private String sellerAddress;
    private String sellerAccount;
    private String payee;
    private String reviewer;
    private String drawer;
    private String type;
    private List detailList;

    /**
     * @return the title
     */
    public String getTitle() {
        return title;
    }

    /**
     * @param title
     *            the title to set
     */
    public void setTitle(String title) {
        this.title = title;
    }

    /**
     * @return the machineNumber
     */
    public String getMachineNumber() {
        return machineNumber;
    }

    /**
     * @param machineNumber
     *            the machineNumber to set
     */
    public void setMachineNumber(String machineNumber) {
        this.machineNumber = machineNumber;
    }

    /**
     * @return the code
     */
    public String getCode() {
        return code;
    }

    /**
     * @param code
     *            the code to set
     */
    public void setCode(String code) {
        this.code = code;
    }

    /**
     * @return the number
     */
    public String getNumber() {
        return number;
    }

    /**
     * @param number
     *            the number to set
     */
    public void setNumber(String number) {
        this.number = number;
    }

    /**
     * @return the date
     */
    public String getDate() {
        return date;
    }

    /**
     * @param date
     *            the date to set
     */
    public void setDate(String date) {
        this.date = date;
    }

    /**
     * @return the checksum
     */
    public String getChecksum() {
        return checksum;
    }

    /**
     * @param checksum
     *            the checksum to set
     */
    public void setChecksum(String checksum) {
        this.checksum = checksum;
    }

    /**
     * @return the buyerName
     */
    public String getBuyerName() {
        return buyerName;
    }

    /**
     * @param buyerName
     *            the buyerName to set
     */
    public void setBuyerName(String buyerName) {
        this.buyerName = buyerName;
    }

    /**
     * @return the buyerInvoiceCode
     */
    public String getBuyerCode() {
        return buyerCode;
    }

    /**
     * @param buyerCode
     *            the buyerCode to set
     */
    public void setBuyerCode(String buyerCode) {
        this.buyerCode = buyerCode;
    }

    /**
     * @return the buyerAddress
     */
    public String getBuyerAddress() {
        return buyerAddress;
    }

    /**
     * @param buyerAddress
     *            the buyerAddress to set
     */
    public void setBuyerAddress(String buyerAddress) {
        this.buyerAddress = buyerAddress;
    }

    /**
     * @return the buyerAccount
     */
    public String getBuyerAccount() {
        return buyerAccount;
    }

    /**
     * @param buyerAccount
     *            the buyerAccount to set
     */
    public void setBuyerAccount(String buyerAccount) {
        this.buyerAccount = buyerAccount;
    }

    /**
     * @return the password
     */
    public String getPassword() {
        return password;
    }

    /**
     * @param password
     *            the password to set
     */
    public void setPassword(String password) {
        this.password = password;
    }

    /**
     * @return the amount
     */
    public BigDecimal getAmount() {
        return amount;
    }

    /**
     * @param amount
     *            the amount to set
     */
    public void setAmount(BigDecimal amount) {
        this.amount = amount;
    }

    /**
     * @return the taxAmount
     */
    public BigDecimal getTaxAmount() {
        return taxAmount;
    }

    /**
     * @param taxAmount
     *            the taxAmount to set
     */
    public void setTaxAmount(BigDecimal taxAmount) {
        this.taxAmount = taxAmount;
    }

    /**
     * @return the totalAmountString
     */
    public String getTotalAmountString() {
        return totalAmountString;
    }

    /**
     * @param totalAmountString
     *            the totalAmountString to set
     */
    public void setTotalAmountString(String totalAmountString) {
        this.totalAmountString = totalAmountString;
    }

    /**
     * @return the totalAmount
     */
    public BigDecimal getTotalAmount() {
        return totalAmount;
    }

    /**
     * @param totalAmount
     *            the totalAmount to set
     */
    public void setTotalAmount(BigDecimal totalAmount) {
        this.totalAmount = totalAmount;
    }

    /**
     * @return the sellerName
     */
    public String getSellerName() {
        return sellerName;
    }

    /**
     * @param sellerName
     *            the sellerName to set
     */
    public void setSellerName(String sellerName) {
        this.sellerName = sellerName;
    }

    /**
     * @return the sellerCode
     */
    public String getSellerCode() {
        return sellerCode;
    }

    /**
     * @param sellerCode
     *            the sellerCode to set
     */
    public void setSellerCode(String sellerCode) {
        this.sellerCode = sellerCode;
    }

    /**
     * @return the sellerAddress
     */
    public String getSellerAddress() {
        return sellerAddress;
    }

    /**
     * @param sellerAddress
     *            the sellerAddress to set
     */
    public void setSellerAddress(String sellerAddress) {
        this.sellerAddress = sellerAddress;
    }

    /**
     * @return the sellerAccount
     */
    public String getSellerAccount() {
        return sellerAccount;
    }

    /**
     * @param sellerAccount
     *            the sellerAccount to set
     */
    public void setSellerAccount(String sellerAccount) {
        this.sellerAccount = sellerAccount;
    }

    /**
     * @return the payee
     */
    public String getPayee() {
        return payee;
    }

    /**
     * @param payee
     *            the payee to set
     */
    public void setPayee(String payee) {
        this.payee = payee;
    }

    /**
     * @return the reviewer
     */
    public String getReviewer() {
        return reviewer;
    }

    /**
     * @param reviewer
     *            the reviewer to set
     */
    public void setReviewer(String reviewer) {
        this.reviewer = reviewer;
    }

    /**
     * @return the drawer
     */
    public String getDrawer() {
        return drawer;
    }

    /**
     * @param drawer
     *            the drawer to set
     */
    public void setDrawer(String drawer) {
        this.drawer = drawer;
    }

    /**
     * @return the type
     */
    public String getType() {
        return type;
    }

    /**
     * @param type
     *            the type to set
     */
    public void setType(String type) {
        this.type = type;
    }

    /**
     * @return the detailList
     */
    public List getDetailList() {
        return detailList;
    }

    /**
     * @param detailList
     *            the detailList to set
     */
    public void setDetailList(List detailList) {
        this.detailList = detailList;
    }

    @Override
    public String toString() {
        return "Invoice [title=" + title + ", machineNumber=" + machineNumber + ", code=" + code + ", number=" + number
                + ", date=" + date + ", checksum=" + checksum + ", buyerName=" + buyerName + ", buyerCode=" + buyerCode
                + ", buyerAddress=" + buyerAddress + ", buyerAccount=" + buyerAccount + ", password=" + password + ", amount="
                + amount + ", taxAmount=" + taxAmount + ", totalAmountString=" + totalAmountString + ", totalAmount="
                + totalAmount + ", sellerName=" + sellerName + ", sellerCode=" + sellerCode + ", sellerAddress=" + sellerAddress
                + ", sellerAccount=" + sellerAccount + ", payee=" + payee + ", reviewer=" + reviewer + ", drawer=" + drawer
                + ", type=" + type + ", detailList=" + detailList + "]";
    }
}

class Detail {
    private String name;
    private String model;
    private String unit;
    private BigDecimal count;
    private BigDecimal price;
    private BigDecimal amount;
    private BigDecimal taxRate;
    private BigDecimal taxAmount;

    /**
     * @return the name
     */
    public String getName() {
        return name;
    }

    /**
     * @param name
     *            the name to set
     */
    public void setName(String name) {
        this.name = name;
    }

    /**
     * @return the model
     */
    public String getModel() {
        return model;
    }

    /**
     * @param model
     *            the model to set
     */
    public void setModel(String model) {
        this.model = model;
    }

    /**
     * @return the unit
     */
    public String getUnit() {
        return unit;
    }

    /**
     * @param unit
     *            the unit to set
     */
    public void setUnit(String unit) {
        this.unit = unit;
    }

    /**
     * @return the count
     */
    public BigDecimal getCount() {
        return count;
    }

    /**
     * @param count
     *            the count to set
     */
    public void setCount(BigDecimal count) {
        this.count = count;
    }

    /**
     * @return the price
     */
    public BigDecimal getPrice() {
        return price;
    }

    /**
     * @param price
     *            the price to set
     */
    public void setPrice(BigDecimal price) {
        this.price = price;
    }

    /**
     * @return the amount
     */
    public BigDecimal getAmount() {
        return amount;
    }

    /**
     * @param amount
     *            the amount to set
     */
    public void setAmount(BigDecimal amount) {
        this.amount = amount;
    }

    /**
     * @return the taxRate
     */
    public BigDecimal getTaxRate() {
        return taxRate;
    }

    /**
     * @param taxRate
     *            the taxRate to set
     */
    public void setTaxRate(BigDecimal taxRate) {
        this.taxRate = taxRate;
    }

    /**
     * @return the taxAmount
     */
    public BigDecimal getTaxAmount() {
        return taxAmount;
    }

    /**
     * @param taxAmount
     *            the taxAmount to set
     */
    public void setTaxAmount(BigDecimal taxAmount) {
        this.taxAmount = taxAmount;
    }

    @Override
    public String toString() {
        return "Detail [name=" + name + ", model=" + model + ", unit=" + unit + ", count=" + count + ", price=" + price
                + ", amount=" + amount + ", taxRate=" + taxRate + ", taxAmount=" + taxAmount + "]";
    }
}
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;

public class PDFKeyWordPosition extends PDFTextStripper {

    private List keywordList;
    private Map> positionListMap;

    public PDFKeyWordPosition() throws IOException {
        super();
    }

    // 获取坐标信息
    public Map> getCoordinate(List keywordList, PDDocument document) throws IOException {
        super.setSortByPosition(true);
        this.keywordList = keywordList;
        this.positionListMap = new HashMap<>();
        super.setStartPage(1);
        super.setEndPage(1);
        Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
        super.writeText(document, dummy);
        return positionListMap;
    }

    @Override
    protected void writeString(String string, List textPositions) throws IOException {
        for (String keyword : keywordList) {
            Integer foundIndex = 0;
            List positionList = positionListMap.computeIfAbsent(keyword, k -> new ArrayList<>());
            for (int i = 0; i < textPositions.size(); i++) {
                TextPosition textPosition = textPositions.get(i);
                String str = textPosition.getUnicode();
                if (0 < str.length() && str.charAt(0) == keyword.charAt(foundIndex)) {
                    foundIndex++;
                    int count = foundIndex;
                    for (int j = foundIndex; j < keyword.length(); j++) {
                        if (i + j >= textPositions.size()) {
                            break;
                        } else {
                            String s = textPositions.get(i + j).getUnicode();
                            if (0 < s.length() && s.charAt(0) == keyword.charAt(j)) {
                                count++;
                            }
                        }
                    }
                    if (count == keyword.length()) {
                        foundIndex = 0;
                        Position position = new Position();
                        position.setX(textPosition.getX());
                        position.setY(textPosition.getY());
                        positionList.add(position);
                        positionListMap.put(keyword, positionList);
                    }
                }
            }
        }
    }

}

class Position {
    public Position() {
    }

    public Position(float x, float y) {
        super();
        this.x = x;
        this.y = y;
    }

    float x;
    float y;

    /**
     * @return the x
     */
    public float getX() {
        return x;
    }

    /**
     * @param x
     *            the x to set
     */
    public void setX(float x) {
        this.x = x;
    }

    /**
     * @return the y
     */
    public float getY() {
        return y;
    }

    /**
     * @param y
     *            the y to set
     */
    public void setY(float y) {
        this.y = y;
    }

    @Override
    public String toString() {
        return "Position [x=" + x + ", y=" + y + "]";
    }
}

配置文件:

基本上就这样了,有兴趣的同学请参考使用

你可能感兴趣的:(大数据基础,后端,java基础,java)