增值税电子发票识别-OCR

设置草图

增值税电子发票识别-OCR_第1张图片

 

增值税电子发票识别-OCR_第2张图片

回复:发票 获取源代码

源码:

发票类型类:

public enum InvoiceType {
    PLAIN_INVOICE("增值税电子普通发票", "0");

    String name;
    String type;
    InvoiceType(String name, String type){
        this.name = name;
        this.type = type;
    }


    public String getName() {
        return name;
    }

    public String getType() {
        return type;
    }

    public static InvoiceType getByName(String name){
        return Arrays.stream(InvoiceType.values()).filter((n)-> n.name.equals(name) ).findFirst().orElse(null);
    }

    @Override
    public String toString() {
        return "InvoiceType{" +
                "name='" + name + '\'' +
                ", type='" + type + '\'' +
                '}';
    }
}

发票信息类:

@Data
public class InvoiceInfo {
    //购方信息
    private String purchaserName;
    private String purchaserTaxNo;
    private String purchaserAddr;
    private String purchaserTel;
    private String purchaserAddrAndTel;

    private String purchaserBank;
    private String purchaserBankNo;
    private String purchaserBankAndNo;

    //销方信息
    private String sellerName;
    private String sellerTaxNo;
    private String sellerAddr;
    private String sellerTel;
    private String sellerAddrAndTel;

    private String sellerBank;
    private String sellerBankNo;
    private String sellerBankAndNo;

    private String invoiceNo;
    private String invoiceCode;
    private String invoiceType;
    //开票日期 格式:yyyy-MM-dd
    private String kprq;
    //密码区
    private String secretArea;

    //校验码
    private String checkCode;

    //含税金额
    private String hsje;
    //不含税金额
    private String bhsje;
    //税额
    private String se;
    //备注
    private String remark;

    //收款人
    private String skr;
    //复核人
    private String fhr;
    //开票人
    private String kpr;
}

pdf扫描工具:

public class ScanPdfInvoiceUtils {


    public InvoiceInfo ocrInvoice(String filePath) throws IOException {
        InvoiceInfo info = new InvoiceInfo();
        File file = Paths.get(filePath).toFile();
        InputStream in = new FileInputStream(file);
        RandomAccessRead rbuffer = new RandomAccessBuffer(in);
        PDFParser parser = new PDFParser(rbuffer);
        parser.parse();

        PDDocument document = parser.getPDDocument();
        PDPageTree pageTree = document.getPages();
        Iterator it = pageTree.iterator();
        while (it.hasNext()) {
            PDPage pdPage = (PDPage) it.next();
            PDResources pdResources = pdPage.getResources();
            Iterable iterable = pdResources.getXObjectNames();
            if (iterable != null) {
                Iterator iter = iterable.iterator();
//				while (iter.hasNext()) {
                COSName cosName = iter.next();
                System.out.println(">>>>>>>>>>>" + cosName.getName());
//				}
            }
        }


        int pages = document.getNumberOfPages();

        PDFTextStripper stripper = new PDFTextStripper();
        // 设置按序输出
        stripper.setSortByPosition(true);
        stripper.setStartPage(1);
        stripper.setEndPage(pages);

        String content = stripper.getText(document);
        System.out.println(content);

        document.close();
        rbuffer.close();
        in.close();

        return info;
    }

    public InvoiceInfo ocrInoivceArea(String filePath) throws IOException {
        InvoiceInfo info = new InvoiceInfo();
        File file = Paths.get(filePath).toFile();
        InputStream in = new FileInputStream(file);
        RandomAccessRead rbuffer = new RandomAccessBuffer(in);
        PDFParser parser = new PDFParser(rbuffer);
        parser.parse();

        PDDocument document = parser.getPDDocument();

        PDPage pdPage = document.getPage(0);

        System.out.println(pdPage.getCropBox().getHeight());
        System.out.println(pdPage.getCropBox().getWidth());
        PDFTextStripperByArea area = new PDFTextStripperByArea();
        area.setSortByPosition(true);
        area.addRegion("invoiceType", new Rectangle2D.Double(190,0,210,90));
        area.addRegion("invoiceRightT", new Rectangle2D.Double(417,0,171,90));

        area.addRegion("secretArea", new Rectangle2D.Double(355,59,354,84));
        area.addRegion("purchaser", new Rectangle2D.Double(40,95,200,60));
        area.addRegion("seller", new Rectangle2D.Double(42,291,171,60));

        area.addRegion("remark", new Rectangle2D.Double(360,297,208,57));
        area.addRegion("hsje", new Rectangle2D.Double(471,280,105,17));

        area.addRegion("bhsje", new Rectangle2D.Double(381,253,88,17));
        area.addRegion("se", new Rectangle2D.Double(500,253,88,17));

        area.addRegion("skr", new Rectangle2D.Double(30,360,120,15));
        area.addRegion("fhr", new Rectangle2D.Double(180,360,120,15));

        area.addRegion("kpr", new Rectangle2D.Double(310,360,80,15));

        area.extractRegions(pdPage);

        area.getRegions().stream().forEach((name)->{
            String temp = area.getTextForRegion(name);
            switch (name){
                case "invoiceType":
                    if (temp.contains(InvoiceType.PLAIN_INVOICE.getName()) || temp.contains("普")){
                        info.setInvoiceType(InvoiceType.PLAIN_INVOICE.getType());
                    }
                    break;
                case "hsje":
                    if (null != temp && !temp.isEmpty()){
                        info.setHsje(temp.replaceAll("¥", "").trim());
                    }
                    break;
                case "bhsje":
                    if (null != temp && !temp.isEmpty()){
                        info.setBhsje(temp.replaceAll("¥", "").trim());
                    }
                    break;
                case "se":
                    if (null != temp && !temp.isEmpty()){
                        info.setSe(temp.replaceAll("¥", "").trim());
                    }
                    break;
                case "secretArea":
                    info.setSecretArea(temp);
                    break;
                case "invoiceRightT":
                    Object[] sp = Arrays.stream(temp.split("\n"))
                            .map(s-> {
                                String xx = subString(s);
                                xx = StringUtils.replaceAll(xx, "\\s*", "");
                                return StringUtils.replaceAll(xx, "[^\\x00-\\xff]", "");
                            }).toArray();
                    info.setInvoiceCode(Objects.toString(sp[0]));
                    info.setInvoiceNo(Objects.toString(sp[1]));
                    info.setKprq(Objects.toString(sp[2]));
                    info.setCheckCode(Objects.toString(sp[3]));
                    break;
                default: break;
            }
        });

        document.close();
        rbuffer.close();
        in.close();

        saveAsPng(filePath, info);
        return info;
    }

    public static void saveAsPng(String filePath, InvoiceInfo info) throws IOException {
        File file = Paths.get(filePath).toFile();
        InputStream in = new FileInputStream(file);
        RandomAccessRead rbuffer = new RandomAccessBuffer(in);
        PDFParser parser = new PDFParser(rbuffer);
        parser.parse();

        PDDocument document = parser.getPDDocument();

        PDFRenderer pdfRenderer = new PDFRenderer(document);
        BufferedImage img = pdfRenderer.renderImageWithDPI(0, 300f);

        //备注
        BufferedImage remarkImg = img.getSubimage(1517,1221, 954, 233);
        byte[] remkarB = OcrUtils.imageToBytes(remarkImg);
        info.setRemark(
               OcrUtils.ocrImg(remkarB)
        );

        //收款人、复核人、开票人
        BufferedImage threeImg = img.getSubimage(83,1458, 705, 183);
        byte[] threeB = OcrUtils.imageToBytes(threeImg);
        String threeS = OcrUtils.ocrImg(threeB);
        threeS = StringUtils.replaceAll(threeS, "\\s*", "");
        threeS = subString(threeS);
        info.setSkr(threeS);

        threeImg = img.getSubimage(790,1458, 509, 183);
        threeB = OcrUtils.imageToBytes(threeImg);
        threeS = OcrUtils.ocrImg(threeB);
        threeS = StringUtils.replaceAll(threeS, "\\s*", "");
        threeS = subString(threeS);
        info.setFhr(threeS);

        threeImg = img.getSubimage(1300,1458, 499, 183);
        threeB = OcrUtils.imageToBytes(threeImg);
        threeS = OcrUtils.ocrImg(threeB);
        threeS = StringUtils.replaceAll(threeS, "\\s*", "");
        threeS = subString(threeS);
        info.setKpr(threeS);


        //销方信息
        BufferedImage sellerImg;
        byte[] sellerB;
        String sellerS;

        sellerImg = img.getSubimage(183,1214, 1260, 60);
        sellerB = OcrUtils.imageToBytes(sellerImg);
        sellerS = OcrUtils.ocrImg(sellerB);
        sellerS = StringUtils.replaceAll(sellerS, "\\s*", "");
        sellerS = subString(sellerS);
        info.setSellerName(sellerS);

        sellerImg = img.getSubimage(183,1271, 1260, 60);
        sellerB = OcrUtils.imageToBytes(sellerImg);
        sellerS = OcrUtils.ocrImg(sellerB);
        sellerS = StringUtils.replaceAll(sellerS, "\\s*", "");
        sellerS = subString(sellerS);
        info.setSellerTaxNo(sellerS);


        sellerImg = img.getSubimage(183,1335, 1260, 60);
        sellerB = OcrUtils.imageToBytes(sellerImg);
        sellerS = OcrUtils.ocrImg(sellerB);
        sellerS = StringUtils.replaceAll(sellerS, "\\s*", "");
        sellerS = subString(sellerS);
        info.setSellerAddrAndTel(sellerS);


        sellerImg = img.getSubimage(183,1393, 1260, 60);
        sellerB = OcrUtils.imageToBytes(sellerImg);
        sellerS = OcrUtils.ocrImg(sellerB);
        sellerS = StringUtils.replaceAll(sellerS, "\\s*", "");
        sellerS = subString(sellerS);
        info.setSellerBankAndNo(sellerS);


        //购方信息

        sellerImg = img.getSubimage(172,348, 1270, 69);
        sellerB = OcrUtils.imageToBytes(sellerImg);
        sellerS = OcrUtils.ocrImg(sellerB);
        sellerS = StringUtils.replaceAll(sellerS, "\\s*", "");
        sellerS = subString(sellerS);
        info.setPurchaserName(sellerS);

        sellerImg = img.getSubimage(172,417, 1270, 69);
        sellerB = OcrUtils.imageToBytes(sellerImg);
        sellerS = OcrUtils.ocrImg(sellerB);
        sellerS = StringUtils.replaceAll(sellerS, "\\s*", "");
        sellerS = subString(sellerS);
        info.setPurchaserTaxNo(sellerS);

        sellerImg = img.getSubimage(172,486, 1270, 69);
        sellerB = OcrUtils.imageToBytes(sellerImg);
        sellerS = OcrUtils.ocrImg(sellerB);
        sellerS = StringUtils.replaceAll(sellerS, "\\s*", "");
        sellerS = subString(sellerS);
        info.setPurchaserAddrAndTel(sellerS);

        sellerImg = img.getSubimage(172,555, 1270, 69);
        sellerB = OcrUtils.imageToBytes(sellerImg);
        sellerS = OcrUtils.ocrImg(sellerB);
        sellerS = StringUtils.replaceAll(sellerS, "\\s*", "");
        sellerS = subString(sellerS);
        info.setPurchaserBankAndNo(sellerS);

        document.close();
        rbuffer.close();
        in.close();

    }


    private static String subString(String str){
        if (str.contains(":")){
            str = StringUtils.substringAfterLast(str, ":");
        }else if(str.contains(":")){
            str = StringUtils.substringAfterLast(str, ":");
        }

        return str;

    }


    /**
     * 长:系数0.2345
     * 宽:系数0.2388
     * @param args
     * @throws IOException
     */
    public static void main(String[] args) throws IOException {
        ScanPdfInvoiceUtils pdfInvoiceUtils = new ScanPdfInvoiceUtils();
        InvoiceInfo info = pdfInvoiceUtils.ocrInoivceArea("/Users/grant/Pictures/03300180011130339349.pdf");
        System.out.println(JSONUtil.toJsonStr(info));

//        pdfInvoiceUtils.saveAsPng("/Users/grant/Pictures/03300180011130339349.pdf");
    }
}

 

你可能感兴趣的:(JAVA)