itext 导入pdf,读取pdf内容,并获取内容的坐标

首先导入jar包

<dependencies>
	<dependency>
	    <groupId>com.itextpdfgroupId>
	    <artifactId>itextpdfartifactId>
	    <version>5.5.11version>
	dependency>
	
	<dependency>
	    <groupId>com.itextpdfgroupId>
	    <artifactId>itext-asianartifactId>
	    <version>5.2.0version>
	dependency>
dependencies>

读取上传pdf文件的内容


@SuppressWarnings("unchecked")
@RequestMapping(params = "cgImportPdf", method = RequestMethod.POST)
@ResponseBody
public AjaxJson cgImportPdf(HttpServletRequest request, HttpServletResponse response) throws Exception {
	String msg = "添加成功";
	AjaxJson j = new AjaxJson();
 	MultipartHttpServletRequest multipartRequest = (MultipartHttpServletRequest) request;
  	List<MultipartFile> contactFile= new ArrayList<MultipartFile>();
    Map<String, MultipartFile> fileMap = multipartRequest.getFileMap();

    for (Map.Entry<String, MultipartFile> entity : fileMap.entrySet()) {
        MultipartFile file = entity.getValue();// 获取上传文件对象
        PdfReader reader = null;
		try {
			//可以传入输入流创建 PdfReader对象,也可以使用文件路径创建 PdfReader对象
			reader = new PdfReader(file.getInputStream());
			//获取pdf的页数
			int pageNum = reader.getNumberOfPages();
			String pageContent = "";
			for (int i = 1; i <= pageNum; i++) {// 只能从第1页开始读
				pageContent += PdfTextExtractor.getTextFromPage(reader, i);
			}
			//pdf文件的所有内容
			System.out.println("pageContent:" + pageContent);
		} catch (IOException e) {
			msg = "添加失败";
			e.printStackTrace();
		} finally {
			reader.close();
		}
    }
	
	 j.setMsg(msg);
     return j;
}           

根据坐标获取pdf文件内容


@SuppressWarnings("unchecked")
@RequestMapping(params = "cgImportPdf", method = RequestMethod.POST)
@ResponseBody
public AjaxJson cgImportPdf(HttpServletRequest request, HttpServletResponse response) throws Exception {
	String msg = "添加成功";
	AjaxJson j = new AjaxJson();
 	MultipartHttpServletRequest multipartRequest = (MultipartHttpServletRequest) request;
  	List<MultipartFile> contactFile= new ArrayList<MultipartFile>();
    Map<String, MultipartFile> fileMap = multipartRequest.getFileMap();

    for (Map.Entry<String, MultipartFile> entity : fileMap.entrySet()) {
        MultipartFile file = entity.getValue();// 获取上传文件对象
        PdfReader reader = null;
		try {
			//可以传入输入流创建 PdfReader对象,也可以使用文件路径创建 PdfReader对象
			reader = new PdfReader(file.getInputStream());

			// 坐标方法
			TextExtractionStrategy strategy;
			//创建坐标对象
			Rectangle2D.Float attachedF = new Rectangle2D.Float(225.1f, 39.5f, 7.738739f, 339.5f);	//第一个参数代表X轴坐标, 第二参数代表Y轴坐标,第三个参数代表宽,第四个三处代表高
			RenderFilter attached = new RegionTextRenderFilter(attachedF);

			//获取pdf的页数
			int pageNum = reader.getNumberOfPages();
			String pageContent = "";
			for (int i = 1; i <= pageNum; i++) {// 只能从第1页开始读
				pageContent += PdfTextExtractor.getTextFromPage(reader, i);

				// 根据坐标获取的内容
				strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), attached);
				String attachedV = PdfTextExtractor.getTextFromPage(reader, i, strategy);
				System.out.println("attachedV:" + attachedV);
			}
			//pdf文件的所有内容
			System.out.println("pageContent:" + pageContent);
		} catch (IOException e) {
			msg = "添加失败";
			e.printStackTrace();
		} finally {
			reader.close();
		}
    }
	
	 j.setMsg(msg);
     return j;
}           

获取pdf文件内容的坐标

首先创建一个获取坐标的工具类,实现 RenderListener类,重写方法

package com.jeecg.ldcorder.service;

import java.awt.Color;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
 
import javax.imageio.ImageIO;
 
import com.itextpdf.awt.geom.Rectangle2D;
import com.itextpdf.awt.geom.RectangularShape;
import com.itextpdf.text.BaseColor;
import com.itextpdf.text.Rectangle;
import com.itextpdf.text.pdf.PdfContentByte;
import com.itextpdf.text.pdf.parser.ImageRenderInfo;
import com.itextpdf.text.pdf.parser.RenderListener;
import com.itextpdf.text.pdf.parser.TextRenderInfo;
public class TestRenderListener implements RenderListener {
	//用来存放文字的矩形
	public List<Rectangle2D.Float> rectText = new ArrayList<Rectangle2D.Float>();
	//用来存放文字
	public List<String> textList = new ArrayList<String>();
	//用来存放文字的y坐标
	public List<Float> listY = new ArrayList<Float>();
	//用来存放每一行文字的坐标位置
	public List<Map<String,Rectangle2D.Float>> rows_text_rect = new ArrayList<>();
	//PDF文件的路径
	protected String filepath = null;
	public TestRenderListener() {
	}
	
	//step 2,遇到"BT"执行
	@Override
	public void beginTextBlock() {
		// TODO Auto-generated method stub
	}
 
	//step 3
	/**
	 * 文字主要处理方法
	 */
	@Override
	public void renderText(TextRenderInfo renderInfo) {
		//获取文字的下面的矩形
		//Rectangle2D.Float rectBase = renderInfo.getBaseline().getBoundingRectange();
		
		String text = renderInfo.getText();
		if(text.length() > 0){
			RectangularShape rectBase = renderInfo.getBaseline().getBoundingRectange();
			//获取文字下面的矩形
			Rectangle2D.Float rectAscen = renderInfo.getAscentLine().getBoundingRectange();
			//计算出文字的边框矩形
			float leftX = (float) rectBase.getMinX();
			float leftY = (float) rectBase.getMinY()-1;
			float rightX = (float) rectAscen.getMaxX();
			float rightY = (float) rectAscen.getMaxY()+1;
			
			Rectangle2D.Float rect = new Rectangle2D.Float(leftX, leftY, rightX - leftX, rightY - leftY);
			
			System.out.println("text:"+text+"--x:"+rect.x + "--y:"+rect.y + "--width:"+rect.width + "--height:"+rect.height);
			
			if(listY.contains(rect.y)){
				int index = listY.indexOf(rect.y);
				float tempx = rect.x > rectText.get(index).x ? rectText.get(index).x : rect.x;
				rectText.set(index,new Rectangle2D.Float(tempx,rect.y,rect.width + rectText.get(index).width,rect.height));												
				textList.set(index,textList.get(index) + text);
			}else{
				rectText.add(rect);
				textList.add(text);
				listY.add(rect.y);
			}
			
			Map<String,Rectangle2D.Float> map = new HashMap<>();
			map.put(text,rect);
			rows_text_rect.add(map);
		}	
	}
	//step 4(最后执行的,只执行一次),遇到“ET”执行
	@Override
	public void endTextBlock() {
		// TODO Auto-generated method stub
	}
 
	//step 1(图片处理方法)
	@Override
	public void renderImage(ImageRenderInfo renderInfo) {
		
	}
}

调用创建的工具类,获取内容坐标


@SuppressWarnings("unchecked")
@RequestMapping(params = "cgImportPdf", method = RequestMethod.POST)
@ResponseBody
public AjaxJson cgImportPdf(HttpServletRequest request, HttpServletResponse response) throws Exception {
	String msg = "添加成功";
	AjaxJson j = new AjaxJson();
 	MultipartHttpServletRequest multipartRequest = (MultipartHttpServletRequest) request;
  	List<MultipartFile> contactFile= new ArrayList<MultipartFile>();
    Map<String, MultipartFile> fileMap = multipartRequest.getFileMap();

    for (Map.Entry<String, MultipartFile> entity : fileMap.entrySet()) {
        MultipartFile file = entity.getValue();// 获取上传文件对象
        PdfReader reader = null;
		try {
			//可以传入输入流创建 PdfReader对象,也可以使用文件路径创建 PdfReader对象
			reader = new PdfReader(file.getInputStream());

			//创建pdf解析类
			PdfReaderContentParser parser = new PdfReaderContentParser(reader);
			//获取pdf的页数
			int pageNum = reader.getNumberOfPages();
			String pageContent = "";
			for (int i = 1; i <= pageNum; i++) {// 只能从第1页开始读
				pageContent += PdfTextExtractor.getTextFromPage(reader, i);

				TestRenderListener listener = new TestRenderListener();
				// 解析PDF,并处理里面的文字
				parser.processContent(i, listener);
				// 获取文字的矩形边框
				List<Rectangle2D.Float> rectText = listener.rectText;
				List<String> textList = listener.textList;
				List<Float> listY = listener.listY;
				List<Map<String, Rectangle2D.Float>> list_text = listener.rows_text_rect;
				for (int k = 0; k < list_text.size(); k++) {
					Map<String, Rectangle2D.Float> map = list_text.get(k);
					for (Map.Entry<String, Rectangle2D.Float> entry : map.entrySet()) {
						//每个内容和对应的坐标
						System.out.println(entry.getKey() + "---" + entry.getValue());
					}
				}
			}
			//pdf文件的所有内容
			System.out.println("pageContent:" + pageContent);
		} catch (IOException e) {
			msg = "添加失败";
			e.printStackTrace();
		} finally {
			reader.close();
		}
    }
	
	 j.setMsg(msg);
     return j;
}           

你可能感兴趣的:(Java,java)