itext与pdfbox都可以做pdf解析工具,下面简单介绍itext与pdfbox坐标定位
itext:
import java.io.IOException;
import com.itextpdf.awt.geom.Rectangle2D.Float;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.ImageRenderInfo;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.RenderListener;
import com.itextpdf.text.pdf.parser.TextRenderInfo;
public class Demo
{
// 定义关键字
private static String KEY_WORD = "KEYWORD";
// 定义返回值
private static float[] resu = null;
// 定义返回页码
private static int i = 0;
/*
* 返回关键字所在的坐标和页数 float[0] >> X float[1] >> Y float[2] >> page
*/
private float[] getKeyWords(String filePath)
{
try
{
PdfReader pdfReader = new PdfReader(filePath);
int pageNum = pdfReader.getNumberOfPages();
PdfReaderContentParser pdfReaderContentParser = new PdfReaderContentParser(
pdfReader);
// 下标从1开始
for (i = 1; i < pageNum; i++)
{
pdfReaderContentParser.processContent(i, new RenderListener()
{
@Override
public void renderText(TextRenderInfo textRenderInfo)
{
String text = textRenderInfo.getText();
if (null != text && text.contains(KEY_WORD))
{
Float boundingRectange = textRenderInfo
.getBaseline().getBoundingRectange();
resu = new float[3];
resu[0] = boundingRectange.x;
resu[1] = boundingRectange.y;
resu[2] = i;
}
}
@Override
public void renderImage(ImageRenderInfo arg0)
{
// TODO Auto-generated method stub
}
@Override
public void endTextBlock()
{
// TODO Auto-generated method stub
}
@Override
public void beginTextBlock()
{
// TODO Auto-generated method stub
}
});
}
} catch (IOException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
return resu;
}
}
pdfbox坐标定位:根据字符串组合进行定位textPosition
/**
* 获得表头表尾坐标
* @param pdfInfoRegular
* @param file
* @return
* @author
*/
public List
List
PDDocument document = null;
float[] text_Start=null;//表头坐标0->x;1->y;2->行高;
float[] text_End=null;//表尾坐标:同上
float[] text_title=null;//表名坐标:同上
float[] text_page=null;
try {
document = PDDocument.load(file);
List allPages = document.getDocumentCatalog().getAllPages();
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition(true);
for (int i = 0; i < allPages.size(); i++) {
text_Start=new float[3];
text_End=new float[3];
text_title=new float[3];
text_page=new float[1];
PrintTextLocatins2 printer = new PrintTextLocatins2();
List
PDPage page = (PDPage) allPages.get(i);
PDStream contents = page.getContents();
if (contents != null) {
printer.processStream(page, page.findResources(), page
.getContents().getStream());
}
list = printer.getList();
if (null !=list && list.size()>0) {
List
List
List
if(text_S!=null&&text_S.size()>0&&text_E!=null&&text_E.size()>0&&text_T!=null&&text_T.size()>0){
position=new ArrayList<>();
//添加开始坐标信息
text_Start[0]=text_S.get(1).getX();
text_Start[1]=text_S.get(1).getY();
text_Start[2]=text_S.get(1).getHeightDir();
position.add(text_Start);
//添加结束坐标
text_End[0]=text_E.get(1).getX();
text_End[1]=text_E.get(1).getY();
text_End[2]=text_E.get(1).getHeightDir();
position.add(text_End);
//添加标题坐标
text_title[0]=text_T.get(1).getX();
text_title[1]=text_T.get(1).getY();
text_title[2]=text_T.get(1).getHeightDir();
position.add(text_title);
//添加page
text_page[0]=i;
position.add(text_page);
// break;
}
}
}
return position;
}catch (Exception e) {
e.printStackTrace();
}
return position;
}
/**
* 获得表头表尾坐标
* @param list
* @param str表头或者表尾关键字
* @return
* @author
*/
public static List
List
for (int i = 0; i < list.size(); i++) {
if (str.contains(list.get(i).getCharacter())) {
StringBuffer textstr = new StringBuffer();
for (int j = 0; j < str.length(); j++) {
if((i+j)
textstr.append(list.get(i + j).getCharacter());
}
}
if (str.equals(textstr.toString())) {
list_xy.add(list.get(i));
list_xy.add(list.get(i + str.length() - 1));
}
}
}
return list_xy;
}
/**
* 获取table坐标信息集合
* @author
*/
public List> readTableCoor(String path,List
List> coors =new ArrayList<>();
for (PdfInfoRegular pir : list) {
try {
coors.add(getTableLoactions(pir, path));
} catch (Exception e) {
e.printStackTrace();
}
}
return coors;
}
}