Java使用PDFBox API实现对PDF文档进行关键字检索

        最近项目上需要获得pdf文档中某一个字/词的坐标,发现网上的普遍都是基于itext实现的,不过实际使用下来发现itext的文字提取比较乱,有的时候提取出一个字,有的时候提取出一段话,不太方便,于是在解决问题的过程中发现了基于PDFBox的文字提取API,稍加整合,实现了关键字检索基本实现0偏移,但是感觉效率上不是很优美,但又说不上哪里可以优化,下面贴代码,希望能得到大家的完善.

依赖Java使用PDFBox API实现对PDF文档进行关键字检索_第1张图片

 

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;

public class BoxKeyPosition extends PDFTextStripper {

	private char[] key;
	private byte[] src;
	private List list = new ArrayList();
	private List pagelist = new ArrayList();

	public BoxKeyPosition(String keyWords, byte[] src) throws IOException {
		super();
		super.setSortByPosition(true);
		this.src = src;

		char[] key = new char[keyWords.length()];
		for (int i = 0; i < keyWords.length(); i++) {
			key[i] = keyWords.charAt(i);
		}
		this.key = key;
	}

	public char[] getKey() {
		return key;
	}

	public void setKey(char[] key) {
		this.key = key;
	}

	public byte[] getSrc() {
		return src;
	}

	public void setSrc(byte[] src) {
		this.src = src;
	}

	public List getPosition() throws IOException {
		try {
			document = PDDocument.load(src);
			int pages = document.getNumberOfPages();
			
			for (int i = 1; i <= pages; i++) {
				pagelist.clear();
				super.setSortByPosition(true);
				super.setStartPage(i);
				super.setEndPage(i);
				Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
				super.writeText(document, dummy);
				for (float[] li : pagelist) {
					li[2] = i;
				}
				list.addAll(pagelist);
			}
			return list;

		} finally {
			if (document != null) {
				document.close();
			}
		}

	}

	@Override
	protected void writeString(String string, List textPositions) throws IOException {
		for (int i = 0; i < textPositions.size(); i++) {

			String str = textPositions.get(i).getUnicode();
			if (str.equals(key[0] + "")) {
				int count = 0;
				for (int j = 1; j < key.length; j++) {
					String s = "";
					try {
						s = textPositions.get(i + j).getUnicode();
					} catch (Exception e) {
						s = "";
					}
					if (s.equals(key[j] + "")) {
						count++;
					}

				}
				if (count == key.length - 1) {
					float[] idx = new float[3];
					idx[0] = textPositions.get(i).getX()+key.length*textPositions.get(i).getWidth()/2;
					idx[1] = textPositions.get(i).getY()-textPositions.get(i).getHeight();
				//	idx[3] = textPositions.get(i).getUnicode();
					pagelist.add(idx);
				}
			}

		}
	}
}

 

 

 

 

 

你可能感兴趣的:(工具类)