接口类:
package org.aoe.software.pdf; import java.io.InputStream; /** *Convent pdf to xml. *PDF转XML的格式定义 <pdf id="00000001" fileName="temp0001.pdf"> <page pageIndex="1"> <text> <tr colX="x1:x2" colY="y1:y2">ssssssssss</tr> </text> <table colX="x1:x2:x3" colY="y1:y2:y3:y4"> <tr> <td colX="x1:x2" colY="y1:y2" colspan="2" rowspan="2">TTTT</td> </tr> </table> </page> </pdf> 说明: id:表示PDF文件的唯一ID标识名,可以为空,是由调用者传入的参数 fileName:表示PDF的文件名称,不可为空(去除文件中所包含的路径), page:表示页面信息 pageIndex:表示PDF文件的具体页码信息 text:表示PDF内容中的段落信息 table:表示PDF内容中的表格信息 tr:表示行信息 td:表示表格中的单远格信息 冒号分隔每组值 colX: 矩形的左下角X坐标 colY: 矩形的右上角y坐标 其中:td 中的colX,colY表示单元格中数据内容的坐标 根据表头的colX 属性描述,计算出cols:表示这个表格总的有多少列 根据表头的colY 属性描述,计算出rows:表示这个表格总的有多少行 colspan:表示列合并(表明具体的由哪些列合并在一起),如果>1个示从当前列合并后的总列数,等于2表示要合并右边的一列单元格组成新的单元格,其它数据以此类推 rowspan:表示行合并(表明具体的由哪些行合并在一起),如果>1个示从当前行合并后的总行数,等于2表示要合并下边的一行单元格组成新的单元格,其它数据以此类推 */ public class PDFToXml { private static final String XML_HEAD = "<?xml version=\"1.0\" encoding=\"GBK\"?>"; private static final String NEW_LINE = "\r\n"; /*调用者传入一个本地的文件名(包含路径),fileID可空,返回生成好的XML格式的字符串, * 如果生成失败,返回字符为空值,即:"" */ public static String ConvertToXML(String fileName, String fileID){ StringBuffer sb = new StringBuffer(); String fileShortName = fileName; fileShortName = fileShortName.replace("\\", "/"); if(fileShortName.indexOf("/") != -1) fileShortName = fileShortName.substring(fileShortName.lastIndexOf("/") + 1); sb.append(XML_HEAD).append(NEW_LINE); sb.append("<pdf id=\""+ (fileID == null ? "" : fileID)+"\" fileName=\""+fileShortName+"\">").append(NEW_LINE); //sb.append(ConvertUtils.parse(fileName)).append(NEW_LINE); sb.append(ExtractRawStream.generateXMLFile(fileName, "tmp.xml", fileID)).append(NEW_LINE); sb.append("</pdf>").append(NEW_LINE); return sb.toString(); } /*调用者传入一个本地的文件名(包含路径),fileID可空, * 把生成好的XML格式的数据按指定的文件路径进行保存,如果生成或保存失败,返回false */ public static boolean ConvertToXML(String fileName, String fileID, String savePath){ return FileUtils.save(ConvertToXML(fileName, fileID), savePath); } /*调用者传入PDF的文件流,当前文件流的名称与文件ID,fileID不能为空, * 返回生成的XML格式的字符串,如果生成失败,返回字符为空值,即:"" */ public static String ConvertToXML(InputStream stream, String fileName,String fileID){ StringBuffer sb = new StringBuffer(); String fileShortName = fileName; fileShortName = fileShortName.replace("\\", "/"); if(fileShortName.indexOf("/") != -1) fileShortName = fileShortName.substring(fileShortName.lastIndexOf("/") + 1); sb.append(XML_HEAD).append(NEW_LINE); sb.append("<pdf id=\""+ (fileID == null ? "" : fileID)+"\" fileName=\""+fileShortName+"\">").append(NEW_LINE); //sb.append(ConvertUtils.parse(stream)).append(NEW_LINE); sb.append(ExtractRawStream.generateXMLFile(stream, fileName, fileID)).append(NEW_LINE); sb.append("</pdf>").append(NEW_LINE); return null; } /* * 调用者传入PDF的文件流,当前文件流的名称与文件ID,fileID不能为空, * 把生成好的XML格式的数据按指定的文件路径进行保存,如果生成或保存失败,返回false */ public static boolean ConvertToXML(InputStream stream,String fileName,String fileID, String savePath){ return FileUtils.save(ConvertToXML(stream, fileName, fileID), savePath); } ///////////////////////////////////////// public static void main(String[] args) { System.out.println(ConvertToXML("r:/a.pdf", "1111", "r:/zzz.xml")); //System.out.println(ConvertToXML("r:/b.pdf", "1111", "r:/b.xml")); } }
package org.aoe.software.pdf; import java.io.InputStream; import java.util.Map; import org.jpedal.PdfDecoder; import org.jpedal.exception.PdfException; import org.jpedal.grouping.PdfGroupingAlgorithms; import org.jpedal.objects.PdfPageData; public class ConvertUtils { private static final String NEW_LINE = "\r\n"; private static PdfDecoder decodePdf = new PdfDecoder(false);; private static int defX1 = -1, defX2, defY1, defY2; public static String parse(String pdfFilepath) { try { decodePdf.setExtractionMode(PdfDecoder.TEXT); // extract just text PdfDecoder.init(true); decodePdf.openPdfFile(pdfFilepath); } catch (Exception e) { e.printStackTrace(); } return parseContent(decodePdf); } public static String parse(InputStream is){ try { decodePdf.setExtractionMode(PdfDecoder.TEXT); // extract just text PdfDecoder.init(true); decodePdf.openPdfFileFromInputStream(is, false); } catch (Exception e) { e.printStackTrace(); } return parseContent(decodePdf); } private static String parseContent(PdfDecoder pdfDecoder){ StringBuffer sb = new StringBuffer(); if (!decodePdf.isExtractionAllowed()) { System.out.println("Text extraction not allowed"); } else if (decodePdf.isEncrypted() && !decodePdf.isPasswordSupplied()) { System.out.println("Encrypted settings"); System.out.println("Please look at Viewer for code sample to handle such files"); System.out.println("Or get support/consultancy"); } else { // page range int start = 1, end = decodePdf.getPageCount(); try { for (int page = start; page < end + 1; page++) { sb.append("<page pageIndex=\""+ page +"\">").append(NEW_LINE); decodePdf.decodePage(page); PdfGroupingAlgorithms currentGrouping = decodePdf.getGroupingObject(); PdfPageData currentPageData = decodePdf.getPdfPageData(); int x1, y1, x2, y2; if (defX1 == -1) { x1 = currentPageData.getMediaBoxX(page); x2 = currentPageData.getMediaBoxWidth(page) + x1; y2 = currentPageData.getMediaBoxY(page); y1 = currentPageData.getMediaBoxHeight(page) + y2; } else { x1 = defX1; y1 = defY1; x2 = defX2; y2 = defY2; } /** * Co-ordinates are x1,y1 (top left hand corner), * x2,y2(bottom right) */ try { Map tableContent = currentGrouping.extractTextAsTable( x1, y1, x2, y2, page, false, // csv false, false, false, 0); // get the text from the Map object String tableText = (String) tableContent.get("content"); //忽略不在乎的标签 tableText = ignoreTag("<TABLE>", tableText); tableText = ignoreTag("</TABLE>", tableText); tableText = ignoreTag(" nowrap", tableText); tableText = ignoreTag("", tableText); tableText = ignoreTag("<SpaceCount space=\"\\d+\" />", tableText); tableText = ignoreTag("<td></td>", tableText); tableText = ignoreTag("<tr></tr>", tableText); boolean isTable = isTable(tableText); if(isTable){ int rows = getCount(tableText, "<tr>"); int cols = getCount(tableText, "<td>"); sb.append("<table colX=\""+rows+"\" colY=\""+cols+"\">").append(tableText).append("</table>").append(NEW_LINE); }else{ tableText = ignoreTag("<tr>", tableText); tableText = ignoreTag("</tr>", tableText); tableText = ignoreTag("<td>", tableText); tableText = ignoreTag("</td>", tableText); sb.append("<text>").append(NEW_LINE); sb.append("<tr colX=\""+ x1 +":"+ x2 +"\" colY=\""+ y1 +":"+ y2 +"\">"+ tableText +"</tr>").append(NEW_LINE); sb.append("</text>").append(NEW_LINE); } } catch (PdfException e) { decodePdf.closePdfFile(); e.printStackTrace(); } // remove data once written out decodePdf.flushObjectValues(false); sb.append("</page>").append(NEW_LINE); } } catch (Exception e) { decodePdf.closePdfFile(); e.printStackTrace(); } decodePdf.flushObjectValues(true); // flush any text data read } decodePdf.closePdfFile(); return sb.toString(); } private static String ignoreTag(String tag, String origin){ return origin.replaceAll(tag, ""); } private static int getCount(String table, String tag){ int count = 0; int index = 0; while((index = table.indexOf(tag, index)) != -1){ count++; index += tag.length(); } return count; } private static boolean isTable(String tableText){ //将如下情况设置为table :多行 或者 单行多列(非空列个数大于2) /*if(tableText.indexOf("<tr>") != tableText.lastIndexOf("<tr>")){ return true; }*/ int rows = getCount(tableText, "<tr>"); int index = 0; if(rows>0){ for(int i=1; i<rows; i++){ int tr = tableText.indexOf("<tr>", index); int closedTr = tableText.indexOf("</tr>", tr); String line = tableText.substring(tr, closedTr); index += line.length(); if(line.indexOf("<td>") != line.lastIndexOf("<td>")){ return true; } } } return false; } }
package org.aoe.software.pdf; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import jxl.Workbook; import jxl.write.Label; import jxl.write.WritableCellFormat; import jxl.write.WritableSheet; import jxl.write.WritableWorkbook; import jxl.write.WriteException; import org.aoe.software.pdf.po.Page; import org.aoe.software.pdf.po.Table; import org.aoe.software.pdf.po.TableTd; import org.aoe.software.pdf.po.TableTr; import org.aoe.software.pdf.po.Text; import org.aoe.software.pdf.po.TextTr; import org.dom4j.Document; import org.dom4j.DocumentHelper; import org.dom4j.Element; import org.jpedal.PdfDecoder; import org.jpedal.exception.PdfException; import org.jpedal.exception.PdfSecurityException; import org.jpedal.fonts.FontMappings; import org.jpedal.grouping.PdfGroupingAlgorithms; import org.jpedal.objects.PdfPageData; import org.jpedal.utils.Strip; public class ExtractRawStream { //DX20130502 decode page no public int decode_pageno; /**flag to show if we print messages*/ public static boolean outputMessages=true; /**word count - used for testing*/ private int wordsExtracted=0; /**correct separator for OS */ String separator = System.getProperty("file.separator"); /**the decoder object which decodes the pdf and returns a data object*/ PdfDecoder decodePdf = null; /**flag to show if file or byte array*/ private boolean isFile=true; /**byte array*/ private byte[] byteArray=null; /**used in our regression tests to limit to first 10 pages*/ public static boolean isTest=false; private List<Rect> relist = new ArrayList<Rect>(); private List<TextLine> textlist = new ArrayList<TextLine>(); private Rect page_rect = new Rect(); private static String file_name = ""; /** * routine to decode a file */ private void decodeFile(String file_name) { //PdfDecoder returns a PdfException if there is a problem try { decodePdf = new PdfDecoder(true); //incase fonts not embedded FontMappings.setFontReplacements(); decodePdf.setExtractionMode(PdfDecoder.TEXT); //extract just text PdfDecoder.init(true); //make sure widths in data CRITICAL if we want to split lines correctly!! decodePdf.useTextExtraction(); //always reset to use unaltered co-ords - allow use of rotated or unrotated // co-ordinates on pages with rotation (used to be in PdfDecoder) PdfGroupingAlgorithms.useUnrotatedCoords=false; /** * open the file (and read metadata including pages in file) */ if(outputMessages) System.out.println("Opening file :" + file_name); if(isFile) decodePdf.openPdfFile(file_name); else decodePdf.openPdfArray(byteArray); } catch (PdfSecurityException e) { System.err.println("Exception " + e+" in pdf code for wordlist"+file_name); } catch (PdfException e) { System.err.println("Exception " + e+" in pdf code for wordlist"+file_name); } catch (Exception e) { System.err.println("Exception " + e+" in pdf code for wordlist"+file_name); e.printStackTrace(); } /** * extract data from pdf (if allowed). */ if(!decodePdf.isExtractionAllowed()){ if(outputMessages) System.out.println("Text extraction not allowed"); }else if (decodePdf.isEncrypted() && !decodePdf.isPasswordSupplied()) { if(outputMessages){ System.out.println("Encrypted settings"); System.out.println("Please look at Viewer for code sample to handle such files"); } } else{ /** * extract data from pdf */ try { //for (int page = start; page < end + 1; page++) { //read pages //decode the page decodePdf.decodePage(decode_pageno); //String contents[] = decodePdf.; // //debug only // FileWriter fw = new FileWriter("d:/abc/commands/raw-" + decode_pageno + ".txt"); StringBuffer sb = new StringBuffer(); for (int i=0;i<decodePdf.stream_data.length;i++) { sb.append((char)decodePdf.stream_data[i]); //fw.write(decodePdf.stream_data[i]); //debug only } // fw.flush(); //debug only /**use whole page size for demo - get data from PageData object*/ PdfPageData currentPageData = decodePdf.getPdfPageData(); int x1 = currentPageData.getMediaBoxX(decode_pageno); int x2 = currentPageData.getMediaBoxWidth(decode_pageno)+x1; int y2 = currentPageData.getMediaBoxX(decode_pageno); int y1 = currentPageData.getMediaBoxHeight(decode_pageno)-y2; //报存页面坐标参数 page_rect.setX(x1); page_rect.setWidth(x2 - x1); page_rect.setY(y2); page_rect.setHeight(y1-y2); //OutputStreamWriter output_stream = // new OutputStreamWriter( // new FileOutputStream(outputDir + "raw-re-"+decode_pageno + ".txt"), // "UTF-8"); StringBuffer line = new StringBuffer(); for (int j=0;j<sb.length();j++) { line.append(sb.charAt(j)); if (sb.charAt(j) == 10) { if ((line.toString().split(" ").length == 5) && ((line.toString().split(" ")[4].equals("re" + (char)10)) ||(line.toString().split(" ")[4].equals("re" + (char)13 + (char)10)))) { String[] command = line.toString().split(" "); Rect rect = new Rect(); rect.setX(Double.parseDouble(command[0])); rect.setY(page_rect.getHeight() - Double.parseDouble(command[1]) - Double.parseDouble(command[3])); rect.setWidth(Double.parseDouble(command[2])); rect.setHeight(Double.parseDouble(command[3])); //output.append(line); relist.add(rect); } line.setLength(0); } } // //debug only // if (decode_pageno == 6) { // StraightLines.printLines("d:/abc/relist_6.txt", relist); // } StraightLines.processReCommands(relist, decode_pageno); StraightLines.sortByYMinAsc(relist); //for (int j=0;j<relist.size();j++) { // Rect rect = relist.get(j); // output_stream.write(rect.getX() + " " + rect.getY() + " " + rect.getWidth() + " " + rect.getHeight() + " re" + "\n\r"); // } // output_stream.flush(); /** create a grouping object to apply grouping to data*/ PdfGroupingAlgorithms currentGrouping =decodePdf.getGroupingObject(); List<?> words =null; try{ /* words =currentGrouping.extractTextAsWordlist( x1, y1, x2, y2, decode_pageno, true,"&:=()!;.,\\/\"\"\'\'"); */ words =currentGrouping.extractTextAsWordlist( x1, y1, x2, y2, decode_pageno, true,""); } catch (PdfException e) { decodePdf.closePdfFile(); System.err.println("Exception= "+ e+" in "+file_name); } //DX20130614 if (words == null) { decodePdf.closePdfFile(); return; } Iterator<?> wordIterator=words.iterator(); while(wordIterator.hasNext()){ String currentWord=(String) wordIterator.next(); /**remove the XML formatting if present - not needed for pure text*/ currentWord=Strip.convertToText(currentWord, decodePdf.isXMLExtraction()); /**if(currentWord.indexOf(" ")!=-1){ System.out.println("word="+currentWord); System.exit(1); }*/ /** * these co-ordinates are absolute from the bottom of the page (MediaBox) * If you are extracting image (which may use crop, use need to modify as below */ double wx1 = Double.parseDouble((String) wordIterator.next()); double wy1 = Double.parseDouble((String) wordIterator.next()); double wx2 = Double.parseDouble((String) wordIterator.next()); double wy2 = Double.parseDouble((String) wordIterator.next()); /**this could be inserting into a database instead*/ TextLine text = new TextLine(); text.getRect().setX(wx1); text.getRect().setY(page_rect.getHeight() - wy1); text.getRect().setWidth(wx2); text.getRect().setHeight(wy1 - wy2); text.setText(currentWord); textlist.add(text); } System.out.println("Page " + decode_pageno + " extracted!"); //} } catch (Exception e) { decodePdf.closePdfFile(); System.err.println("Exception "+ e+" in "+file_name); e.printStackTrace(); } /** * flush data structures - not strictly required but included * as example */ decodePdf.flushObjectValues(true); //flush any text data read /**tell user*/ if(outputMessages) System.out.println("Text read"); } /**close the pdf file*/ decodePdf.closePdfFile(); decodePdf=null; } ////////////////////////////////////////////////////////////////////////// /** * main routine which checks for any files passed and runs the demo */ public static void main(String[] args) { FileUtils.save(generateXMLFile("r:/a.pdf", "R:/out.xml", "00000001"), "r:/z.xml"); } /** * return words extracted. We use this in some tests. */ public int getWordsExtractedCount() { return wordsExtracted; } /* * extract raw commands */ public List<Rect> parseFilePage(String filename, int pageno) { setDecode_pageno(pageno); decodeFile(filename); StraightLines.sortByXMax(relist); return relist; } public int getDecode_pageno() { return decode_pageno; } public void setDecode_pageno(int decode_pageno) { this.decode_pageno = decode_pageno; } private static double MINIMUN_LINE_LENGTH = 2; //算法描述 /* * 先获得所有横线,每两条相邻横线为一行 * 再每一行,获得所有有效竖线(同时与上下横线交叉的竖线),每两条相邻竖线为一列 * 最后生成单元表 * 获得所有水平线 */ public static String generateExcelTables(Element pageElement, int page_no, List<Rect> lines, List<TextLine> textlist) { List<Rect> column_lines = new ArrayList<Rect>(); List<Rect> horizontal_lines = new ArrayList<Rect>(); List<Rect> vertical_lines = new ArrayList<Rect>(); //去除短线 for (int i=0;i<lines.size();i++) { if ((lines.get(i).getWidth() > MINIMUN_LINE_LENGTH) && (lines.get(i).getX() > 0)){ horizontal_lines.add(lines.get(i)); } } StraightLines.sortByYMinAsc(horizontal_lines); //获得垂直线 for (int i=0;i<lines.size();i++) { if ((lines.get(i).getHeight() > MINIMUN_LINE_LENGTH) && (lines.get(i).getY() > 0)) { vertical_lines.add(lines.get(i)); } } StraightLines.sortByYMax(vertical_lines); /*if (pageElement.attribute("pageindex").getStringValue().equals("27")) { System.out.println("debug"); }*/ for (int i=0;i<horizontal_lines.size()-1;i++) { Rect topline, bottomline; topline = horizontal_lines.get(i); bottomline = horizontal_lines.get(i+1); for (int j=0;j<vertical_lines.size();j++) { //找到交叉该对水平线的垂直线 if (((vertical_lines.get(j).getY() - MINIMUN_LINE_LENGTH < topline.getY()) && (vertical_lines.get(j).getY() + vertical_lines.get(j).getHeight() + MINIMUN_LINE_LENGTH > topline.getY())) && ((vertical_lines.get(j).getY() - MINIMUN_LINE_LENGTH < bottomline.getY()) && (vertical_lines.get(j).getY() + vertical_lines.get(j).getHeight() + MINIMUN_LINE_LENGTH > bottomline.getY()))) { //如果结果中不存在该垂直线,则加入 boolean bFind = false; for (int k=0;k<column_lines.size();k++) { if (column_lines.get(k).getX() == vertical_lines.get(j).getX()) { bFind = true; break; } } if (!bFind) column_lines.add(vertical_lines.get(j)); } } } StraightLines.sortByXMin(column_lines); List<Rect> mergedhlines; mergedhlines = StraightLines.mergeHorizontalLines(horizontal_lines); StraightLines.sortByYMinAsc(mergedhlines); StraightLines.sortByXMin(vertical_lines); //xml表元素 Element tableElement = pageElement.addElement("table"); //Add by tangxc. //tableElement.addAttribute("border", "1"); String str_colX = ""; for (int j=0;j<column_lines.size();j++) { if (j==0) { str_colX = (int)column_lines.get(j).getX() + ""; } else { str_colX = str_colX + ":" + (int)column_lines.get(j).getX(); } } String str_colY = ""; TextLine.sortByYMinAsc(textlist); //在EXCEL文件中生成表格 //WritableWorkbook workbook = initOutputExcelFile(); //WritableSheet sheet = workbook.createSheet("Page", 0); int first_column_rowspan = 0; //第一列的行和并 /*Table table = null;*/ for (int i=0;i<mergedhlines.size()-1;i++) { Rect topline, bottomline; topline = mergedhlines.get(i); bottomline = mergedhlines.get(i+1); double leftline = 0; leftline = StraightLines.getNextVerticalLine(topline, bottomline, vertical_lines, 0); if (leftline == 0) { if (tableElement.nodeCount() > 0) { first_column_rowspan = 0; //DX20130704 reset tableElement.addAttribute("colX", str_colX); tableElement.addAttribute("colY", str_colY); tableElement = pageElement.addElement("table"); //Add by tangxc. //tableElement.addAttribute("border", "1"); str_colY = ""; } /*str_colY = ""; TextTr tr = new TextTr(); tr.setColX(str_colX); tr.setColY(str_colY); tr.setContent(""); Text txt = new Text(); txt.addTr(tr); page.addText(txt); */ continue; //没有交叉线 } else { if (str_colY.equals("")) { str_colY = (int)mergedhlines.get(i).getY() + ":" + (int)mergedhlines.get(i+1).getY(); } else { str_colY = str_colY + ":" + (int)mergedhlines.get(i+1).getY(); } if ((tableElement.nodeCount() > 0) && (i == (mergedhlines.size()-2))) { tableElement.addAttribute("colX", str_colX); tableElement.addAttribute("colY", str_colY); } /*table = new Table();*/ } Element rowElement = tableElement.addElement("tr"); /*TableTr tr = new TableTr(); table.addTr(tr);*/ do { double nextline = 0; boolean bFind = false; for (int j=0;j<column_lines.size();j++) { if (column_lines.get(j).getX() == leftline) { bFind = true; //找下一根交叉线 nextline = StraightLines.getNextVerticalLine(topline, bottomline, vertical_lines, leftline); if (nextline==0) { break; //没有下一根交叉线 } for (int m=j+1;m<column_lines.size();m++) { if (column_lines.get(m).getX() == nextline) { Element cellElement = null; //单元格的坐标 Rect cell_rect = new Rect(); cell_rect.setX(leftline); cell_rect.setWidth(nextline); cell_rect.setY(topline.getY()); cell_rect.setHeight(bottomline.getY()); /*TableTd td = null; */ if (leftline == StraightLines.getNextVerticalLine(topline, bottomline, vertical_lines, 0)) { //第一列 if (first_column_rowspan > 1) { first_column_rowspan--; continue; } else { first_column_rowspan = 0; } //设置边框 cellElement = rowElement.addElement("td"); cellElement.addAttribute("colspan", (m-j) + ""); /*td = new TableTd(); td.setColspan(String.valueOf(m-j)); tr.addTd(td);*/ //设置行合并 if ((bottomline.getX()-topline.getX())>10) { first_column_rowspan = 2; for (int p=i+2;p<mergedhlines.size();p++) { if ((mergedhlines.get(p).getX()-topline.getX())>10) { cell_rect.setHeight(mergedhlines.get(p).getY()); first_column_rowspan++; } else { break; } } cellElement.addAttribute("rowspan", (first_column_rowspan) + ""); /*td.setRowspan(String.valueOf(first_column_rowspan)); */ } } else { //非第一列 cellElement = rowElement.addElement("td"); cellElement.addAttribute("colspan", (m-j) + ""); /*td = new TableTd(); td.setColspan(String.valueOf(m-j)); tr.addTd(td);*/ } Element textElement = cellElement.addElement("text"); //SetSheetCell(sheet, i+1, j+1, 1, m-j, ""); //查找在topline, bottomline, leftline, nextline区域内的字符串,放入EXCEL表格 for (int n=0;n<textlist.size();n++) { TextLine textline = textlist.get(n); int rowspan = 1; if (cellElement.attribute("rowspan") != null) { rowspan = Integer.parseInt(cellElement.attribute("rowspan").getStringValue()); } if ((textline.getRect().getX() >= cell_rect.getX()) && ((textline.getRect().getX()) < cell_rect.getWidth()) && ((textline.getRect().getY()) >= cell_rect.getY()) && ((textline.getRect().getY()) < cell_rect.getHeight())) { textElement = cellElement.element("text"); mergeElement(textElement, textline); //mergeElement(td, textline); // textElement.addAttribute("height", (int)textline.getRect().getHeight() + ""); // textElement.addAttribute("width", (int)(textline.getRect().getWidth() - textline.getRect().getX()) + ""); // textElement.addAttribute("x", (int)textline.getRect().getX() + ""); // textElement.addAttribute("y", (int)textline.getRect().getY() + ""); // textElement.setText(textline.getText()); //cellText = cellText + textline.getText(); //找到 //SetSheetCell(sheet, i+1, j+1, 1, m-j, textline.getText()); } } //cellElement.setText(cellText); } } } } leftline = nextline; } while (leftline != 0); } //pageElement.elements().remove(pageElement.elements().size()); Rect[] tables = new Rect[pageElement.elements().size()]; int table_indexes[] = new int[pageElement.elements().size()]; //表的元素索引值 int last_table_index = -1; //最后一张表格索引号 for (int i=0;i<pageElement.elements().size();i++) { Element node = (Element)pageElement.elements().get(i); if ((node.attribute("colX") == null) || (node.attributeValue("colX").equals(""))) continue; tables[i] = new Rect(); table_indexes[i] = i; String cols_X[] = node.attributeValue("colX").split(":"); String cols_Y[] = node.attributeValue("colY").split(":"); tables[i].setX(Double.parseDouble(cols_X[0])); tables[i].setY(Double.parseDouble(cols_Y[0])); tables[i].setWidth((Double.parseDouble(cols_X[cols_X.length-1])-tables[i].getX())); tables[i].setHeight((Double.parseDouble(cols_Y[cols_Y.length-1])-tables[i].getY())); last_table_index = i; } Page page = new Page(); page.setCurrentNum(page_no); //表格数据后期处理 //去除空行 for (int i=0;i<pageElement.elements().size();i++) { Element table = (Element)pageElement.elements().get(i); Table tab = new Table(); tab.setColX(table.attributeValue("colX")); tab.setColY(table.attributeValue("colY")); page.addTable(tab); List<Integer> empty_row_index_list = new ArrayList<Integer>(); for (int j=0;j<table.elements().size();j++) { Element tr = (Element)table.elements().get(j); TableTr myTr = new TableTr(); tab.addTr(myTr); boolean b_empty_row = tr.elements().size()>0?true:false; for (int k=0;k<tr.elements().size();k++) { Element td = (Element)tr.elements().get(k); if (!td.getStringValue().equals("")) { b_empty_row = false; Element text = td.element("text"); TableTd myTd = new TableTd(); int x = Integer.parseInt(text.attributeValue("x")); int y = Integer.parseInt(text.attributeValue("y")); int w = Integer.parseInt(text.attributeValue("width")); int h = Integer.parseInt(text.attributeValue("height")); myTd.setColX(x+":"+(x+w)); myTd.setColY(y+":"+(y+h)); myTd.setColspan(td.attributeValue("colspan")); myTd.setRowspan(td.attributeValue("rowspan")); myTd.setContent(td.getStringValue()); myTr.addTd(myTd); continue; } } if (b_empty_row) { empty_row_index_list.add(j); } } for (int l=empty_row_index_list.size();l>0;l--) { table.elements().remove((int)((Integer)empty_row_index_list.get(l-1).intValue())); } } //表外文本 Rect lastrect = null; for (TextLine textline:textlist) { boolean inserted = false; Rect rect = textline.getRect(); for (int i=0;i<tables.length;i++) { if (tables[i] == null) continue; if (rect.getY()<tables[i].getY() || ((rect.getY() > tables[i].getY()) && (rect.getY() < (tables[i].getY() + tables[i].getHeight())) && (rect.getX() < tables[i].getX())) || ((rect.getY() > tables[i].getY()) && (rect.getY() < (tables[i].getY() + tables[i].getHeight())) && (rect.getX() > (tables[i].getX() + tables[i].getWidth())))) { //是否表外数据 if ((i==0) || (rect.getY() > (tables[i-1].getY() + tables[i-1].getHeight()))) { int step = 1; Element element = DocumentHelper.createElement("text"); element.addAttribute("x", ""+(int)rect.getX()); element.addAttribute("y", ""+(int)rect.getY()); element.addAttribute("width", ""+(int)rect.getWidth()); element.addAttribute("height", ""+(int)rect.getHeight()); element.setText(textline.getText()); TextTr myTr = new TextTr(); myTr.setColX(rect.getX() +":" + ((int)rect.getX() + (int)rect.getWidth())); myTr.setColY(rect.getY() +":" + ((int)rect.getY() + (int)rect.getHeight())); myTr.setContent(textline.getText()); Text myTxt = new Text(); myTxt.addTr(myTr); page.addText(myTxt); pageElement.content().add(table_indexes[i], element); if (lastrect == null) { lastrect = rect; } else { if (Math.abs(lastrect.getY() - rect.getY())>(lastrect.getHeight()/2)) { lastrect = rect; //element = DocumentHelper.createElement("br"); //pageElement.content().add(table_indexes[i], element); //element.addElement("br"); pageElement.content().add(pageElement.indexOf(element), DocumentHelper.createElement("br")); step = 2; } } for (int j=i;j<table_indexes.length;j++) { table_indexes[j]+=step; } inserted = true; break; } } } if (!inserted) { if ((last_table_index < 0) || (rect.getY() > (tables[last_table_index].getY()+tables[last_table_index].getHeight()))) { Element element = pageElement.addElement("text"); if (lastrect == null) { lastrect = rect; } else { if (Math.abs(lastrect.getY() - rect.getY())>(lastrect.getHeight()/2)) { lastrect = rect; //element.addElement("br"); //pageElement.content().add(pageElement.indexOf(element), DocumentHelper.createElement("br")); } } element.addAttribute("x", ""+(int)rect.getX()); element.addAttribute("y", ""+(int)rect.getY()); element.addAttribute("width", ""+(int)rect.getWidth()); element.addAttribute("height", ""+(int)rect.getHeight()); element.setText(textline.getText()); TextTr myTr = new TextTr(); myTr.setColX(rect.getX() +":" + ((int)rect.getX() + (int)rect.getWidth())); myTr.setColY(rect.getY() +":" + ((int)rect.getY() + (int)rect.getHeight())); myTr.setContent(textline.getText()); Text myTxt = new Text(); myTxt.addTr(myTr); page.addText(myTxt); inserted = true; } } } //uninitOutputExcelFile(workbook); return page.toString(); } public static WritableWorkbook initOutputExcelFile() { WritableWorkbook workbook = null; WritableSheet sheet = null; try { workbook = Workbook.createWorkbook(new File("d:/output.xls")); sheet = workbook.createSheet("Page", 0); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return workbook; } public static void uninitOutputExcelFile(WritableWorkbook workbook) { try { workbook.write(); workbook.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (WriteException e) { // TODO Auto-generated catch block e.printStackTrace(); } } private static void SetSheetCell(WritableSheet sheet, int row, int column, int rowspan, int colspan, String text) { try { //Label number = new Label(row, column, text); //sheet.getCell(column, row).getContents(); Label number = new Label(column, row, sheet.getCell(column, row).getContents() + text); WritableCellFormat format = new WritableCellFormat(); format.setBorder(jxl.format.Border.LEFT, jxl.format.BorderLineStyle.THIN); //边框样式, ls) format.setBorder(jxl.format.Border.RIGHT, jxl.format.BorderLineStyle.THIN); //边框样式, ls) format.setBorder(jxl.format.Border.TOP, jxl.format.BorderLineStyle.THIN); //边框样式, ls) format.setBorder(jxl.format.Border.BOTTOM, jxl.format.BorderLineStyle.THIN); //边框样式, ls) number.setCellFormat(format); sheet.addCell(number); //sheet.mergeCells(row, column, row + rowspan, column + colspan); sheet.mergeCells(column, row, column + colspan-1, row + rowspan-1); } catch (WriteException e) { e.printStackTrace(); } } public static String generateXMLFile(String filename, String xml_filename) { return generateXMLFile(filename, xml_filename, ""); } public static String generateXMLFile(String filename, String xml_filename, String id) { StringBuffer sb = new StringBuffer(); ExtractRawStream ers = new ExtractRawStream(); //checkPath(xml_filename); Element rootElement = ers.initXMLFile(filename, id); PdfDecoder decodePdf = null; decodePdf = new PdfDecoder(true); try { decodePdf.openPdfFile(filename); } catch (PdfException e1) { e1.printStackTrace(); System.exit(0); } int start = 1, end = decodePdf.getPageCount(); for (int page_no=start; page_no<end+1;page_no++) { ers.relist.clear(); ers.textlist.clear(); ers.page_rect.clear(); Element pageElement = rootElement.addElement("page"); pageElement.addAttribute("pageindex", page_no + ""); ers.parseFilePage(filename, page_no); sb.append(ExtractRawStream.generateExcelTables(pageElement, page_no, ers.relist, ers.textlist)); } /*try { XMLWriter output = new XMLWriter( new FileWriter( new File(xml_filename))); output.write(rootElement.getDocument()); output.close(); } catch(IOException e) { System.out.println(e.getMessage()); }*/ return filte(sb); } public static String generateXMLFile(InputStream stream, String filename, String id) { StringBuffer sb = new StringBuffer(); ExtractRawStream ers = new ExtractRawStream(); Element rootElement = ers.initXMLFile(filename, id); PdfDecoder decodePdf = null; decodePdf = new PdfDecoder(true); try { decodePdf.openPdfFileFromInputStream(stream, false); } catch (PdfException e1) { e1.printStackTrace(); System.exit(0); } int start = 1, end = decodePdf.getPageCount(); for (int page_no=start; page_no<end+1;page_no++) { ers.relist.clear(); ers.textlist.clear(); ers.page_rect.clear(); Element pageElement = rootElement.addElement("page"); pageElement.addAttribute("pageindex", page_no + ""); ers.parseFilePage(filename, page_no); sb.append(ExtractRawStream.generateExcelTables(pageElement, page_no, ers.relist, ers.textlist)); } return filte(sb); } private static String filte(StringBuffer sb){ String result = sb.toString(); result = ignoreTag("colX=\"null\"", result); result = ignoreTag("colY=\"null\"", result); result = ignoreTag("<table colX=\"null\" colY=\"null\"></table>", result); result = ignoreTag("<table></table>", result); result = ignoreTag("<table ></table>", result); result = ignoreTag(" colspan=\"null\"", result); result = ignoreTag(" rowspan=\"null\"", result); return result; } private Element initXMLFile(String file_name, String id) { Document document = DocumentHelper.createDocument(); Element rootElement = document.addElement("pdf"); rootElement.addAttribute("id", id); rootElement.addAttribute("filename", new File(file_name).getName()); return rootElement; } private static void checkPath(String filename) { File file = new File(filename).getParentFile(); if (file!=null&&!file.exists()) { file.mkdirs(); } } @SuppressWarnings("deprecation") private static void mergeElement(Element textElement, TextLine textline) { String x = ""; String y = ""; String width = ""; String height = ""; if (textElement.attribute("x") != null) { x = textElement.attribute("x").getStringValue(); y = textElement.attribute("y").getStringValue(); width = textElement.attribute("width").getStringValue(); height = textElement.attribute("height").getStringValue(); } if (x == null || x.equals("null") || x.equals("")) { x = (int)textline.getRect().getX() + ""; y = (int)textline.getRect().getY() + ""; width = (int)(textline.getRect().getWidth() - textline.getRect().getX()) + ""; height = (int)textline.getRect().getHeight() + ""; textElement.addAttribute("x", x); textElement.addAttribute("y", y); textElement.addAttribute("width", width); textElement.addAttribute("height", height); textElement.setText(textline.getText()); } else { Rect rect = new Rect(); rect.setX((Double.parseDouble(x) < textline.getRect().getX())?Double.parseDouble(x):textline.getRect().getX()); rect.setY((Double.parseDouble(y) < textline.getRect().getY())?Double.parseDouble(y):textline.getRect().getY()); double x_max = Double.parseDouble(x) + Double.parseDouble(width); if (x_max<textline.getRect().getWidth()) { rect.setWidth(textline.getRect().getWidth()-textline.getRect().getX()); } else { rect.setWidth(Double.parseDouble(width)); } double y_max = Double.parseDouble(y) + Double.parseDouble(height); if (y_max<(textline.getRect().getY() + textline.getRect().getHeight())) { rect.setHeight(textline.getRect().getY() + textline.getRect().getHeight() - rect.getY()); } else { rect.setHeight(y_max - rect.getY()); } textElement.addAttribute("x", (int)rect.getX() + ""); textElement.addAttribute("y", (int)rect.getY() + ""); textElement.addAttribute("width", (int)rect.getWidth() + ""); textElement.addAttribute("height", (int)rect.getHeight() + ""); textElement.setText(textElement.getText() + textline.getText()); } } private static void mergeElement(TableTd td, TextLine textline) { String x = ""; String y = ""; String width = ""; String height = ""; /*if (textElement.attribute("x") != null) { x = textElement.attribute("x").getStringValue(); y = textElement.attribute("y").getStringValue(); width = textElement.attribute("width").getStringValue(); height = textElement.attribute("height").getStringValue(); }*/ if (x == null || x.equals("null") || x.equals("")) { x = (int)textline.getRect().getX() + ""; y = (int)textline.getRect().getY() + ""; width = (int)(textline.getRect().getWidth() - textline.getRect().getX()) + ""; height = (int)textline.getRect().getHeight() + ""; } else { Rect rect = new Rect(); rect.setX((Double.parseDouble(x) < textline.getRect().getX())?Double.parseDouble(x):textline.getRect().getX()); rect.setY((Double.parseDouble(y) < textline.getRect().getY())?Double.parseDouble(y):textline.getRect().getY()); double x_max = Double.parseDouble(x) + Double.parseDouble(width); if (x_max<textline.getRect().getWidth()) { rect.setWidth(textline.getRect().getWidth()-textline.getRect().getX()); } else { rect.setWidth(Double.parseDouble(width)); } double y_max = Double.parseDouble(y) + Double.parseDouble(height); if (y_max<(textline.getRect().getY() + textline.getRect().getHeight())) { rect.setHeight(textline.getRect().getY() + textline.getRect().getHeight() - rect.getY()); } else { rect.setHeight(y_max - rect.getY()); } x = (int)rect.getX() + ""; y = (int)rect.getY() + ""; width = (int)rect.getWidth() + ""; height = (int)rect.getHeight() + ""; } td.setColX(x+":"+(x+width)); td.setColY(y+":"+(y+height)); td.setContent(textline.getText()); } //无框表格数据提取 public void noframe_table_parse(List<Rect> lines, List<TextLine> textlist) { TextLine.groupByX(textlist); Rect line = new Rect(); for (TextLine textline:textlist) { Rect rect = textline.getRect(); } } private static String ignoreTag(String tag, String origin){ return origin.replaceAll(tag, ""); } }
package org.aoe.software.pdf; import java.io.BufferedWriter; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; public class FileUtils { public static boolean save(String data, String filepath){ OutputStream os = null; try { os = new FileOutputStream(filepath); BufferedWriter out = new BufferedWriter(new OutputStreamWriter(os)); out.write(data); out.flush(); return true; } catch (Exception e) { return false; } finally{ if(os != null){ try { os.close(); } catch (IOException e) { os = null; } } } } }
package org.aoe.software.pdf; public class Rect { double x; double y; double width; double height; Rect() { } Rect(Rect rect) { set(rect); } public void set(Rect rect) { this.x = rect.getX(); this.y = rect.getY(); this.width = rect.getWidth(); this.height = rect.getHeight(); } public double getX() { return x; } public void setX(double x) { this.x = x; } public double getY() { return y; } public void setY(double y) { this.y = y; } public double getWidth() { return width; } public void setWidth(double width) { this.width = width; } public double getHeight() { return height; } public void setHeight(double height) { this.height = height; } public void clear() { x = 0; y = 0; width = 0; height = 0; } }
package org.aoe.software.pdf; import java.util.ArrayList; import java.util.List; public class TextLine { private String text; private Rect rect = new Rect(); public String getText() { return text; } public void setText(String text) { this.text = text; } public Rect getRect() { return rect; } public void setRect(Rect rect) { this.rect = rect; } public static void sortByYMinAsc(List<TextLine> textlist) { int i, j; boolean ischanged = false; for (j=textlist.size();j>0;j--) { ischanged = false; for (i=0;i<j-1;i++) { if (textlist.get(i).getRect().getY() > textlist.get(i+1).getRect().getY()) { TextLine temp = textlist.get(i); textlist.set(i, textlist.get(i+1)); textlist.set(i+1, temp); ischanged = true; } } if (!ischanged) break; } } public static double MINIMUN_LINE_LENGTH = 3; public static List<List<TextLine>> groupByX(List<TextLine> lines) { if (lines.size() == 0) return null; TextLine.sortByXMinAsc(lines); List<List<TextLine>> out = new ArrayList<List<TextLine>>(); double lastx = 0; lastx = lines.get(0).getRect().getX(); List<TextLine> current = new ArrayList<TextLine>(); for (TextLine line:lines) { //分组 if ((line.getRect().getX() - lastx) > MINIMUN_LINE_LENGTH) { out.add(current); current = new ArrayList<TextLine>(); lastx = line.getRect().getX(); current.add(line); } else { line.getRect().setX(lastx); //去掉双线 current.add(line); } } out.add(current); return out; } public static List<List<TextLine>> groupByY(List<TextLine> lines) { if (lines.size() == 0) return null; TextLine.sortByYMinAsc(lines); List<List<TextLine>> out = new ArrayList<List<TextLine>>(); double lasty = 0; lasty = lines.get(0).getRect().getY(); List<TextLine> current = new ArrayList<TextLine>(); for (TextLine line:lines) { //分组 if ((line.getRect().getY() - lasty) > MINIMUN_LINE_LENGTH) { out.add(current); current = new ArrayList<TextLine>(); lasty = line.getRect().getY(); current.add(line); } else { line.getRect().setY(lasty); //去掉双线 current.add(line); } } out.add(current); return out; } //按x轴值排序 public static void sortByXMinAsc(List<TextLine> in) { int i, j; boolean ischanged = false; for (j=in.size();j>0;j--) { ischanged = false; for (i=0;i<j-1;i++) { if (in.get(i).getRect().getX() < in.get(i+1).getRect().getX()) { TextLine temp = in.get(i); in.set(i, in.get(i+1)); in.set(i+1, temp); ischanged = true; } } if (!ischanged) break; } } }
package org.aoe.software.pdf; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class StraightLines { //按x轴值排序 public static void sortByXMin(List<Rect> in) { int i, j; boolean ischanged = false; for (j=in.size();j>0;j--) { ischanged = false; for (i=0;i<j-1;i++) { if (in.get(i).getX() > in.get(i+1).getX()) { Rect temp = in.get(i); in.set(i, in.get(i+1)); in.set(i+1, temp); ischanged = true; } } if (!ischanged) break; } } //按x轴值排序 public static void sortByXMinAsc(List<Rect> in) { int i, j; boolean ischanged = false; for (j=in.size();j>0;j--) { ischanged = false; for (i=0;i<j-1;i++) { if (in.get(i).getX() < in.get(i+1).getX()) { Rect temp = in.get(i); in.set(i, in.get(i+1)); in.set(i+1, temp); ischanged = true; } } if (!ischanged) break; } } //按y轴值排序 public static void sortByYMin(List<Rect> in) { int i, j; boolean ischanged = false; for (j=in.size();j>0;j--) { ischanged = false; for (i=0;i<j-1;i++) { if (in.get(i).getY() < in.get(i+1).getY()) { Rect temp = in.get(i); in.set(i, in.get(i+1)); in.set(i+1, temp); ischanged = true; } } if (!ischanged) break; } } //按y轴值排序 public static void sortByYMinAsc(List<Rect> in) { int i, j; boolean ischanged = false; for (j=in.size();j>0;j--) { ischanged = false; for (i=0;i<j-1;i++) { if (in.get(i).getY() > in.get(i+1).getY()) { Rect temp = in.get(i); in.set(i, in.get(i+1)); in.set(i+1, temp); ischanged = true; } } if (!ischanged) break; } } //按宽度排序 public static void sortByXMax(List<Rect> in) { int i, j; boolean ischanged = false; for (j=in.size();j>0;j--) { ischanged = false; for (i=0;i<j-1;i++) { if (in.get(i).getWidth() > in.get(i+1).getWidth()) { Rect temp = in.get(i); in.set(i, in.get(i+1)); in.set(i+1, temp); ischanged = true; } } if (!ischanged) break; } } //按高度排序 public static void sortByYMax(List<Rect> in) { int i, j; boolean ischanged = false; for (j=in.size();j>0;j--) { ischanged = false; for (i=0;i<j-1;i++) { if (in.get(i).getHeight() > in.get(i+1).getHeight()) { Rect temp = in.get(i); in.set(i, in.get(i+1)); in.set(i+1, temp); ischanged = true; } } if (!ischanged) break; } } public static List<Rect> mergeHorizontalLines(List<Rect> horizontal_lines) { List<Rect> merged_lines = new ArrayList<Rect>(); if ((horizontal_lines == null)) return merged_lines; //水平按序排列 StraightLines.sortByYMinAsc(horizontal_lines); double lasty = 0; Rect temp = new Rect(); for (int i=0;i<horizontal_lines.size();i++) { Rect line = horizontal_lines.get(i); if (lasty == 0) { lasty = line.getY(); temp = new Rect(); temp.setX(line.getX()); temp.setWidth(line.getWidth()); temp.setY(line.getY()); temp.setHeight(line.getHeight()); continue; } if (line.getY() != lasty) { lasty = line.getY(); merged_lines.add(temp); temp = new Rect(); temp.setX(line.getX()); temp.setWidth(line.getWidth()); temp.setY(line.getY()); temp.setHeight(line.getHeight()); } else { //合并线 if (temp.getWidth() == 0) { temp.setX(line.getX()); temp.setWidth(line.getWidth()); temp.setY(line.getY()); temp.setHeight(line.getHeight()); } else { double xMin = (line.getX() < temp.getX())?line.getX():temp.getX(); if ((line.getX() + line.getWidth()) > (temp.getX() + temp.getWidth())) { temp.setWidth((line.getX() + line.getWidth()) - temp.getX()); } else { temp.setWidth((temp.getX() + temp.getWidth()) - temp.getX()); } temp.setX(xMin); } } } if (temp.getWidth() > 0) { merged_lines.add(temp); } return merged_lines; } public static List<Rect> mergeVerticalLines(List<Rect> vertical_lines) { List<Rect> merged_lines = new ArrayList<Rect>(); if ((vertical_lines == null)) return merged_lines; //水平按序排列 StraightLines.sortByXMinAsc(vertical_lines); double lasty = 0; Rect temp = new Rect(); for (int i=0;i<vertical_lines.size();i++) { Rect line = vertical_lines.get(i); if (lasty == 0) { lasty = line.getY(); temp = new Rect(); temp.setX(line.getX()); temp.setWidth(line.getWidth()); temp.setY(line.getY()); temp.setHeight(line.getHeight()); continue; } if (line.getY() != lasty) { lasty = line.getY(); merged_lines.add(temp); temp = new Rect(); temp.setX(line.getX()); temp.setWidth(line.getWidth()); temp.setY(line.getY()); temp.setHeight(line.getHeight()); } else { //合并线 if (temp.getWidth() == 0) { temp.setX(line.getX()); temp.setWidth(line.getWidth()); temp.setY(line.getY()); temp.setHeight(line.getHeight()); } else { double xMin = (line.getX() < temp.getX())?line.getX():temp.getX(); if ((line.getX() + line.getWidth()) > (temp.getX() + temp.getWidth())) { temp.setWidth((line.getX() + line.getWidth()) - temp.getX()); } else { temp.setWidth((temp.getX() + temp.getWidth()) - temp.getX()); } temp.setX(xMin); } } } if (temp.getWidth() > 0) { merged_lines.add(temp); } return merged_lines; } public static double MINIMUN_LINE_LENGTH = 3; public static double getNextVerticalLine(Rect topline, Rect bottomline, List<Rect> in_vertical_lines, double startx) { List<Rect> vertical_lines = in_vertical_lines; StraightLines.sortByXMin(vertical_lines); double result = 0; boolean bFind = false; int start; if (startx==0) { start = -1; } else { for (start=0;start<vertical_lines.size();start++) { if (vertical_lines.get(start).getX() == startx) { if (((vertical_lines.get(start).getY() - MINIMUN_LINE_LENGTH < topline.getY()) && (vertical_lines.get(start).getY() + vertical_lines.get(start).getHeight() + MINIMUN_LINE_LENGTH > topline.getY())) && ((vertical_lines.get(start).getY() - MINIMUN_LINE_LENGTH < bottomline.getY()) && (vertical_lines.get(start).getY() + vertical_lines.get(start).getHeight() + MINIMUN_LINE_LENGTH > bottomline.getY()))) { for (;start<vertical_lines.size() && vertical_lines.get(start).getX() == startx;start++); start--; bFind = true; break; } } } } bFind = false; for (int i=start+1;i<vertical_lines.size();i++) { if (((vertical_lines.get(i).getY() - MINIMUN_LINE_LENGTH < topline.getY()) && (vertical_lines.get(i).getY() + vertical_lines.get(i).getHeight() + MINIMUN_LINE_LENGTH > topline.getY())) && ((vertical_lines.get(i).getY() - MINIMUN_LINE_LENGTH < bottomline.getY()) && (vertical_lines.get(i).getY() + vertical_lines.get(i).getHeight() + MINIMUN_LINE_LENGTH > bottomline.getY()))) { bFind = true; result = vertical_lines.get(i).getX(); break; } } return result; } public static void processReCommands(List<Rect> relist, int page_no) { List<Rect> temp = new ArrayList<Rect>(); //Rect line; for (Rect line:relist) { if ((line.getWidth()>MINIMUN_LINE_LENGTH) && (line.getHeight()>MINIMUN_LINE_LENGTH)) { Rect topline = new Rect(); topline.setX(line.getX()); topline.setY(line.getY()); topline.setWidth(line.getWidth()); topline.setHeight(0.1); Rect bottomline = new Rect(); bottomline.setX(line.getX()); bottomline.setY(line.getY()+line.getHeight()); bottomline.setWidth(line.getWidth()); bottomline.setHeight(0.1); Rect leftline = new Rect(); leftline.setX(line.getX()); leftline.setY(line.getY()); leftline.setHeight(line.getHeight()); leftline.setWidth(0.1); Rect rightline = new Rect(); rightline.setX(line.getX()+line.getWidth()); rightline.setY(line.getY()); rightline.setHeight(line.getHeight()); rightline.setWidth(0.1); temp.add(topline); temp.add(bottomline); temp.add(leftline); temp.add(rightline); } else { temp.add(line); } } relist.clear(); relist.addAll(temp); temp.clear(); List<Rect> horizontal_lines = new ArrayList<Rect>(); List<Rect> vertical_lines = new ArrayList<Rect>(); Rect lastline = new Rect(); //获得水平线 for (int i=0;i<relist.size();i++) { if (relist.get(i).getWidth() > MINIMUN_LINE_LENGTH) { horizontal_lines.add(relist.get(i)); } } StraightLines.sortByYMinAsc(horizontal_lines); for (Rect line:horizontal_lines) { if ((lastline.getHeight() == 0) && (lastline.getWidth() == 0)) { lastline.set(line); continue; } if (Math.abs((lastline.getY() - line.getY())) > MINIMUN_LINE_LENGTH) { temp.add(lastline); lastline = new Rect(); lastline.set(line); } else { if (lastline.getX() > line.getX()) { lastline.setX(line.getX()); } if ((lastline.getX() + lastline.getWidth()) < (line.getX() + line.getWidth())) { lastline.setWidth((line.getX() + line.getWidth()) - lastline.getX()); } } } if ((lastline.getHeight() != 0) && (lastline.getWidth() != 0)) { temp.add(lastline); } //获得垂直线 for (int i=0;i<relist.size();i++) { if (relist.get(i).getHeight() > MINIMUN_LINE_LENGTH) { if (relist.get(i).getX() != 0) { vertical_lines.add(relist.get(i)); } } } List<List<Rect>> v_list = StraightLines.groupByX(vertical_lines); List<Rect> v_lines = StraightLines.remergeVerticalLines(v_list); temp.addAll(v_lines); /* StraightLines.sortByXMinAsc(vertical_lines); lastline = new Rect(); for (Rect line:vertical_lines) { if ((lastline.getHeight() == 0) && (lastline.getWidth() == 0)) { lastline.set(line); continue; } if (Math.abs((lastline.getX() - line.getX())) > MINIMUN_LINE_LENGTH*5) { lastline.setY(lastline.getY() - 2); lastline.setHeight(lastline.getHeight() + 2); temp.add(lastline); lastline = new Rect(); lastline.set(line); } else { if (lastline.getY() > line.getY()) { lastline.setY(line.getY()); } if ((lastline.getY() + lastline.getHeight()) < (line.getY() + line.getHeight())) { lastline.setHeight((line.getY() + line.getHeight()) - lastline.getY()); } } } if ((lastline.getHeight() != 0) && (lastline.getWidth() != 0)) { temp.add(lastline); } */ relist.clear(); relist.addAll(temp); } public static void printLines(String filename, List<Rect> lines) { FileWriter out; try { out = new FileWriter(new File(filename)); for (Rect line:lines) { out.write(line.getX() + ", " + line.getY() + ", " + line.getWidth() + ", " + line.getHeight() + " re" + (char)(10) + (char)(13)); } out.flush(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static List<List<Rect>> groupByX(List<Rect> lines) { if (lines.size() == 0) return null; StraightLines.sortByXMin(lines); List<List<Rect>> out = new ArrayList<List<Rect>>(); double lastx = 0; lastx = lines.get(0).getX(); List<Rect> current = new ArrayList<Rect>(); for (Rect line:lines) { //分组 if ((line.getX() - lastx)>MINIMUN_LINE_LENGTH) { out.add(current); current = new ArrayList<Rect>(); lastx = line.getX(); current.add(line); } else { line.setX(lastx); //去掉双线 current.add(line); } } out.add(current); return out; } public static List<Rect> remergeVerticalLines(List<List<Rect>> lines_list) { List<Rect> out = new ArrayList<Rect>(); if (lines_list == null) return out; for (List<Rect> lines: lines_list) { StraightLines.sortByYMinAsc(lines); Rect current = new Rect(); for (Rect line: lines) { if (line.getY() == 0) continue; if (current.getY() == 0) current.set(line); if (line.getY()<=(current.getY()+current.getHeight())) { double height = 0; if ((line.getY() + line.getHeight()) > (current.getY() + current.getHeight())) { height = line.getY() + line.getHeight() - current.getY(); current.setHeight(height); } } else { out.add(current); current = new Rect(); current.set(line); } } out.add(current); } return out; } }
PO
package org.aoe.software.pdf.po; import java.util.LinkedList; import java.util.List; /** * 每页对象. * * <page pageIndex="1"> <text> <tr colX="x1:x2" colY="y1:y2">ssssssssss</tr> </text> <table colX="x1:x2:x3" colY="y1:y2:y3:y4"> <tr> <td colX="x1:x2" colY="y1:y2" colspan="2" rowspan="2">TTTT</td> </tr> </table> </page> * */ public class Page { private int currentNum; private List<Text> textList = new LinkedList<Text>(); private List<Table> tableList = new LinkedList<Table>(); private List<Integer> seqList = new LinkedList<Integer>(); // 0:text 1:table public String toString(){ StringBuffer sb = new StringBuffer(); int textIndex = 0; int tableIndex = 0; for(int i : seqList){ if(i == 0){ sb.append(textList.get(textIndex++).toString()); }else{ sb.append(tableList.get(tableIndex++).toString()); } } return String.format("<page pageIndex=\"%s\">%s</page>", currentNum, sb.toString()); } public int getCurrentNum() { return currentNum; } public void setCurrentNum(int currentNum) { this.currentNum = currentNum; } public void addText(Text text){ textList.add(text); seqList.add(0); } public void addTable(Table table){ tableList.add(table); seqList.add(1); } }
package org.aoe.software.pdf.po; import java.util.LinkedList; import java.util.List; /** 表格. * */ public class Table { private String colX; private String colY; private List<TableTr> trList = new LinkedList<TableTr>(); public String toString(){ StringBuffer sb = new StringBuffer(); for(TableTr tr : trList){ sb.append(tr.toString()); } return String.format("<table border=\"1\" colX=\"%s\" colY=\"%s\">%s</table>", colX, colY, sb.toString()); } public void addTr(TableTr tr){ trList.add(tr); } public String getColX() { return colX; } public void setColX(String colX) { this.colX = colX; } public String getColY() { return colY; } public void setColY(String colY) { this.colY = colY; } }
package org.aoe.software.pdf.po; /** * 表格的单元格。 * */ public class TableTd { private String colX; private String colY; private String colspan; private String rowspan; private String content; @Override public String toString(){ return String.format("<td colX=\"%s\" colY=\"%s\" colspan=\"%s\" rowspan=\"%s\">%s</td>", colX, colY, colspan, rowspan, content); } public String getContent() { return content; } public void setContent(String content) { this.content = content; } public String getColX() { return colX; } public void setColX(String colX) { this.colX = colX; } public String getColY() { return colY; } public void setColY(String colY) { this.colY = colY; } public String getColspan() { return colspan; } public void setColspan(String colspan) { this.colspan = colspan; } public String getRowspan() { return rowspan; } public void setRowspan(String rowspan) { this.rowspan = rowspan; } }
package org.aoe.software.pdf.po; import java.util.LinkedList; import java.util.List; /** 表格的行. * */ public class TableTr { private List<TableTd> tdList = new LinkedList<TableTd>(); public String toString(){ StringBuffer sb = new StringBuffer(); sb.append("<tr>"); for(TableTd td : tdList){ sb.append(td.toString()); } sb.append("</tr>"); return sb.toString(); } public void addTd(TableTd td){ tdList.add(td); } }
package org.aoe.software.pdf.po; import java.util.LinkedList; import java.util.List; /** * 文本块. * */ public class Text { private List<TextTr> trList = new LinkedList<TextTr>(); public String toString(){ StringBuffer sb = new StringBuffer(); for(TextTr tr : trList){ sb.append(tr.toString()); } return String.format("<text>%s</text>",sb.toString()); } public void addTr(TextTr tr){ trList.add(tr); } }
package org.aoe.software.pdf.po; /** * 文本行 。 * */ public class TextTr { private String colX; private String colY; private String content; public String toString(){ return String.format("<tr colX=\"%s\" colY=\"%s\">%s</tr>", colX, colY, content); } public String getColX() { return colX; } public void setColX(String colX) { this.colX = colX; } public String getColY() { return colY; } public void setColY(String colY) { this.colY = colY; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } }