使用jpedal解析PDF到XML

接口类:
package org.aoe.software.pdf;

import java.io.InputStream;

/**
 *Convent pdf to xml.
 *PDF转XML的格式定义

<pdf id="00000001" fileName="temp0001.pdf">
  <page pageIndex="1">
    <text>
      <tr colX="x1:x2" colY="y1:y2">ssssssssss</tr>
    </text>
    <table  colX="x1:x2:x3" colY="y1:y2:y3:y4">
      <tr>
         <td colX="x1:x2" colY="y1:y2" colspan="2" rowspan="2">TTTT</td>
      </tr>
    </table>
  </page>
</pdf>


说明:
id:表示PDF文件的唯一ID标识名,可以为空,是由调用者传入的参数
fileName:表示PDF的文件名称,不可为空(去除文件中所包含的路径),

page:表示页面信息
pageIndex:表示PDF文件的具体页码信息
text:表示PDF内容中的段落信息
table:表示PDF内容中的表格信息

tr:表示行信息
td:表示表格中的单远格信息

冒号分隔每组值
colX:
   矩形的左下角X坐标

colY:
   矩形的右上角y坐标

其中:td 中的colX,colY表示单元格中数据内容的坐标

根据表头的colX 属性描述,计算出cols:表示这个表格总的有多少列
根据表头的colY 属性描述,计算出rows:表示这个表格总的有多少行

colspan:表示列合并(表明具体的由哪些列合并在一起),如果>1个示从当前列合并后的总列数,等于2表示要合并右边的一列单元格组成新的单元格,其它数据以此类推
rowspan:表示行合并(表明具体的由哪些行合并在一起),如果>1个示从当前行合并后的总行数,等于2表示要合并下边的一行单元格组成新的单元格,其它数据以此类推



 */
public class PDFToXml {
	private static final String XML_HEAD = "<?xml version=\"1.0\" encoding=\"GBK\"?>";
	private static final String NEW_LINE = "\r\n";
	
	/*调用者传入一个本地的文件名(包含路径),fileID可空,返回生成好的XML格式的字符串,
	 * 如果生成失败,返回字符为空值,即:""
	*/
	public static String ConvertToXML(String fileName, String fileID){
		StringBuffer sb = new StringBuffer();
		String fileShortName = fileName;
		fileShortName = fileShortName.replace("\\", "/");
		if(fileShortName.indexOf("/") != -1)
			fileShortName = fileShortName.substring(fileShortName.lastIndexOf("/") + 1);
		sb.append(XML_HEAD).append(NEW_LINE);
		sb.append("<pdf id=\""+ (fileID == null ? "" : fileID)+"\" fileName=\""+fileShortName+"\">").append(NEW_LINE);
		//sb.append(ConvertUtils.parse(fileName)).append(NEW_LINE);
		sb.append(ExtractRawStream.generateXMLFile(fileName, "tmp.xml", fileID)).append(NEW_LINE);
		sb.append("</pdf>").append(NEW_LINE);
		return sb.toString();
	}
	
	/*调用者传入一个本地的文件名(包含路径),fileID可空,
	 * 把生成好的XML格式的数据按指定的文件路径进行保存,如果生成或保存失败,返回false
	*/
	public static boolean ConvertToXML(String fileName, String fileID, String savePath){
		return FileUtils.save(ConvertToXML(fileName, fileID), savePath);
	}

	/*调用者传入PDF的文件流,当前文件流的名称与文件ID,fileID不能为空,
	 * 返回生成的XML格式的字符串,如果生成失败,返回字符为空值,即:""
	 */
	public static String ConvertToXML(InputStream stream, String fileName,String fileID){
		StringBuffer sb = new StringBuffer();
		String fileShortName = fileName;
		fileShortName = fileShortName.replace("\\", "/");
		if(fileShortName.indexOf("/") != -1)
			fileShortName = fileShortName.substring(fileShortName.lastIndexOf("/") + 1);
		sb.append(XML_HEAD).append(NEW_LINE);
		sb.append("<pdf id=\""+ (fileID == null ? "" : fileID)+"\" fileName=\""+fileShortName+"\">").append(NEW_LINE);
		//sb.append(ConvertUtils.parse(stream)).append(NEW_LINE);
		sb.append(ExtractRawStream.generateXMLFile(stream, fileName, fileID)).append(NEW_LINE);
		sb.append("</pdf>").append(NEW_LINE);
		return null;
	}
	
	/*
	 * 调用者传入PDF的文件流,当前文件流的名称与文件ID,fileID不能为空,
	 * 把生成好的XML格式的数据按指定的文件路径进行保存,如果生成或保存失败,返回false
	 */	
	public static boolean ConvertToXML(InputStream stream,String fileName,String fileID, String savePath){
		return FileUtils.save(ConvertToXML(stream, fileName, fileID), savePath);
	}
	
	
	/////////////////////////////////////////
	
	public static void main(String[] args) {
		System.out.println(ConvertToXML("r:/a.pdf", "1111", "r:/zzz.xml"));
		//System.out.println(ConvertToXML("r:/b.pdf", "1111", "r:/b.xml"));
	}
}


package org.aoe.software.pdf;

import java.io.InputStream;
import java.util.Map;

import org.jpedal.PdfDecoder;
import org.jpedal.exception.PdfException;
import org.jpedal.grouping.PdfGroupingAlgorithms;
import org.jpedal.objects.PdfPageData;

public class ConvertUtils {
	private static final String NEW_LINE = "\r\n";
	private static PdfDecoder decodePdf = new PdfDecoder(false);;
	private static int defX1 = -1, defX2, defY1, defY2;

	public static String parse(String pdfFilepath) {
		try {
			decodePdf.setExtractionMode(PdfDecoder.TEXT); // extract just text
			PdfDecoder.init(true);
			decodePdf.openPdfFile(pdfFilepath);
		} catch (Exception e) {
			e.printStackTrace();
		}
		return parseContent(decodePdf);
	}
	
	public static String parse(InputStream is){
		try {
			decodePdf.setExtractionMode(PdfDecoder.TEXT); // extract just text
			PdfDecoder.init(true);
			decodePdf.openPdfFileFromInputStream(is, false);
		} catch (Exception e) {
			e.printStackTrace();
		}
		return parseContent(decodePdf);
	}
	
	
	private static String parseContent(PdfDecoder pdfDecoder){
		StringBuffer sb = new StringBuffer();
		if (!decodePdf.isExtractionAllowed()) {
			System.out.println("Text extraction not allowed");
		} else if (decodePdf.isEncrypted() && !decodePdf.isPasswordSupplied()) {
			System.out.println("Encrypted settings");
			System.out.println("Please look at Viewer for code sample to handle such files");
			System.out.println("Or get support/consultancy");
		} else {
			// page range
			int start = 1, end = decodePdf.getPageCount();

			try {
				for (int page = start; page < end + 1; page++) { 

					sb.append("<page pageIndex=\""+ page +"\">").append(NEW_LINE);
					
					decodePdf.decodePage(page);
					PdfGroupingAlgorithms currentGrouping = decodePdf.getGroupingObject();
					PdfPageData currentPageData = decodePdf.getPdfPageData();

					int x1, y1, x2, y2;

					if (defX1 == -1) {
						x1 = currentPageData.getMediaBoxX(page);
						x2 = currentPageData.getMediaBoxWidth(page) + x1;
						y2 = currentPageData.getMediaBoxY(page);
						y1 = currentPageData.getMediaBoxHeight(page) + y2;
					} else {
						x1 = defX1;
						y1 = defY1;
						x2 = defX2;
						y2 = defY2;
					}
						/**
						 * Co-ordinates are x1,y1 (top left hand corner),
						 * x2,y2(bottom right)
						 */

						try {
							Map tableContent = currentGrouping.extractTextAsTable(
									x1, y1, x2, y2, page, false, // csv
									false, false, false, 0);

							// get the text from the Map object
							String tableText = (String) tableContent.get("content");
							
							
							//忽略不在乎的标签
							tableText = ignoreTag("<TABLE>", tableText);
							tableText = ignoreTag("</TABLE>", tableText);
							
							tableText = ignoreTag(" nowrap", tableText);
							tableText = ignoreTag("", tableText);
							tableText = ignoreTag("<SpaceCount space=\"\\d+\" />", tableText);
							
							tableText = ignoreTag("<td></td>", tableText);
							tableText = ignoreTag("<tr></tr>", tableText);
							
							boolean isTable = isTable(tableText);
							if(isTable){
								int rows = getCount(tableText, "<tr>");
								int cols = getCount(tableText, "<td>");
								sb.append("<table  colX=\""+rows+"\" colY=\""+cols+"\">").append(tableText).append("</table>").append(NEW_LINE);
							}else{
								tableText = ignoreTag("<tr>", tableText);
								tableText = ignoreTag("</tr>", tableText);
								tableText = ignoreTag("<td>", tableText);
								tableText = ignoreTag("</td>", tableText);
								sb.append("<text>").append(NEW_LINE);
								sb.append("<tr colX=\""+ x1 +":"+ x2 +"\" colY=\""+ y1 +":"+ y2 +"\">"+ tableText +"</tr>").append(NEW_LINE);
								sb.append("</text>").append(NEW_LINE);
							}
						} catch (PdfException e) {
							decodePdf.closePdfFile();
							e.printStackTrace();
						}

						// remove data once written out
						decodePdf.flushObjectValues(false);
					
					sb.append("</page>").append(NEW_LINE);
				}
			} catch (Exception e) {
				decodePdf.closePdfFile();
				e.printStackTrace();
			}

			decodePdf.flushObjectValues(true); // flush any text data read
		}
		decodePdf.closePdfFile();
		return sb.toString();
	}
	
	private static String ignoreTag(String tag, String origin){
		return origin.replaceAll(tag, "");
	}
	
	private static int getCount(String table, String tag){
		int count = 0;
		int index = 0;
		while((index = table.indexOf(tag, index)) != -1){
			count++;
			index += tag.length();
		}
		return count;
	}
	
	private static boolean isTable(String tableText){
		//将如下情况设置为table :多行 或者 单行多列(非空列个数大于2)
		/*if(tableText.indexOf("<tr>") != tableText.lastIndexOf("<tr>")){
			return true;
		}*/
		int rows = getCount(tableText, "<tr>");
		int index = 0;
		if(rows>0){
			for(int i=1; i<rows; i++){
				int tr = tableText.indexOf("<tr>", index);
				int closedTr = tableText.indexOf("</tr>", tr);
				String line = tableText.substring(tr, closedTr);
				index += line.length();
				
				if(line.indexOf("<td>") != line.lastIndexOf("<td>")){
					return true;
				}
			}
		}
		return false;
	}
}


package org.aoe.software.pdf;
 
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import jxl.Workbook;
import jxl.write.Label;
import jxl.write.WritableCellFormat;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import jxl.write.WriteException;

import org.aoe.software.pdf.po.Page;
import org.aoe.software.pdf.po.Table;
import org.aoe.software.pdf.po.TableTd;
import org.aoe.software.pdf.po.TableTr;
import org.aoe.software.pdf.po.Text;
import org.aoe.software.pdf.po.TextTr;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.jpedal.PdfDecoder;
import org.jpedal.exception.PdfException;
import org.jpedal.exception.PdfSecurityException;
import org.jpedal.fonts.FontMappings;
import org.jpedal.grouping.PdfGroupingAlgorithms;
import org.jpedal.objects.PdfPageData;
import org.jpedal.utils.Strip;

public class ExtractRawStream {
 
	//DX20130502 decode page no
	 public int decode_pageno;
	 
    /**flag to show if we print messages*/
     public static boolean outputMessages=true;
 
    /**word count - used for testing*/
     private int wordsExtracted=0;
 
    /**correct separator for OS */
     String separator = System.getProperty("file.separator");
 
    /**the decoder object which decodes the pdf and returns a data object*/
     PdfDecoder decodePdf = null;
 
    /**flag to show if file or byte array*/
     private boolean isFile=true;
 
    /**byte array*/
     private byte[] byteArray=null;
 
    /**used in our regression tests to limit to first 10 pages*/
     public static boolean isTest=false;
     
     private List<Rect> relist = new ArrayList<Rect>();
     
     private List<TextLine> textlist = new ArrayList<TextLine>();
     
     private Rect page_rect = new Rect();
     
     private static String file_name = "";

	/**
      * routine to decode a file
      */
     private void decodeFile(String file_name) {
        //PdfDecoder returns a PdfException if there is a problem
         try {
             decodePdf = new PdfDecoder(true);
 
            //incase fonts not embedded
             FontMappings.setFontReplacements();
 
            decodePdf.setExtractionMode(PdfDecoder.TEXT); //extract just text
             PdfDecoder.init(true);
             //make sure widths in data CRITICAL if we want to split lines correctly!!
 
              decodePdf.useTextExtraction();
 
            //always reset to use unaltered co-ords - allow use of rotated or unrotated
             // co-ordinates on pages with rotation (used to be in PdfDecoder)
             PdfGroupingAlgorithms.useUnrotatedCoords=false;
 
            /**
              * open the file (and read metadata including pages in  file)
              */
             if(outputMessages)
                 System.out.println("Opening file :" + file_name);
 
            if(isFile)
                 decodePdf.openPdfFile(file_name);
             else
                 decodePdf.openPdfArray(byteArray);
         } catch (PdfSecurityException e) {
             System.err.println("Exception " + e+" in pdf code for wordlist"+file_name);
         } catch (PdfException e) {
             System.err.println("Exception " + e+" in pdf code for wordlist"+file_name);
 
        } catch (Exception e) {
             System.err.println("Exception " + e+" in pdf code for wordlist"+file_name);
             e.printStackTrace();
         }
 
        /**
          * extract data from pdf (if allowed).
          */
         if(!decodePdf.isExtractionAllowed()){
             if(outputMessages)
                 System.out.println("Text extraction not allowed");
         }else if (decodePdf.isEncrypted() && !decodePdf.isPasswordSupplied()) {
             if(outputMessages){
                 System.out.println("Encrypted settings");
                 System.out.println("Please look at Viewer for code sample to handle such files");
             }
         } else{
            /**
              * extract data from pdf
              */
             try {
                 //for (int page = start; page < end + 1; page++) { //read pages
 
                    //decode the page
                     decodePdf.decodePage(decode_pageno);
                     //String contents[] = decodePdf.;
                     
//                     //debug only
//                     FileWriter fw = new FileWriter("d:/abc/commands/raw-" + decode_pageno + ".txt");
                     
                     
                     StringBuffer sb = new StringBuffer();
                     for (int i=0;i<decodePdf.stream_data.length;i++) {
                    	 sb.append((char)decodePdf.stream_data[i]);
                    	 //fw.write(decodePdf.stream_data[i]); //debug only
                     }
                     
//                     fw.flush(); //debug only
                     
                     /**use whole page size for  demo - get data from PageData object*/
                     PdfPageData currentPageData = decodePdf.getPdfPageData();
 
                    int x1 = currentPageData.getMediaBoxX(decode_pageno);
                     int x2 = currentPageData.getMediaBoxWidth(decode_pageno)+x1;
 
                    int y2 = currentPageData.getMediaBoxX(decode_pageno);
                     int y1 = currentPageData.getMediaBoxHeight(decode_pageno)-y2;
                     
                     //报存页面坐标参数
                     page_rect.setX(x1);
                     page_rect.setWidth(x2 - x1);
                     page_rect.setY(y2);
                     page_rect.setHeight(y1-y2);
                     
                     //OutputStreamWriter output_stream =
                     //        new OutputStreamWriter(
                     //                new FileOutputStream(outputDir + "raw-re-"+decode_pageno + ".txt"),
                     //                "UTF-8");
                     
                     StringBuffer line = new StringBuffer();
                     for (int j=0;j<sb.length();j++) {
                    	 line.append(sb.charAt(j));
                    	 if (sb.charAt(j) == 10) {
                    		 if ((line.toString().split(" ").length == 5)
                    				 && ((line.toString().split(" ")[4].equals("re" + (char)10))
                    						 ||(line.toString().split(" ")[4].equals("re" + (char)13 + (char)10)))) {
                    			 String[] command = line.toString().split(" ");
                    			 Rect rect = new Rect();
                    			 rect.setX(Double.parseDouble(command[0]));
                    			 rect.setY(page_rect.getHeight() - Double.parseDouble(command[1]) - Double.parseDouble(command[3]));
                    			 rect.setWidth(Double.parseDouble(command[2]));
                    			 rect.setHeight(Double.parseDouble(command[3]));
	                    		 //output.append(line);
                    			 relist.add(rect);
                    		 }
                    		 line.setLength(0);
                    	 }
                     }

//                   //debug only
//                     if (decode_pageno == 6) {
//                    	 StraightLines.printLines("d:/abc/relist_6.txt", relist);
//                     }
                     
                     StraightLines.processReCommands(relist, decode_pageno);
                     
                     
                     StraightLines.sortByYMinAsc(relist);
                     //for (int j=0;j<relist.size();j++) {
                    //	 Rect rect = relist.get(j);
                    //	 output_stream.write(rect.getX() + " " + rect.getY() + " " + rect.getWidth() + " " + rect.getHeight() + " re" + "\n\r");
                    // }
                    // output_stream.flush();
                     
                     /** create a grouping object to apply grouping to data*/
                     PdfGroupingAlgorithms currentGrouping =decodePdf.getGroupingObject();
                     List<?> words =null;
                     try{
                    	 /*
                         words =currentGrouping.extractTextAsWordlist(
                                 x1,
                                 y1,
                                 x2,
                                 y2,
                                 decode_pageno,
                                 true,"&:=()!;.,\\/\"\"\'\'");
                                 */
                         words =currentGrouping.extractTextAsWordlist(
                                 x1,
                                 y1,
                                 x2,
                                 y2,
                                 decode_pageno,
                                 true,"");
                     } catch (PdfException e) {
                         decodePdf.closePdfFile();
                         System.err.println("Exception= "+ e+" in "+file_name);
                     }
                     //DX20130614
                     if (words == null) {
                    	 decodePdf.closePdfFile();
                    	 return;
                     }
                     Iterator<?> wordIterator=words.iterator();
                     while(wordIterator.hasNext()){

                        String currentWord=(String) wordIterator.next();

                        /**remove the XML formatting if present - not needed for pure text*/
                         currentWord=Strip.convertToText(currentWord, decodePdf.isXMLExtraction());

                        /**if(currentWord.indexOf(" ")!=-1){
                          System.out.println("word="+currentWord);
                          System.exit(1);
                          }*/

                        /**
                          * these co-ordinates are absolute from the bottom of the page (MediaBox)
                          * If you are extracting image (which may use crop, use need to modify as below
                          */
                         double wx1 = Double.parseDouble((String) wordIterator.next());
                         double wy1 = Double.parseDouble((String) wordIterator.next());
                         double wx2 = Double.parseDouble((String) wordIterator.next());
                         double wy2 = Double.parseDouble((String) wordIterator.next());

                        /**this could be inserting into a database instead*/
                         TextLine text = new TextLine();
                         text.getRect().setX(wx1);
                         text.getRect().setY(page_rect.getHeight() - wy1);
                         text.getRect().setWidth(wx2);
                         text.getRect().setHeight(wy1 - wy2);
                         text.setText(currentWord);
                         textlist.add(text);

                    }
                     
                    System.out.println("Page " + decode_pageno + " extracted!");
 
                //}
             } catch (Exception e) {
                 decodePdf.closePdfFile();
                 System.err.println("Exception "+ e+" in "+file_name);
                 e.printStackTrace();
             }
 
            /**
              * flush data structures - not strictly required but included
              * as example
              */
             decodePdf.flushObjectValues(true); //flush any text data read
 
            /**tell user*/
             if(outputMessages)
                 System.out.println("Text read");
 
        }
 
        /**close the pdf file*/
         decodePdf.closePdfFile();
 
        decodePdf=null;
 
    }

     //////////////////////////////////////////////////////////////////////////
     /**
      * main routine which checks for any files passed and runs the demo
      */
     public static void main(String[] args) {
        FileUtils.save(generateXMLFile("r:/a.pdf", "R:/out.xml", "00000001"), "r:/z.xml");
     }
 
    /**
      * return words extracted. We use this in some tests.
      */
     public int getWordsExtractedCount() {
         return wordsExtracted;
     }
     
     /*
      * extract raw commands
      */
     public List<Rect> parseFilePage(String filename, int pageno) {
    	 
    	 setDecode_pageno(pageno);
    	 
    	 decodeFile(filename);
    	 
    	 StraightLines.sortByXMax(relist);
    	 
    	 return relist;
     }

	public int getDecode_pageno() {
		return decode_pageno;
	}

	public void setDecode_pageno(int decode_pageno) {
		this.decode_pageno = decode_pageno;
	}
	
	private static double MINIMUN_LINE_LENGTH = 2;
	//算法描述
	/*
	 * 先获得所有横线,每两条相邻横线为一行
	 * 再每一行,获得所有有效竖线(同时与上下横线交叉的竖线),每两条相邻竖线为一列
	 * 最后生成单元表
	 * 获得所有水平线
	 */
	public static String generateExcelTables(Element pageElement, int page_no, List<Rect> lines, List<TextLine> textlist) {
		List<Rect> column_lines = new ArrayList<Rect>();
		
		List<Rect> horizontal_lines = new ArrayList<Rect>();
		List<Rect> vertical_lines = new ArrayList<Rect>();
		
		//去除短线
		for (int i=0;i<lines.size();i++) {
			if ((lines.get(i).getWidth() > MINIMUN_LINE_LENGTH)
					&& (lines.get(i).getX() > 0)){
				horizontal_lines.add(lines.get(i));
			}
		}
		StraightLines.sortByYMinAsc(horizontal_lines);
		
		//获得垂直线
		for (int i=0;i<lines.size();i++) {
			if ((lines.get(i).getHeight() > MINIMUN_LINE_LENGTH)
					&& (lines.get(i).getY() > 0)) {
				vertical_lines.add(lines.get(i));
			}
		}
		StraightLines.sortByYMax(vertical_lines);
		
		/*if (pageElement.attribute("pageindex").getStringValue().equals("27")) {
			System.out.println("debug");
		}*/
		for (int i=0;i<horizontal_lines.size()-1;i++) {
			Rect topline, bottomline;
			topline = horizontal_lines.get(i);
			bottomline = horizontal_lines.get(i+1);
			for (int j=0;j<vertical_lines.size();j++) {
				//找到交叉该对水平线的垂直线
				if (((vertical_lines.get(j).getY() - MINIMUN_LINE_LENGTH < topline.getY()) 
						&& (vertical_lines.get(j).getY() + vertical_lines.get(j).getHeight() + MINIMUN_LINE_LENGTH > topline.getY()))
					&& ((vertical_lines.get(j).getY() - MINIMUN_LINE_LENGTH < bottomline.getY()) 
						&& (vertical_lines.get(j).getY() + vertical_lines.get(j).getHeight() + MINIMUN_LINE_LENGTH > bottomline.getY()))) {
					//如果结果中不存在该垂直线,则加入
					boolean bFind = false;
					for (int k=0;k<column_lines.size();k++) {
						if (column_lines.get(k).getX() == vertical_lines.get(j).getX()) {
							bFind = true;
							break;
						}

					}
					if (!bFind) column_lines.add(vertical_lines.get(j));
				}

			}
		}
		
		StraightLines.sortByXMin(column_lines);
		
		List<Rect> mergedhlines;
		
		mergedhlines = StraightLines.mergeHorizontalLines(horizontal_lines);
		
		StraightLines.sortByYMinAsc(mergedhlines);
		
		StraightLines.sortByXMin(vertical_lines);
		
		//xml表元素
		Element tableElement = pageElement.addElement("table");
		//Add by tangxc.
		//tableElement.addAttribute("border", "1");
		String str_colX = "";
		for (int j=0;j<column_lines.size();j++) {
			if (j==0) {
				str_colX = (int)column_lines.get(j).getX() + "";
			} else {
				str_colX = str_colX + ":" + (int)column_lines.get(j).getX();
			}
		}
		
		String str_colY = "";
		
		TextLine.sortByYMinAsc(textlist);
		
		//在EXCEL文件中生成表格
		//WritableWorkbook workbook = initOutputExcelFile();
		//WritableSheet sheet = workbook.createSheet("Page", 0); 
		int first_column_rowspan = 0; //第一列的行和并
		/*Table table = null;*/
		for (int i=0;i<mergedhlines.size()-1;i++) {
			Rect topline, bottomline;
			topline = mergedhlines.get(i);
			bottomline = mergedhlines.get(i+1);
			double leftline = 0;
			leftline = StraightLines.getNextVerticalLine(topline, bottomline, vertical_lines, 0);
			
			if (leftline == 0) {
				if (tableElement.nodeCount() > 0) {
					first_column_rowspan = 0; //DX20130704 reset
					tableElement.addAttribute("colX", str_colX);
					tableElement.addAttribute("colY", str_colY);
					tableElement = pageElement.addElement("table");
					//Add by tangxc.
					//tableElement.addAttribute("border", "1");
					
					str_colY = "";
				}
				/*str_colY = "";
				
				TextTr tr = new TextTr();
				tr.setColX(str_colX);
				tr.setColY(str_colY);
				tr.setContent("");
				Text txt = new Text();
				txt.addTr(tr);
				page.addText(txt);
				*/
				continue; //没有交叉线
			} else {
				if (str_colY.equals("")) {
					str_colY = (int)mergedhlines.get(i).getY() + ":" + (int)mergedhlines.get(i+1).getY();
				} else {
					str_colY = str_colY + ":" + (int)mergedhlines.get(i+1).getY();
				}
				if ((tableElement.nodeCount() > 0) && (i == (mergedhlines.size()-2))) {
					tableElement.addAttribute("colX", str_colX);
					tableElement.addAttribute("colY", str_colY);
				}
				
				/*table = new Table();*/
			}
			
			
			Element rowElement = tableElement.addElement("tr");
			/*TableTr tr = new TableTr();
			table.addTr(tr);*/
			do {
				double nextline = 0;
				boolean bFind = false;
				for (int j=0;j<column_lines.size();j++) {
					if (column_lines.get(j).getX() == leftline) {
						bFind = true;
						
						//找下一根交叉线
						nextline = StraightLines.getNextVerticalLine(topline, bottomline, vertical_lines, leftline);
						if (nextline==0) {
							break; //没有下一根交叉线
						}
						
						for (int m=j+1;m<column_lines.size();m++) {
							if (column_lines.get(m).getX() == nextline) {
								Element cellElement = null;
								
								//单元格的坐标
								Rect cell_rect = new Rect();
								cell_rect.setX(leftline);
								cell_rect.setWidth(nextline);
								cell_rect.setY(topline.getY());
								cell_rect.setHeight(bottomline.getY());
								
								/*TableTd td = null;
								*/
								if (leftline == StraightLines.getNextVerticalLine(topline, bottomline, vertical_lines, 0)) { //第一列
									if (first_column_rowspan > 1) {
										first_column_rowspan--;
										continue;
									} else {
										first_column_rowspan = 0;
									}
									
									//设置边框
									cellElement = rowElement.addElement("td");
									cellElement.addAttribute("colspan", (m-j) + "");
									
									/*td = new TableTd();
									td.setColspan(String.valueOf(m-j));
									tr.addTd(td);*/
									
									//设置行合并
									if ((bottomline.getX()-topline.getX())>10) {
										first_column_rowspan = 2;
										for (int p=i+2;p<mergedhlines.size();p++) {
											if ((mergedhlines.get(p).getX()-topline.getX())>10) {
												cell_rect.setHeight(mergedhlines.get(p).getY());
												first_column_rowspan++;
											} else {
												break;
											}
										}
										cellElement.addAttribute("rowspan", (first_column_rowspan) + "");
										
										/*td.setRowspan(String.valueOf(first_column_rowspan));
										*/
									}
								} else { //非第一列
									cellElement = rowElement.addElement("td");
									cellElement.addAttribute("colspan", (m-j) + "");
									
									/*td = new TableTd();
									td.setColspan(String.valueOf(m-j));
									tr.addTd(td);*/
								}
								
								Element textElement = cellElement.addElement("text");
								
								//SetSheetCell(sheet, i+1, j+1, 1, m-j, "");
								//查找在topline, bottomline, leftline, nextline区域内的字符串,放入EXCEL表格
								for (int n=0;n<textlist.size();n++) {
									TextLine textline = textlist.get(n);
									int rowspan = 1;
									if (cellElement.attribute("rowspan") != null) {
										rowspan = Integer.parseInt(cellElement.attribute("rowspan").getStringValue());
									}
									if ((textline.getRect().getX() >= cell_rect.getX())
										&& ((textline.getRect().getX()) < cell_rect.getWidth())
										&& ((textline.getRect().getY()) >= cell_rect.getY())
										&& ((textline.getRect().getY()) < cell_rect.getHeight())) {
										
										textElement = cellElement.element("text");
										mergeElement(textElement, textline);
										//mergeElement(td, textline);
//										textElement.addAttribute("height", (int)textline.getRect().getHeight() + "");
//										textElement.addAttribute("width", (int)(textline.getRect().getWidth() - textline.getRect().getX()) + "");
//										textElement.addAttribute("x", (int)textline.getRect().getX() + "");
//										textElement.addAttribute("y", (int)textline.getRect().getY() + "");
//										textElement.setText(textline.getText());
										//cellText = cellText + textline.getText();
										//找到
										//SetSheetCell(sheet, i+1, j+1, 1, m-j, textline.getText());
									}
								}
								//cellElement.setText(cellText);
							}
						}
					}
				}
				leftline = nextline;
			} while (leftline != 0);
		}
		
		//pageElement.elements().remove(pageElement.elements().size());
		Rect[] tables = new Rect[pageElement.elements().size()];
		int table_indexes[] = new int[pageElement.elements().size()]; //表的元素索引值
		int last_table_index = -1; //最后一张表格索引号
		for (int i=0;i<pageElement.elements().size();i++) {
			
			Element node = (Element)pageElement.elements().get(i);
			
			if ((node.attribute("colX") == null) || (node.attributeValue("colX").equals(""))) continue;
			tables[i] = new Rect();
			table_indexes[i] = i;
			String cols_X[] = node.attributeValue("colX").split(":");
			String cols_Y[] = node.attributeValue("colY").split(":");
			tables[i].setX(Double.parseDouble(cols_X[0]));
			tables[i].setY(Double.parseDouble(cols_Y[0]));
			tables[i].setWidth((Double.parseDouble(cols_X[cols_X.length-1])-tables[i].getX()));
			tables[i].setHeight((Double.parseDouble(cols_Y[cols_Y.length-1])-tables[i].getY()));
			
			last_table_index = i;
		}
		
		
		Page page = new Page();
		page.setCurrentNum(page_no);
		
		//表格数据后期处理
		//去除空行
		for (int i=0;i<pageElement.elements().size();i++) {
			Element table = (Element)pageElement.elements().get(i);

			Table tab = new Table();
			tab.setColX(table.attributeValue("colX"));
			tab.setColY(table.attributeValue("colY"));
			page.addTable(tab);
			
			List<Integer> empty_row_index_list = new ArrayList<Integer>();
			for (int j=0;j<table.elements().size();j++) {
				Element tr = (Element)table.elements().get(j);
				
				TableTr myTr = new TableTr();
				tab.addTr(myTr);
				
				boolean b_empty_row = tr.elements().size()>0?true:false;
				for (int k=0;k<tr.elements().size();k++) {
					Element td = (Element)tr.elements().get(k);
					if (!td.getStringValue().equals("")) {
						b_empty_row = false;

						Element text = td.element("text");
						TableTd myTd = new TableTd();
						int x = Integer.parseInt(text.attributeValue("x"));
						int y = Integer.parseInt(text.attributeValue("y"));
						int w = Integer.parseInt(text.attributeValue("width"));
						int h = Integer.parseInt(text.attributeValue("height"));
						myTd.setColX(x+":"+(x+w));
						myTd.setColY(y+":"+(y+h));
						myTd.setColspan(td.attributeValue("colspan"));
						myTd.setRowspan(td.attributeValue("rowspan"));
						myTd.setContent(td.getStringValue());
						myTr.addTd(myTd);
						
						continue;
					}
				}
				if (b_empty_row) {
					empty_row_index_list.add(j);
				}
			}
			for (int l=empty_row_index_list.size();l>0;l--) {
				table.elements().remove((int)((Integer)empty_row_index_list.get(l-1).intValue()));
			}
		}
		
		//表外文本
		Rect lastrect = null;
		for (TextLine textline:textlist) {
			boolean inserted = false;
			Rect rect = textline.getRect();
			for (int i=0;i<tables.length;i++) {
				if (tables[i] == null) continue;
				if (rect.getY()<tables[i].getY()
						|| ((rect.getY() > tables[i].getY()) && (rect.getY() < (tables[i].getY() + tables[i].getHeight())) && (rect.getX() < tables[i].getX()))
						|| ((rect.getY() > tables[i].getY()) && (rect.getY() < (tables[i].getY() + tables[i].getHeight())) && (rect.getX() > (tables[i].getX() + tables[i].getWidth())))) {
					//是否表外数据
					if ((i==0) || (rect.getY() > (tables[i-1].getY() + tables[i-1].getHeight()))) {
						int step = 1;
						Element element = DocumentHelper.createElement("text");
						
						element.addAttribute("x", ""+(int)rect.getX());
						element.addAttribute("y", ""+(int)rect.getY());
						element.addAttribute("width", ""+(int)rect.getWidth());
						element.addAttribute("height", ""+(int)rect.getHeight());
						element.setText(textline.getText());
						
						TextTr myTr = new TextTr();
						myTr.setColX(rect.getX() +":" + ((int)rect.getX() + (int)rect.getWidth()));
						myTr.setColY(rect.getY() +":" + ((int)rect.getY() + (int)rect.getHeight()));
						myTr.setContent(textline.getText());
						Text myTxt = new Text();
						myTxt.addTr(myTr);
						page.addText(myTxt);
						
						pageElement.content().add(table_indexes[i], element);
						if (lastrect == null) {
							lastrect = rect;
						}
						else {
							if (Math.abs(lastrect.getY() - rect.getY())>(lastrect.getHeight()/2)) {
								lastrect = rect;
								//element = DocumentHelper.createElement("br");
								//pageElement.content().add(table_indexes[i], element);
								//element.addElement("br");
								pageElement.content().add(pageElement.indexOf(element), DocumentHelper.createElement("br"));
								step = 2;
							}
						}
						for (int j=i;j<table_indexes.length;j++) {
							table_indexes[j]+=step;
						}
						
						inserted = true;
						break;
					}
				}
				
			}
			if (!inserted) {
				if ((last_table_index < 0) ||
						(rect.getY() > (tables[last_table_index].getY()+tables[last_table_index].getHeight()))) {
					Element element = pageElement.addElement("text");
					if (lastrect == null) {
						lastrect = rect;
					}
					else {
						if (Math.abs(lastrect.getY() - rect.getY())>(lastrect.getHeight()/2)) {
							lastrect = rect;
							//element.addElement("br");
							//pageElement.content().add(pageElement.indexOf(element), DocumentHelper.createElement("br"));
						}
					}
					element.addAttribute("x", ""+(int)rect.getX());
					element.addAttribute("y", ""+(int)rect.getY());
					element.addAttribute("width", ""+(int)rect.getWidth());
					element.addAttribute("height", ""+(int)rect.getHeight());
					element.setText(textline.getText());
					
					
					TextTr myTr = new TextTr();
					myTr.setColX(rect.getX() +":" + ((int)rect.getX() + (int)rect.getWidth()));
					myTr.setColY(rect.getY() +":" + ((int)rect.getY() + (int)rect.getHeight()));
					myTr.setContent(textline.getText());
					Text myTxt = new Text();
					myTxt.addTr(myTr);
					page.addText(myTxt);
					
					inserted = true;
				} 
			}
		}
		//uninitOutputExcelFile(workbook);
		return page.toString();
	}
	
	public static WritableWorkbook initOutputExcelFile() {
		WritableWorkbook workbook = null;
		WritableSheet sheet = null;
		try {
			workbook = Workbook.createWorkbook(new File("d:/output.xls"));
			sheet = workbook.createSheet("Page", 0); 
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return workbook;
	}
	
	public static void uninitOutputExcelFile(WritableWorkbook workbook) {
		try {
			workbook.write();
			workbook.close(); 
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (WriteException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} 
		
	}
	
	private static void SetSheetCell(WritableSheet sheet, int row, int column, int rowspan, int colspan, String text)  {
		try {
			//Label number = new Label(row, column, text);
			//sheet.getCell(column, row).getContents();
			Label number = new Label(column, row, sheet.getCell(column, row).getContents() + text);
			WritableCellFormat format = new WritableCellFormat();
			format.setBorder(jxl.format.Border.LEFT, jxl.format.BorderLineStyle.THIN);	//边框样式, ls)
			format.setBorder(jxl.format.Border.RIGHT, jxl.format.BorderLineStyle.THIN);  //边框样式, ls)
			format.setBorder(jxl.format.Border.TOP, jxl.format.BorderLineStyle.THIN);  //边框样式, ls)
			format.setBorder(jxl.format.Border.BOTTOM, jxl.format.BorderLineStyle.THIN);  //边框样式, ls)
			number.setCellFormat(format);
			sheet.addCell(number); 
			//sheet.mergeCells(row, column, row + rowspan, column + colspan);
			sheet.mergeCells(column, row, column + colspan-1, row + rowspan-1);
		} catch (WriteException e) {
			e.printStackTrace();
		}
	}
	
	public static String generateXMLFile(String filename, String xml_filename) {
		return generateXMLFile(filename, xml_filename, "");
	}
	
	public static String generateXMLFile(String filename, String xml_filename, String id) {
		StringBuffer sb = new StringBuffer();
		
		ExtractRawStream ers = new ExtractRawStream();
		//checkPath(xml_filename);
		Element rootElement = ers.initXMLFile(filename, id);
		PdfDecoder decodePdf = null;
		decodePdf = new PdfDecoder(true);
		try {
			decodePdf.openPdfFile(filename);
		} catch (PdfException e1) {
			e1.printStackTrace();
			System.exit(0);
		}
		int start = 1, end = decodePdf.getPageCount();
		for (int page_no=start; page_no<end+1;page_no++) {
			ers.relist.clear();
			ers.textlist.clear();
			ers.page_rect.clear();
			Element pageElement = rootElement.addElement("page");
			pageElement.addAttribute("pageindex", page_no + "");
			
	        ers.parseFilePage(filename, page_no);
	        
	        sb.append(ExtractRawStream.generateExcelTables(pageElement, page_no, ers.relist, ers.textlist));
		}
        
        /*try {
		    XMLWriter output = new XMLWriter(
		            new FileWriter(
		            new File(xml_filename)));
		    output.write(rootElement.getDocument());
		    output.close();
	    } catch(IOException e) {
			System.out.println(e.getMessage());
		}*/
		
		return filte(sb);
	}
	
	public static String generateXMLFile(InputStream stream, String filename, String id) {
		StringBuffer sb = new StringBuffer();
		
		ExtractRawStream ers = new ExtractRawStream();
		Element rootElement = ers.initXMLFile(filename, id);
		PdfDecoder decodePdf = null;
		decodePdf = new PdfDecoder(true);
		try {
			decodePdf.openPdfFileFromInputStream(stream, false);
		} catch (PdfException e1) {
			e1.printStackTrace();
			System.exit(0);
		}
		int start = 1, end = decodePdf.getPageCount();
		for (int page_no=start; page_no<end+1;page_no++) {
			ers.relist.clear();
			ers.textlist.clear();
			ers.page_rect.clear();
			Element pageElement = rootElement.addElement("page");
			pageElement.addAttribute("pageindex", page_no + "");
			
	        ers.parseFilePage(filename, page_no);
	        
	        sb.append(ExtractRawStream.generateExcelTables(pageElement, page_no, ers.relist, ers.textlist));
		}
        
		return filte(sb);
	}
	
	private static String filte(StringBuffer sb){
		String result = sb.toString();
		result = ignoreTag("colX=\"null\"", result);
		result = ignoreTag("colY=\"null\"", result);
		result = ignoreTag("<table colX=\"null\" colY=\"null\"></table>", result);
		result = ignoreTag("<table></table>", result);
		result = ignoreTag("<table  ></table>", result);
		result = ignoreTag(" colspan=\"null\"", result);
		result = ignoreTag(" rowspan=\"null\"", result);
		return result;
	}
	
	private Element initXMLFile(String file_name, String id) {
		Document document = DocumentHelper.createDocument();
		Element rootElement = document.addElement("pdf");
		rootElement.addAttribute("id", id);
		rootElement.addAttribute("filename", new File(file_name).getName());
		
		return rootElement;
	}
	
	private static void checkPath(String filename) {
		File file = new File(filename).getParentFile();
		if (file!=null&&!file.exists()) {
			file.mkdirs();
		}
	}
	
	@SuppressWarnings("deprecation")
	private static void mergeElement(Element textElement, TextLine textline) {
		String x = "";
		String y = "";
		String width = "";
		String height = "";
		if (textElement.attribute("x") != null) {
			x = textElement.attribute("x").getStringValue();
			y = textElement.attribute("y").getStringValue();
			width = textElement.attribute("width").getStringValue();
			height = textElement.attribute("height").getStringValue();
		}
		
		if (x == null || x.equals("null") || x.equals("")) {
			x = (int)textline.getRect().getX() + "";
			y = (int)textline.getRect().getY() + "";
			width = (int)(textline.getRect().getWidth() - textline.getRect().getX()) + "";
			height = (int)textline.getRect().getHeight() + "";
			
			textElement.addAttribute("x", x);
			textElement.addAttribute("y", y);
			textElement.addAttribute("width", width);
			textElement.addAttribute("height", height);
			textElement.setText(textline.getText());
		} else {
			Rect rect = new Rect();
			rect.setX((Double.parseDouble(x) < textline.getRect().getX())?Double.parseDouble(x):textline.getRect().getX());
			rect.setY((Double.parseDouble(y) < textline.getRect().getY())?Double.parseDouble(y):textline.getRect().getY());
			double x_max = Double.parseDouble(x) + Double.parseDouble(width);
			if (x_max<textline.getRect().getWidth()) {
				rect.setWidth(textline.getRect().getWidth()-textline.getRect().getX());
			} else {
				rect.setWidth(Double.parseDouble(width));
			}
			double y_max = Double.parseDouble(y) + Double.parseDouble(height);
			if (y_max<(textline.getRect().getY() + textline.getRect().getHeight())) {
				rect.setHeight(textline.getRect().getY() + textline.getRect().getHeight() - rect.getY());
			} else {
				rect.setHeight(y_max - rect.getY());
			}
			textElement.addAttribute("x", (int)rect.getX() + "");
			textElement.addAttribute("y", (int)rect.getY() + "");
			textElement.addAttribute("width", (int)rect.getWidth() + "");
			textElement.addAttribute("height", (int)rect.getHeight() + "");
			textElement.setText(textElement.getText() + textline.getText());
		}
	}
	
	private static void mergeElement(TableTd td, TextLine textline) {
		String x = "";
		String y = "";
		String width = "";
		String height = "";
		/*if (textElement.attribute("x") != null) {
			x = textElement.attribute("x").getStringValue();
			y = textElement.attribute("y").getStringValue();
			width = textElement.attribute("width").getStringValue();
			height = textElement.attribute("height").getStringValue();
		}*/
		
		if (x == null || x.equals("null") || x.equals("")) {
			x = (int)textline.getRect().getX() + "";
			y = (int)textline.getRect().getY() + "";
			width = (int)(textline.getRect().getWidth() - textline.getRect().getX()) + "";
			height = (int)textline.getRect().getHeight() + "";
		} else {
			Rect rect = new Rect();
			rect.setX((Double.parseDouble(x) < textline.getRect().getX())?Double.parseDouble(x):textline.getRect().getX());
			rect.setY((Double.parseDouble(y) < textline.getRect().getY())?Double.parseDouble(y):textline.getRect().getY());
			double x_max = Double.parseDouble(x) + Double.parseDouble(width);
			if (x_max<textline.getRect().getWidth()) {
				rect.setWidth(textline.getRect().getWidth()-textline.getRect().getX());
			} else {
				rect.setWidth(Double.parseDouble(width));
			}
			double y_max = Double.parseDouble(y) + Double.parseDouble(height);
			if (y_max<(textline.getRect().getY() + textline.getRect().getHeight())) {
				rect.setHeight(textline.getRect().getY() + textline.getRect().getHeight() - rect.getY());
			} else {
				rect.setHeight(y_max - rect.getY());
			}
			x = (int)rect.getX() + "";
			y = (int)rect.getY() + "";
			width = (int)rect.getWidth() + "";
			height = (int)rect.getHeight() + "";
			
		}
		td.setColX(x+":"+(x+width));
		td.setColY(y+":"+(y+height));
		td.setContent(textline.getText());
	}
	
	//无框表格数据提取
	public void noframe_table_parse(List<Rect> lines, List<TextLine> textlist) {
		TextLine.groupByX(textlist);
		
		Rect line = new Rect();
		
		for (TextLine textline:textlist) {
			Rect rect = textline.getRect();
		}
	}
	
	private static String ignoreTag(String tag, String origin){
		return origin.replaceAll(tag, "");
	}
}


package org.aoe.software.pdf;

import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;

public class FileUtils {
	public static boolean save(String data, String filepath){
		OutputStream os = null;
		try {
			os = new FileOutputStream(filepath);
			BufferedWriter out = new BufferedWriter(new OutputStreamWriter(os));
			out.write(data);
			out.flush();
			return true;
		} catch (Exception e) {
			return false;
		} finally{
			if(os != null){
				try {
					os.close();
				} catch (IOException e) {
					os = null;
				}
			}
		}
	}
}


package org.aoe.software.pdf;

public class Rect {
	double x;
	double y;
	double width;
	double height;
	
	Rect() {
		
	}
	
	Rect(Rect rect) {
		set(rect);
	}
	
	public void set(Rect rect) {
		this.x = rect.getX();
		this.y = rect.getY();
		this.width = rect.getWidth();
		this.height = rect.getHeight();
	}
	
	public double getX() {
		return x;
	}
	
	public void setX(double x) {
		this.x = x;
	}
	
	public double getY() {
		return y;
	}
	
	public void setY(double y) {
		this.y = y;
	}
	
	public double getWidth() {
		return width;
	}
	
	public void setWidth(double width) {
		this.width = width;
	}
	
	public double getHeight() {
		return height;
	}
	
	public void setHeight(double height) {
		this.height = height;
	}
	
	public void clear() {
		x = 0;
		y = 0;
		width = 0;
		height = 0;
	}
}


package org.aoe.software.pdf;

import java.util.ArrayList;
import java.util.List;

public class TextLine {
	private String text;
	private Rect rect = new Rect();
	
	public String getText() {
		return text;
	}
	public void setText(String text) {
		this.text = text;
	}
	public Rect getRect() {
		return rect;
	}
	public void setRect(Rect rect) {
		this.rect = rect;
	}
	
	public static void sortByYMinAsc(List<TextLine> textlist) {
		int i, j;
		boolean ischanged = false;
		for (j=textlist.size();j>0;j--) {
			ischanged = false;
			for (i=0;i<j-1;i++) {
				if (textlist.get(i).getRect().getY() > textlist.get(i+1).getRect().getY()) {
					TextLine temp = textlist.get(i);
					textlist.set(i, textlist.get(i+1));
					textlist.set(i+1, temp);
					ischanged = true;
				}
			}
			if (!ischanged) break;
		}
	}
	
	public static double MINIMUN_LINE_LENGTH = 3;
	public static List<List<TextLine>> groupByX(List<TextLine> lines) {
		if (lines.size() == 0) return null;
		
		TextLine.sortByXMinAsc(lines);
		
		List<List<TextLine>> out = new ArrayList<List<TextLine>>();
		double lastx = 0;
		lastx = lines.get(0).getRect().getX();
		List<TextLine> current = new ArrayList<TextLine>();
		for (TextLine line:lines) {
			//分组
			if ((line.getRect().getX() - lastx) > MINIMUN_LINE_LENGTH) {
				out.add(current); 
				current = new ArrayList<TextLine>();
				lastx = line.getRect().getX();
				current.add(line);
			} else {
				line.getRect().setX(lastx); //去掉双线
				current.add(line);
			}
		}
		out.add(current);
		
		return out;
	}
	
	public static List<List<TextLine>> groupByY(List<TextLine> lines) {
		if (lines.size() == 0) return null;
		
		TextLine.sortByYMinAsc(lines);
		
		List<List<TextLine>> out = new ArrayList<List<TextLine>>();
		double lasty = 0;
		lasty = lines.get(0).getRect().getY();
		List<TextLine> current = new ArrayList<TextLine>();
		for (TextLine line:lines) {
			//分组
			if ((line.getRect().getY() - lasty) > MINIMUN_LINE_LENGTH) {
				out.add(current); 
				current = new ArrayList<TextLine>();
				lasty = line.getRect().getY();
				current.add(line);
			} else {
				line.getRect().setY(lasty); //去掉双线
				current.add(line);
			}
		}
		out.add(current);
		
		return out;
	}
	
	//按x轴值排序
	public static void sortByXMinAsc(List<TextLine> in) {
		int i, j;
		boolean ischanged = false;
		for (j=in.size();j>0;j--) {
			ischanged = false;
			for (i=0;i<j-1;i++) {
				if (in.get(i).getRect().getX() < in.get(i+1).getRect().getX()) {
					TextLine temp = in.get(i);
					in.set(i, in.get(i+1));
					in.set(i+1, temp);
					ischanged = true;
				}
			}
			if (!ischanged) break;
		}
	}
}


package org.aoe.software.pdf;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class StraightLines {
	//按x轴值排序
	public static void sortByXMin(List<Rect> in) {
		int i, j;
		boolean ischanged = false;
		for (j=in.size();j>0;j--) {
			ischanged = false;
			for (i=0;i<j-1;i++) {
				if (in.get(i).getX() > in.get(i+1).getX()) {
					Rect temp = in.get(i);
					in.set(i, in.get(i+1));
					in.set(i+1, temp);
					ischanged = true;
				}
			}
			if (!ischanged) break;
		}
	}
	
	//按x轴值排序
	public static void sortByXMinAsc(List<Rect> in) {
		int i, j;
		boolean ischanged = false;
		for (j=in.size();j>0;j--) {
			ischanged = false;
			for (i=0;i<j-1;i++) {
				if (in.get(i).getX() < in.get(i+1).getX()) {
					Rect temp = in.get(i);
					in.set(i, in.get(i+1));
					in.set(i+1, temp);
					ischanged = true;
				}
			}
			if (!ischanged) break;
		}
	}
	
	//按y轴值排序
	public static void sortByYMin(List<Rect> in) {
		int i, j;
		boolean ischanged = false;
		for (j=in.size();j>0;j--) {
			ischanged = false;
			for (i=0;i<j-1;i++) {
				if (in.get(i).getY() < in.get(i+1).getY()) {
					Rect temp = in.get(i);
					in.set(i, in.get(i+1));
					in.set(i+1, temp);
					ischanged = true;
				}
			}
			if (!ischanged) break;
		}
	}
	
	//按y轴值排序
	public static void sortByYMinAsc(List<Rect> in) {
		int i, j;
		boolean ischanged = false;
		for (j=in.size();j>0;j--) {
			ischanged = false;
			for (i=0;i<j-1;i++) {
				if (in.get(i).getY() > in.get(i+1).getY()) {
					Rect temp = in.get(i);
					in.set(i, in.get(i+1));
					in.set(i+1, temp);
					ischanged = true;
				}
			}
			if (!ischanged) break;
		}
	}
	
	//按宽度排序
	public static void sortByXMax(List<Rect> in) {
		int i, j;
		boolean ischanged = false;
		for (j=in.size();j>0;j--) {
			ischanged = false;
			for (i=0;i<j-1;i++) {
				if (in.get(i).getWidth() > in.get(i+1).getWidth()) {
					Rect temp = in.get(i);
					in.set(i, in.get(i+1));
					in.set(i+1, temp);
					ischanged = true;
				}
			}
			if (!ischanged) break;
		}
	}
	
	//按高度排序
	public static void sortByYMax(List<Rect> in) {
		int i, j;
		boolean ischanged = false;
		for (j=in.size();j>0;j--) {
			ischanged = false;
			for (i=0;i<j-1;i++) {
				if (in.get(i).getHeight() > in.get(i+1).getHeight()) {
					Rect temp = in.get(i);
					in.set(i, in.get(i+1));
					in.set(i+1, temp);
					ischanged = true;
				}
			}
			if (!ischanged) break;
		}
	}
	
	public static List<Rect> mergeHorizontalLines(List<Rect> horizontal_lines) {
		
		List<Rect> merged_lines = new ArrayList<Rect>();
		
		if ((horizontal_lines == null))
			return merged_lines;
		
		//水平按序排列
		StraightLines.sortByYMinAsc(horizontal_lines);
		
		double lasty = 0;
		Rect temp = new Rect();
		
		for (int i=0;i<horizontal_lines.size();i++) {
			Rect line = horizontal_lines.get(i);
			
			if (lasty == 0) {
				lasty = line.getY();

				temp = new Rect();
				temp.setX(line.getX());
				temp.setWidth(line.getWidth());
				temp.setY(line.getY());
				temp.setHeight(line.getHeight());

				continue;
			}
			if (line.getY() != lasty) {
			
				lasty = line.getY();
			
				merged_lines.add(temp);
				temp = new Rect();
				temp.setX(line.getX());
				temp.setWidth(line.getWidth());
				temp.setY(line.getY());
				temp.setHeight(line.getHeight());
			} 
			else 
			{
				//合并线
				if (temp.getWidth() == 0) {
					temp.setX(line.getX());
					temp.setWidth(line.getWidth());
					temp.setY(line.getY());
					temp.setHeight(line.getHeight());
				}
				else {

					double xMin = (line.getX() < temp.getX())?line.getX():temp.getX();

					if ((line.getX() + line.getWidth()) > (temp.getX() + temp.getWidth())) {
						temp.setWidth((line.getX() + line.getWidth()) - temp.getX());
					}
					else {
						temp.setWidth((temp.getX() + temp.getWidth()) - temp.getX());
					}
					temp.setX(xMin);
				}
			}
		}
		if (temp.getWidth() > 0) {
			merged_lines.add(temp);
		}
		
		return merged_lines;
	}
	
public static List<Rect> mergeVerticalLines(List<Rect> vertical_lines) {
		
		List<Rect> merged_lines = new ArrayList<Rect>();
		
		if ((vertical_lines == null))
			return merged_lines;
		
		//水平按序排列
		StraightLines.sortByXMinAsc(vertical_lines);
		
		double lasty = 0;
		Rect temp = new Rect();
		
		for (int i=0;i<vertical_lines.size();i++) {
			Rect line = vertical_lines.get(i);
			
			if (lasty == 0) {
				lasty = line.getY();

				temp = new Rect();
				temp.setX(line.getX());
				temp.setWidth(line.getWidth());
				temp.setY(line.getY());
				temp.setHeight(line.getHeight());

				continue;
			}
			if (line.getY() != lasty) {
			
				lasty = line.getY();
			
				merged_lines.add(temp);
				temp = new Rect();
				temp.setX(line.getX());
				temp.setWidth(line.getWidth());
				temp.setY(line.getY());
				temp.setHeight(line.getHeight());
			} 
			else {
				//合并线
				if (temp.getWidth() == 0) {
					temp.setX(line.getX());
					temp.setWidth(line.getWidth());
					temp.setY(line.getY());
					temp.setHeight(line.getHeight());
				}
				else {

					double xMin = (line.getX() < temp.getX())?line.getX():temp.getX();

					if ((line.getX() + line.getWidth()) > (temp.getX() + temp.getWidth())) {
						temp.setWidth((line.getX() + line.getWidth()) - temp.getX());
					}
					else {
						temp.setWidth((temp.getX() + temp.getWidth()) - temp.getX());
					}
					temp.setX(xMin);
				}
			}
		}
		if (temp.getWidth() > 0) {
			merged_lines.add(temp);
		}
		
		return merged_lines;
	}

	public static double MINIMUN_LINE_LENGTH = 3;
	public static double getNextVerticalLine(Rect topline, Rect bottomline, List<Rect> in_vertical_lines, double startx) {
		List<Rect> vertical_lines = in_vertical_lines;
		StraightLines.sortByXMin(vertical_lines);
		
		double result = 0;
		
		boolean bFind = false;
		int start;
		if (startx==0) {

			start = -1;
		}
		else {
			
			for (start=0;start<vertical_lines.size();start++) {
				if (vertical_lines.get(start).getX() == startx) {

					if (((vertical_lines.get(start).getY() - MINIMUN_LINE_LENGTH < topline.getY()) 
							&& (vertical_lines.get(start).getY() + vertical_lines.get(start).getHeight() + MINIMUN_LINE_LENGTH > topline.getY()))
						&& ((vertical_lines.get(start).getY() - MINIMUN_LINE_LENGTH < bottomline.getY()) 
							&& (vertical_lines.get(start).getY() + vertical_lines.get(start).getHeight() + MINIMUN_LINE_LENGTH > bottomline.getY()))) {
							for (;start<vertical_lines.size() && vertical_lines.get(start).getX() == startx;start++);
							start--;
							bFind = true;
							break;
					}
					
				}
			}
		}

		bFind = false;
		
		for (int i=start+1;i<vertical_lines.size();i++) {
			if (((vertical_lines.get(i).getY() - MINIMUN_LINE_LENGTH < topline.getY()) 
					&& (vertical_lines.get(i).getY() + vertical_lines.get(i).getHeight() + MINIMUN_LINE_LENGTH > topline.getY()))
				&& ((vertical_lines.get(i).getY() - MINIMUN_LINE_LENGTH < bottomline.getY()) 
					&& (vertical_lines.get(i).getY() + vertical_lines.get(i).getHeight() + MINIMUN_LINE_LENGTH > bottomline.getY()))) {
				
				bFind = true;
				result = vertical_lines.get(i).getX();
				break;
			}
		}
		
		return result;
	}
	
	public static void processReCommands(List<Rect> relist, int page_no) {
		List<Rect> temp = new ArrayList<Rect>();
		
		//Rect line;
		for (Rect line:relist) {
			if ((line.getWidth()>MINIMUN_LINE_LENGTH) && (line.getHeight()>MINIMUN_LINE_LENGTH)) {
				
				Rect topline = new Rect();
				topline.setX(line.getX());
				topline.setY(line.getY());
				topline.setWidth(line.getWidth());
				topline.setHeight(0.1);
				
				Rect bottomline = new Rect();
				bottomline.setX(line.getX());
				bottomline.setY(line.getY()+line.getHeight());
				bottomline.setWidth(line.getWidth());
				bottomline.setHeight(0.1);
				
				Rect leftline = new Rect();
				leftline.setX(line.getX());
				leftline.setY(line.getY());
				leftline.setHeight(line.getHeight());
				leftline.setWidth(0.1);
				
				Rect rightline = new Rect();
				rightline.setX(line.getX()+line.getWidth());
				rightline.setY(line.getY());
				rightline.setHeight(line.getHeight());
				rightline.setWidth(0.1);
				
				temp.add(topline);
				temp.add(bottomline);
				temp.add(leftline);
				temp.add(rightline);
			} else {
				temp.add(line);
			}
		}
		relist.clear();
		relist.addAll(temp);
		
		temp.clear();
		
		List<Rect> horizontal_lines = new ArrayList<Rect>();
		List<Rect> vertical_lines = new ArrayList<Rect>();
		
		Rect lastline = new Rect();
		//获得水平线
		for (int i=0;i<relist.size();i++) {
			if (relist.get(i).getWidth() > MINIMUN_LINE_LENGTH) {
				horizontal_lines.add(relist.get(i));
			}
		}
		StraightLines.sortByYMinAsc(horizontal_lines);
		
		for (Rect line:horizontal_lines) {
			if ((lastline.getHeight() == 0) && (lastline.getWidth() == 0)) {
				lastline.set(line);
				continue;
			} 
			if (Math.abs((lastline.getY() - line.getY())) > MINIMUN_LINE_LENGTH) {
				temp.add(lastline);
				lastline = new Rect();
				lastline.set(line);
			} else {
				if (lastline.getX() > line.getX()) {
					lastline.setX(line.getX());
				}
				
				if ((lastline.getX() + lastline.getWidth()) < (line.getX() + line.getWidth())) {
					lastline.setWidth((line.getX() + line.getWidth()) - lastline.getX());
				}
			}
		}
		if ((lastline.getHeight() != 0) && (lastline.getWidth() != 0)) {
			temp.add(lastline);
		}
		
		//获得垂直线
		for (int i=0;i<relist.size();i++) {
			if (relist.get(i).getHeight() > MINIMUN_LINE_LENGTH) {
				if (relist.get(i).getX() != 0) {
					vertical_lines.add(relist.get(i));
				}
			}
		}
		
		List<List<Rect>> v_list = StraightLines.groupByX(vertical_lines);
		List<Rect> v_lines = StraightLines.remergeVerticalLines(v_list);
		temp.addAll(v_lines);
		
		/*
		StraightLines.sortByXMinAsc(vertical_lines);
		
		lastline = new Rect();
		for (Rect line:vertical_lines) {
			
			if ((lastline.getHeight() == 0) && (lastline.getWidth() == 0)) {
				lastline.set(line);
				continue;
			} 
			if (Math.abs((lastline.getX() - line.getX())) > MINIMUN_LINE_LENGTH*5) {
				lastline.setY(lastline.getY() - 2);
				lastline.setHeight(lastline.getHeight() + 2);
				temp.add(lastline);
				lastline = new Rect();
				lastline.set(line);
			} else {
				if (lastline.getY() > line.getY()) {
					lastline.setY(line.getY());
				}
				
				if ((lastline.getY() + lastline.getHeight()) < (line.getY() + line.getHeight())) {
					lastline.setHeight((line.getY() + line.getHeight()) - lastline.getY());
				}
			}
		}
		if ((lastline.getHeight() != 0) && (lastline.getWidth() != 0)) {
			temp.add(lastline);
		}
		*/
		relist.clear();
		relist.addAll(temp);
	}
	
	public static void printLines(String filename, List<Rect> lines) {
		FileWriter out;
		try {
			out = new FileWriter(new File(filename));
			for (Rect line:lines) {
				out.write(line.getX() + ", " + line.getY() + ", " + line.getWidth() + ", " + line.getHeight() + " re" + (char)(10) + (char)(13));
			}
			out.flush();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	
	public static List<List<Rect>> groupByX(List<Rect> lines) {
		if (lines.size() == 0) return null;
		
		StraightLines.sortByXMin(lines);
		
		List<List<Rect>> out = new ArrayList<List<Rect>>();
		double lastx = 0;
		lastx = lines.get(0).getX();
		List<Rect> current = new ArrayList<Rect>();
		for (Rect line:lines) {
			//分组
			if ((line.getX() - lastx)>MINIMUN_LINE_LENGTH) {
				out.add(current); 
				current = new ArrayList<Rect>();
				lastx = line.getX();
				current.add(line);
			} else {
				line.setX(lastx); //去掉双线
				current.add(line);
			}
		}
		out.add(current);
		
		return out;
	}
	
	public static List<Rect> remergeVerticalLines(List<List<Rect>> lines_list) {
		List<Rect> out = new ArrayList<Rect>();
		
		if (lines_list == null) return out;
		
		for (List<Rect> lines: lines_list) {
			StraightLines.sortByYMinAsc(lines);
			Rect current = new Rect();
			for (Rect line: lines) {
				if (line.getY() == 0) continue;
				if (current.getY() == 0) current.set(line);
				if (line.getY()<=(current.getY()+current.getHeight())) {
					double height = 0;
					if ((line.getY() + line.getHeight()) > (current.getY() + current.getHeight())) {
						height = line.getY() + line.getHeight() - current.getY();
						current.setHeight(height);
					}
				} else {
					out.add(current);
					current = new Rect();
					current.set(line);
				}
			}
			out.add(current);
		}
		return out;
	}
}


PO

package org.aoe.software.pdf.po;

import java.util.LinkedList;
import java.util.List;

/**
 * 每页对象.
 * 
 * <page pageIndex="1">
    <text>
      <tr colX="x1:x2" colY="y1:y2">ssssssssss</tr>
    </text>
    <table  colX="x1:x2:x3" colY="y1:y2:y3:y4">
      <tr>
         <td colX="x1:x2" colY="y1:y2" colspan="2" rowspan="2">TTTT</td>
      </tr>
    </table>
  </page>
 *
 */
public class Page {
	private int currentNum;
	private List<Text> textList = new LinkedList<Text>();
	private List<Table> tableList = new LinkedList<Table>();
	private List<Integer> seqList = new LinkedList<Integer>(); // 0:text 1:table
	
	public String toString(){
		StringBuffer sb = new StringBuffer();
		int textIndex = 0;
		int tableIndex = 0;
		for(int i : seqList){
			if(i == 0){
				sb.append(textList.get(textIndex++).toString());
			}else{
				sb.append(tableList.get(tableIndex++).toString());
			}
		}
		return String.format("<page pageIndex=\"%s\">%s</page>", currentNum, sb.toString());
	}
	
	public int getCurrentNum() {
		return currentNum;
	}

	public void setCurrentNum(int currentNum) {
		this.currentNum = currentNum;
	}

	public void addText(Text text){
		textList.add(text);
		seqList.add(0);
	}
	
	public void addTable(Table table){
		tableList.add(table);
		seqList.add(1);
	}
}


package org.aoe.software.pdf.po;

import java.util.LinkedList;
import java.util.List;

/**
  表格.

 *
 */
public class Table {
	private String colX;
	private String colY;
	private List<TableTr> trList = new LinkedList<TableTr>();
	
	public String toString(){
		StringBuffer sb = new StringBuffer();
		for(TableTr tr : trList){
			sb.append(tr.toString());
		}
		return String.format("<table border=\"1\" colX=\"%s\" colY=\"%s\">%s</table>", 
				colX, colY, sb.toString());
	}
	
	public void addTr(TableTr tr){
		trList.add(tr);
	}

	public String getColX() {
		return colX;
	}

	public void setColX(String colX) {
		this.colX = colX;
	}

	public String getColY() {
		return colY;
	}

	public void setColY(String colY) {
		this.colY = colY;
	}
	
	
}


package org.aoe.software.pdf.po;

/**
 * 表格的单元格。
 * 
 */
public class TableTd {
	private String colX;
	private String colY;
	private String colspan;
	private String rowspan;
	private String content;
	
	@Override
	public String toString(){
		return String.format("<td colX=\"%s\" colY=\"%s\" colspan=\"%s\" rowspan=\"%s\">%s</td>",
				colX, colY, colspan, rowspan, content);
	}
	
	public String getContent() {
		return content;
	}

	public void setContent(String content) {
		this.content = content;
	}

	public String getColX() {
		return colX;
	}

	public void setColX(String colX) {
		this.colX = colX;
	}

	public String getColY() {
		return colY;
	}

	public void setColY(String colY) {
		this.colY = colY;
	}

	public String getColspan() {
		return colspan;
	}

	public void setColspan(String colspan) {
		this.colspan = colspan;
	}

	public String getRowspan() {
		return rowspan;
	}

	public void setRowspan(String rowspan) {
		this.rowspan = rowspan;
	}

}


package org.aoe.software.pdf.po;

import java.util.LinkedList;
import java.util.List;

/**
 表格的行.
 *
 */
public class TableTr {
	private List<TableTd> tdList = new LinkedList<TableTd>();

	public String toString(){
		StringBuffer sb = new StringBuffer();
		sb.append("<tr>");
		for(TableTd td : tdList){
			sb.append(td.toString());
		}
		sb.append("</tr>");
		return sb.toString();
	}
	
	public void addTd(TableTd td){
		tdList.add(td);
	}
}


package org.aoe.software.pdf.po;

import java.util.LinkedList;
import java.util.List;

/**
 * 文本块.
 *
 */
public class Text {
	private List<TextTr> trList = new LinkedList<TextTr>();
	
	public String toString(){
		StringBuffer sb = new StringBuffer();
		for(TextTr tr : trList){
			sb.append(tr.toString());
		}
		return String.format("<text>%s</text>",sb.toString());
	}
	
	public void addTr(TextTr tr){
		trList.add(tr);
	}

}


package org.aoe.software.pdf.po;

/**
 * 文本行 。
 * 
 */
public class TextTr {
	private String colX;
	private String colY;
	private String content;
	
	public String toString(){
		return String.format("<tr colX=\"%s\" colY=\"%s\">%s</tr>", colX, colY, content);
	}
	
	public String getColX() {
		return colX;
	}
	public void setColX(String colX) {
		this.colX = colX;
	}
	public String getColY() {
		return colY;
	}
	public void setColY(String colY) {
		this.colY = colY;
	}
	public String getContent() {
		return content;
	}
	public void setContent(String content) {
		this.content = content;
	}
	
	
}


你可能感兴趣的:(xml)