lucene入门

最近一直再研究lucene,把入门的程序和大家分享:

对索引的操作类:

Java代码
  1. public   class  IndexDao {  
  2.   
  3.     public  IndexDao() {  
  4.         try  {  
  5.             indexWriter = new  IndexWriter(Constants.INDEX_STORE_PATH,  
  6.                     Constants.analyzer, MaxFieldLength.LIMITED);  
  7.         } catch  (Exception e) {  
  8.             e.printStackTrace();  
  9.         }  
  10.     }  
  11.     public  IndexDao(Directory dir) {  
  12.         try  {  
  13.             indexWriter = new  IndexWriter(dir,Constants.analyzer,MaxFieldLength.LIMITED);  
  14.         } catch  (Exception e) {  
  15.             e.printStackTrace();  
  16.         }  
  17.     }  
  18.     public  IndexDao( boolean  isCreate) {  
  19.         try  {  
  20.             indexWriter = new  IndexWriter(Constants.INDEX_STORE_PATH,Constants.analyzer, isCreate,MaxFieldLength.LIMITED);  
  21.         } catch  (Exception e) {  
  22.             e.printStackTrace();  
  23.         }  
  24.     }  
  25.   
  26.     // 索引器   
  27.     private  IndexWriter indexWriter =  null ;  
  28.   
  29.     /**  
  30.      * 添加/创建索引  
  31.      *   
  32.      * @param folder  
  33.      * @throws IOException  
  34.      * @throws CorruptIndexException  
  35.      */   
  36.     public   void  saveIndex(File folder, String[] unIndeies)  
  37.             throws  CorruptIndexException, IOException {  
  38.         if  (folder.isDirectory()) {  
  39.             String[] files = folder.list();  
  40.             for  ( int  i =  0 ; i < files.length; i++) {  
  41.                 File f = new  File(folder, files[i]);  
  42.                 if  (!f.isHidden()) {  
  43.                     if  (f.isDirectory()) {  
  44.                         saveIndex(f, unIndeies);// ② 递归   
  45.                     }  
  46.                     String fileTyep = ReadFile.validateFile(f);  
  47.                     for  ( int  j =  0 ; j < unIndeies.length; j++) {  
  48.                         if  (fileTyep.equalsIgnoreCase(unIndeies[j])) {  
  49.                             System.out.println("正在建立索引 : "  + f.getName() +  "" );  
  50.                             Document doc = ReadFile.indexFile(f);  
  51.                             indexWriter.addDocument(doc);  
  52.                         }  
  53.                     }  
  54.                 }  
  55.             }  
  56.         }  
  57.     }  
  58.   
  59.     /**  
  60.      * Term是搜索的最小单位,代表某个 Field 中的一个关键词,如:<title, lucene> new Term( "title",  
  61.      * "lucene" ); new Term( "id", "5" ); new Term( "id", UUID );  
  62.      *   
  63.      * @param term  
  64.      */   
  65.     public   void  deleteIndex(Term term) {  
  66.         try  {  
  67.             indexWriter.deleteDocuments(term);  
  68.         } catch  (Exception e) {  
  69.             throw   new  RuntimeException(e);  
  70.         } finally  {  
  71.             try  {  
  72.                 indexWriter.close();  
  73.             } catch  (Exception e) {  
  74.                 e.printStackTrace();  
  75.             }  
  76.         }  
  77.     }  
  78.   
  79.     /**  
  80.      * 更新索引 indexWriter.deleteDocuments(term); indexWriter.addDocument(doc);  
  81.      *   
  82.      * @param term  
  83.      * @param doc  
  84.      */   
  85.     public   void  updateIndex(Term term, Document doc) {  
  86.         try  {  
  87.             indexWriter.updateDocument(term, doc);  
  88.         } catch  (Exception e) {  
  89.             throw   new  RuntimeException(e);  
  90.         } finally  {  
  91.             try  {  
  92.                 indexWriter.close();  
  93.             } catch  (Exception e) {  
  94.                 e.printStackTrace();  
  95.             }  
  96.         }  
  97.     }  
  98.   
  99.     /**  
  100.      * 查询 totalPage = recordCount / pageSize; if (recordCount % pageSize &gt; 0)  
  101.      * totalPage++;  
  102.      *   
  103.      * @param queryString  
  104.      * @param firstResult  
  105.      * @param maxResults  
  106.      * @return  
  107.      */   
  108.     public  QueryResult search(String queryString,  int  firstResult,  
  109.             int  maxResults) {  
  110.         try  {  
  111.             // 1,把要搜索的文本解析为 Query   
  112.             String[] fields = { "name" "content"  };  
  113.             Map<String, Float> boosts = new  HashMap<String, Float>();  
  114.             boosts.put("name" , 2f);  
  115.             boosts.put("content" , 3f);  //默认为1.0f   
  116.             QueryParser queryParser = new  MultiFieldQueryParser(fields,  
  117.                     Constants.analyzer, boosts);  
  118.             Query query = queryParser.parse(queryString);  
  119. //          Query query = IKQueryParser.parse("content", queryString);   
  120.             Date start = new  Date();  
  121.             QueryResult result = search(query, firstResult, maxResults);  
  122.             Date end = new  Date();  
  123.             System.out.println("检索完成,用时"  + (end.getTime() - start.getTime())  
  124.                     + "毫秒" );  
  125.             return  result;  
  126.         } catch  (Exception e) {  
  127.             throw   new  RuntimeException(e);  
  128.         }  
  129.     }  
  130.   
  131.     public  QueryResult search(Query query,  int  firstResult,  int  maxResults) {  
  132.         IndexSearcher indexSearcher = null ;  
  133.         try  {  
  134.             // 2,进行查询   
  135.             indexSearcher = new  IndexSearcher(Constants.INDEX_STORE_PATH);  
  136.             Filter filter = new  RangeFilter( "size" ,  
  137.                     NumberTools.longToString(0 ), NumberTools  
  138.                             .longToString(1000000 ),  true true );  
  139.             // 排序   
  140.             Sort sort = new  Sort();  
  141.             sort.setSort(new  SortField( "size" ));  // 默认为升序   
  142.             // sort.setSort(new SortField("size", true));   
  143.             TopDocs topDocs = indexSearcher.search(query, filter, 10000 , sort);  
  144.             int  recordCount = topDocs.totalHits;  
  145.             List<Document> recordList = new  ArrayList<Document>();  
  146.             // 准备高亮器   
  147.             Formatter formatter = new  SimpleHTMLFormatter( "<font color='red'>" ,  
  148.                     "</font>" );  
  149.             Scorer scorer = new  QueryScorer(query);  
  150.             Highlighter highlighter = new  Highlighter(formatter, scorer);  
  151.             Fragmenter fragmenter = new  SimpleFragmenter( 50 );  
  152.             highlighter.setTextFragmenter(fragmenter);  
  153.             // 3,取出当前页的数据   
  154.             int  end = Math.min(firstResult + maxResults, topDocs.totalHits);  
  155.             for  ( int  i = firstResult; i < end; i++) {  
  156.                 ScoreDoc scoreDoc = topDocs.scoreDocs[i];  
  157.                 int  docSn = scoreDoc.doc;  // 文档内部编号   
  158.                 Document doc = indexSearcher.doc(docSn); // 根据编号取出相应的文档   
  159.                 // 高亮 返回高亮后的结果,如果当前属性值中没有出现关键字,会返回 null   
  160.                 String hc = highlighter.getBestFragment(Constants.analyzer,  
  161.                         "content" , doc.get( "content" ));  
  162.                 if  (hc ==  null ) {  
  163.                     String content = doc.get("content" );  
  164.                     int  endIndex = Math.min( 50 , content.length());  
  165.                     hc = content.substring(0 , endIndex); // 最多前50个字符   
  166.                 }  
  167.                 doc.getField("content" ).setValue(hc);  
  168.                 recordList.add(doc);  
  169.             }  
  170.             // 返回结果   
  171.             return   new  QueryResult(recordCount, recordList);  
  172.         } catch  (Exception e) {  
  173.             throw   new  RuntimeException(e);  
  174.         } finally  {  
  175.             try  {  
  176.                 indexSearcher.close();  
  177.             } catch  (IOException e) {  
  178.                 e.printStackTrace();  
  179.             }  
  180.         }  
  181.     }  
  182.   
  183.     public   void  close() {  
  184.         // 对索引进行优化   
  185.         try  {  
  186.             indexWriter.optimize();  
  187.             indexWriter.close();  
  188.         } catch  (CorruptIndexException e) {  
  189.             e.printStackTrace();  
  190.         } catch  (IOException e) {  
  191.             e.printStackTrace();  
  192.         }  
  193.     }  
  194.   
  195.     public   void  readIndex(String key, String value) {  
  196.           
  197.         IndexReader reader;  
  198.         try  {  
  199. //          Directory fsDir = FSDirectory.getDirectory(   
  200. //                  Constants.INDEX_STORE_PATH, false);   
  201. //          if (IndexReader.isLocked(fsDir)) {   
  202. //              System.out.println("------unlock-----");   
  203. //              IndexReader.unlock(fsDir);   
  204. //          }   
  205.             reader = IndexReader.open(Constants.INDEX_STORE_PATH);  
  206.             for  ( int  i =  0 ; i < reader.numDocs(); i++)  
  207. //          System.out.println(reader.document(i));   
  208.             System.out.println("版本:"  + reader.getVersion());  
  209.             System.out.println("索引内的文档数量:"  + reader.numDocs());  
  210.             Term term = new  Term(key, value);  
  211.             TermDocs docs = reader.termDocs(term);  
  212.             IndexSearcher indexSearcher = null ;  
  213.             indexSearcher = new  IndexSearcher(Constants.INDEX_STORE_PATH);  
  214.             while  (docs.next()) {  
  215.                 int  docSn = docs.doc();  // 文档内部编号   
  216.                 Document doc = indexSearcher.doc(docSn); // 根据编号取出相应的文档   
  217.                 System.out.println("文档路径 "  + doc.get( "path" ));  
  218.                 System.out.println("含有所查找的 "  + term +  "的Document的编号为: " + docs.doc());  
  219.                 System.out.println("Term在文档中的出现 "  + docs.freq()+ " 次" );  
  220.             }  
  221.         } catch  (CorruptIndexException e) {  
  222.             e.printStackTrace();  
  223.         } catch  (IOException e) {  
  224.             e.printStackTrace();  
  225.         }  
  226.     }  
  227. }  
public class IndexDao {

	public IndexDao() {
		try {
			indexWriter = new IndexWriter(Constants.INDEX_STORE_PATH,
					Constants.analyzer, MaxFieldLength.LIMITED);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	public IndexDao(Directory dir) {
		try {
			indexWriter = new IndexWriter(dir,Constants.analyzer,MaxFieldLength.LIMITED);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	public IndexDao(boolean isCreate) {
		try {
			indexWriter = new IndexWriter(Constants.INDEX_STORE_PATH,Constants.analyzer, isCreate,MaxFieldLength.LIMITED);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	// 索引器
	private IndexWriter indexWriter = null;

	/**
	 * 添加/创建索引
	 * 
	 * @param folder
	 * @throws IOException
	 * @throws CorruptIndexException
	 */
	public void saveIndex(File folder, String[] unIndeies)
			throws CorruptIndexException, IOException {
		if (folder.isDirectory()) {
			String[] files = folder.list();
			for (int i = 0; i < files.length; i++) {
				File f = new File(folder, files[i]);
				if (!f.isHidden()) {
					if (f.isDirectory()) {
						saveIndex(f, unIndeies);// ② 递归
					}
					String fileTyep = ReadFile.validateFile(f);
					for (int j = 0; j < unIndeies.length; j++) {
						if (fileTyep.equalsIgnoreCase(unIndeies[j])) {
							System.out.println("正在建立索引 : " + f.getName() + "");
							Document doc = ReadFile.indexFile(f);
							indexWriter.addDocument(doc);
						}
					}
				}
			}
		}
	}

	/**
	 * Term是搜索的最小单位,代表某个 Field 中的一个关键词,如:<title, lucene> new Term( "title",
	 * "lucene" ); new Term( "id", "5" ); new Term( "id", UUID );
	 * 
	 * @param term
	 */
	public void deleteIndex(Term term) {
		try {
			indexWriter.deleteDocuments(term);
		} catch (Exception e) {
			throw new RuntimeException(e);
		} finally {
			try {
				indexWriter.close();
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
	}

	/**
	 * 更新索引 indexWriter.deleteDocuments(term); indexWriter.addDocument(doc);
	 * 
	 * @param term
	 * @param doc
	 */
	public void updateIndex(Term term, Document doc) {
		try {
			indexWriter.updateDocument(term, doc);
		} catch (Exception e) {
			throw new RuntimeException(e);
		} finally {
			try {
				indexWriter.close();
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
	}

	/**
	 * 查询 totalPage = recordCount / pageSize; if (recordCount % pageSize &gt; 0)
	 * totalPage++;
	 * 
	 * @param queryString
	 * @param firstResult
	 * @param maxResults
	 * @return
	 */
	public QueryResult search(String queryString, int firstResult,
			int maxResults) {
		try {
			// 1,把要搜索的文本解析为 Query
			String[] fields = { "name", "content" };
			Map<String, Float> boosts = new HashMap<String, Float>();
			boosts.put("name", 2f);
			boosts.put("content", 3f); //默认为1.0f
			QueryParser queryParser = new MultiFieldQueryParser(fields,
					Constants.analyzer, boosts);
			Query query = queryParser.parse(queryString);
//			Query query = IKQueryParser.parse("content", queryString);
			Date start = new Date();
			QueryResult result = search(query, firstResult, maxResults);
			Date end = new Date();
			System.out.println("检索完成,用时" + (end.getTime() - start.getTime())
					+ "毫秒");
			return result;
		} catch (Exception e) {
			throw new RuntimeException(e);
		}
	}

	public QueryResult search(Query query, int firstResult, int maxResults) {
		IndexSearcher indexSearcher = null;
		try {
			// 2,进行查询
			indexSearcher = new IndexSearcher(Constants.INDEX_STORE_PATH);
			Filter filter = new RangeFilter("size",
					NumberTools.longToString(0), NumberTools
							.longToString(1000000), true, true);
			// 排序
			Sort sort = new Sort();
			sort.setSort(new SortField("size")); // 默认为升序
			// sort.setSort(new SortField("size", true));
			TopDocs topDocs = indexSearcher.search(query, filter, 10000, sort);
			int recordCount = topDocs.totalHits;
			List<Document> recordList = new ArrayList<Document>();
			// 准备高亮器
			Formatter formatter = new SimpleHTMLFormatter("<font color='red'>",
					"</font>");
			Scorer scorer = new QueryScorer(query);
			Highlighter highlighter = new Highlighter(formatter, scorer);
			Fragmenter fragmenter = new SimpleFragmenter(50);
			highlighter.setTextFragmenter(fragmenter);
			// 3,取出当前页的数据
			int end = Math.min(firstResult + maxResults, topDocs.totalHits);
			for (int i = firstResult; i < end; i++) {
				ScoreDoc scoreDoc = topDocs.scoreDocs[i];
				int docSn = scoreDoc.doc; // 文档内部编号
				Document doc = indexSearcher.doc(docSn); // 根据编号取出相应的文档
				// 高亮 返回高亮后的结果,如果当前属性值中没有出现关键字,会返回 null
				String hc = highlighter.getBestFragment(Constants.analyzer,
						"content", doc.get("content"));
				if (hc == null) {
					String content = doc.get("content");
					int endIndex = Math.min(50, content.length());
					hc = content.substring(0, endIndex);// 最多前50个字符
				}
				doc.getField("content").setValue(hc);
				recordList.add(doc);
			}
			// 返回结果
			return new QueryResult(recordCount, recordList);
		} catch (Exception e) {
			throw new RuntimeException(e);
		} finally {
			try {
				indexSearcher.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}

	public void close() {
		// 对索引进行优化
		try {
			indexWriter.optimize();
			indexWriter.close();
		} catch (CorruptIndexException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	public void readIndex(String key, String value) {
		
		IndexReader reader;
		try {
//			Directory fsDir = FSDirectory.getDirectory(
//					Constants.INDEX_STORE_PATH, false);
//			if (IndexReader.isLocked(fsDir)) {
//				System.out.println("------unlock-----");
//				IndexReader.unlock(fsDir);
//			}
			reader = IndexReader.open(Constants.INDEX_STORE_PATH);
			for (int i = 0; i < reader.numDocs(); i++)
//		    System.out.println(reader.document(i));
		    System.out.println("版本:" + reader.getVersion());
			System.out.println("索引内的文档数量:" + reader.numDocs());
			Term term = new Term(key, value);
			TermDocs docs = reader.termDocs(term);
			IndexSearcher indexSearcher = null;
			indexSearcher = new IndexSearcher(Constants.INDEX_STORE_PATH);
			while (docs.next()) {
				int docSn = docs.doc(); // 文档内部编号
				Document doc = indexSearcher.doc(docSn); // 根据编号取出相应的文档
				System.out.println("文档路径 " + doc.get("path"));
				System.out.println("含有所查找的 " + term + "的Document的编号为: "+ docs.doc());
				System.out.println("Term在文档中的出现 " + docs.freq()+" 次");
			}
		} catch (CorruptIndexException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
}



读取文件工具类:

Java代码
  1. public   class  ReadFile {  
  2.   
  3.     public   static  String readWord(File f) {  
  4.         StringBuffer content = new  StringBuffer( "" ); // 文档内容   
  5.         try  {  
  6.              HWPFDocument doc = new  HWPFDocument( new  FileInputStream(f));  
  7.              Range range = doc.getRange();  
  8.              int  paragraphCount = range.numParagraphs(); // 段落   
  9.              for  ( int  i =  0 ; i < paragraphCount; i++) { // 遍历段落读取数据   
  10.              Paragraph pp = range.getParagraph(i);  
  11.              content.append(pp.text());  
  12.              }  
  13. //           System.out.println("-------word--------"+content.toString());   
  14.         } catch  (Exception e) {  
  15.             System.out.println("建立索引出错 : "  + f.getAbsolutePath() +  "" );  
  16.             e.printStackTrace();  
  17.         }  
  18.         return  content.toString().trim();  
  19.     }  
  20.   
  21.     public   static  String readPdf(File f){  
  22.         StringBuffer content = new  StringBuffer( "" ); // 文档内容   
  23.         PDDocument pdfDocument = null ;  
  24.         try  {  
  25.             if (f.length()> 10048576 ){  
  26.                 DecimalFormat df = new  DecimalFormat( "#.00" );  
  27.                 System.out.println("---------------------文件大小------" +df.format(( double ) f.length() /  1048576 ) +  "M" );  
  28.                 return  f.getName();  
  29.             }  
  30.             FileInputStream fis = new  FileInputStream(f);  
  31.             PDFTextStripper stripper = new  PDFTextStripper();  
  32.             pdfDocument = PDDocument.load(fis);  
  33.             if (pdfDocument.isEncrypted()){  
  34.                 return  f.getName();  
  35.             }  
  36.               
  37.             StringWriter writer = new  StringWriter();  
  38.             stripper.writeText(pdfDocument, writer);  
  39.             content.append(writer.getBuffer().toString());  
  40.             fis.close();  
  41.         } catch  (IOException e) {  
  42.             System.out.println("建立索引出错 : "  + f.getAbsolutePath() +  "" );  
  43.             System.err.println("IOException="  + e);  
  44.             //System.exit(1);   
  45.         } finally  {  
  46.             if  (pdfDocument !=  null ) {  
  47.                 // System.err.println("Closing document " + f + "...");   
  48.                 org.pdfbox.cos.COSDocument cos = pdfDocument.getDocument();  
  49.                 try  {  
  50.                     cos.close();  
  51.                     // System.err.println("Closed " + cos);   
  52.                     pdfDocument.close();  
  53.                 } catch  (IOException e) {  
  54.                     System.out.println("建立索引出错 : "  + f.getAbsolutePath() +  "" );  
  55.                     e.printStackTrace();  
  56.                 }  
  57.             }  
  58.         }  
  59. //       System.out.println("-------pdf--------"+content.toString());   
  60.         return  content.toString().trim();  
  61.     }  
  62.       
  63.     public   static  String readHtml(File f) {  
  64.         StringBuffer content = new  StringBuffer( "" );  
  65.         FileInputStream fis = null ;  
  66.         try  {  
  67.             fis = new  FileInputStream(f);  
  68.             // 读取页面 这里的字符编码要注意,要对上html头文件的一致,否则会出乱码   
  69.             BufferedReader reader = new  BufferedReader( new  InputStreamReader(fis,  "gb2312" ));  
  70.             String line = null ;  
  71.             while  ((line = reader.readLine()) !=  null ) {  
  72.                 content.append(line + "\n" );  
  73.             }  
  74.             reader.close();  
  75.         } catch  (Exception e) {  
  76.             System.out.println("建立索引出错 : "  + f.getAbsolutePath() +  "" );  
  77.             e.printStackTrace();  
  78.         }  
  79.         String contentString = content.toString();  
  80. //      System.out.println("---------htm索引----"+contentString);   
  81.         return  contentString;  
  82.     }  
  83.   
  84.     public   static  String readTxt(File f) {  
  85.         StringBuffer content = new  StringBuffer( "" );  
  86.         try  {  
  87.             BufferedReader reader = new  BufferedReader( new  InputStreamReader(  
  88.                     new  FileInputStream(f)));  
  89.             for  (String line =  null ; (line = reader.readLine()) !=  null ;) {  
  90.                 content.append(line).append("\n" );  
  91.             }  
  92.         } catch  (IOException e) {  
  93.             System.out.println("建立索引出错 : "  + f.getAbsolutePath() +  "" );  
  94.             e.printStackTrace();  
  95.         }  
  96.         return  content.toString().trim();  
  97.     }  
  98.       
  99.     public   static  String readExcel(File f,String fileType){  
  100.         StringBuffer content = new  StringBuffer( "" );  
  101.             try {  
  102.                 ExcelReader er=new  ExcelReader(f,fileType);   
  103.                 String line=er.readLine();  
  104.                 content.append(line).append("\n" );  
  105.                 while (line!= null ){  
  106.                     line=er.readLine();  
  107.                     content.append(line).append("\n" );  
  108.                 }  
  109.                 er.close();  
  110.             }catch (Exception e){  
  111.                 System.out.println("建立索引出错 : "  + f.getAbsolutePath() +  "" );  
  112.                 e.printStackTrace();  
  113.             }  
  114.             return  content.toString();  
  115.     }  
  116.   
  117.     public   static  String validateFile(File f) {  
  118.         String fileType = "otherType" ;  
  119.         String fileName = f.getName();  
  120.         if  (fileName.lastIndexOf( '.' ) == - 1 ) {  
  121.             fileType = "dir" ;  
  122.             return  fileType;  
  123.         }  
  124.         fileName = fileName.substring(fileName.lastIndexOf('.' ) +  1 , fileName  
  125.                 .length());  
  126.           
  127.         int  i =  0 ;  
  128.         String [] extension=Constants.EXTENSION;  
  129.         for  (i =  0 ; i < extension.length; i++) {  
  130.             if  (fileName.equalsIgnoreCase(extension[i])) {  
  131.                 fileType = extension[i];  
  132.                 break ;  
  133.             }  
  134.         }  
  135.         return  fileType;  
  136.     }  
  137.   
  138.     public   static  Document indexFile(File f) {  
  139.         Document doc = new  Document();  
  140.         try  {  
  141.             doc.add(new  Field( "name" , f.getName(), Store.YES, Index.ANALYZED));  
  142.             doc.add(new  Field( "size" , NumberTools.longToString(f.length()),  
  143.                     Store.YES, Index.NOT_ANALYZED));  
  144.             doc.add(new  Field( "path" , f.getAbsolutePath(), Store.YES,  
  145.                     Index.NOT_ANALYZED));  
  146.             String fileType = validateFile(f);  
  147.             if  (fileType.equals( "txt" )) {  
  148.                 doc.add(new  Field( "content" , ReadFile.readTxt(f), Store.YES,  
  149.                         Index.ANALYZED));  
  150.             } else   if  (fileType.equals( "pdf" )) {  
  151.                 doc.add(new  Field( "content" , ReadFile.readPdf(f), Store.YES,  
  152.                         Index.ANALYZED));  
  153.             } else   if  (fileType.equals( "doc" )) {  
  154.                 doc.add(new  Field( "content" , ReadFile.readWord(f), Store.YES,  
  155.                         Index.ANALYZED));  
  156.             } else   if  (fileType.equals( "htm" )) {  
  157.                 doc.add(new  Field( "content" , ReadFile.readHtml(f), Store.YES,  
  158.                         Index.ANALYZED));  
  159.             } else   if (fileType.equals( "xls" )){  
  160.                 doc.add(new  Field( "content" , ReadFile.readExcel(f, fileType), Store.YES,  
  161.                         Index.ANALYZED));  
  162.             }else  {  
  163.                 doc.add(new  Field( "content" , f.getName(), Store.YES, Index.ANALYZED));  
  164.             }  
  165.         } catch  (Exception e) {  
  166.             System.out.println("建立索引出错 : "  + f.getAbsolutePath() +  "" );  
  167.             e.printStackTrace();  
  168.         }  
  169.         return  doc;  
  170.     }  
  171. }  
public class ReadFile {

	public static String readWord(File f) {
		StringBuffer content = new StringBuffer("");// 文档内容
		try {
			 HWPFDocument doc = new HWPFDocument(new FileInputStream(f));
			 Range range = doc.getRange();
			 int paragraphCount = range.numParagraphs();// 段落
			 for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据
			 Paragraph pp = range.getParagraph(i);
			 content.append(pp.text());
			 }
//			 System.out.println("-------word--------"+content.toString());
		} catch (Exception e) {
			System.out.println("建立索引出错 : " + f.getAbsolutePath() + "");
			e.printStackTrace();
		}
		return content.toString().trim();
	}

	public static String readPdf(File f){
		StringBuffer content = new StringBuffer("");// 文档内容
		PDDocument pdfDocument = null;
		try {
			if(f.length()>10048576){
				DecimalFormat df = new DecimalFormat("#.00");
				System.out.println("---------------------文件大小------"+df.format((double) f.length() / 1048576) + "M");
				return f.getName();
			}
			FileInputStream fis = new FileInputStream(f);
			PDFTextStripper stripper = new PDFTextStripper();
			pdfDocument = PDDocument.load(fis);
			if(pdfDocument.isEncrypted()){
				return f.getName();
			}
			
			StringWriter writer = new StringWriter();
			stripper.writeText(pdfDocument, writer);
			content.append(writer.getBuffer().toString());
			fis.close();
		} catch (IOException e) {
			System.out.println("建立索引出错 : " + f.getAbsolutePath() + "");
			System.err.println("IOException=" + e);
			//System.exit(1);
		} finally {
			if (pdfDocument != null) {
				// System.err.println("Closing document " + f + "...");
				org.pdfbox.cos.COSDocument cos = pdfDocument.getDocument();
				try {
					cos.close();
					// System.err.println("Closed " + cos);
					pdfDocument.close();
				} catch (IOException e) {
					System.out.println("建立索引出错 : " + f.getAbsolutePath() + "");
					e.printStackTrace();
				}
			}
		}
//		 System.out.println("-------pdf--------"+content.toString());
		return content.toString().trim();
	}
	
	public static String readHtml(File f) {
		StringBuffer content = new StringBuffer("");
		FileInputStream fis = null;
		try {
			fis = new FileInputStream(f);
			// 读取页面 这里的字符编码要注意,要对上html头文件的一致,否则会出乱码
			BufferedReader reader = new BufferedReader(new InputStreamReader(fis, "gb2312"));
			String line = null;
			while ((line = reader.readLine()) != null) {
				content.append(line + "\n");
			}
			reader.close();
		} catch (Exception e) {
			System.out.println("建立索引出错 : " + f.getAbsolutePath() + "");
			e.printStackTrace();
		}
		String contentString = content.toString();
//		System.out.println("---------htm索引----"+contentString);
		return contentString;
	}

	public static String readTxt(File f) {
		StringBuffer content = new StringBuffer("");
		try {
			BufferedReader reader = new BufferedReader(new InputStreamReader(
					new FileInputStream(f)));
			for (String line = null; (line = reader.readLine()) != null;) {
				content.append(line).append("\n");
			}
		} catch (IOException e) {
			System.out.println("建立索引出错 : " + f.getAbsolutePath() + "");
			e.printStackTrace();
		}
		return content.toString().trim();
	}
	
	public static String readExcel(File f,String fileType){
		StringBuffer content = new StringBuffer("");
			try{
				ExcelReader er=new ExcelReader(f,fileType);	
				String line=er.readLine();
				content.append(line).append("\n");
				while(line!=null){
					line=er.readLine();
					content.append(line).append("\n");
				}
				er.close();
			}catch(Exception e){
				System.out.println("建立索引出错 : " + f.getAbsolutePath() + "");
				e.printStackTrace();
			}
			return content.toString();
	}

	public static String validateFile(File f) {
		String fileType = "otherType";
		String fileName = f.getName();
		if (fileName.lastIndexOf('.') == -1) {
			fileType = "dir";
			return fileType;
		}
		fileName = fileName.substring(fileName.lastIndexOf('.') + 1, fileName
				.length());
		
		int i = 0;
		String [] extension=Constants.EXTENSION;
		for (i = 0; i < extension.length; i++) {
			if (fileName.equalsIgnoreCase(extension[i])) {
				fileType = extension[i];
				break;
			}
		}
		return fileType;
	}

	public static Document indexFile(File f) {
		Document doc = new Document();
		try {
			doc.add(new Field("name", f.getName(), Store.YES, Index.ANALYZED));
			doc.add(new Field("size", NumberTools.longToString(f.length()),
					Store.YES, Index.NOT_ANALYZED));
			doc.add(new Field("path", f.getAbsolutePath(), Store.YES,
					Index.NOT_ANALYZED));
			String fileType = validateFile(f);
			if (fileType.equals("txt")) {
				doc.add(new Field("content", ReadFile.readTxt(f), Store.YES,
						Index.ANALYZED));
			} else if (fileType.equals("pdf")) {
				doc.add(new Field("content", ReadFile.readPdf(f), Store.YES,
						Index.ANALYZED));
			} else if (fileType.equals("doc")) {
				doc.add(new Field("content", ReadFile.readWord(f), Store.YES,
						Index.ANALYZED));
			} else if (fileType.equals("htm")) {
				doc.add(new Field("content", ReadFile.readHtml(f), Store.YES,
						Index.ANALYZED));
			} else if(fileType.equals("xls")){
				doc.add(new Field("content", ReadFile.readExcel(f, fileType), Store.YES,
						Index.ANALYZED));
			}else {
				doc.add(new Field("content", f.getName(), Store.YES, Index.ANALYZED));
			}
		} catch (Exception e) {
			System.out.println("建立索引出错 : " + f.getAbsolutePath() + "");
			e.printStackTrace();
		}
		return doc;
	}
}

 

Java代码
  1. public   class  ExcelReader {  
  2.     // 创建文件输入流   
  3.     private  BufferedReader reader =  null ;  
  4.   
  5.     // 文件类型   
  6.     private  String filetype;  
  7.   
  8.     // 文件二进制输入流   
  9.     private  InputStream is =  null ;  
  10.   
  11.     // 当前的Sheet   
  12.     private   int  currSheet;  
  13.   
  14.     // 当前位置   
  15.     private   int  currPosition;  
  16.   
  17.     // Sheet数量   
  18.     private   int  numOfSheets;  
  19.   
  20.     // HSSFWorkbook   
  21.     HSSFWorkbook workbook = null ;  
  22.   
  23.     // 设置Cell之间以空格分割   
  24.     private   static  String EXCEL_LINE_DELIMITER =  " " ;  
  25.   
  26.     // 设置最大列数   
  27. //  private static int MAX_EXCEL_COLUMNS = 64;   
  28.   
  29.     // 构造函数创建一个ExcelReader   
  30.     public
分享到:
评论

你可能感兴趣的:(F#,Excel,Lucene,J#)