为了好写论文,也论文上好复制粘贴,我把我学习的过程相当于做笔记一样写在这里,一起共勉!!!
第一步,前期处理,也就是说如果文本(或者其他)过大,我们需要把它切分为更小的文本(或其他),切片代码如下:
package mych2_demo.lucenedemo.preprocess; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.util.HashMap; import java.util.Map; public class FilePreprocess { /** * * 两参数,一个是被处理的源文件,一个是处理后的文件输出路径 * */ public static void preprocess(File file,String outputDir){ try{ splitToSmallFiles(charactorProcess(file,outputDir+"output.all"),outputDir); }catch(Exception e){ e.printStackTrace(); } } /** * 对文件字符进行全角/半角处理 * */ public static File charactorProcess(File file,String destFile ) throws Exception{ BufferedWriter writer= new BufferedWriter(new FileWriter(destFile)); BufferedReader reader= new BufferedReader(new FileReader(file)); String line = reader.readLine(); while(line!=null){ if(!line.equals("\r\n")){ String newline=replace(line); writer.write(newline); writer.newLine(); } line=reader.readLine(); } reader.close(); writer.close(); return new File(destFile); } /** * 拆分成小文件 */ public static void splitToSmallFiles(File file,String outputpath) throws Exception{ int filePointer=0; int MAX_SIZE=10240; BufferedWriter writer=null; BufferedReader reader=new BufferedReader(new FileReader(file)); StringBuffer buffer =new StringBuffer(); String line = reader.readLine(); while(line!=null){ buffer.append(line).append("\r\n"); if(buffer.toString().getBytes().length>=MAX_SIZE){ writer=new BufferedWriter(new FileWriter(outputpath+"output"+filePointer+".txt")); writer.write(buffer.toString()); writer.close(); filePointer++; buffer=new StringBuffer(); } line = reader.readLine(); } writer = new BufferedWriter(new FileWriter(outputpath+"output"+filePointer+".txt")); writer.write(buffer.toString()); writer.close(); } /** * 全角半角的转换 */ private static String replace(String line){ Map map=new HashMap(); map.put(",", ","); map.put("。", "."); map.put("〈", "<"); map.put("〉", ">"); map.put("‖", "|"); map.put("《", "<"); map.put("》", ">"); map.put("〔", "["); map.put("〕", "]"); map.put("﹖", "?"); map.put("?", "?"); map.put("“", "\""); map.put("”", "\""); map.put(":", ":"); map.put("、", ","); map.put("(", "("); map.put(")", ")"); map.put("【", "["); map.put("】", "]"); map.put("—", "-"); map.put("~", "~"); map.put("!", "!"); map.put("‵", "'"); map.put("①", "1"); map.put("②", "2"); map.put("③", "3"); map.put("④", "4"); map.put("⑤", "5"); map.put("⑥", "6"); map.put("⑦", "7"); map.put("⑧", "8"); map.put("⑨", "9"); int length=line.length(); for(int i=0;i<length;i++){ String charat=line.substring(i,i+1); if(map.get(charat)!=null){ line=line.replace(charat, (String)map.get(charat)); } } return line; } // public static void main(String[] args){ // String inputFile="f:\\book.txt"; // String outputDir="f:\\testfoler\\"; // // if(!new File(outputDir).exists()){ // new File(outputDir).mkdirs(); // } // // FilePreprocess filepreprocess=new FilePreprocess(); // filepreprocess.preprocess(new File(inputFile), outputDir); // } }
在网上下的一部text小说:
切分后如下:
借助Lucene开源框架建立索引:
代码如下:
package mych2_demo.lucenedemo.process; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import jeasy.analysis.MMAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; public class IndexProcesser { private String INDEX_STROE_PATH="f:\\index"; //创建索引 public void createIndex(String inputDir){ try{ IndexWriter writer=new IndexWriter(INDEX_STROE_PATH, new MMAnalyzer(), true); File filesDir=new File(inputDir); //取得所有需要建立索引的文件数组 File[] files = filesDir.listFiles(); //遍历数组 for(int i=0;i<files.length;i++){ String fileName=files[i].getName(); if(fileName.substring(fileName.lastIndexOf(".")).equals(".txt")){ Document doc =new Document(); //为文件名创建一个field Field field =new Field("filename",files[i].getName(),Field.Store.YES,Field.Index.TOKENIZED); doc.add(field); //为文件内容创建一个field field = new Field("content", loadFileToString(files[i]), Field.Store.NO, Field.Index.TOKENIZED); doc.add(field); //吧document加入IndexWriter writer.addDocument(doc); } } writer.close(); }catch (Exception e){ e.printStackTrace(); } } public String loadFileToString(File file) { try{ BufferedReader br= new BufferedReader(new FileReader(file)); StringBuffer sb=new StringBuffer(); String line=br.readLine(); while(line!=null){ sb.append(line); line=br.readLine(); } br.close(); return sb.toString(); }catch(Exception e){ e.printStackTrace(); return null; } } public static void main(String[] args){ IndexProcesser processer = new IndexProcesser(); processer.createIndex("f:\\testfolder"); } }测试所建立后生成的索引:
建立索引后,我们分别按照我们索引查询和我们java系统的String查询比较
代码如下:
package mych2_demo.lucenedemo.process; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.util.Date; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.Map; import java.util.StringTokenizer; import javax.rmi.CORBA.Tie; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; public class Search { private String INDEX_STORE_PATH="f:\\index"; //利用Lucene的搜索 public void indexSearch(String searchType, String searchKey){ try{ System.out.println("****使用索引方式搜索**********"); System.out.println("----------------------------"); //根据索引位置建立索引IndexSearcher IndexSearcher searcher=new IndexSearcher(INDEX_STORE_PATH); //建立索引单元,searchType代表搜索的Filed,SearchKey代表关键字 Term t=new Term(searchType, searchKey); //由term生成一个Query Query q=new TermQuery(t); //搜索开始时间 Date beginTime=new Date(); //获取一个<document,frequency>的枚举对象TermDocs TermDocs termDocs=searcher.getIndexReader().termDocs(t); while(termDocs.next()){ //输出在文档中出现关键词的次数 System.out.print("find "+termDocs.freq()+" matches in"); //输出搜索到关键词的文档 System.out.println(searcher.getIndexReader().document(termDocs.doc()) .getField("filename").stringValue()); } //搜索完成时间 Date endTime=new Date(); //搜索所耗时间 long timeofSearch=endTime.getTime()-beginTime.getTime(); System.out.println("使用索引方式所花时间 "+timeofSearch+" ms"); }catch(Exception e){ e.printStackTrace(); } } /** * 利用字符串的索引 */ public void stringSearch(String keyword,String searchDir){ System.out.println("****利用字符串的索引**********"); System.out.println("----------------------------"); File filesDir=new File(searchDir); File[] files=filesDir.listFiles(); Map rs = new LinkedHashMap(); Date beginTime=new Date(); for(int i=0;i<files.length;i++){ //初始化匹配次数 int hits=0; try{ BufferedReader br=new BufferedReader(new FileReader(files[i])); StringBuffer sb=new StringBuffer(); String line=br.readLine(); while(line!=null){ sb.append(line); line=br.readLine(); } br.close(); String stringToSearch=sb.toString(); //初始化fromIndex int fromIndex=-keyword.length(); //逐个匹配关键词 while((fromIndex = stringToSearch.indexOf(keyword,fromIndex+keyword.length()))!=-1){ hits++; } //将文件名和匹配次数加入hashMap rs.put(files[i].getName(),new Integer(hits)); }catch(Exception e){ e.printStackTrace(); } } Iterator it=rs.keySet().iterator(); while(it.hasNext()){ String fileName=(String)it.next(); Integer hits=(Integer)rs.get(fileName); System.out.println("find "+hits.intValue()+" matches in"+fileName); } Date endTime=new Date(); long timeofSearch = endTime.getTime()-beginTime.getTime(); System.out.println("使用字符串匹配所耗用时间 "+timeofSearch+" ms"); } }
索引:
字符串:
比较可以看出建立索引的好处以及优势。