lucene 3.0.2 + 多文件夹微博数据(时间,微博)构建索引

 

lucene 3.0.2 + 多文件夹微博数据(时间,微博)构建索引
package lia.meetlucene;



import java.io.File;

import java.io.IOException;

import java.util.LinkedList;



import javax.xml.parsers.DocumentBuilder;

import javax.xml.parsers.DocumentBuilderFactory;



import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.CorruptIndexException;

//import org.apache.lucene.document.Document;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

import org.w3c.dom.Document;

import org.w3c.dom.Element;

import org.w3c.dom.NodeList;



public class Unicode1 {



    static boolean numTime = false;

    static boolean numText = false;

    static String timeTmp = null;

    static String textTmp = null;

    

    static void indexer(IndexWriter writer) throws CorruptIndexException, IOException

    {

        org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();

        Field field = new Field("context",textTmp,Field.Store.YES,

                Field.Index.ANALYZED);

        doc.add(field);

        

        field = new Field("time",timeTmp,Field.Store.YES,

                Field.Index.NOT_ANALYZED);

        doc.add(field);

        

        writer.addDocument(doc);

        //System.out.println("微博: " + textTmp+ "  "+timeTmp);

    }

    

    

    static void Dfs(NodeList nodecur,IndexWriter writer) {

        for (int j = 0; j < nodecur.getLength(); j++) {



            if ("timestamp".equals(nodecur.item(j).getNodeName())) // 输出pass

            {

                //System.out.println("时间: " + nodecur.item(j).getTextContent());

                timeTmp = nodecur.item(j).getTextContent();

                numTime = true;

            }

            /*

             * else if ("origtext".equals(nodecur.item(j).getNodeName()))

             * System.out.println("原微博: " + nodecur.item(j).getTextContent());

             */

            else if ("text".equals(nodecur.item(j).getNodeName())) // 输出code

            {

                //System.out.println("微博: " + nodecur.item(j).getTextContent());

                textTmp = nodecur.item(j).getTextContent();

                numText = true; 

            }

            if(numText&&numTime)

                try {

                    indexer(writer);

                    numText = false;

                    numTime = false;

                } catch (CorruptIndexException e) {

                    // TODO Auto-generated catch block

                    e.printStackTrace();

                } catch (IOException e) {

                    // TODO Auto-generated catch block

                    e.printStackTrace();

                }

                

            NodeList childNodes = nodecur.item(j).getChildNodes();

            Dfs(childNodes,writer);

        }

    }



    public static void main(String[] args) throws IOException {



        long a = System.currentTimeMillis();



        // String dataDir ="C:/Users/Administrator/Desktop/xdj/tengxun/A__Vae";

        //File dataDir = new File("C:/Users/Administrator/Desktop/xdj/tengxun");

        //String indexDir = "C:/Users/Administrator/Desktop/xdj/suoyin";

        File dataDir = new File("E:/xdj/tengxun");

        String indexDir = "E:/xdj/tengxunsuoying";

        Directory dir = FSDirectory.open(new File(indexDir));



        IndexWriter writer = new IndexWriter(dir, 

                new SmartChineseAnalyzer(Version.LUCENE_20),

                //new StandardAnalyzer(Version.LUCENE_30), 

                true, 

                IndexWriter.MaxFieldLength.UNLIMITED);

        

        



        LinkedList list = new LinkedList();

        File file[] = dataDir.listFiles();

        for (int i = 0; i < file.length; i++) {

            if (file[i].isDirectory())

                list.add(file[i]);

        }

        File tmp;

        int num = 0;

        while (!list.isEmpty()) {

            

            tmp = (File) list.removeFirst();

            file = tmp.listFiles();

            for (int i = 0; i < file.length; i++) {

                System.out.println(file[i].getAbsolutePath());



                Element element = null;

                // documentBuilder为抽象不能直接实例化(将XML文件转换为DOM文件)

                DocumentBuilder db = null;

                DocumentBuilderFactory dbf = null;

                try {

                    // 返回documentBuilderFactory对象

                    dbf = DocumentBuilderFactory.newInstance();

                    // 返回db对象用documentBuilderFatory对象获得返回documentBuildr对象

                    db = dbf.newDocumentBuilder();

                    // 得到一个DOM并返回给document对象

                    Document dt = db.parse(file[i]);

                    // 得到一个elment根元素

                    element = dt.getDocumentElement();

                    // 获得根节点

                    System.out.println("根元素:" + element.getNodeName());

                    // 获得根元素下的子节点



                    Dfs(element.getChildNodes(),writer);



                    num++;

                } catch (Exception e) {

                    e.printStackTrace();

                }

            }



            System.out.println(System.currentTimeMillis() - a + "    " + num);

        }

        writer.close();

        

    }

}
View Code

 

你可能感兴趣的:(Lucene)