solr_对富文本(pdf等)建立索引

<p>solrconfig.xml中定义的</p>
<p>&lt;requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler"&gt;<br> &lt;lst name="defaults"&gt;<br> &lt;str name="fmap.Last-Modified"&gt;last_modified&lt;/str&gt;<br> &lt;str name="uprefix"&gt;ignored_&lt;/str&gt;<br> &lt;/lst&gt;<br> &lt;!--Optional. Specify a path to a tika configuration file. See the Tika docs for details.--&gt;<br> &lt;str name="tika.config"&gt;/my/path/to/tika.config&lt;/str&gt;<br> &lt;!-- Optional. Specify one or more date formats to parse. See DateUtil.DEFAULT_DATE_FORMATS for default date formats --&gt;<br> &lt;lst name="date.formats"&gt;<br> &lt;str&gt;yyyy-MM-dd&lt;/str&gt;<br> &lt;/lst&gt;<br> &lt;/requestHandler&gt;</p>
<p></p>
<p></p>
<p>需要额外jar包支持:</p>
<p>apache-solr-cell-1.4.0.jar</p>
<p></p>
<p></p>
<p>请求的URL样例:</p>
<p><a href="http://16.158.149.182:8080/solr/update/extract?literal.id=22&amp;literal.name=alern2&amp;literal.fullname=alern2.txt&amp;literal.type=file&amp;literal.size=16&amp;literal.datecreated=1281429683820&amp;literal.datelastmodify=1281429683820&amp;literal.userid=1&amp;literal.location=My">http://16.158.149.182:8080/solr/update/extract?literal.id=22&amp;literal.name=alern2&amp;literal.fullname=alern2.txt&amp;literal.type=file&amp;literal.size=16&amp;literal.datecreated=1281429683820&amp;literal.datelastmodify=1281429683820&amp;literal.userid=1&amp;literal.location=My</a> Files&amp;commit=true&amp;stream.url=http://localhost:8080/kumulus_data/kumulus_local/uid1/1281429683820?AWSAccessKeyId=AKIAJB36UFEVFFJ3P4TQ&amp;Expires=1281444093&amp;Signature=zn2D1frVIv1aDV7drby5Iu7iLng%3D&amp;oh=s3.amazonaws.com</p>
<p></p>
<p>/solr/update/extract 请求url</p>
<p>literal.id=22 相当于XML中定义的&lt;field name="id"&gt;22&lt;/field&gt;</p>
<p>commit=true 直接提交事务</p>
<p>stream.url 指向富文本文件URL ,这里URL是有问题的,里面的&amp; 符号未被处理,提交会失败。</p>
<p>相对的还有stream.file </p>
<p></p>
<p>详细看WIKI</p>
<p><a href="http://wiki.apache.org/solr/ExtractingRequestHandler">http://wiki.apache.org/solr/ExtractingRequestHandler</a></p>
<p></p>
<p>简单样例:</p>
<p></p>
<p>里面url是可用的,其他URL都没经过URLEncoding.url,都有问题</p>
<p></p>
<p>这里的例子中,都使用HttpURLConnection链接,level有点低,可以考虑使用appach的HTTPClient,帮助解决多线程安全等问题,更安全可靠。</p>
<p><textarea cols="101" rows="15" name="code" class="java:collapse">public class simpleClassFotTest {
/**
* @param args
* @throws IOException
* @throws URISyntaxException
*/
public static void main(String[] args) throws IOException, URISyntaxException {
try{
URI uri = new URI("http://localhost:8080/SolrDemo/indexDir/1280998963663_aldern.txt");
String url = "http://16.158.149.182:8080/solr/update/extract?literal.id=21" +
"&amp;literal.name=aldern&amp;literal.fullname=aldern.txt&amp;literal.type=file&amp;literal.size=32" +
"&amp;literal.datecreated=1281433365403&amp;literal.datelastmodify=1281433365403&amp;literal.userid=1" +
"&amp;literal.location=MyFiles&amp;commit=true" +
"&amp;stream.url=http%3A%2F%2Flocalhost%3A8080%2Fkumulus_data%2Fkumulus_local%2Fuid1%2F1281433365403%3FAWSAccessKeyId%3DAKIAJB36UFEVFFJ3P4TQ%26Expires%3D1281447779%26Signature%3DL6BlDV%252BviBuL78vF2V5QPq3HTDU%253D%26oh%3Ds3.amazonaws.com";
String url2 = "http://16.158.149.182:8080/solr/update/extract?literal.id=22&amp;literal.name=alern2&amp;literal.fullname=alern2.txt&amp;literal.type=file&amp;literal.size=16&amp;literal.datecreated=1281429683820&amp;literal.datelastmodify=1281429683820&amp;literal.userid=1&amp;literal.location=My Files&amp;commit=true&amp;stream.url=http://localhost:8080/kumulus_data/kumulus_local/uid1/1281429683820?AWSAccessKeyId=AKIAJB36UFEVFFJ3P4TQ&amp;Expires=1281444093&amp;Signature=zn2D1frVIv1aDV7drby5Iu7iLng%3D&amp;oh=s3.amazonaws.com";
String url3 = "http://16.158.149.182:8080/solr/update/extract?literal.id=22&amp;literal.name=alern2&amp;literal.fullname=alern2.txt&amp;literal.type=file&amp;literal.size=16&amp;literal.datecreated=1281429683820&amp;literal.datelastmodify=1281429683820&amp;literal.userid=1&amp;literal.location=My Files&amp;commit=true&amp;stream.url=";
String param = "http://localhost:8080/kumulus_data/kumulus_local/uid1/1281429683820?AWSAccessKeyId=AKIAJB36UFEVFFJ3P4TQ&amp;Expires=1281444093&amp;Signature=zn2D1frVIv1aDV7drby5Iu7iLng%3D&amp;oh=s3.amazonaws.com";
param = URLEncoder.encode(param,"ASCII");
HttpURLConnection urlc = null;
System.out.println(url);
urlc = (HttpURLConnection) new URL(url).openConnection();
urlc.setDoOutput(true);
        urlc.setDoInput(true);

        InputStream in = urlc.getInputStream();
        String result = convertStreamToString(in);
System.out.println(result);
}catch (Exception e) {
e.printStackTrace();
}
}
public static String convertStreamToString(InputStream is) {
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
StringBuilder sb = new StringBuilder();

String line = null;
try {
while ((line = reader.readLine()) != null) {
sb.append(line + "/n");
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return sb.toString();
}  
}
</textarea></p>
<p></p>
<p></p>
<p>使用传统XML提交索引例子:</p>
<p>/update</p>
<p><textarea cols="103" rows="15" name="code" class="c-sharp">public class simpleClassForTest2 {
public static void main(String[] args) throws MalformedURLException, IOException{
String url = "http://localhost:8080/solr/update";
Document dom = DocumentHelper.createDocument();
Element add =dom.addElement("add");
Element doc = add.addElement("doc");
Element field_id = doc.addElement("field");
field_id.addAttribute("name", "id");
field_id.setText("50");

//Element field_1 = doc.addElement("field");
//field_1.addAttribute("name", "name");
//field_1.setText("ddt");
Element field_3 = doc.addElement("field");
field_3.addAttribute("name", "size");
field_3.setText("11");
Element field_2 = doc.addElement("field");
field_2.addAttribute("name", "text");
field_2.setText("ddt");
String xml = "";
xml = dom.asXML();
xml = xml.replace("&lt;?xml version=/"1.0/" encoding=/"UTF-8/"?&gt;", "");
StringWriter sw = new StringWriter();

HttpURLConnection urlc = (HttpURLConnection) new URL(url).openConnection();

urlc.setRequestMethod("GET");
urlc.setDoOutput(true);
        urlc.setDoInput(true);
        urlc.setUseCaches(false);
        urlc.setAllowUserInteraction(false);
        urlc.setRequestProperty("Content-type", "text/html; charset=UTF-8");
        ByteArrayInputStream stream = new ByteArrayInputStream(xml.getBytes());
postFile(stream, sw);
       
        System.out.println("reuslt :" + sw.toString());
        postData(new StringReader("&lt;commit/&gt;"), sw);
System.out.println("commit : "+sw.toString());

}
public static void postFile(InputStream stream, Writer output)
    throws FileNotFoundException, UnsupportedEncodingException{
    Reader reader = new InputStreamReader(stream, "UTF-8");
    try
    {
        postData(reader, output);
    }
    finally
    {
        try
        {
            if(reader != null)
                reader.close();
        }
        catch(IOException e)
        {
            e.printStackTrace();//throw new PostException("IOException while closing file", e);
        }
    }
}
public static String convertStreamToString(InputStream is) {
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
StringBuilder sb = new StringBuilder();

String line = null;
try {
while ((line = reader.readLine()) != null) {
sb.append(line + "/n");
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}

return sb.toString();
}  
public static void postData(Reader data, Writer output)
    {
        HttpURLConnection urlc = null;
        try
        {
        URL u = new URL("http://localhost:8080/solr/update");
            urlc = (HttpURLConnection)u.openConnection();
            try
            {
                urlc.setRequestMethod("POST");
            }
            catch(ProtocolException e)
            {
                e.printStackTrace();
            }
            urlc.setDoOutput(true);//允许输出
            urlc.setDoInput(true);//允许输入
            urlc.setUseCaches(false);//不使用缓存(如浏览器中的缓存优化 )
            urlc.setAllowUserInteraction(false);
            urlc.setRequestProperty("Content-type", "text/xml; charset=UTF-8");
            OutputStream out = urlc.getOutputStream();
            try
            {
                Writer writer = new OutputStreamWriter(out, "UTF-8");
                pipe(data, writer);
                writer.close();
            }
            catch(IOException e)
            {
            e.printStackTrace();
            }
            finally
            {
                if(out != null)
                    out.close();
            }
            InputStream in = urlc.getInputStream();
            try
            {
                Reader reader = new InputStreamReader(in);
                pipe(reader, output);
                reader.close();
            }
            catch(IOException e)
            {
            e.printStackTrace();
            }
            finally
            {
                if(in != null)
                    in.close();
            }
        }
        catch(IOException e)
        {
        e.printStackTrace();
        }
        finally
        {
            if(urlc != null)
                urlc.disconnect();
        }
    }
private static void pipe(Reader reader, Writer writer)throws IOException   {
    char buf[] = new char[1024];
    for(int read = 0; (read = reader.read(buf)) &gt;= 0;)
        writer.write(buf, 0, read);

    writer.flush();
    }
}
</textarea></p>

你可能感兴趣的:(Solr)