solr_对富文本(pdf等)建立索引

solrconfig.xml中定义的

<requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
    <lst name="defaults">
      <str name="fmap.Last-Modified">last_modified</str>
      <str name="uprefix">ignored_</str>
    </lst>
    <!--Optional.  Specify a path to a tika configuration file.  See the Tika docs for details.-->
    <str name="tika.config">/my/path/to/tika.config</str>
    <!-- Optional. Specify one or more date formats to parse.  See DateUtil.DEFAULT_DATE_FORMATS for default date formats -->
    <lst name="date.formats">
      <str>yyyy-MM-dd</str>
    </lst>
  </requestHandler>

 

 

需要额外jar包支持:

apache-solr-cell-1.4.0.jar

 

 

请求的URL样例:

http://16.158.149.182:8080/solr/update/extract?literal.id=22&literal.name=alern2&literal.fullname=alern2.txt&literal.type=file&literal.size=16&literal.datecreated=1281429683820&literal.datelastmodify=1281429683820&literal.userid=1&literal.location=My Files&commit=true&stream.url=http://localhost:8080/kumulus_data/kumulus_local/uid1/1281429683820?AWSAccessKeyId=AKIAJB36UFEVFFJ3P4TQ&Expires=1281444093&Signature=zn2D1frVIv1aDV7drby5Iu7iLng%3D&oh=s3.amazonaws.com

 

/solr/update/extract 请求url

literal.id=22 相当于XML中定义的<field name="id">22</field>

commit=true 直接提交事务

stream.url 指向富文本文件URL ,这里URL是有问题的,里面的& 符号未被处理,提交会失败。

相对的还有stream.file

 

详细看WIKI

http://wiki.apache.org/solr/ExtractingRequestHandler

 

简单样例:

 

里面url是可用的,其他URL都没经过URLEncoding.url,都有问题

 

这里的例子中,都使用HttpURLConnection链接,level有点低,可以考虑使用appach的HTTPClient,帮助解决多线程安全等问题,更安全可靠。

public class simpleClassFotTest { /** * @param args * @throws IOException * @throws URISyntaxException */ public static void main(String[] args) throws IOException, URISyntaxException { try{ URI uri = new URI("http://localhost:8080/SolrDemo/indexDir/1280998963663_aldern.txt"); String url = "http://16.158.149.182:8080/solr/update/extract?literal.id=21" + "&literal.name=aldern&literal.fullname=aldern.txt&literal.type=file&literal.size=32" + "&literal.datecreated=1281433365403&literal.datelastmodify=1281433365403&literal.userid=1" + "&literal.location=MyFiles&commit=true" + "&stream.url=http%3A%2F%2Flocalhost%3A8080%2Fkumulus_data%2Fkumulus_local%2Fuid1%2F1281433365403%3FAWSAccessKeyId%3DAKIAJB36UFEVFFJ3P4TQ%26Expires%3D1281447779%26Signature%3DL6BlDV%252BviBuL78vF2V5QPq3HTDU%253D%26oh%3Ds3.amazonaws.com"; String url2 = "http://16.158.149.182:8080/solr/update/extract?literal.id=22&literal.name=alern2&literal.fullname=alern2.txt&literal.type=file&literal.size=16&literal.datecreated=1281429683820&literal.datelastmodify=1281429683820&literal.userid=1&literal.location=My Files&commit=true&stream.url=http://localhost:8080/kumulus_data/kumulus_local/uid1/1281429683820?AWSAccessKeyId=AKIAJB36UFEVFFJ3P4TQ&Expires=1281444093&Signature=zn2D1frVIv1aDV7drby5Iu7iLng%3D&oh=s3.amazonaws.com"; String url3 = "http://16.158.149.182:8080/solr/update/extract?literal.id=22&literal.name=alern2&literal.fullname=alern2.txt&literal.type=file&literal.size=16&literal.datecreated=1281429683820&literal.datelastmodify=1281429683820&literal.userid=1&literal.location=My Files&commit=true&stream.url="; String param = "http://localhost:8080/kumulus_data/kumulus_local/uid1/1281429683820?AWSAccessKeyId=AKIAJB36UFEVFFJ3P4TQ&Expires=1281444093&Signature=zn2D1frVIv1aDV7drby5Iu7iLng%3D&oh=s3.amazonaws.com"; param = URLEncoder.encode(param,"ASCII"); HttpURLConnection urlc = null; System.out.println(url); urlc = (HttpURLConnection) new URL(url).openConnection(); urlc.setDoOutput(true); urlc.setDoInput(true); InputStream in = urlc.getInputStream(); String result = convertStreamToString(in); System.out.println(result); }catch (Exception e) { e.printStackTrace(); } } public static String convertStreamToString(InputStream is) { BufferedReader reader = new BufferedReader(new InputStreamReader(is)); StringBuilder sb = new StringBuilder(); String line = null; try { while ((line = reader.readLine()) != null) { sb.append(line + "/n"); } } catch (IOException e) { e.printStackTrace(); } finally { try { is.close(); } catch (IOException e) { e.printStackTrace(); } } return sb.toString(); } }

 

 

使用传统XML提交索引例子:

/update

public class simpleClassForTest2 { public static void main(String[] args) throws MalformedURLException, IOException{ String url = "http://localhost:8080/solr/update"; Document dom = DocumentHelper.createDocument(); Element add =dom.addElement("add"); Element doc = add.addElement("doc"); Element field_id = doc.addElement("field"); field_id.addAttribute("name", "id"); field_id.setText("50"); //Element field_1 = doc.addElement("field"); //field_1.addAttribute("name", "name"); //field_1.setText("ddt"); Element field_3 = doc.addElement("field"); field_3.addAttribute("name", "size"); field_3.setText("11"); Element field_2 = doc.addElement("field"); field_2.addAttribute("name", "text"); field_2.setText("ddt"); String xml = ""; xml = dom.asXML(); xml = xml.replace("<?xml version=/"1.0/" encoding=/"UTF-8/"?>", ""); StringWriter sw = new StringWriter(); HttpURLConnection urlc = (HttpURLConnection) new URL(url).openConnection(); urlc.setRequestMethod("GET"); urlc.setDoOutput(true); urlc.setDoInput(true); urlc.setUseCaches(false); urlc.setAllowUserInteraction(false); urlc.setRequestProperty("Content-type", "text/html; charset=UTF-8"); ByteArrayInputStream stream = new ByteArrayInputStream(xml.getBytes()); postFile(stream, sw); System.out.println("reuslt :" + sw.toString()); postData(new StringReader("<commit/>"), sw); System.out.println("commit : "+sw.toString()); } public static void postFile(InputStream stream, Writer output) throws FileNotFoundException, UnsupportedEncodingException{ Reader reader = new InputStreamReader(stream, "UTF-8"); try { postData(reader, output); } finally { try { if(reader != null) reader.close(); } catch(IOException e) { e.printStackTrace();//throw new PostException("IOException while closing file", e); } } } public static String convertStreamToString(InputStream is) { BufferedReader reader = new BufferedReader(new InputStreamReader(is)); StringBuilder sb = new StringBuilder(); String line = null; try { while ((line = reader.readLine()) != null) { sb.append(line + "/n"); } } catch (IOException e) { e.printStackTrace(); } finally { try { is.close(); } catch (IOException e) { e.printStackTrace(); } } return sb.toString(); } public static void postData(Reader data, Writer output) { HttpURLConnection urlc = null; try { URL u = new URL("http://localhost:8080/solr/update"); urlc = (HttpURLConnection)u.openConnection(); try { urlc.setRequestMethod("POST"); } catch(ProtocolException e) { e.printStackTrace(); } urlc.setDoOutput(true);//允许输出 urlc.setDoInput(true);//允许输入 urlc.setUseCaches(false);//不使用缓存(如浏览器中的缓存优化 ) urlc.setAllowUserInteraction(false); urlc.setRequestProperty("Content-type", "text/xml; charset=UTF-8"); OutputStream out = urlc.getOutputStream(); try { Writer writer = new OutputStreamWriter(out, "UTF-8"); pipe(data, writer); writer.close(); } catch(IOException e) { e.printStackTrace(); } finally { if(out != null) out.close(); } InputStream in = urlc.getInputStream(); try { Reader reader = new InputStreamReader(in); pipe(reader, output); reader.close(); } catch(IOException e) { e.printStackTrace(); } finally { if(in != null) in.close(); } } catch(IOException e) { e.printStackTrace(); } finally { if(urlc != null) urlc.disconnect(); } } private static void pipe(Reader reader, Writer writer)throws IOException { char buf[] = new char[1024]; for(int read = 0; (read = reader.read(buf)) >= 0;) writer.write(buf, 0, read); writer.flush(); } }

你可能感兴趣的:(String,Stream,null,Solr,url,output)