package com.jiepu.tika_demo; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.charset.Charset; import org.apache.tika.Tika; import org.apache.tika.detect.EncodingDetector; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.txt.CharsetDetector; import org.apache.tika.parser.txt.Icu4jEncodingDetector; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; /** * Hello world! * */ public class App { public static void main(String[] args) throws Exception { Tika tika = new Tika(); //System.out.println(tika.detect("http://127.0.0.1:8080/xd/index.txt")); /*System.out.println(tika.detect("x.html")); System.out.println(tika.detect("110.mp3")); System.out.println(tika.detect("110.apk")); System.out.println(tika.detect("110.ipa")); System.out.println(tika.detect("110.exe")); System.out.println(tika.detect("110.eml"));*/ //String content = tika.parseToString(new File("G:\\测试数据\\test-documents\\EmbeddedDocument.docx")); //System.out.println(content); //System.out.println(tika.translate("fuck", "en")); //System.out.println(tika.detect(new File("G:\\测试数据\\test\\guangxi105.pdf"))); //tika.translate(text, sourceLanguage, targetLanguage) File dir=new File("g:\\测试数据\\天猫\\"); for (File file: dir.listFiles()) { System.out.print(file.getAbsolutePath()+" "); InputStream fileInputStream=new FileInputStream(file); String type=tika.detect(file.getAbsolutePath()); System.out.println(type); Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE,type); //CharsetDetector charsetDetector=new CharsetDetector(); //HtmlEncodingDetector UniversalEncodingDetector Icu4jEncodingDetector EncodingDetector encodingDetector=new Icu4jEncodingDetector(); Charset encode=encodingDetector.detect(new BufferedInputStream(fileInputStream), new Metadata()); System.out.println(encode.name()); metadata.set(Metadata.CONTENT_ENCODING, encode.name()); //String content = tika.parseToString(fileInputStream,metadata); String content = tika.parseToString(file); if(content.equals("")) { System.out.println("content==null"); }else{ System.out.println(content); } //System.out.println(read(file.getAbsolutePath(), "gb2312")); } //test001(); //testdoc(); System.out.println("Hello World!"); } public static String read(String fileName, String encoding) { String string = ""; try { BufferedReader in = new BufferedReader(new InputStreamReader( new FileInputStream(fileName), encoding)); String str = ""; while ((str = in.readLine()) != null) { string += str + "\n"; } in.close(); } catch (Exception ex) { ex.printStackTrace(); } return string; } public static void write(String fileName, String encoding, String str) { try { Writer out = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(fileName), encoding)); out.write(str); out.close(); } catch (Exception ex) { ex.printStackTrace(); } } private static void testdoc() { Parser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(); try { InputStream stream = new FileInputStream("G:\\测试数据\\test-documents\\EmbeddedDocument.docx"); parser.parse( stream,handler, metadata, new ParseContext()); System.out.println(handler.toString()); } catch (Exception e) { e.printStackTrace(); } finally { } } private static void test001() { Parser parser = new AutoDetectParser(); // Should auto-detect! ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); try { InputStream stream = new FileInputStream( "G:/测试数据/test-documents/testMP4.m4a"); parser.parse(stream, handler, metadata, new ParseContext()); System.out.println(handler.toString()); stream.close(); } catch (Exception e) { e.printStackTrace(); } finally { } } }
package com.jiepu.tika_demo; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; import org.apache.tika.detect.EncodingDetector; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.txt.CharsetDetector; import org.apache.tika.parser.txt.CharsetMatch; import org.apache.tika.parser.txt.UniversalEncodingDetector; /** * 本类使用ICU4J包进行文档编码获取 * */ public class EncodeDetector { /** * 获取编码 * @throws IOException * @throws Exception */ public static String getEncode(byte[] data,String url){ CharsetDetector detector = new CharsetDetector(); detector.setText(data); CharsetMatch match = detector.detect(); String encoding = match.getName(); System.out.println("The Content in " + match.getName()); CharsetMatch[] matches = detector.detectAll(); System.out.println("All possibilities"); for (CharsetMatch m : matches) { //System.out.println("CharsetName:" + m.getName() + " Confidence:"+ m.getConfidence()); } return encoding; } public static String getEncode(InputStream data,String url) throws IOException{ CharsetDetector detector = new CharsetDetector(); detector.setText(data); CharsetMatch match = detector.detect(); String encoding = match.getName(); System.out.println("The Content in " + match.getName()); CharsetMatch[] matches = detector.detectAll(); System.out.println("All possibilities"); for (CharsetMatch m : matches) { // System.out.println("CharsetName:" + m.getName() + " Confidence:"+ m.getConfidence()); } return encoding; } public static void main(String[] args) throws Exception { String encode=getEncode(new BufferedInputStream(new FileInputStream("G:\\测试数据\\天猫\\002.txt")), ""); System.out.println(encode); File file=new File("G:\\测试数据\\天猫\\002.txt"); InputStream stream=null; try { stream=new FileInputStream(file); EncodingDetector detector=new UniversalEncodingDetector(); Charset charset = detector.detect(new BufferedInputStream(stream), new Metadata()); System.out.println("编码:"+charset.name()); } finally { if (stream != null) stream.close(); } } }
http://www.cnblogs.com/chenying99/archive/2013/03/07/2947296.html
http://www.pipetips.com/names/2013/03/20/246647.html