public class LanguageDetectingParser extends DelegatingParser { /** * */ private static final long serialVersionUID = 1L; public void parse( InputStream stream, ContentHandler handler, final Metadata metadata, ParseContext context) throws SAXException, IOException, TikaException { ProfilingHandler profiler =new ProfilingHandler(); ContentHandler tee =new TeeContentHandler(handler, profiler); super.parse(stream, tee, metadata, context); LanguageIdentifier identifier = profiler.getLanguage(); if (identifier.isReasonablyCertain()) { metadata.set(Metadata.LANGUAGE, identifier.getLanguage()); } } protected Parser getDelegateParser(ParseContext context) { return context.get(Parser.class, new AutoDetectParser()); } }
public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TemporaryResources tmp = new TemporaryResources(); try { TikaInputStream tis = TikaInputStream.get(stream, tmp); // Automatically detect the MIME type of the document MediaType type = detector.detect(tis, metadata); metadata.set(Metadata.CONTENT_TYPE, type.toString()); // TIKA-216: Zip bomb prevention SecureContentHandler sch = new SecureContentHandler(handler, tis); try { // Parse the document super.parse(tis, sch, metadata, context); } catch (SAXException e) { // Convert zip bomb exceptions to TikaExceptions sch.throwIfCauseOf(e); throw e; } } finally { tmp.dispose(); } }
public static void main(String[] args) throws IOException, TikaException { // TODO Auto-generated method stub File file=new File("E:\\watiao.htm"); InputStream stream=TikaInputStream.get(file); try { EncodingDetector detector=new UniversalEncodingDetector(); Charset charset = detector.detect(stream, new Metadata()); System.out.println("编码2:"+charset.name()); //进一步解析 } finally { if (stream != null) stream.close(); } }
本系列tika源码解析的文章系本人原创,本人参考了《Tika in Action》英文版,以后如有心得再继续补充。
转载请注明出处 博客园 刺猬的温驯
本文链接 http://www.cnblogs.com/chenying99/archive/2013/03/11/2953365.html