关于Apache Tika解析txt文件乱码的研究

tika提取正文不乱码,但是当正文内容特别少时候,比如只有一个汉字时就会乱码,感觉他的编码的识别方法应该是基于一种策略。这种策略,是根据正文内容来计算的,所以当内容特别少时,编码计算容易失败!估计是使用统计学和启发式方法对网页源码进行编码探测。ICU4J就是基于第二种方式的类库,由IBM提供。 tika支持的编码检测方法有:HtmlEncodingDetector UniversalEncodingDetector  Icu4jEncodingDetector
package com.jiepu.tika_demo;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;

import org.apache.tika.Tika;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.Icu4jEncodingDetector;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;

/**
 * Hello world!
 *
 */
public class App {
	public static void main(String[] args) throws Exception {
		
		
		 
		Tika tika = new Tika();
	
		//System.out.println(tika.detect("http://127.0.0.1:8080/xd/index.txt"));
		/*System.out.println(tika.detect("x.html"));
		System.out.println(tika.detect("110.mp3"));
		System.out.println(tika.detect("110.apk"));
		System.out.println(tika.detect("110.ipa"));
		System.out.println(tika.detect("110.exe"));
		System.out.println(tika.detect("110.eml"));*/
		//String content = tika.parseToString(new File("G:\\测试数据\\test-documents\\EmbeddedDocument.docx"));
		//System.out.println(content);
		//System.out.println(tika.translate("fuck", "en"));
		//System.out.println(tika.detect(new File("G:\\测试数据\\test\\guangxi105.pdf")));
		//tika.translate(text, sourceLanguage, targetLanguage)
		File dir=new File("g:\\测试数据\\天猫\\");
		for (File  file: dir.listFiles()) {
			System.out.print(file.getAbsolutePath()+" ");
			
			InputStream fileInputStream=new FileInputStream(file);
			String type=tika.detect(file.getAbsolutePath());
			
			System.out.println(type);
			 Metadata metadata = new Metadata();
			 metadata.set(Metadata.CONTENT_TYPE,type);
			 
			
			
			//CharsetDetector charsetDetector=new CharsetDetector();
			
			 
			//HtmlEncodingDetector  UniversalEncodingDetector  Icu4jEncodingDetector
			EncodingDetector encodingDetector=new Icu4jEncodingDetector();
			Charset encode=encodingDetector.detect(new BufferedInputStream(fileInputStream), new Metadata());
			System.out.println(encode.name());
			
			 metadata.set(Metadata.CONTENT_ENCODING, encode.name());
			
			//String content = tika.parseToString(fileInputStream,metadata);
			
			String content = tika.parseToString(file);
			if(content.equals(""))		
			{
				System.out.println("content==null");
			}else{
				System.out.println(content);
			}
			//System.out.println(read(file.getAbsolutePath(), "gb2312"));
			
			
		}
		//test001();
		//testdoc();
		System.out.println("Hello World!");
	}
	 public static String read(String fileName, String encoding) {

	        String string = "";
	        try {
	            BufferedReader in = new BufferedReader(new InputStreamReader(
	                    new FileInputStream(fileName), encoding));

	            String str = "";
	            while ((str = in.readLine()) != null) {
	                string += str + "\n";
	            }
	            in.close();

	        } catch (Exception ex) {
	            ex.printStackTrace();
	        }
	        return string;
	    }

	    public static void write(String fileName, String encoding, String str) {
	        try {
	            Writer out = new BufferedWriter(new OutputStreamWriter(
	                    new FileOutputStream(fileName), encoding));
	            out.write(str);
	            out.close();
	        } catch (Exception ex) {
	            ex.printStackTrace();
	        }
	    }

	private static void testdoc() {

	        Parser parser =  new AutoDetectParser();
	        Metadata metadata = new Metadata();
	        ContentHandler handler = new BodyContentHandler();
	        try {
	        InputStream stream = new FileInputStream("G:\\测试数据\\test-documents\\EmbeddedDocument.docx");
	        
	            parser.parse(
	                    stream,handler, metadata, new ParseContext());
	            System.out.println(handler.toString());
	           
	        }  catch (Exception e) {
				
	        	e.printStackTrace();
			} finally {
	           
	        }
	       
		
	}

	private static void test001() {
		   Parser parser = new AutoDetectParser(); // Should auto-detect!
	        ContentHandler handler = new BodyContentHandler();
	        Metadata metadata = new Metadata();
	        try {
	        InputStream stream = new FileInputStream(
	                "G:/测试数据/test-documents/testMP4.m4a");	      
	            parser.parse(stream, handler, metadata, new ParseContext());
	            System.out.println(handler.toString());
	            stream.close();
	        } catch (Exception e) {
				e.printStackTrace();
			} finally {
	            
	        }
	}
}
package com.jiepu.tika_demo;


import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;

import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
import org.apache.tika.parser.txt.UniversalEncodingDetector;



/**
* 本类使用ICU4J包进行文档编码获取
*
*/
public class EncodeDetector {
    /**
    * 获取编码
    * @throws IOException
    * @throws Exception
    */
    public static String getEncode(byte[] data,String url){
       CharsetDetector detector = new CharsetDetector();
       detector.setText(data);
       CharsetMatch match = detector.detect();
       String encoding = match.getName();
       System.out.println("The Content in " + match.getName());
       CharsetMatch[] matches = detector.detectAll();
       System.out.println("All possibilities");
       for (CharsetMatch m : matches) {
    	   //System.out.println("CharsetName:" + m.getName() + " Confidence:"+ m.getConfidence());
       }
       return encoding;
    }
    public static String getEncode(InputStream data,String url) throws IOException{
       CharsetDetector detector = new CharsetDetector();
       detector.setText(data);
       CharsetMatch match = detector.detect();
       String encoding = match.getName();
       System.out.println("The Content in " + match.getName());
       
       CharsetMatch[] matches = detector.detectAll();
       System.out.println("All possibilities");
       for (CharsetMatch m : matches) {
    	  // System.out.println("CharsetName:" + m.getName() + " Confidence:"+ m.getConfidence());
       }
       return encoding;
    }
    public static void main(String[] args) throws Exception {
		
    	String encode=getEncode(new BufferedInputStream(new FileInputStream("G:\\测试数据\\天猫\\002.txt")), "");
    	System.out.println(encode);
    	
        File file=new File("G:\\测试数据\\天猫\\002.txt");
        InputStream stream=null;        
        try
        {
            stream=new FileInputStream(file);
            EncodingDetector  detector=new UniversalEncodingDetector();
            Charset charset = detector.detect(new BufferedInputStream(stream), new Metadata());
            System.out.println("编码:"+charset.name());    
        } finally
        {
            if (stream != null)  
            	stream.close();
        }
	}

}

http://www.cnblogs.com/chenying99/archive/2013/03/07/2947296.html

http://www.pipetips.com/names/2013/03/20/246647.html



你可能感兴趣的:(关于Apache Tika解析txt文件乱码的研究)