Java tesseract-ocr 图文识别技术 Java代码实现

Java文字识别程序的关键是寻找一个可以调用

import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.Locale;

import javax.imageio.IIOImage;
import javax.imageio.ImageIO;
import javax.imageio.ImageReader;
import javax.imageio.ImageWriteParam;
import javax.imageio.ImageWriter;
import javax.imageio.metadata.IIOMetadata;
import javax.imageio.stream.ImageInputStream;
import javax.imageio.stream.ImageOutputStream;

import com.sun.media.imageio.plugins.tiff.TIFFImageWriteParam;

public class ImageIOHelper {

	public static File createImage(File imageFile, String imageFormat) {
		File tempFile = null;
		try {
			Iterator readers = ImageIO.getImageReadersByFormatName(imageFormat);
			ImageReader reader = (ImageReader) readers.next();

			ImageInputStream iis = ImageIO.createImageInputStream(imageFile);
			reader.setInput(iis);
			// Read the stream metadata
			IIOMetadata streamMetadata = reader.getStreamMetadata();

			// Set up the writeParam
			TIFFImageWriteParam tiffWriteParam = new TIFFImageWriteParam(Locale.CHINESE);
			tiffWriteParam.setCompressionMode(ImageWriteParam.MODE_DISABLED);

			// Get tif writer and set output to file
			Iterator writers = ImageIO.getImageWritersByFormatName("tiff");
			ImageWriter writer = (ImageWriter) writers.next();

			BufferedImage bi = reader.read(0);
			IIOImage image = new IIOImage(bi, null, reader.getImageMetadata(0));
			tempFile = tempImageFile(imageFile);
			ImageOutputStream ios = ImageIO.createImageOutputStream(tempFile);
			writer.setOutput(ios);
			writer.write(streamMetadata, image, tiffWriteParam);
			ios.close();

			writer.dispose();
			reader.dispose();

		} catch (IOException e) {
			e.printStackTrace();
		}
		return tempFile;
	}

	private static File tempImageFile(File imageFile) {
		String path = imageFile.getPath();
		StringBuffer strB = new StringBuffer(path);
		strB.insert(path.lastIndexOf('.'), 0);
		return new File(strB.toString().replaceFirst("(?<=//.)(//w+)$", "tif"));
	}

}


的OCR引擎。tesseract-ocr就是一个这样的OCR引擎,在1985年到1995年由HP实验室开发,现在在Google。tesseract-ocr 3.0发布,支持中文。不过tesseract-ocr 3.0不是图形化界面的客户端,别人写的FreeOCR图形化客户端还不支持导入新的 3.0 traineddata。但这标志着,现在有自由的中文OCR软件了。

 

    java中使用tesseract-ocr3.01的步骤如下:

1.下载安装tesseract-ocr-setup-3.01-1.exe(3.0以上版本才增加了中文识别)

2.在安装向导中可以选择需要下载的语言包。

3.到网上搜索下载java图形处理所需的2个包:jai_imageio-1.1-alpha.jar,swingx-1.6.1.jar

4.java程序清单:

ImageIOHelper 类:

[html] view plain copy

  1. import java.awt.image.BufferedImage;  
  2. import java.io.File;  
  3. import java.io.IOException;  
  4. import java.util.Iterator;  
  5. import java.util.Locale;  
  6.   
  7. import javax.imageio.IIOImage;  
  8. import javax.imageio.ImageIO;  
  9. import javax.imageio.ImageReader;  
  10. import javax.imageio.ImageWriteParam;  
  11. import javax.imageio.ImageWriter;  
  12. import javax.imageio.metadata.IIOMetadata;  
  13. import javax.imageio.stream.ImageInputStream;  
  14. import javax.imageio.stream.ImageOutputStream;  
  15.   
  16. import com.sun.media.imageio.plugins.tiff.TIFFImageWriteParam;  
  17.   
  18. public class ImageIOHelper {    
  19.         
  20.     public static File createImage(File imageFile, String imageFormat) {    
  21.         File tempFile = null;    
  22.         try {    
  23.             Iterator readers = ImageIO.getImageReadersByFormatName(imageFormat);    
  24.             ImageReader reader = readers.next();    
  25.             
  26.             ImageInputStream iis = ImageIO.createImageInputStream(imageFile);    
  27.             reader.setInput(iis);    
  28.             //Read the stream metadata    
  29.             IIOMetadata streamMetadata = reader.getStreamMetadata();    
  30.                 
  31.             //Set up the writeParam    
  32.             TIFFImageWriteParam tiffWriteParam = new TIFFImageWriteParam(Locale.CHINESE);    
  33.             tiffWriteParam.setCompressionMode(ImageWriteParam.MODE_DISABLED);    
  34.                 
  35.             //Get tif writer and set output to file    
  36.             Iterator writers = ImageIO.getImageWritersByFormatName("tiff");    
  37.             ImageWriter writer = writers.next();    
  38.                 
  39.             BufferedImage bi = reader.read(0);    
  40.             IIOImage image = new IIOImage(bi,null,reader.getImageMetadata(0));    
  41.             tempFile = tempImageFile(imageFile);    
  42.             ImageOutputStream ios = ImageIO.createImageOutputStream(tempFile);    
  43.             writer.setOutput(ios);    
  44.             writer.write(streamMetadata, image, tiffWriteParam);    
  45.             ios.close();    
  46.                 
  47.             writer.dispose();    
  48.             reader.dispose();    
  49.                 
  50.         } catch (IOException e) {    
  51.             e.printStackTrace();    
  52.         }    
  53.         return tempFile;    
  54.     }    
  55.     
  56.     private static File tempImageFile(File imageFile) {    
  57.         String path = imageFile.getPath();    
  58.         StringBuffer strB = new StringBuffer(path);    
  59.         strB.insert(path.lastIndexOf('.'),0);    
  60.         return new File(strB.toString().replaceFirst("(?<=//.)(//w+)$", "tif"));    
  61.     }    
  62.     
  63. }  


 

OCR 类:

[html] view plain copy

  1. package com.hhp.util;  
  2.   
  3. import java.io.BufferedReader;    
  4. import java.io.File;    
  5. import java.io.FileInputStream;    
  6. import java.io.InputStreamReader;    
  7. import java.util.ArrayList;    
  8. import java.util.List;    
  9. import org.jdesktop.swingx.util.OS;    
  10.     
  11. public class OCR {    
  12.     private final String LANG_OPTION = "-l";  //英文字母小写l,并非数字1    
  13.     private final String EOL = System.getProperty("line.separator");    
  14.     private String tessPath = "C://Program Files (x86)//Tesseract-OCR";    
  15.     //private String tessPath = new File("tesseract").getAbsolutePath();    
  16.         
  17.     public String recognizeText(File imageFile,String imageFormat)throws Exception{    
  18.         File tempImage = ImageIOHelper.createImage(imageFile,imageFormat);    
  19.         File outputFile = new File(imageFile.getParentFile(),"output");    
  20.         StringBuffer strB = new StringBuffer();    
  21.         List cmd = new ArrayList();    
  22.         if(OS.isWindowsXP()){    
  23.             cmd.add(tessPath+"//tesseract");    
  24.         }else if(OS.isLinux()){    
  25.             cmd.add("tesseract");    
  26.         }else{    
  27.             cmd.add(tessPath+"//tesseract");    
  28.         }    
  29.         cmd.add("");    
  30.         cmd.add(outputFile.getName());    
  31.         cmd.add(LANG_OPTION);    
  32.         cmd.add("chi_sim");    
  33.         //cmd.add("eng");    
  34.             
  35.         ProcessBuilder pb = new ProcessBuilder();    
  36.         pb.directory(imageFile.getParentFile());    
  37.             
  38.         cmd.set(1, tempImage.getName());    
  39.         pb.command(cmd);    
  40.         pb.redirectErrorStream(true);    
  41.             
  42.         Process process = pb.start();    
  43.         //tesseract.exe 1.jpg 1 -l chi_sim    
  44.         int w = process.waitFor();    
  45.             
  46.         //删除临时正在工作文件    
  47.         tempImage.delete();    
  48.             
  49.         if(w==0){    
  50.             BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(outputFile.getAbsolutePath()+".txt"),"UTF-8"));    
  51.                 
  52.             String str;    
  53.             while((str = in.readLine())!=null){    
  54.                 strB.append(str).append(EOL);    
  55.             }    
  56.             in.close();    
  57.         }else{    
  58.             String msg;    
  59.             switch(w){    
  60.                 case 1:    
  61.                     msg = "Errors accessing files.There may be spaces in your image's filename.";    
  62.                     break;    
  63.                 case 29:    
  64.                     msg = "Cannot recongnize the image or its selected region.";    
  65.                     break;    
  66.                 case 31:    
  67.                     msg = "Unsupported image format.";    
  68.                     break;    
  69.                 default:    
  70.                     msg = "Errors occurred.";    
  71.             }    
  72.             tempImage.delete();    
  73.             throw new RuntimeException(msg);    
  74.         }    
  75.         new File(outputFile.getAbsolutePath()+".txt").delete();    
  76.         return strB.toString();    
  77.     }    
  78. }    
  79.   
  80.   
  81.    


测试类TestOCR :

 

[html] view plain copy

  1. import java.io.File;  
  2. import java.io.IOException;  
  3.   
  4. import com.hhp.util.OCR;  
  5.   
  6. public class OcrTest {  
  7.   
  8.  public static void main(String[] args) {  
  9.         String path = "C://temp//OCRcode//4.png";       
  10.         System.out.println("ORC Test Begin......");  
  11.         try {       
  12.             String valCode = new OCR().recognizeText(new File(path), "png");       
  13.             System.out.println(valCode);       
  14.         } catch (IOException e) {       
  15.             e.printStackTrace();       
  16.         } catch (Exception e) {    
  17.             e.printStackTrace();    
  18.         }         
  19.         System.out.println("ORC Test End......");  
  20.     }    
  21.   
  22. }  


经过测试,tesseract-ocr 3.01的文字识别率很高,对于网站中常见的验证码识别率也很高。

你可能感兴趣的:(Java tesseract-ocr 图文识别技术 Java代码实现)