基于Tesseract-OCR实现的JAVA WEB版OCR(图片转文字)

首先在Tesseract-OCR官网下载Tesseract-OCR 3.02,以及中文数据包chi_sim.traineddata(简体)

接下来就是新建一个JAVA EE项目,把Tesseract-OCR放在项目WebRoot下。

下面是主要代码:

接受客户端上传过来的图片,使用Tesseract-OCR识别后返回至前台。

package servlet;

import java.io.IOException;

import javax.servlet.ServletConfig;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import util.FileUtil;
import util.OCRUtil;

import com.jspsmart.upload.File;
import com.jspsmart.upload.SmartUpload;
import com.jspsmart.upload.SmartUploadException;


public class OCRServlet extends HttpServlet {

	public void doPost(HttpServletRequest request, HttpServletResponse response)
			throws ServletException, IOException {
		response.setCharacterEncoding("gbk");
		SmartUpload upload = new SmartUpload();
		ServletConfig sc = this.getServletConfig();
		upload.initialize(sc, request, response);
		File file = null;
		long size = 5*1024*1024;
		upload.setAllowedFilesList("gif,jpg,bmp,png");
		upload.setMaxFileSize(size);
		upload.setCharset("GBK");
		try {
			upload.upload();
			file = upload.getFiles().getFile(0);
			String userPath = "upload\\"+request.getRemoteAddr().replaceAll("\\.", "")+"\\";
			String svpath = userPath+file.getFileName();
			if(!file.isMissing()){
				String realPath = request.getRealPath("/");
				FileUtil.creatPath(realPath+userPath);
				file.saveAs(svpath,SmartUpload.SAVE_VIRTUAL);
				try {
					OCRUtil.runOCR(realPath, realPath+svpath, realPath+userPath+"ocr",true);
					request.setAttribute("txt", FileUtil.read(realPath+userPath+"ocr.txt").trim());
					request.getRequestDispatcher("/index.jsp").forward(request, response);
				} catch (Exception e) {
					e.printStackTrace();
				}
				FileUtil.delete(realPath+userPath);
			}
		} catch (SmartUploadException e) {
			e.printStackTrace();
		}
	}

}
package util;

public class OCRUtil {
	public static String chiSIM = "chi_sim";
	
	public static void runOCR(String realPath,String imagePath,String outPath,boolean isChi) throws Exception{
		Runtime r = Runtime.getRuntime();
		String cmd = "\""+realPath+"Tesseract-OCR\\tesseract.exe\" \""+imagePath+"\" \""+outPath+"\" -l "+(isChi?chiSIM:"");
		r.exec(cmd);
	}
}
package util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;

public class FileUtil {
	public static String read(String path) throws IOException{
		String txt = "";
		File file = new File(path);
		long timeout = 30*60;
		while(!(file.isFile() && file.exists())){
			file = new File(path);
			try {
				Thread.sleep(100);
				timeout -= 100;
			} catch (InterruptedException e) {
				e.printStackTrace();
			}
		}
		if (file.isFile() && file.exists()) {
			InputStreamReader read = new InputStreamReader(new FileInputStream(file), "UTF-8");
			BufferedReader bReader = new BufferedReader(read);
			String temptxt = "";
			txt = "";
			while((temptxt=bReader.readLine())!=null){
				txt += temptxt;
			}
			bReader.close();
			read.close();
		}
		return txt;
	}
	
	public static void creatPath(String path) throws IOException{
		File file = new File(path);
		file.mkdir();
	}
	
	public static void delete(String path) throws IOException{
		File file = new File(path);
		String[] list = file.list();
		File tempFile = null;
		for(String temp : list){
			tempFile = new File(path+temp);
			tempFile.delete();
		}
		file.delete();
	}
}

下面是JSP代码:

<%@ page language="java" import="java.util.*" pageEncoding="GBK"%>



  
    在线OCR--By Lee
    
	
	
	    
	
	
	

  
  
  
  
选择文件:
上传文件:

效果图:


在图片没做任何处理的情况下,识别率还是挺低的。。

你可能感兴趣的:(java,EE)