2019独角兽企业重金招聘Python工程师标准>>>
安装 Tesseract-OCR
Windows 版本
- tesseract-ocr-setup-xx.xx.exe
- chi_sim.traineddata.gz 中文语言包
这两个请自行百度即可,然后我们将其安装在D:下,其中将语言包放在安装目录下的tessdata的目录下。
Linux 版本
我使用的是centos7,下面给出安装tesseract的前提条件。
- 安装编译库
yum install autoconf automake libtool yum install libjpeg-devel libpng-devel libtiff-devel zlib-devel
请注意这里面是两个库,使用了顿号隔开的,一般系统都有存在这个库的
-
安装依赖的leptonica库
wget http://www.leptonica.com/source/leptonica-1.72.tar.gz tar -xvf leptonica-1.72.tar.gz cd leptonica-1.72 ./configure --with-libpng && make && make install
这个依赖库要注意,一定是1.71以上的版本。
-
下载编译tesseract
wget https://github.com/tesseract-ocr/tesseract/archive/3.04.00.tar.gz mv 3.04.00 Tesseract3.04.00.tar.gz tar -xvf Tesseract3.04.00.tar.gz cd tesseract-3.04.00/ ./autogen.sh ./configure make && make install ldconfig
tesseract我安装在了 /usr/local 这个目录下,名称为 tesseract-3.04.00 。如果你使用的是3.01的版本,需要在./autogen.sh 后面执行mkdir m4;这条命令,否则他会提示m4这个目录不存在。
- 下载识别库(语言包)
wget --no-check-certificate https://github.com/tesseract-ocr/tessdata/raw/master/eng.traineddata wget --no-check-certificate https://github.com/tesseract-ocr/tessdata/raw/master/chi_sim.traineddata wget --no-check-certificate https://github.com/tesseract-ocr/tessdata/raw/master/chi_tra.traineddata wget http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.eng.tar.gz
注意此处的语言包以及解压出的语言包都要放在 /usr/local/share/tessdata/ 目录下。
Java 读取数据
- 启动命令程序
package com.zefun.common.utils; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import org.apache.log4j.Logger; import org.jdesktop.swingx.util.OS; /** * ocr 读取命令程序 * @author 高国藩 * @date 2016年12月5日 上午10:27:16 */ public class Ocr { /** 英文字母小写l,并非数字1 */ private static final String LANG_OPTION = "-l"; /** 系统换行符 */ private static final String EOL = System.getProperty("line.separator"); /** 系统目录符 */ private static final String GNL = System.getProperty("file.separator"); /** log */ private Logger logger = Logger.getLogger(Ocr.class); /** 目标,下面分别为Linux和Windows下的程序配备,Linux不需要此处的路径了 */ // private String tessPath = GNL + "usr" + GNL + "local" + GNL + "tesseract-3.04.00"; private String tessPath = new File("D:\\Tesseract-OCR").getAbsolutePath(); /** * 解析图片 * @author 高国藩 * @date 2016年12月5日 上午10:28:47 * @param imageFile image file * @param imageFormat 转码路径 * @return ver_code * @throws Exception 异常处理啊 */ public String recognizeText(File imageFile, String imageFormat) throws Exception { File tempImage = ImageIOHelper.createImage(imageFile, imageFormat); File outputFile = new File(imageFile.getParentFile(), "output"); StringBuffer strB = new StringBuffer(); List
cmd = new ArrayList (); if (OS.isWindowsXP()) { cmd.add(tessPath + "//tesseract"); } else if (OS.isLinux()) { cmd.add("tesseract"); } else { cmd.add(tessPath + "//tesseract"); } cmd.add(""); cmd.add(outputFile.getName()); cmd.add(LANG_OPTION); // cmd.add("chi_sim"); 更换语言包 cmd.add("eng"); ProcessBuilder pb = new ProcessBuilder(); pb.directory(imageFile.getParentFile()); cmd.set(1, tempImage.getName()); pb.command(cmd); pb.redirectErrorStream(true); Process process = pb.start(); logger.info(cmd.toString()); // tesseract.exe 1.jpg 1 -l chi_sim int w = process.waitFor(); // 删除临时正在工作文件 tempImage.delete(); if (w == 0) { BufferedReader in = new BufferedReader(new InputStreamReader( new FileInputStream(outputFile.getAbsolutePath() + ".txt"), "UTF-8")); String str; while ((str = in.readLine()) != null) { strB.append(str).append(EOL); } in.close(); } else { String msg; switch (w) { case 1: msg = "Errors accessing files.There may be spaces in your image's filename."; break; case 29: msg = "Cannot recongnize the image or its selected region."; break; case 31: msg = "Unsupported image format."; break; default: msg = "Errors occurred."; } tempImage.delete(); throw new RuntimeException(msg); } new File(outputFile.getAbsolutePath() + ".txt").delete(); logger.info("图形识别结果 ====>>> " + strB.toString()); return strB.toString(); } } 此处要注意一下tesseract的命令目录,Windows和Linux的目录不同,尤其分隔符。
- 解析图片程序
package com.zefun.common.utils; import java.awt.image.BufferedImage; import java.io.File; import java.io.IOException; import java.util.Iterator; import java.util.Locale; import javax.imageio.IIOImage; import javax.imageio.ImageIO; import javax.imageio.ImageReader; import javax.imageio.ImageWriteParam; import javax.imageio.ImageWriter; import javax.imageio.metadata.IIOMetadata; import javax.imageio.stream.ImageInputStream; import javax.imageio.stream.ImageOutputStream; import com.sun.media.imageio.plugins.tiff.TIFFImageWriteParam; /** * ver_image 图片解析器 * @author 高国藩 * @date 2016年12月5日 上午10:31:09 */ public class ImageIOHelper { /** * 图片文件转换为tif格式 * @param imageFile 文件路径 * @param imageFormat 文件扩展名 * @return 路径 */ public static File createImage(File imageFile, String imageFormat) { File tempFile = null; try { Iterator
readers = ImageIO .getImageReadersByFormatName(imageFormat); ImageReader reader = readers.next(); ImageInputStream iis = ImageIO.createImageInputStream(imageFile); reader.setInput(iis); // Read the stream metadata IIOMetadata streamMetadata = reader.getStreamMetadata(); // Set up the writeParam TIFFImageWriteParam tiffWriteParam = new TIFFImageWriteParam( Locale.CHINESE); tiffWriteParam.setCompressionMode(ImageWriteParam.MODE_DISABLED); // Get tif writer and set output to file Iterator writers = ImageIO .getImageWritersByFormatName("tiff"); ImageWriter writer = writers.next(); BufferedImage bi = reader.read(0); IIOImage image = new IIOImage(bi, null, reader.getImageMetadata(0)); tempFile = tempImageFile(imageFile); ImageOutputStream ios = ImageIO.createImageOutputStream(tempFile); writer.setOutput(ios); writer.write(streamMetadata, image, tiffWriteParam); ios.close(); writer.dispose(); reader.dispose(); } catch (IOException e) { e.printStackTrace(); } return tempFile; } /** * 格式化图片 * @author 高国藩 * @date 2016年12月5日 上午10:31:41 * @param imageFile imageFile * @return File */ private static File tempImageFile(File imageFile) { String path = imageFile.getPath(); StringBuffer strB = new StringBuffer(path); strB.insert(path.lastIndexOf('.'), 0); return new File(strB.toString().replaceFirst("(?<=//.)(//w+)$", "tif")); } } 改程序会将图片首先解析为tif类型文件,在其中读取出数据。
-
测试加载
String valCode = new Ocr().recognizeText(new File(new File(path4).getAbsolutePath()), "jpg"); logger.info(valCode);
注意在测试中的文件路径问题,Linux和Windows区别很大。
- Maven 包管理
-
net.java.dev.jna jna 4.2.1 net.sourceforge.tess4j tess4j 2.0.1 com.sun.jna jna com.kenai.nbpwr org-jdesktop-swingx 1.6-201002261215
使用Java模拟系统登录
package com.zefun.wechat.controller;
import java.io.File;
import java.io.InputStream;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.servlet.http.HttpServletRequest;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.servlet.ModelAndView;
import com.zefun.common.consts.Url;
import com.zefun.common.utils.HttpClientUtil;
import com.zefun.common.utils.Ocr;
import com.zefun.web.controller.BaseController;
import net.sf.json.JSONObject;
/**
* ImageUtilsController
* @author 高国藩
* @date 2016年12月5日 下午12:01:32
*/
@Controller
public class ImageUtilsController extends BaseController {
/** logger */
private Logger logger = Logger.getLogger(ImageUtilsController.class);
/**
* login
* @author 高国藩
* @date 2016年12月6日 下午6:49:10
* @param request request
* @return ModelAndView
*/
@RequestMapping(value = Url.MessagePushMember.VIEW_IMAGE, method = RequestMethod.GET)
public ModelAndView appointView(HttpServletRequest request){
try {
String verPath = "http://vip1.sentree.com.cn/shair/vc";
String loginAction = "http://vip1.sentree.com.cn/shair/loginAction!ajaxLogin.action";
CloseableHttpClient httpImageClientStore = HttpClientBuilder.create().build();
HttpGet imagePath = new HttpGet(verPath);
HttpResponse imageResponse = httpImageClientStore.execute(imagePath);
BasicCookieStore cookieStore = new BasicCookieStore();
cookieStore = HttpClientUtil.setCookieStore(imageResponse, cookieStore, "vip1.sentree.com.cn");
InputStream is = imageResponse.getEntity().getContent();
String imageSystemPath = "D:/" + new Date().getTime() + ".jpg";
File tempImagePathFile = new File(imageSystemPath);
FileUtils.copyInputStreamToFile(is, tempImagePathFile);
String valCode = new Ocr().recognizeText(tempImagePathFile, "jpg");
CloseableHttpClient httpClientLogin = HttpClients.custom().setDefaultCookieStore(cookieStore).build();
HttpPost httpPost = new HttpPost(loginAction);
Map params = new HashMap<>();
params.put("login", "fs");
params.put("passwd", "ab82443397");
params.put("rand", valCode.trim());
List pairs = HttpClientUtil.geneNameValPairs(params);
httpPost.setEntity(new UrlEncodedFormEntity(pairs, "UTF-8"));
RequestConfig reqConf = RequestConfig.DEFAULT;
httpPost.setConfig(reqConf);
HttpResponse loginResult = httpClientLogin.execute(httpPost);
String loginCode = EntityUtils.toString(loginResult.getEntity());
if (JSONObject.fromObject(loginCode).get("code").toString().equals("7")){
logger.info("系统侵入成功 ...");
}
FileUtils.deleteQuietly(tempImagePathFile);
}
catch (Exception e) {
e.printStackTrace();
}
return null;
}
}