yum install -y autoconf automake libtool libjpeg libpng libtiff zlib libjpeg-devel libpng-devel libtiff-devel zlib-devel
下载:wget http://www.leptonica.org/source/leptonica-1.76.0.tar.gz
解压:tar -zxvf leptonica-1.76.0.tar.gz
安装:
cd leptonica-1.76.0
./configure
make && make install
下载:wget https://github.com/tesseract-ocr/tesseract/archive/4.0.0-beta.3.tar.gz
解压:tar-zxvf tesseract-4.0.0-beta.3.tar.gz
安装:./autoconf
提示错误信息"Missing autoconf-archive. Check the build requirements"
解决办法:yum install autoconf-archive
./configure
提示错误信息"error: Leptonica 1.74 or higher is required. Try to install libleptonica-dev package"
参考csdn地址:https://blog.csdn.net/xjmxym/article/details/79040514
按照上述文档操作之后,执行:
./configure --with-extra-includes=/usr/local/include --with-extra-libraries=/usr/local/lib make && make install
切换Tesseract-OCR 指令安装目录:/usr/local/bin/tesseract --list-langs
github下载全套tessdata_fast并上传至/usr/local/share/文件夹下,将tessdata_fast改名为tessdata,(建议下载需要的语言包:eng.traineddata、chi_sim.traineddata)
执行如下指令:/usr/local/bin/tesseract 识别图像路径 识别结果输出地址 -l chi_sim
Demo: /usr/local/bin/tesseract /ftp/pub/0002-0001.jpg /ftp/pub/1 -l chi_sim
ocr.py
import io
import tesserocr
from PIL import Image
import base64
#base64图片转文本
def base64_to_text(base64_text):
img_b64decode = base64.b64decode(base64_text) # base64解码
base64Img = io.BytesIO(img_b64decode)
image = Image.open(base64Img)
clear_noise_img = clear_noise(image)
result = tesserocr.image_to_text(clear_noise_img)
return result.replace('\n', '')
# 去除噪点
def clear_noise(imgBinImg):
w = imgBinImg.size[0] # 图片宽度
h = imgBinImg.size[1] # 图片高度
border = 2 # 边框
for x in range(0, w):
for y in range(0, h):
# 一个点为黑色,周围8个点为白色,则此点为噪点,设置为白色
r = imgBinImg.getpixel((x, y))[0]
g = imgBinImg.getpixel((x, y))[1]
b = imgBinImg.getpixel((x, y))[2]
# 去掉边框
if (x <= border):
imgBinImg.putpixel([x, y], (255, 255, 255))
if (x >= w - border):
imgBinImg.putpixel([x, y], (255, 255, 255))
if (y <= border):
imgBinImg.putpixel([x, y], (255, 255, 255))
if (y >= h - border):
imgBinImg.putpixel([x, y], (255, 255, 255))
# 去掉灰色背景
if (abs(r - 200) < 85 and abs(g - 200) < 585 and abs(b - 200) < 85):
imgBinImg.putpixel([x, y], (255, 255, 255))
if (r < 100 and g < 100 and b > 200):
imgBinImg.putpixel([x, y], (0, 0, 255))
if (r < 80 and g < 80 and b < 80):
imgBinImg.putpixel([x, y], (255, 255, 255))
return imgBinImg
test.py
from ocr import base64_to_text
base64_text = '/9j/4AAQSkZJRgABAgAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAA8AIwDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwDt7W1ga1hLQRklFJJQc8VyXxN0u9bwo99o80trPZN50gt2KF4/4s49Ov4Guzs/+POD/rmv8qnaNZY2jdQyMCrKRkEHqKiMY8q0IjGPKtDm/BGrWfijwta6gYIPtAHlXChBxIvX8+D9DXSCytf+faH/AL9ivIfCrt8P/ifd+G52I03UiDbMx4yc+Wf5ofU4r2YCnyx7D5Y9iEWNp/z6w/8AfsU8WFp/z6wf9+xUF1q+nWMqxXV7BFI3RHcAn8Pwq7FIk0ayRurowyGU5BqnSsruOgcsexGLCz/59IP+/Yp40+y/59IP+/Y/wqccDJpwx61PLHsHLHsQjTrL/nzt/wDv0v8AhTxp1j/z52//AH6X/CpwKeKOWPYOWPYrjTbH/nyt/wDv0v8AhTxplh/z5W3/AH6X/CrAFPAo5Y9g5Y9isNLsP+fG2/79L/hTxpen/wDPjbf9+V/wqyBTxRyx7Byx7FYaVp3/AD4Wv/flf8KcNJ07/nwtf+/K/wCFWgKeKOWPYOWPYqjSdN/6B9r/AN+V/wAK4zxfbQWurRJbwxxIYASsahRnc3PFegCuE8b/APIah/691/8AQmrnxUUqeiMMTFKnoiaz/wCPOD/rmv8AKrIFV7P/AI8oP+ua/wAqbqepW2j6Xc6jePst7dC7n+g9z0Hua6I/CjePwo5X4jeCZvFlja3GnSJDqdkxaJmO3euM7cjocgYPbn1ri7z4pa1fafBpEFu9prCEQ3TY+ZnBwdo7ZP5Vs6L8ZrKe3B13TbmxWQkR3EKmSJh/PI9s/hWG/ga81fzvFvhrWLW9nSQzxJGrBnIP3SGAwfY9a9HL6lGjU9tVSly/Zd9f+G310CabVkV9W+Hut2WljVL65jaZ2GYtxLk4z19cA/lXTfC7X7pNNu7ecvJFGv7osc4I6gDPQA5PtWV4m+KFvrfhuGwS0ltdULf6RHIMeUwBB2565yfpXUfDTwzZroLzTTRXTSO3CNkJlQCPyr6PFY2piMoc8Uldy92y2t+XzMYxSqe6cxqfjjWtS1WW206R/LhJYByMZB+9784AGPeszUPFHirTXX7ZvRSxxuyQeckZBz9eeenTivULb4f2Ok6o1/ZSYiLB3glPy5HQ5/E/nWL8Tr/TjbxaaWiWRkwrDnHzY/hB6cnHfipwmNwVTEU6FGgpR6u2vmwlGSTbZ03w+8RyeIfDxubkgSxsQ5J/Xkk/57VuWXiDTryRI0mCyOxQKxHXtyCRzg457etcf4P02XQfBNxN5bMj2+/Zn5nZhnIGSOhHYk+w4ryjTr/Xv7Zu4tPkL3BZmbcBlRnnr0PA/LjpXBDKoY3EV3Rkoxi9L7WuXz8qVz6eFPArwFPGfi/wpq6pr32jac/I+MNgYyMcHt/k1r6p8UNa15V07wzZSGdxhpI0LHG3nHp35qJcO4tSXK04vXmT937w9tE9pDru27hn0zThJH5hTeu8dVzyK+YBrHijQPEkEmoXV1HcLMC6yvnPOSD/AN9V0vxev5nv9C1C3d4GmtS+5GIIYNjOfpWsuHZrEUqPtE1UTaa1WiuL2ys3bY+gBTgK+frX4u+JI9HtbexsjcSQRATXMytIzN6k1LpHxv1uPUok1W1tpLdnCybEKMo9RzUPhnMEm0k7eauP20D6AArgvHH/ACGof+vdf/QmruraZLm2iuIzmOVA6n1BGRXDeOP+Q1D/ANe6/wDoTV8ti/4ZGK/hk9l/x5Qf9c1/lXP+O/Cd14v0aKwttR+yKkokdWTKyD0OOeOT9fzrobL/AI8rf/rmv8qtCt4/CjaPwopafo1jp2jQaTDbo1nDGIxG6hgw9T6knk+5rF1D4faPO5udK83RL/GFudObyvwZBhSP85rqhXj3xJvfEeja8ZIL+5j0+XDQ7DgA4GRx716OXYB46t7GMlF26hOXKrm5c/B3QTZzyS3V9cXTuHe4nmBIG4FyMADpmvP/AAVqviHw5rktlp0ltIpP7yO8JCYAPO4dOSOfp2rpbv4v3M+kta22nBbmQMm8tkAHpgd+Ko/D3wnqWoayl/dQSx2wxuZuNw7j3BHBHcE+1fQ0ctqYfB1nmD5e2ur+7ddk/Uxc05LkO7vfFWqNp1zaa34R1KOKSMo82mSpdBQR97AII9eRXkdyNOuLy4vpNZWC8ldpUilieLDHDYyRgDlxweu3sTXTeIhqHhLxiJPtMhtJJN6h3cLjOdh56ANtyP4SD1rr/EfjrSJvCtwkgWado9sYDZIYjg8Dr354IrmwtDG4Jwng3zRqeX5/qVJxlfm6Gb8M/FUOo6i+mSTKX8qSOL5icoCCozvwcZIHykgcbvXkvA8zxeNpYRE7O0u5ljHJ2PuKjjABIGSf4Qw6kVe+GPgSx1+8ubvWLBJrRUICMHTLHvkAD9c+1Y+oeCLjTvG91DBDqFrpsV2ds8GWMUZOVOSegyOT+tdkrUsRiaC5dYrrbVa6X3eu1yd0mel/F208/RoZGSFGwFWZmyccs4wOnKxjP+0aj+CccEWlXhCZuHdQzJyAoXcMnsfmI/4DUHjDwn4lt/C8r2/i24vLQRbRbXNom4g4GAy4xkcc/TvXLfCvxBq2kXFxHBobaoiKxYWkyiVFJXcQhI38hMenPrXmUsRKeUTo2dk16f8AA/Itq1RMsfGy28rxXbTopVHthuA5G/c3PsSAv5UnxCVr3wX4UviMeVYxxk+rkYcfUFVP/Aqb8VvE2ma6bPyILzT7tMm4t7+3aF3PRevHy/N3/iqTUY31T4JWUzcy2d35k0vJyDlFH5YB91HrXsYHFRlQwcuZXjJxffW6/EzlHWR6T8IEt28AWe2NPODOJTjr8xx+mBXmXxs0m303xjbz20KxJd2odwowC4ZgT+W2u1+BF553hnULVjl4bkEf7rKMD8w351R+P9jus9EvwPuSSwsf94KR/wCgtXPgZyw/EEoSfxOS++7X6DlrSPS/BFyLzwNoc4OSbGJWPuFAP6g1z/jn/kNw/wDXsv8A6E1O+Dd79s+G9ihOWtpJYW/76LD9GFJ46/5DcP8A17L/AOhNXyOc0/ZVqtPtJr8Sa7vRuWbL/jxt/wDrmv8AKrIrmItbuYYkjVIiEUKMg9vxqT/hILv/AJ5w/wDfJ/xrmjWjZGkasbI6YVDeafaalbNb3tvHPCw5V1yKwP8AhIrv/nnB/wB8n/Gl/wCEkvP+eUH/AHyf8auOIUXdXTH7WJoWfhHQLGbzrfSrZZOoYpkjnPGelbiIqKFVQo9AMVyn/CS3n/PK3/75P+NL/wAJPej/AJZW/wD3y3+NVUxTqO822/MSqxWyNzWtBsdf097S9iDK2Cr45RhnBH5n8CR3riYPg9pyXyTPOTDj5ox2PPTIIwfQjI7HvW5/wlN8P+WVv/3y3+NL/wAJXff88rb/AL5b/GuihmuIw8HClNpPoJzg9WjptM0y10qzjtbSJI40GMIgUfkOO5/OpLvS7DUWja9sre4aM5RpYwxQ+xPSuW/4Sy//AOeNt/3y3+NL/wAJdfj/AJY23/fLf/FVzfWfe5ru5XtYnVyadbXGmNp00Ye1ePymjxgFMYxxjjHHFYegeANH8Pag15ZCRX8xnVdx2jI2gEd8At/30ao/8JhqA/5Y2v8A3y3/AMVS/wDCY6gP+WNr/wB8t/8AFVUcZOMXCLaT3XcXtY9js7qxtL+AwXtrDcwnrHNGHU/geK4+8+FejYlk0K4u9DmlzvFo+6CT2eFsqw9uKb/wmeo/88bX/vlv/iqX/hNdSH/LC0/74b/4qs/bIftombpOneJPAF1c3H/COWesWs+POuNHPkTEDOCbdjtJ5PCYrdi8SeCfHXlaZfNEbmGUSDTtRQwyrIAQPkbG44Y8DI5qt/wm2pD/AJYWn/fDf/FVQ1HW01d4X1HR9KungcPE80BZkIOQQc5HNafW5OfPd379fvF7WO1j0u0tLaygWC0t4reFfuxxIEUfQDiuF8d/8huD/r2X/wBCaj/hOtTH/LCz/wC+G/8Aiqx9W1afWbpbi4SNXVAgEYIGASe5PrXJiaqnAxxFRShZH//Z'
result = base64_to_text(base64_text)
print(result)
返回结果
33mn