1、下载tesseract
https://digi.bib.uni-mannheim.de/tesseract/
安装
系统变量中配置path
配置系统变量:TESSDATA_PREFIX
检查配置是否正常
Python3.7加载tesserocr
python37 -m pip install pillow
python37 -m pip install pytesseract
在目录C:\Python37\Lib\site-packages\pytesseract下打开pytesseract.py,大约第33行,修改:tesseract_cmd=r’C:\\Python37\\Scripts\\tesseract.exe’
到这里Python的绑定window的tesserocr应用已经完成。
https://github.com/tesseract-ocr/tessdata
下载支持中文的字库:chi_sim.traineddata
直接当前目录,没有问题,但是读取其他目录就会存在报错:
File “C:\Python37\lib\site-packages\PIL\Image.py”, line 2944, in open
“cannot identify image file %r” % (filename if filename else fp)
PIL.UnidentifiedImageError: cannot identify image file ‘d:\\test_temp\\105506_.jpg’
(后来检测为python脚本问题)
python37 -m pip install –upgrade pillow
通过代理下载模块
pip37 install pillow==7.0.0 –proxy 127.0.0.1:1080
测试:
(1)简单图片测试:
测试图片:
from PIL import Image
import pytesseract
def read_text(text_path):
"""
传入文本(jpg、png)的绝对路径,读取文本
:param text_path:
:return: 文本内容
"""
# 验证码图片转字符串
im = Image.open(text_path)
# 转化为8bit的黑白图片
imgry = im.convert('L')
# 二值化,采用阈值分割算法,threshold为分割点
threshold = 140
table = []
for j in range(256):
if j < threshold:
table.append(0)
else:
table.append(1)
out = imgry.point(table, '1')
# 识别文本,lang参数改为chi_sim,其他代码与上面的读取验证码代码一致。
text = pytesseract.image_to_string(out, lang="chi_sim", config='--psm 6')
return text
def start():
res =(read_text("D:\\test_temp\\105018.jpg"))
print (res.split('\n')[0])
print (res.split('\n')[1])
def main():
start()
if __name__ == '__main__':
main()
(2)网页图片OCR测试:
# coding:utf-8
__author__='vulsee.com'
from PIL import Image
import pytesseract
import requests
import sys
import ssl
import urllib3
ssl._create_default_https_context = ssl._create_unverified_context
urllib3.disable_warnings()
requests.packages.urllib3.disable_warnings()
def read_text(text_path):
"""
传入文本(jpg、png)的绝对路径,读取文本
:param text_path:
:return: 文本内容
"""
# 验证码图片转字符串
im = Image.open(text_path)
# 转化为8bit的黑白图片
imgry = im.convert('L')
#imgry = im
# 二值化,采用阈值分割算法,threshold为分割点
threshold = 140
table = []
for j in range(256):
if j < threshold:
table.append(0)
else:
table.append(1)
out = imgry.point(table, '1')
# 识别文本
#text = pytesseract.image_to_string(out, lang="eng", config='--psm 6')
text = pytesseract.image_to_string(out, lang="chi_sim", config='--psm 6')
return (text.strip())
def saveImg(url,id):
response = requests.get(url)
imgtxt2 = response.content
with open(str(id) + '.jpg', 'wb') as f:
#f.write(bytes(imgtxt2))
f.write(imgtxt2)
res = read_text(''+str(id) + '.jpg')
res=res.replace(' ','').replace('\n','-')
print (res)
def start(id, parentId):
url = 'https://****.***/api/api/****t?id=%d&parentId=%s' % (
id, str(parentId))
saveImg(url,id)
def main():
setlen=20
for i in range(1,setlen,1):
idno = 105506-500
parentId = 534726-500
idno=idno+i
pid='159980316'+str(parentId+i)
#time.sleep(2)
print (i,idno, pid)
start(idno, pid)
if __name__ == "__main__":
main()