357 lines
15 KiB
Diff
357 lines
15 KiB
Diff
commit c4bac00441363fcaeb074682d8226ca523614ea2
|
|
Author: Guillaume Girol <symphorien+git@xlumurb.eu>
|
|
Date: Sat Aug 20 17:48:01 2022 +0200
|
|
|
|
Fix finding tesseract and cuneiform
|
|
|
|
diff --git a/src/pyocr/cuneiform.py b/src/pyocr/cuneiform.py
|
|
index 2e5b717..35647e2 100644
|
|
--- a/src/pyocr/cuneiform.py
|
|
+++ b/src/pyocr/cuneiform.py
|
|
@@ -25,13 +25,9 @@ from . import builders
|
|
from .error import CuneiformError
|
|
|
|
|
|
-# CHANGE THIS IF CUNEIFORM IS NOT IN YOUR PATH, OR IS NAMED DIFFERENTLY
|
|
-CUNEIFORM_CMD = 'cuneiform'
|
|
+CUNEIFORM_CMD = '@cuneiform@/bin/cuneiform'
|
|
|
|
-CUNEIFORM_DATA_POSSIBLE_PATHS = [
|
|
- "/usr/local/share/cuneiform",
|
|
- "/usr/share/cuneiform",
|
|
-]
|
|
+CUNEIFORM_DATA_POSSIBLE_PATHS = ['@cuneiform@/share/cuneiform']
|
|
|
|
LANGUAGES_LINE_PREFIX = "Supported languages: "
|
|
LANGUAGES_SPLIT_RE = re.compile("[^a-z]")
|
|
diff --git a/src/pyocr/libtesseract/tesseract_raw.py b/src/pyocr/libtesseract/tesseract_raw.py
|
|
index 2002614..9ebea5c 100644
|
|
--- a/src/pyocr/libtesseract/tesseract_raw.py
|
|
+++ b/src/pyocr/libtesseract/tesseract_raw.py
|
|
@@ -2,7 +2,6 @@ import ctypes
|
|
import locale
|
|
import logging
|
|
import os
|
|
-import sys
|
|
|
|
from ..error import TesseractError
|
|
|
|
@@ -10,51 +9,16 @@ from ..error import TesseractError
|
|
logger = logging.getLogger(__name__)
|
|
|
|
TESSDATA_PREFIX = os.getenv('TESSDATA_PREFIX', None)
|
|
-libnames = []
|
|
+if TESSDATA_PREFIX is None:
|
|
+ TESSDATA_PREFIX = '@tesseract@/share/tessdata'
|
|
+ os.environ['TESSDATA_PREFIX'] = TESSDATA_PREFIX
|
|
+
|
|
+
|
|
# 70 is the minimum credible dpi for tesseract and force it to compute an
|
|
# estimate of the image dpi
|
|
DPI_DEFAULT = 70
|
|
|
|
-
|
|
-if getattr(sys, 'frozen', False): # pragma: no cover
|
|
- # Pyinstaller integration
|
|
- libnames += [os.path.join(sys._MEIPASS, "libtesseract-4.dll")]
|
|
- libnames += [os.path.join(sys._MEIPASS, "libtesseract-3.dll")]
|
|
- tessdata = os.path.join(sys._MEIPASS, "data")
|
|
- if not os.path.exists(os.path.join(tessdata, "tessdata")):
|
|
- logger.warning(
|
|
- "Running from container, but no tessdata ({}) found !".format(
|
|
- tessdata
|
|
- )
|
|
- )
|
|
- else:
|
|
- TESSDATA_PREFIX = os.path.join(tessdata, "tessdata")
|
|
-
|
|
-
|
|
-if sys.platform[:3] == "win": # pragma: no cover
|
|
- libnames += [
|
|
- # Jflesch> Don't they have the equivalent of LD_LIBRARY_PATH on
|
|
- # Windows ?
|
|
- "../vs2010/DLL_Release/libtesseract302.dll",
|
|
- # prefer the most recent first
|
|
- "libtesseract305.dll",
|
|
- "libtesseract304.dll",
|
|
- "libtesseract303.dll",
|
|
- "libtesseract302.dll",
|
|
- "libtesseract400.dll", # Tesseract 4 is still in alpha stage
|
|
- "libtesseract.dll",
|
|
- "C:\\Program Files (x86)\\Tesseract-OCR\\libtesseract-4.dll",
|
|
- "C:\\Program Files (x86)\\Tesseract-OCR\\libtesseract-3.dll",
|
|
- ]
|
|
-else:
|
|
- libnames += [
|
|
- "libtesseract.so.5",
|
|
- "libtesseract.so.4",
|
|
- "libtesseract.so.3",
|
|
- "libtesseract.5.dylib",
|
|
- "libtesseract.4.dylib",
|
|
- ]
|
|
-
|
|
+libnames = [ "@tesseract@/lib/libtesseract.so" ]
|
|
|
|
g_libtesseract = None
|
|
|
|
@@ -367,12 +331,12 @@ def init(lang=None):
|
|
try:
|
|
if lang:
|
|
lang = lang.encode("utf-8")
|
|
- prefix = None
|
|
- if TESSDATA_PREFIX: # pragma: no cover
|
|
- prefix = TESSDATA_PREFIX.encode("utf-8")
|
|
+
|
|
+ prefix = TESSDATA_PREFIX
|
|
+
|
|
g_libtesseract.TessBaseAPIInit3(
|
|
ctypes.c_void_p(handle),
|
|
- ctypes.c_char_p(prefix),
|
|
+ ctypes.c_char_p(prefix.encode('utf-8')),
|
|
ctypes.c_char_p(lang)
|
|
)
|
|
g_libtesseract.TessBaseAPISetVariable(
|
|
diff --git a/src/pyocr/tesseract.py b/src/pyocr/tesseract.py
|
|
index 0fe0d20..c1fdd27 100644
|
|
--- a/src/pyocr/tesseract.py
|
|
+++ b/src/pyocr/tesseract.py
|
|
@@ -28,8 +28,7 @@ from .builders import DigitBuilder # backward compatibility
|
|
from .error import TesseractError # backward compatibility
|
|
from .util import digits_only
|
|
|
|
-# CHANGE THIS IF TESSERACT IS NOT IN YOUR PATH, OR IS NAMED DIFFERENTLY
|
|
-TESSERACT_CMD = 'tesseract.exe' if os.name == 'nt' else 'tesseract'
|
|
+TESSERACT_CMD = '@tesseract@/bin/tesseract'
|
|
|
|
TESSDATA_EXTENSION = ".traineddata"
|
|
|
|
diff --git a/tests/tests_cuneiform.py b/tests/tests_cuneiform.py
|
|
index 45b7f6a..95f55c6 100644
|
|
--- a/tests/tests_cuneiform.py
|
|
+++ b/tests/tests_cuneiform.py
|
|
@@ -21,7 +21,7 @@ class TestCuneiform(BaseTest):
|
|
# XXX is it useful?
|
|
which.return_value = True
|
|
self.assertTrue(cuneiform.is_available())
|
|
- which.assert_called_once_with("cuneiform")
|
|
+ which.assert_called_once_with("@cuneiform@/bin/cuneiform")
|
|
|
|
@patch("subprocess.Popen")
|
|
def test_version(self, popen):
|
|
@@ -54,7 +54,7 @@ class TestCuneiform(BaseTest):
|
|
self.assertIn("eng", langs)
|
|
self.assertIn("fra", langs)
|
|
popen.assert_called_once_with(
|
|
- ["cuneiform", "-l"],
|
|
+ ["@cuneiform@/bin/cuneiform", "-l"],
|
|
stdout=subprocess.PIPE, stderr=subprocess.STDOUT
|
|
)
|
|
|
|
@@ -109,7 +109,7 @@ class TestCuneiformTxt(BaseTest):
|
|
output = cuneiform.image_to_string(self.image)
|
|
self.assertEqual(output, self._get_file_content("text").strip())
|
|
popen.assert_called_once_with(
|
|
- ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
|
|
+ ["@cuneiform@/bin/cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
|
|
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
|
|
stderr=subprocess.STDOUT
|
|
)
|
|
@@ -125,7 +125,7 @@ class TestCuneiformTxt(BaseTest):
|
|
builder=self.builder)
|
|
self.assertEqual(output, self._get_file_content("text").strip())
|
|
popen.assert_called_once_with(
|
|
- ["cuneiform", "-l", "fra", "-f", "text", "-o", self.tmp_filename,
|
|
+ ["@cuneiform@/bin/cuneiform", "-l", "fra", "-f", "text", "-o", self.tmp_filename,
|
|
"-"],
|
|
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
|
|
stderr=subprocess.STDOUT
|
|
@@ -142,7 +142,7 @@ class TestCuneiformTxt(BaseTest):
|
|
builder=self.builder)
|
|
self.assertEqual(output, self._get_file_content("text").strip())
|
|
popen.assert_called_once_with(
|
|
- ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
|
|
+ ["@cuneiform@/bin/cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
|
|
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
|
|
stderr=subprocess.STDOUT
|
|
)
|
|
@@ -173,7 +173,7 @@ class TestCuneiformTxt(BaseTest):
|
|
output = cuneiform.image_to_string(image, builder=self.builder)
|
|
self.assertEqual(output, self._get_file_content("text").strip())
|
|
popen.assert_called_once_with(
|
|
- ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
|
|
+ ["@cuneiform@/bin/cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
|
|
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
|
|
stderr=subprocess.STDOUT
|
|
)
|
|
@@ -227,7 +227,7 @@ class TestCuneiformWordBox(BaseTest):
|
|
output = cuneiform.image_to_string(self.image,
|
|
builder=self.builder)
|
|
popen.assert_called_once_with(
|
|
- ["cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"],
|
|
+ ["@cuneiform@/bin/cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"],
|
|
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
|
|
stderr=subprocess.STDOUT
|
|
)
|
|
@@ -280,7 +280,7 @@ class TestCuneiformLineBox(BaseTest):
|
|
output = cuneiform.image_to_string(self.image,
|
|
builder=self.builder)
|
|
popen.assert_called_once_with(
|
|
- ["cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"],
|
|
+ ["@cuneiform@/bin/cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"],
|
|
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
|
|
stderr=subprocess.STDOUT
|
|
)
|
|
diff --git a/tests/tests_libtesseract.py b/tests/tests_libtesseract.py
|
|
index a5d46d8..8b9e315 100644
|
|
--- a/tests/tests_libtesseract.py
|
|
+++ b/tests/tests_libtesseract.py
|
|
@@ -165,7 +165,8 @@ class TestLibTesseractRaw(BaseTest):
|
|
args = libtess.TessBaseAPIInit3.call_args[0]
|
|
self.assertEqual(len(args), 3)
|
|
self.assertEqual(args[0].value, self.handle)
|
|
- self.assertEqual(args[1].value, None)
|
|
+ # we hardcode tesseract data, so we don't get None
|
|
+ #self.assertEqual(args[1].value, None)
|
|
self.assertEqual(args[2].value, lang.encode() if lang else None)
|
|
|
|
self.assertEqual(
|
|
@@ -201,7 +202,8 @@ class TestLibTesseractRaw(BaseTest):
|
|
args = libtess.TessBaseAPIInit3.call_args[0]
|
|
self.assertEqual(len(args), 3)
|
|
self.assertEqual(args[0].value, self.handle)
|
|
- self.assertEqual(args[1].value, None)
|
|
+ # we hardcode tesseract data, so we don't get None
|
|
+ #self.assertEqual(args[1].value, None)
|
|
self.assertEqual(args[2].value, lang.encode() if lang else None)
|
|
|
|
self.assertEqual(
|
|
diff --git a/tests/tests_tesseract.py b/tests/tests_tesseract.py
|
|
index 18d01ef..593cf94 100644
|
|
--- a/tests/tests_tesseract.py
|
|
+++ b/tests/tests_tesseract.py
|
|
@@ -36,7 +36,7 @@ class TestTesseract(BaseTest):
|
|
def test_available(self, which):
|
|
which.return_value = True
|
|
self.assertTrue(tesseract.is_available())
|
|
- which.assert_called_once_with("tesseract")
|
|
+ which.assert_called_once_with("@tesseract@/bin/tesseract")
|
|
|
|
@patch("subprocess.Popen")
|
|
def test_version_error(self, popen):
|
|
@@ -162,7 +162,7 @@ class TestTesseract(BaseTest):
|
|
for lang in ("eng", "fra", "jpn", "osd"):
|
|
self.assertIn(lang, langs)
|
|
popen.assert_called_once_with(
|
|
- ["tesseract", "--list-langs"],
|
|
+ ["@tesseract@/bin/tesseract", "--list-langs"],
|
|
startupinfo=None, creationflags=0,
|
|
stdout=subprocess.PIPE, stderr=subprocess.STDOUT
|
|
)
|
|
@@ -177,7 +177,7 @@ class TestTesseract(BaseTest):
|
|
self.assertEqual(te.exception.status, 1)
|
|
self.assertEqual("unable to get languages", te.exception.message)
|
|
popen.assert_called_once_with(
|
|
- ["tesseract", "--list-langs"],
|
|
+ ["@tesseract@/bin/tesseract", "--list-langs"],
|
|
startupinfo=None, creationflags=0,
|
|
stdout=subprocess.PIPE, stderr=subprocess.STDOUT
|
|
)
|
|
@@ -254,7 +254,7 @@ class TestTesseract(BaseTest):
|
|
self.assertEqual(status, 0)
|
|
self.assertEqual(error, message)
|
|
popen.assert_called_once_with(
|
|
- ["tesseract", "input.bmp", "output"],
|
|
+ ["@tesseract@/bin/tesseract", "input.bmp", "output"],
|
|
cwd=tmpdir,
|
|
startupinfo=None,
|
|
creationflags=0,
|
|
@@ -277,7 +277,7 @@ class TestTesseract(BaseTest):
|
|
self.assertEqual(status, 0)
|
|
self.assertEqual(error, message)
|
|
popen.assert_called_with(
|
|
- ["tesseract", "input2.bmp", "output2", "-l", "fra", "--psm", "3"],
|
|
+ ["@tesseract@/bin/tesseract", "input2.bmp", "output2", "-l", "fra", "--psm", "3"],
|
|
cwd=tmpdir,
|
|
startupinfo=None,
|
|
creationflags=0,
|
|
@@ -308,7 +308,7 @@ class TestTesseract(BaseTest):
|
|
self.assertEqual(result["angle"], 90)
|
|
self.assertEqual(result["confidence"], 9.30)
|
|
popen.assert_called_once_with(
|
|
- ["tesseract", "input.bmp", "stdout", "--psm", "0"],
|
|
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"],
|
|
stdin=subprocess.PIPE,
|
|
shell=False,
|
|
startupinfo=None,
|
|
@@ -344,7 +344,7 @@ class TestTesseract(BaseTest):
|
|
self.assertEqual(result["angle"], 90)
|
|
self.assertEqual(result["confidence"], 9.30)
|
|
popen.assert_called_once_with(
|
|
- ["tesseract", "input.bmp", "stdout", "--psm", "0"],
|
|
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"],
|
|
stdin=subprocess.PIPE,
|
|
shell=False,
|
|
startupinfo=None,
|
|
@@ -377,7 +377,7 @@ class TestTesseract(BaseTest):
|
|
self.assertEqual(result["angle"], 90)
|
|
self.assertEqual(result["confidence"], 9.30)
|
|
popen.assert_called_once_with(
|
|
- ["tesseract", "input.bmp", "stdout",
|
|
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout",
|
|
"--psm", "0", "-l", "osd"],
|
|
stdin=subprocess.PIPE,
|
|
shell=False,
|
|
@@ -405,7 +405,7 @@ class TestTesseract(BaseTest):
|
|
with self.assertRaises(tesseract.TesseractError) as te:
|
|
tesseract.detect_orientation(self.image)
|
|
popen.assert_called_once_with(
|
|
- ["tesseract", "input.bmp", "stdout", "--psm", "0"],
|
|
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"],
|
|
stdin=subprocess.PIPE,
|
|
shell=False,
|
|
startupinfo=None,
|
|
@@ -439,7 +439,7 @@ class TestTesseract(BaseTest):
|
|
with self.assertRaises(tesseract.TesseractError) as te:
|
|
tesseract.detect_orientation(self.image)
|
|
popen.assert_called_once_with(
|
|
- ["tesseract", "input.bmp", "stdout", "--psm", "0"],
|
|
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"],
|
|
stdin=subprocess.PIPE,
|
|
shell=False,
|
|
startupinfo=None,
|
|
@@ -473,7 +473,7 @@ class TestTesseract(BaseTest):
|
|
self.assertEqual(result["angle"], 90)
|
|
self.assertEqual(result["confidence"], 9.30)
|
|
popen.assert_called_once_with(
|
|
- ["tesseract", "input.bmp", "stdout", "-psm", "0"],
|
|
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0"],
|
|
stdin=subprocess.PIPE,
|
|
shell=False,
|
|
startupinfo=None,
|
|
@@ -506,7 +506,7 @@ class TestTesseract(BaseTest):
|
|
self.assertEqual(result["angle"], 90)
|
|
self.assertEqual(result["confidence"], 9.30)
|
|
popen.assert_called_once_with(
|
|
- ["tesseract", "input.bmp", "stdout", "-psm", "0", "-l", "fra"],
|
|
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0", "-l", "fra"],
|
|
stdin=subprocess.PIPE,
|
|
shell=False,
|
|
startupinfo=None,
|
|
@@ -533,7 +533,7 @@ class TestTesseract(BaseTest):
|
|
with self.assertRaises(tesseract.TesseractError) as te:
|
|
tesseract.detect_orientation(self.image)
|
|
popen.assert_called_once_with(
|
|
- ["tesseract", "input.bmp", "stdout", "-psm", "0"],
|
|
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0"],
|
|
stdin=subprocess.PIPE,
|
|
shell=False,
|
|
startupinfo=None,
|
|
@@ -567,7 +567,7 @@ class TestTesseract(BaseTest):
|
|
with self.assertRaises(tesseract.TesseractError) as te:
|
|
tesseract.detect_orientation(self.image)
|
|
popen.assert_called_once_with(
|
|
- ["tesseract", "input.bmp", "stdout", "-psm", "0"],
|
|
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0"],
|
|
stdin=subprocess.PIPE,
|
|
shell=False,
|
|
startupinfo=None,
|