diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..9b1d7da --- /dev/null +++ b/setup.cfg @@ -0,0 +1,28 @@ +[metadata] +name = pytessy +author = hyperrixel +url = https://github.com/hyperrixel/pytessy +project_urls = + Bug Tracker = https://github.com/hyperrixel/pytessy/issues + Documentation = https://pytessy.readthedocs.io/ + Source Code = https://github.com/hyperrixel/pytessy + + +license = Boost Software License 1.0 +long_description = file: README.md +long_description_content_type = text/markdown; charset=UTF-8 +platform = any +license_files = + LICENSE + +[options] +name = pytessy +python_requires = >=3.8 +packages = pytessy +package_dir = + pytessy=source + +setup_requires = + setuptools >=38.3.0 + pip >= 20.0 + setuptools_scm diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..d5d43d7 --- /dev/null +++ b/setup.py @@ -0,0 +1,3 @@ +from setuptools import setup + +setup(use_scm_version=True) diff --git a/source/pytessy.py b/source/pytessy.py index 6137fc4..927f330 100644 --- a/source/pytessy.py +++ b/source/pytessy.py @@ -33,7 +33,7 @@ import __main__ import ctypes import ctypes.util -from os import chdir, environ +from os import chdir, environ, getcwd from os.path import abspath, dirname, isabs, isdir, isfile, join from sys import platform @@ -102,6 +102,8 @@ def get_text(self): result = self._lib.TessBaseAPIGetUTF8Text(self._api) if result: return result.decode('utf-8') + else: + return "" @@ -137,6 +139,17 @@ def set_image(self, imagedata, width, height, bytes_per_pixel, bytes_per_line, imagedata, width, height, bytes_per_pixel, bytes_per_line) self._lib.TessBaseAPISetSourceResolution(self._api, resolution) + + + def set_variable(self, key, val): + """ + Sets a variable in Tesseract + ---------- + @Params: key + val : TYPE + """ + self._check_setup() + self._lib.TessBaseAPISetVariable(self._api, key, val) @@ -172,7 +185,11 @@ def setup_lib(cls, lib_path=None): ctypes.c_int, # height ctypes.c_int, # bytes_per_pixel ctypes.c_int) # bytes_per_line - + + lib.TessBaseAPISetVariable.argtypes = (cls.TessBaseAPI, + ctypes.c_char_p, + ctypes.c_char_p) + lib.TessBaseAPIGetUTF8Text.restype = ctypes.c_char_p # text lib.TessBaseAPIGetUTF8Text.argtypes = (cls.TessBaseAPI, ) # handle @@ -227,7 +244,8 @@ class PyTessy(object): def __init__(self, tesseract_path=None, api_version=None, lib_path=None, - data_path=None, language='eng', verbose_search=False): + data_path=None, language='eng', verbose_search=False, + oem=1, psm=7, char_whitelist=None): """ Initializes PyTessy instance ---------------------------- @@ -258,7 +276,6 @@ def __init__(self, tesseract_path=None, api_version=None, lib_path=None, search process. FileNotFoundError If cannot found "tessdata" directory. """ - run_path = dirname(abspath(__main__.__file__)) no_lib = True if lib_path is not None: @@ -317,10 +334,13 @@ def __init__(self, tesseract_path=None, api_version=None, lib_path=None, break if data_path is None: raise FileNotFoundError('PyTessy: Couldn\'t find "tessdata" directory.') - chdir(tess_path) self._tess = TesseractHandler(lib_path=lib_path, data_path=data_path, language=language) - chdir(run_path) + self._tess.set_variable(b"tessedit_pageseg_mode", bytes(psm)) + self._tess.set_variable(b"tessedit_ocr_engine_mode", bytes(oem)) + if char_whitelist: + self._tess.set_variable(b"tessedit_char_whitelist", char_whitelist) +