k2pdfopt: Fix build and clean up

This commit is contained in:
Daniel Fullmer 2019-09-13 15:53:03 -04:00 committed by Matthieu Coudron
parent 9128fe19cc
commit 740d4c22ec
5 changed files with 1991 additions and 158 deletions

View file

@ -36,67 +36,19 @@ stdenv.mkDerivation rec {
buildInputs =
let
# The patches below were constructed by taking the files from k2pdfopt in
# the {mupdf,leptonica,tesseract}_mod/ directories, replacing the
# corresponding files in the respective source trees, resolving any errors
# with more recent versions of these depencencies, and running diff.
mupdf_modded = mupdf.overrideAttrs (attrs: {
# Excluded the pdf-*.c files, since they mostly just broke the #includes
prePatch = ''
cp ${src}/mupdf_mod/{font,stext-device,string}.c source/fitz/
cp ${src}/mupdf_mod/font-win32.c source/pdf/
'';
patches = attrs.patches ++ [ ./mupdf.patch ]; # Last verified with mupdf 1.14.0
});
leptonica_modded = leptonica.overrideAttrs (attrs: {
name = "leptonica-1.74.4";
# Modified source files apply to this particular version of leptonica
version = "1.74.4";
src = fetchurl {
url = "http://www.leptonica.org/source/leptonica-1.74.4.tar.gz";
sha256 = "0fw39amgyv8v6nc7x8a4c7i37dm04i6c5zn62d24bgqnlhk59hr9";
};
prePatch = ''
cp ${src}/leptonica_mod/{allheaders.h,dewarp2.c,leptwin.c} src/
'';
patches = [
# stripped down copy of upstream commit b88c821f8d347bce0aea86d606c710303919f3d2
./leptonica-CVE-2018-3836.patch
(fetchpatch {
# CVE-2018-7186
url = "https://github.com/DanBloomberg/leptonica/commit/"
+ "ee301cb2029db8a6289c5295daa42bba7715e99a.patch";
sha256 = "0cgb7mvz2px1rg5i80wk1wxxjvzjga617d8q6j7qygkp7jm6495d";
})
(fetchpatch {
# CVE-2018-7247
url = "https://github.com/DanBloomberg/leptonica/commit/"
+ "c1079bb8e77cdd426759e466729917ca37a3ed9f.patch";
sha256 = "1z4iac5gwqggh7aa8cvyp6nl9fwd1v7wif26caxc9y5qr3jj34qf";
})
(fetchpatch {
# CVE-2018-7440
url = "https://github.com/DanBloomberg/leptonica/commit/"
+ "49ecb6c2dfd6ed5078c62f4a8eeff03e3beced3b.patch";
sha256 = "1hjmva98iaw9xj7prg7aimykyayikcwnk4hk0380007hqb35lqmy";
})
];
patches = [ ./leptonica.patch ]; # Last verified with leptonica 1.78.0
});
tesseract_modded = tesseract4.override {
tesseractBase = tesseract4.tesseractBase.overrideAttrs (_: {
prePatch = ''
cp ${src}/tesseract_mod/baseapi.{h,cpp} src/api/
cp ${src}/tesseract_mod/ccutil.{h,cpp} src/ccutil/
cp ${src}/tesseract_mod/genericvector.h src/ccutil/
cp ${src}/tesseract_mod/input.cpp src/lstm/
cp ${src}/tesseract_mod/lstmrecognizer.cpp src/lstm/
cp ${src}/tesseract_mod/mainblk.cpp src/ccutil/
cp ${src}/tesseract_mod/params.cpp src/ccutil/
cp ${src}/tesseract_mod/serialis.{h,cpp} src/ccutil/
cp ${src}/tesseract_mod/tesscapi.cpp src/api/
cp ${src}/tesseract_mod/tessdatamanager.cpp src/ccstruct/
cp ${src}/tesseract_mod/tessedit.cpp src/ccmain/
cp ${src}/include_mod/{tesseract.h,leptonica.h} src/api/
'';
patches = [ ./tesseract.patch ];
patches = [ ./tesseract.patch ]; # Last verified with tesseract 1.4
});
};
in

View file

@ -1,95 +0,0 @@
--- a/src/allheaders.h
+++ b/src/allheaders.h
@@ -2600,6 +2600,7 @@
LEPT_DLL extern char * stringReverse ( const char *src );
LEPT_DLL extern char * strtokSafe ( char *cstr, const char *seps, char **psaveptr );
LEPT_DLL extern l_int32 stringSplitOnToken ( char *cstr, const char *seps, char **phead, char **ptail );
+LEPT_DLL extern l_int32 stringCheckForChars ( const char *src, const char *chars, l_int32 *pfound );
LEPT_DLL extern char * stringRemoveChars ( const char *src, const char *remchars );
LEPT_DLL extern l_int32 stringFindSubstr ( const char *src, const char *sub, l_int32 *ploc );
LEPT_DLL extern char * stringReplaceSubstr ( const char *src, const char *sub1, const char *sub2, l_int32 *pfound, l_int32 *ploc );
--- a/src/gplot.c
+++ b/src/gplot.c
@@ -141,9 +141,10 @@
const char *xlabel,
const char *ylabel)
{
-char *newroot;
-char buf[L_BUF_SIZE];
-GPLOT *gplot;
+char *newroot;
+char buf[L_BUF_SIZE];
+l_int32 badchar;
+GPLOT *gplot;
PROCNAME("gplotCreate");
@@ -152,6 +153,9 @@
if (outformat != GPLOT_PNG && outformat != GPLOT_PS &&
outformat != GPLOT_EPS && outformat != GPLOT_LATEX)
return (GPLOT *)ERROR_PTR("outformat invalid", procName, NULL);
+ stringCheckForChars(rootname, "`;&|><\"?*", &badchar);
+ if (badchar) /* danger of command injection */
+ return (GPLOT *)ERROR_PTR("invalid rootname", procName, NULL);
if ((gplot = (GPLOT *)LEPT_CALLOC(1, sizeof(GPLOT))) == NULL)
return (GPLOT *)ERROR_PTR("gplot not made", procName, NULL);
--- a/src/utils2.c
+++ b/src/utils2.c
@@ -42,6 +42,7 @@
* l_int32 stringSplitOnToken()
*
* Find and replace string and array procs
+ * l_int32 stringCheckForChars()
* char *stringRemoveChars()
* l_int32 stringFindSubstr()
* char *stringReplaceSubstr()
@@ -701,6 +702,48 @@
/*--------------------------------------------------------------------*
* Find and replace procs *
*--------------------------------------------------------------------*/
+/*!
+ * \brief stringCheckForChars()
+ *
+ * \param[in] src input string; can be of zero length
+ * \param[in] chars string of chars to be searched for in %src
+ * \param[out] pfound 1 if any characters are found; 0 otherwise
+ * \return 0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ * (1) This can be used to sanitize an operation by checking for
+ * special characters that don't belong in a string.
+ * </pre>
+ */
+l_int32
+stringCheckForChars(const char *src,
+ const char *chars,
+ l_int32 *pfound)
+{
+char ch;
+l_int32 i, n;
+
+ PROCNAME("stringCheckForChars");
+
+ if (!pfound)
+ return ERROR_INT("&found not defined", procName, 1);
+ *pfound = FALSE;
+ if (!src || !chars)
+ return ERROR_INT("src and chars not both defined", procName, 1);
+
+ n = strlen(src);
+ for (i = 0; i < n; i++) {
+ ch = src[i];
+ if (strchr(chars, ch)) {
+ *pfound = TRUE;
+ break;
+ }
+ }
+ return 0;
+}
+
+
/*!
* \brief stringRemoveChars()
*

View file

@ -0,0 +1,254 @@
From 8c11a20925686855023df90ed477957c7d7fe91e Mon Sep 17 00:00:00 2001
From: Daniel Fullmer <danielrf12@gmail.com>
Date: Fri, 13 Sep 2019 15:54:21 -0400
Subject: [PATCH] Willus mod for k2pdfopt
---
src/allheaders.h | 4 ++
src/dewarp2.c | 106 ++++++++++++++++++++++++++++++++++++++++++-----
src/leptwin.c | 6 ++-
3 files changed, 104 insertions(+), 12 deletions(-)
diff --git a/src/allheaders.h b/src/allheaders.h
index e68eff1..b3cc729 100644
--- a/src/allheaders.h
+++ b/src/allheaders.h
@@ -669,6 +669,10 @@ LEPT_DLL extern L_DEWARPA * dewarpaReadMem ( const l_uint8 *data, size_t size );
LEPT_DLL extern l_ok dewarpaWrite ( const char *filename, L_DEWARPA *dewa );
LEPT_DLL extern l_ok dewarpaWriteStream ( FILE *fp, L_DEWARPA *dewa );
LEPT_DLL extern l_ok dewarpaWriteMem ( l_uint8 **pdata, size_t *psize, L_DEWARPA *dewa );
+/* WILLUS MOD */
+ LEPT_DLL extern l_int32 dewarpBuildPageModel_ex ( L_DEWARP *dew, const char *debugfile,l_int32 fit_order );
+ LEPT_DLL extern l_int32 dewarpFindVertDisparity_ex ( L_DEWARP *dew, PTAA *ptaa, l_int32 rotflag,l_int32 fit_order );
+ LEPT_DLL extern l_int32 dewarpBuildLineModel_ex ( L_DEWARP *dew, l_int32 opensize, const char *debugfile,l_int32 fit_order );
LEPT_DLL extern l_ok dewarpBuildPageModel ( L_DEWARP *dew, const char *debugfile );
LEPT_DLL extern l_ok dewarpFindVertDisparity ( L_DEWARP *dew, PTAA *ptaa, l_int32 rotflag );
LEPT_DLL extern l_ok dewarpFindHorizDisparity ( L_DEWARP *dew, PTAA *ptaa );
diff --git a/src/dewarp2.c b/src/dewarp2.c
index 220eec1..2e29500 100644
--- a/src/dewarp2.c
+++ b/src/dewarp2.c
@@ -144,9 +144,17 @@ static const l_float32 L_ALLOWED_W_FRACT = 0.05; /* no bigger */
* longest textlines.
* </pre>
*/
+/* WILLUS MOD */
l_ok
-dewarpBuildPageModel(L_DEWARP *dew,
- const char *debugfile)
+dewarpBuildPageModel(L_DEWARP *dew,const char *debugfile)
+{
+return(dewarpBuildPageModel_ex(dew,debugfile,2));
+}
+
+l_ok
+dewarpBuildPageModel_ex(L_DEWARP *dew,
+ const char *debugfile,
+ l_int32 fit_order)
{
l_int32 linecount, topline, botline, ret;
PIX *pixs, *pix1, *pix2, *pix3;
@@ -225,7 +233,7 @@ PTAA *ptaa1, *ptaa2;
/* Get the sampled vertical disparity from the textline centers.
* The disparity array will push pixels vertically so that each
* textline is flat and centered at the y-position of the mid-point. */
- if (dewarpFindVertDisparity(dew, ptaa2, 0) != 0) {
+ if (dewarpFindVertDisparity_ex(dew, ptaa2, 0, fit_order) != 0) {
L_WARNING("vertical disparity not built\n", procName);
ptaaDestroy(&ptaa2);
return 1;
@@ -290,13 +298,24 @@ PTAA *ptaa1, *ptaa2;
* a pdf. Non-pix debug output goes to /tmp.
* </pre>
*/
+/* WILLUS MOD */
l_ok
dewarpFindVertDisparity(L_DEWARP *dew,
PTAA *ptaa,
l_int32 rotflag)
{
+return(dewarpFindVertDisparity_ex(dew,ptaa,rotflag,2));
+}
+/* WILLUS MOD -- add cubic and quartic fits and ..._ex functions */
+l_int32
+dewarpFindVertDisparity_ex(L_DEWARP *dew,
+ PTAA *ptaa,
+ l_int32 rotflag,
+ l_int32 fit_order)
+{
l_int32 i, j, nlines, npts, nx, ny, sampling;
-l_float32 c0, c1, c2, x, y, midy, val, medval, meddev, minval, maxval;
+/* WILLUS MOD */
+l_float32 c0, c1, c2, c3, c4, x, y, midy, val, medval, meddev, minval, maxval;
l_float32 *famidys;
NUMA *nax, *nafit, *nacurve0, *nacurve1, *nacurves;
NUMA *namidy, *namidys, *namidysi;
@@ -304,11 +323,22 @@ PIX *pix1, *pix2, *pixcirc, *pixdb;
PTA *pta, *ptad, *ptacirc;
PTAA *ptaa0, *ptaa1, *ptaa2, *ptaa3, *ptaa4, *ptaa5, *ptaat;
FPIX *fpix;
+/* WILLUS MOD */
+l_int32 fit_order1,fit_order2;
PROCNAME("dewarpFindVertDisparity");
if (!dew)
return ERROR_INT("dew not defined", procName, 1);
+/* WILLUS MOD */
+ if (fit_order < 10)
+ fit_order1 = fit_order2 = fit_order;
+ else
+ {
+ fit_order1=fit_order % 10;
+ fit_order2=fit_order / 10;
+ fit_order2=fit_order2 % 10;
+ }
dew->vsuccess = 0;
if (!ptaa)
return ERROR_INT("ptaa not defined", procName, 1);
@@ -331,12 +361,32 @@ FPIX *fpix;
pixdb = (rotflag) ? pixRotateOrth(dew->pixs, 1) : pixClone(dew->pixs);
for (i = 0; i < nlines; i++) { /* for each line */
pta = ptaaGetPta(ptaa, i, L_CLONE);
- ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL);
- numaAddNumber(nacurve0, c2);
+/* WILLUS MOD */
+if (fit_order1>3)
+ {
+ ptaGetQuarticLSF(pta, &c4, &c3, &c2, &c1, &c0, NULL);
+ numaAddNumber(nacurve0, c4);
+ }
+else if (fit_order1==3)
+ {
+ ptaGetCubicLSF(pta, &c3, &c2, &c1, &c0, NULL);
+ numaAddNumber(nacurve0, c3);
+ }
+else
+ {
+ ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL);
+ numaAddNumber(nacurve0, c2);
+ }
ptad = ptaCreate(nx);
for (j = 0; j < nx; j++) { /* uniformly sampled in x */
x = j * sampling;
- applyQuadraticFit(c2, c1, c0, x, &y);
+/* WILLUS MOD */
+if (fit_order1>3)
+ applyQuarticFit(c4, c3, c2, c1, c0, x, &y);
+else if (fit_order1==3)
+ applyCubicFit(c3, c2, c1, c0, x, &y);
+else
+ applyQuadraticFit(c2, c1, c0, x, &y);
ptaAddPt(ptad, x, y);
}
ptaaAddPta(ptaa0, ptad, L_INSERT);
@@ -350,7 +400,13 @@ FPIX *fpix;
for (i = 0; i < nlines; i++) {
pta = ptaaGetPta(ptaa, i, L_CLONE);
ptaGetArrays(pta, &nax, NULL);
- ptaGetQuadraticLSF(pta, NULL, NULL, NULL, &nafit);
+/* WILLUS MOD */
+if (fit_order1>3)
+ptaGetQuarticLSF(pta, NULL, NULL, NULL, NULL, NULL, &nafit);
+else if (fit_order1==3)
+ptaGetCubicLSF(pta, NULL, NULL, NULL, NULL, &nafit);
+else
+ptaGetQuadraticLSF(pta, NULL, NULL, NULL, &nafit);
ptad = ptaCreateFromNuma(nax, nafit);
ptaaAddPta(ptaat, ptad, L_INSERT);
ptaDestroy(&pta);
@@ -494,11 +550,24 @@ FPIX *fpix;
ptaa5 = ptaaCreate(nx); /* uniformly sampled across full height of image */
for (j = 0; j < nx; j++) { /* for each column */
pta = ptaaGetPta(ptaa4, j, L_CLONE);
- ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL);
+/* WILLUS MOD */
+/* Order higher than 2 can cause a little craziness here. */
+if (fit_order2>3)
+ ptaGetQuarticLSF(pta, &c4, &c3, &c2, &c1, &c0, NULL);
+else if (fit_order2==3)
+ ptaGetCubicLSF(pta, &c3, &c2, &c1, &c0, NULL);
+else
+ ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL);
ptad = ptaCreate(ny);
for (i = 0; i < ny; i++) { /* uniformly sampled in y */
y = i * sampling;
- applyQuadraticFit(c2, c1, c0, y, &val);
+/* WILLUS MOD */
+if (fit_order2>3)
+ applyQuarticFit(c4, c3, c2, c1, c0, y, &val);
+else if (fit_order2==3)
+ applyCubicFit(c3, c2, c1, c0, y, &val);
+else
+ applyQuadraticFit(c2, c1, c0, y, &val);
ptaAddPt(ptad, y, val);
}
ptaaAddPta(ptaa5, ptad, L_INSERT);
@@ -1602,11 +1671,21 @@ FPIX *fpix;
* See notes there.
* </pre>
*/
+/* WILLUS MOD */
l_ok
dewarpBuildLineModel(L_DEWARP *dew,
l_int32 opensize,
const char *debugfile)
{
+return(dewarpBuildLineModel_ex(dew,opensize,debugfile,2));
+}
+
+l_int32
+dewarpBuildLineModel_ex(L_DEWARP *dew,
+ l_int32 opensize,
+ const char *debugfile,
+ l_int32 fit_order)
+{
char buf[64];
l_int32 i, j, bx, by, ret, nlines;
BOXA *boxa;
@@ -1695,6 +1774,8 @@ PTAA *ptaa1, *ptaa2;
/* Remove all lines that are not at least 0.75 times the length
* of the longest line. */
+/* WILLUS MOD */
+/*
ptaa2 = dewarpRemoveShortLines(pix, ptaa1, 0.75, DEBUG_SHORT_LINES);
if (debugfile) {
pix1 = pixConvertTo32(pix);
@@ -1704,6 +1785,8 @@ PTAA *ptaa1, *ptaa2;
pixDestroy(&pix1);
pixDestroy(&pix2);
}
+*/
+ptaa2=ptaa1;
ptaaDestroy(&ptaa1);
nlines = ptaaGetCount(ptaa2);
if (nlines < dew->minlines) {
@@ -1717,7 +1800,8 @@ PTAA *ptaa1, *ptaa2;
* centers. The disparity array will push pixels vertically
* so that each line is flat and centered at the y-position
* of the mid-point. */
- ret = dewarpFindVertDisparity(dew, ptaa2, 1 - i);
+/* WILLUS MOD */
+ ret = dewarpFindVertDisparity_ex(dew, ptaa2, 1 - i, fit_order);
/* If i == 0, move the result to the horizontal disparity,
* rotating it back by -90 degrees. */
diff --git a/src/leptwin.c b/src/leptwin.c
index 72643a0..573d33e 100644
--- a/src/leptwin.c
+++ b/src/leptwin.c
@@ -364,5 +364,9 @@ PIXCMAP *cmap;
return hBitmap;
}
-
+#else
+/* willus mod: Avoid weird issue with OS/X library archiver when there are no symbols */
+int leptwin_my_empty_func(void);
+int leptwin_my_empty_func(void)
+{return(0);}
#endif /* _WIN32 */
--
2.22.0

File diff suppressed because it is too large Load diff

View file

@ -1,13 +1,675 @@
From 39aa8502eee7bb669a29d1a9b3bfe5c9595ad960 Mon Sep 17 00:00:00 2001
From: Daniel Fullmer <danielrf12@gmail.com>
Date: Fri, 13 Sep 2019 13:45:05 -0400
Subject: [PATCH] Willus mod changes from k2pdfopt
---
src/api/Makefile.am | 1 +
src/api/baseapi.cpp | 87 +++++++++++
src/api/baseapi.h | 3 +
src/api/tesscapi.cpp | 311 +++++++++++++++++++++++++++++++++++++
src/api/tesseract.h | 29 ++++
src/ccmain/tessedit.cpp | 5 +-
src/ccutil/ccutil.h | 7 +
src/ccutil/genericvector.h | 21 ++-
src/ccutil/mainblk.cpp | 17 +-
src/ccutil/params.cpp | 3 +-
src/ccutil/serialis.cpp | 3 +
src/ccutil/serialis.h | 2 +
src/lstm/input.cpp | 3 +
13 files changed, 488 insertions(+), 4 deletions(-)
create mode 100644 src/api/tesscapi.cpp
create mode 100644 src/api/tesseract.h
diff --git a/src/api/Makefile.am b/src/api/Makefile.am
index d8c1e54..46ead13 100644
index d9b76eb6..cd2dc30f 100644
--- a/src/api/Makefile.am
+++ b/src/api/Makefile.am
@@ -42,7 +42,7 @@ libtesseract_api_la_CPPFLAGS = $(AM_CPPFLAGS)
if VISIBILITY
libtesseract_api_la_CPPFLAGS += -DTESS_EXPORTS
endif
-libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp pdfrenderer.cpp
+libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp pdfrenderer.cpp tesscapi.cpp
@@ -39,6 +39,7 @@ libtesseract_api_la_SOURCES += lstmboxrenderer.cpp
libtesseract_api_la_SOURCES += pdfrenderer.cpp
libtesseract_api_la_SOURCES += wordstrboxrenderer.cpp
libtesseract_api_la_SOURCES += renderer.cpp
+libtesseract_api_la_SOURCES += tesscapi.cpp
lib_LTLIBRARIES += libtesseract.la
libtesseract_la_LDFLAGS =
libtesseract_la_LDFLAGS = $(LEPTONICA_LIBS) $(OPENCL_LDFLAGS) $(libarchive_LIBS)
diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp
index 9245d07c..ea964ee6 100644
--- a/src/api/baseapi.cpp
+++ b/src/api/baseapi.cpp
@@ -215,6 +215,14 @@ TessBaseAPI::TessBaseAPI()
// Use the current locale if building debug code.
std::locale::global(std::locale(""));
#endif
+ const char *locale;
+ locale = std::setlocale(LC_ALL, nullptr);
+/* willus mod Remove assertions--taken care of in tesscapi.cpp */
+// ASSERT_HOST(!strcmp(locale, "C"));
+ locale = std::setlocale(LC_CTYPE, nullptr);
+// ASSERT_HOST(!strcmp(locale, "C"));
+ locale = std::setlocale(LC_NUMERIC, nullptr);
+// ASSERT_HOST(!strcmp(locale, "C"));
}
TessBaseAPI::~TessBaseAPI() {
@@ -1333,6 +1341,85 @@ static void AddBoxToTSV(const PageIterator* it, PageIteratorLevel level,
text->add_str_int("\t", bottom - top);
}
+/* willus mod */
+int TessBaseAPI::GetOCRWords(int **x00,int **y00,int **x11,int **y11,int **ybaseline0,
+ char **utf8words)
+
+ {
+ int iword,nwords,totlen,it8;
+ int *x0,*y0,*x1,*y1,*ybaseline;
+ char *tutf8;
+
+ ResultIterator *res_it = GetIterator();
+ /* Count words */
+ iword=0;
+ totlen=0;
+ while (!res_it->Empty(RIL_BLOCK))
+ {
+ if (res_it->Empty(RIL_WORD))
+ {
+ res_it->Next(RIL_WORD);
+ continue;
+ }
+ iword++;
+ STRING textstr=std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
+ totlen+=strlen(textstr.string())+1;
+ res_it->Next(RIL_WORD);
+ }
+ nwords=iword;
+/*
+printf("\nnwords=%d, totlen=%d\n",nwords,totlen);
+*/
+ x0=(*x00)=(int *)malloc(sizeof(int)*5*nwords);
+ y0=(*y00)=&x0[nwords];
+ x1=(*x11)=&y0[nwords];
+ y1=(*y11)=&x1[nwords];
+ ybaseline=(*ybaseline0)=&y1[nwords];
+ tutf8=(*utf8words)=(char *)malloc(totlen);
+ iword=0;
+ it8=0;
+ res_it->Begin();
+ while (!res_it->Empty(RIL_BLOCK))
+ {
+ if (res_it->Empty(RIL_WORD))
+ {
+ res_it->Next(RIL_WORD);
+ continue;
+ }
+ STRING textstr=std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
+ strcpy(&tutf8[it8],textstr.string());
+ it8 += strlen(&tutf8[it8])+1;
+ /*
+ STRING textstr("");
+ textstr += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
+ */
+/*
+printf("Word %d: '%s'\n",iword,textstr.string());
+*/
+ int left, top, right, bottom;
+ int u1,v1,u2,v2;
+ res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
+ res_it->Baseline(RIL_WORD, &u1, &v1, &u2, &v2);
+ x0[iword]=left;
+ x1[iword]=right;
+ y0[iword]=top;
+ y1[iword]=bottom;
+ ybaseline[iword]=(v1+v2)/2;
+ iword++;
+/*
+printf("BB: (%d,%d)-(%d,%d) BL: (%d,%d)-(%d,%d)\n",left,bottom,right,top,x1,y1,x2,y2);
+*/
+ res_it->Next(RIL_WORD);
+ }
+/*
+printf("iword=%d\n",iword);
+*/
+ return(iword);
+ }
+
+/* willus mod */
+int GetOCRWords(int **x0,int **y0,int **x1,int **y1,int **ybaseline,char **utf8words);
+
/**
* Make a TSV-formatted string from the internal data structures.
* page_number is 0-based but will appear in the output as 1-based.
diff --git a/src/api/baseapi.h b/src/api/baseapi.h
index 3724dd92..23be5920 100644
--- a/src/api/baseapi.h
+++ b/src/api/baseapi.h
@@ -575,6 +575,9 @@ class TESS_API TessBaseAPI {
*/
char* GetHOCRText(ETEXT_DESC* monitor, int page_number);
+/* willus mod */
+int GetOCRWords(int **x0,int **y0,int **x1,int **y1,int **ybaseline,char **utf8words);
+
/**
* Make a HTML-formatted string with hOCR markup from the internal
* data structures.
diff --git a/src/api/tesscapi.cpp b/src/api/tesscapi.cpp
new file mode 100644
index 00000000..1752fafe
--- /dev/null
+++ b/src/api/tesscapi.cpp
@@ -0,0 +1,311 @@
+/*
+** tesscapi.cpp willus.com attempt at C wrapper for tesseract.
+** (Butchered from tesseractmain.cpp)
+** Last udpated 9-1-12
+**
+** Copyright (C) 2012 http://willus.com
+**
+** This program is free software: you can redistribute it and/or modify
+** it under the terms of the GNU Affero General Public License as
+** published by the Free Software Foundation, either version 3 of the
+** License, or (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU Affero General Public License for more details.
+**
+** You should have received a copy of the GNU Affero General Public License
+** along with this program. If not, see <http://www.gnu.org/licenses/>.
+**
+*/
+
+/*
+#include "mfcpch.h"
+*/
+// #define USE_VLD //Uncomment for Visual Leak Detector.
+#if (defined _MSC_VER && defined USE_VLD)
+#include <vld.h>
+#endif
+
+// Include automatically generated configuration file if running autoconf
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+#include <locale.h>
+#ifdef USING_GETTEXT
+#include <libintl.h>
+#define _(x) gettext(x)
+#else
+#define _(x) (x)
+#endif
+
+#include "allheaders.h"
+#include "baseapi.h"
+#include "strngs.h"
+#include "params.h"
+#include "blobs.h"
+#include "simddetect.h"
+#include "tesseractclass.h"
+/*
+#include "notdll.h"
+*/
+
+/* C Wrappers */
+#include "tesseract.h"
+
+// static tesseract::TessBaseAPI api[4];
+
+/*
+** ocr_type=0: OEM_DEFAULT
+** ocr_type=1: OEM_TESSERACT_ONLY
+** ocr_type=2: OEM_LSTM_ONLY
+** ocr_type=3: OEM_TESSERACT_LSTM_COMBINED
+*/
+void *tess_capi_init(char *datapath,char *language,int ocr_type,FILE *out,
+ char *initstr,int maxlen,int *status)
+
+ {
+ char original_locale[256];
+ tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI;
+/*
+printf("@tess_capi_init\n");
+printf(" datapath='%s'\n",datapath);
+printf(" language='%s'\n",language);
+printf(" ocr_type=%d\n",ocr_type);
+*/
+#ifdef USE_NLS
+ setlocale (LC_ALL, "");
+ bindtextdomain (PACKAGE, LOCALEDIR);
+ textdomain (PACKAGE);
+#endif
+ /* willus mod, 11-24-16 */
+ /* Tesseract needs "C" locale to correctly parse all data .traineddata files. */
+/*
+printf("locale='%s'\n",setlocale(LC_ALL,NULL));
+printf("ctype='%s'\n",setlocale(LC_CTYPE,NULL));
+printf("numeric='%s'\n",setlocale(LC_NUMERIC,NULL));
+*/
+ strncpy(original_locale,setlocale(LC_ALL,NULL),255);
+ original_locale[255]='\0';
+/*
+printf("original_locale='%s'\n",original_locale);
+*/
+ setlocale(LC_ALL,"C");
+/*
+printf("new locale='%s'\n",setlocale(LC_ALL,NULL));
+printf("new ctype='%s'\n",setlocale(LC_CTYPE,NULL));
+printf("new numeric='%s'\n",setlocale(LC_NUMERIC,NULL));
+*/
+ // fprintf(stderr, "tesseract %s\n", tesseract::TessBaseAPI::Version());
+ // Make the order of args a bit more forgiving than it used to be.
+ const char* lang = "eng";
+ tesseract::PageSegMode pagesegmode = tesseract::PSM_SINGLE_BLOCK;
+ if (language!=NULL && language[0]!='\0')
+ lang = language;
+ /*
+ if (output == NULL)
+ {
+ fprintf(stderr, _("Usage:%s imagename outputbase [-l lang] "
+ "[-psm pagesegmode] [configfile...]\n"), argv[0]);
+ fprintf(stderr,
+ _("pagesegmode values are:\n"
+ "0 = Orientation and script detection (OSD) only.\n"
+ "1 = Automatic page segmentation with OSD.\n"
+ "2 = Automatic page segmentation, but no OSD, or OCR\n"
+ "3 = Fully automatic page segmentation, but no OSD. (Default)\n"
+ "4 = Assume a single column of text of variable sizes.\n"
+ "5 = Assume a single uniform block of vertically aligned text.\n"
+ "6 = Assume a single uniform block of text.\n"
+ "7 = Treat the image as a single text line.\n"
+ "8 = Treat the image as a single word.\n"
+ "9 = Treat the image as a single word in a circle.\n"
+ "10 = Treat the image as a single character.\n"));
+ fprintf(stderr, _("-l lang and/or -psm pagesegmode must occur before any"
+ "configfile.\n"));
+ exit(1);
+ }
+ */
+/*
+printf("SSE = %s\n",SIMDDetect::IsSSEAvailable() ? "AVAILABLE" : "NOT AVAILABLE");
+printf("AVX = %s\n",SIMDDetect::IsAVXAvailable() ? "AVAILABLE" : "NOT AVAILABLE");
+*/
+/*
+v4.00 loads either TESSERACT enginer, LSTM engine, or both. No CUBE.
+*/
+ ocr_type=0; /* Ignore specified and use default */
+ api->SetOutputName(NULL);
+ (*status)=api->Init(datapath,lang,
+ ocr_type==0 ? tesseract::OEM_DEFAULT :
+ (ocr_type==1 ? tesseract::OEM_TESSERACT_ONLY :
+ (ocr_type==2 ? tesseract::OEM_LSTM_ONLY :
+ (tesseract::OEM_TESSERACT_LSTM_COMBINED))));
+ if ((*status)!=0)
+ {
+ /* willus mod, 11-24-16 */
+ setlocale(LC_ALL,original_locale);
+ api->End();
+ delete api;
+ return(NULL);
+ }
+ /*
+ api.Init("tesscapi",lang,tesseract::OEM_DEFAULT,
+ &(argv[arg]), argc - arg, NULL, NULL, false);
+ */
+ // We have 2 possible sources of pagesegmode: a config file and
+ // the command line. For backwards compatability reasons, the
+ // default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
+ // default for this program is tesseract::PSM_AUTO. We will let
+ // the config file take priority, so the command-line default
+ // can take priority over the tesseract default, so we use the
+ // value from the command line only if the retrieved mode
+ // is still tesseract::PSM_SINGLE_BLOCK, indicating no change
+ // in any config file. Therefore the only way to force
+ // tesseract::PSM_SINGLE_BLOCK is from the command line.
+ // It would be simpler if we could set the value before Init,
+ // but that doesn't work.
+ if (api->GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK)
+ api->SetPageSegMode(pagesegmode);
+
+ /*
+ ** Initialization message
+ */
+ {
+ char istr[1024];
+ int sse,avx;
+
+// printf("tessedit_ocr_engine_mode = %d\n",tessedit_ocr_engine_mode);
+ sprintf(istr,"%s",api->Version());
+ sse=tesseract::SIMDDetect::IsSSEAvailable();
+ avx=tesseract::SIMDDetect::IsAVXAvailable();
+ if (sse || avx)
+ sprintf(&istr[strlen(istr)]," [%s]",sse&&avx?"SSE+AVX":(sse?"SSE":"AVX"));
+ sprintf(&istr[strlen(istr)],"\n Tesseract data folder = '%s'",datapath==NULL?getenv("TESSDATA_PREFIX"):datapath);
+ strcat(istr,"\n Tesseract languages: ");
+ GenericVector<STRING> languages;
+ api->GetLoadedLanguagesAsVector(&languages);
+/*
+printf("OEM=%d\n",api->oem());
+printf("Langs='%s'\n",api->GetInitLanguagesAsString());
+printf("AnyTessLang()=%d\n",(int)api->tesseract()->AnyTessLang());
+printf("AnyLSTMLang()=%d\n",(int)api->tesseract()->AnyLSTMLang());
+printf("num_sub_langs()=%d\n",api->tesseract()->num_sub_langs());
+printf("languages.size()=%d\n",(int)languages.size());
+*/
+
+ for (int i=0;i<=api->tesseract()->num_sub_langs();i++)
+ {
+ tesseract::Tesseract *lang1;
+ int eng;
+ lang1 = i==0 ? api->tesseract() : api->tesseract()->get_sub_lang(i-1);
+ eng=(int)lang1->tessedit_ocr_engine_mode;
+ sprintf(&istr[strlen(istr)],"%s%s [%s]",i==0?"":", ",lang1->lang.string(),
+ eng==2?"LSTM+Tess":(eng==1?"LSTM":"Tess"));
+ }
+/*
+printf("%d. '%s'\n",i+1,languages[i].string());
+printf(" sublang[%d].oem_engine = %d\n",i+1,(int)api->tesseract()->get_sub_lang(i)->tessedit_ocr_engine_mode);
+*/
+
+ /*
+ if (ocr_type==0 || ocr_type==3)
+ sprintf(&istr[strlen(istr)],"[LSTM+] (lang=");
+ else if (ocr_type==2)
+ sprintf(&istr[strlen(istr)],"[LSTM] (lang=");
+ strncpy(&istr[strlen(istr)],language,253-strlen(istr));
+ istr[253]='\0';
+ strcat(istr,")");
+ */
+ if (out!=NULL)
+ fprintf(out,"%s\n",istr);
+ if (initstr!=NULL)
+ {
+ strncpy(initstr,istr,maxlen-1);
+ initstr[maxlen-1]='\0';
+ }
+ }
+
+
+ /* Turn off LSTM debugging output */
+ api->SetVariable("lstm_debug_level","0");
+#if (WILLUSDEBUG & 1)
+ api->SetVariable("lstm_debug_level","9");
+ api->SetVariable("paragraph_debug_level","9");
+ api->SetVariable("tessdata_manager_debug_level","9");
+ api->SetVariable("tosp_debug_level","9");
+ api->SetVariable("wordrec_debug_level","9");
+ api->SetVariable("segsearch_debug_level","9");
+#endif
+ /* willus mod, 11-24-16 */
+ setlocale(LC_ALL,original_locale);
+ return((void *)api);
+ }
+
+
+int tess_capi_get_ocr(void *vapi,PIX *pix,char *outstr,int maxlen,int segmode,FILE *out)
+
+ {
+ tesseract::TessBaseAPI *api;
+ static int old_segmode=-1;
+
+ api=(tesseract::TessBaseAPI *)vapi;
+ if (old_segmode != segmode)
+ {
+ old_segmode=segmode;
+ api->SetPageSegMode((tesseract::PageSegMode)segmode);
+ }
+ if (!api->ProcessPage(pix,0,NULL,NULL,0,NULL))
+ {
+ /* pixDestroy(&pix); */
+ if (out!=NULL)
+ fprintf(out,"tesscapi: Error during bitmap processing.\n");
+ api->Clear();
+ return(-1);
+ }
+ strncpy(outstr,api->GetUTF8Text(),maxlen-1);
+ outstr[maxlen-1]='\0';
+ api->Clear();
+ return(0);
+ }
+
+
+int tess_capi_get_ocr_multiword(void *vapi,PIX *pix,int segmode,
+ int **left,int **top,int **right,int **bottom,
+ int **ybase,char **text,int *nw,
+ FILE *out)
+
+ {
+ tesseract::TessBaseAPI *api;
+ static int old_segmode=-1;
+
+ api=(tesseract::TessBaseAPI *)vapi;
+ if (old_segmode != segmode)
+ {
+ old_segmode=segmode;
+ api->SetPageSegMode((tesseract::PageSegMode)segmode);
+ }
+ if (!api->ProcessPage(pix,0,NULL,NULL,0,NULL))
+ {
+ if (out!=NULL)
+ fprintf(out,"tesscapi: Error during bitmap processing.\n");
+ api->Clear();
+ (*nw)=0;
+ return(-1);
+ }
+ (*nw)=api->GetOCRWords(left,top,right,bottom,ybase,text);
+ api->Clear();
+ return(0);
+ }
+
+
+void tess_capi_end(void *vapi)
+
+ {
+ tesseract::TessBaseAPI *api;
+
+ if (vapi==NULL)
+ return;
+ api=(tesseract::TessBaseAPI *)vapi;
+ api->End();
+ delete api;
+ }
diff --git a/src/api/tesseract.h b/src/api/tesseract.h
new file mode 100644
index 00000000..575948cc
--- /dev/null
+++ b/src/api/tesseract.h
@@ -0,0 +1,29 @@
+/*
+** Willus.com's Tesseract C Wrappers
+**
+** 6-8-12
+**
+*/
+
+#ifndef _TESSERACT_H_
+#define _TESSERACT_H_
+
+//#include <leptonica.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void *tess_capi_init(char *datapath,char *language,int ocr_type,FILE *out,
+ char *initstr,int maxlen,int *status);
+int tess_capi_get_ocr(void *api,PIX *pix,char *outstr,int maxlen,int segmode,FILE *out);
+int tess_capi_get_ocr_multiword(void *vapi,PIX *pix,int segmode,
+ int **left,int **top,int **right,int **bottom,
+ int **ybase,char **text,int *nw,
+ FILE *out);
+void tess_capi_end(void *api);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/ccmain/tessedit.cpp b/src/ccmain/tessedit.cpp
index 17f0951b..7af94ee2 100644
--- a/src/ccmain/tessedit.cpp
+++ b/src/ccmain/tessedit.cpp
@@ -101,6 +101,10 @@ bool Tesseract::init_tesseract_lang_data(
" to your \"tessdata\" directory.\n");
return false;
}
+ /* willus mod */
+ TFile fp;
+ strncpy(fp.tfile_filename,tessdata_path.string(),511);
+ fp.tfile_filename[511]='\0';
#ifndef DISABLED_LEGACY_ENGINE
if (oem == OEM_DEFAULT) {
// Set the engine mode from availability, which can then be overridden by
@@ -116,7 +120,6 @@ bool Tesseract::init_tesseract_lang_data(
#endif // ndef DISABLED_LEGACY_ENGINE
// If a language specific config file (lang.config) exists, load it in.
- TFile fp;
if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp,
this->params());
diff --git a/src/ccutil/ccutil.h b/src/ccutil/ccutil.h
index 71e89c60..bdeccc14 100644
--- a/src/ccutil/ccutil.h
+++ b/src/ccutil/ccutil.h
@@ -80,6 +80,13 @@ class CCUtil {
// Member parameters.
// These have to be declared and initialized after params_ member, since
// params_ should be initialized before parameters are added to it.
+/* willus mod */
+/*
+ #ifdef _WIN32
+ STRING_VAR_H(tessedit_module_name, WINDLLNAME,
+ "Module colocated with tessdata dir");
+ #endif
+*/
INT_VAR_H(ambigs_debug_level, 0, "Debug level for unichar ambiguities");
BOOL_VAR_H(use_definite_ambigs_for_classifier, false,
"Use definite ambiguities when running character classifier");
diff --git a/src/ccutil/genericvector.h b/src/ccutil/genericvector.h
index 3556d153..3a5e8662 100644
--- a/src/ccutil/genericvector.h
+++ b/src/ccutil/genericvector.h
@@ -382,7 +382,26 @@ inline bool LoadDataFromFile(const char* filename, GenericVector<char>* data) {
// reserve an extra byte in case caller wants to append a '\0' character
data->reserve(size + 1);
data->resize_no_init(size);
- result = static_cast<long>(fread(&(*data)[0], 1, size, fp)) == size;
+ /* willus mod Dec 2018--weird issue with Win XP and MinGW gcc 7.3.0 */
+ /* Can't read entire file at once -- need to break up into smaller blocksize reads */
+ {
+ int frs,n;
+ int blocksize;
+ blocksize=1024*1024;
+ for (n=0;1;)
+ {
+ int bs;
+ bs= size-n > blocksize ? blocksize : size-n;
+ frs=(int)fread(&(*data)[n],1,bs,fp);
+ n+=frs;
+ if (frs<bs || bs<blocksize || n>=size)
+ break;
+ }
+ result = static_cast<long>((long)n==size);
+ }
+ /*
+ result = static_cast<long>(fread(&(*data)[0], 1, size, fp)) == size;
+ */
}
fclose(fp);
}
diff --git a/src/ccutil/mainblk.cpp b/src/ccutil/mainblk.cpp
index 52b04b04..80b26044 100644
--- a/src/ccutil/mainblk.cpp
+++ b/src/ccutil/mainblk.cpp
@@ -55,8 +55,22 @@ void CCUtil::main_setup(const char *argv0, const char *basename) {
#if defined(_WIN32)
} else if (datadir == nullptr || _access(datadir.string(), 0) != 0) {
/* Look for tessdata in directory of executable. */
+ /*
+ char drive[_MAX_DRIVE];
+ char dir[_MAX_DIR];
+ */
char path[_MAX_PATH];
- DWORD length = GetModuleFileName(nullptr, path, sizeof(path));
+ int i;
+ /* DWORD length = */ GetModuleFileName(nullptr, path, sizeof(path));
+ /* willus mod--avoid _splitpath_s -- not in XP */
+ for (i=strlen(path)-1;i>=0 && path[i]!='/' && path[i]!='\\';i--);
+ if (i>=0)
+ {
+ path[i]='\0';
+ datadir=path;
+ datadir += "/tessdata";
+ }
+ /*
if (length > 0 && length < sizeof(path)) {
char* separator = std::strrchr(path, '\\');
if (separator != nullptr) {
@@ -65,6 +79,7 @@ void CCUtil::main_setup(const char *argv0, const char *basename) {
datadir += "/tessdata";
}
}
+ */
#endif /* _WIN32 */
#if defined(TESSDATA_PREFIX)
} else {
diff --git a/src/ccutil/params.cpp b/src/ccutil/params.cpp
index 00bf2563..486c5ce0 100644
--- a/src/ccutil/params.cpp
+++ b/src/ccutil/params.cpp
@@ -82,7 +82,8 @@ bool ParamUtils::ReadParamsFromFp(SetParamConstraint constraint, TFile *fp,
if (!foundit) {
anyerr = true; // had an error
- tprintf("Warning: Parameter not found: %s\n", line);
+ /* willus mod */
+ tprintf("Tesseract warning: Parameter %s not found in file %s.\n",line,fp->tfile_filename);
}
}
}
diff --git a/src/ccutil/serialis.cpp b/src/ccutil/serialis.cpp
index 7def011f..6107a494 100644
--- a/src/ccutil/serialis.cpp
+++ b/src/ccutil/serialis.cpp
@@ -201,6 +201,9 @@ bool TFile::Open(const STRING& filename, FileReader reader) {
offset_ = 0;
is_writing_ = false;
swap_ = false;
+ /* willus mod */
+ strncpy(tfile_filename,filename.string(),511);
+ tfile_filename[511]='\0';
if (reader == nullptr)
return LoadDataFromFile(filename, data_);
else
diff --git a/src/ccutil/serialis.h b/src/ccutil/serialis.h
index 095b9227..4cc8251e 100644
--- a/src/ccutil/serialis.h
+++ b/src/ccutil/serialis.h
@@ -77,6 +77,8 @@ class TFile {
public:
TFile();
~TFile();
+ /* willus mod */
+ char tfile_filename[512];
// All the Open methods load the whole file into memory for reading.
// Opens a file with a supplied reader, or nullptr to use the default.
diff --git a/src/lstm/input.cpp b/src/lstm/input.cpp
index 73b584b3..0b0b54c3 100644
--- a/src/lstm/input.cpp
+++ b/src/lstm/input.cpp
@@ -93,8 +93,11 @@ Pix* Input::PrepareLSTMInputs(const ImageData& image_data,
return nullptr;
}
if (width < min_width || height < min_width) {
+ /* willus mod -- no warning */
+ /*
tprintf("Image too small to scale!! (%dx%d vs min width of %d)\n", width,
height, min_width);
+ */
pixDestroy(&pix);
return nullptr;
}
--
2.22.0