diff options
author | Nick White <git@njw.me.uk> | 2011-11-07 20:09:55 +0000 |
---|---|---|
committer | Nick White <git@njw.me.uk> | 2011-11-07 20:09:55 +0000 |
commit | fca6360e428cfd63c0d8f9fc30b83d27a611bf4a (patch) | |
tree | 369fc52946e4f5e770461d4db92573456f87984c /extras | |
parent | 2fc2aed916f9d2a5c02f434d2532e80d04f2821a (diff) |
Dramatically improve mkocrpdf script
Diffstat (limited to 'extras')
-rw-r--r-- | extras/mkocrpdf.sh | 19 | ||||
-rwxr-xr-x | extras/mkpdf.sh | 1 |
2 files changed, 12 insertions, 8 deletions
diff --git a/extras/mkocrpdf.sh b/extras/mkocrpdf.sh index 165059a..3a01559 100644 --- a/extras/mkocrpdf.sh +++ b/extras/mkocrpdf.sh @@ -1,22 +1,25 @@ #!/bin/sh +# See COPYING file for copyright and license details. # # Makes a PDF with embedded text extracted by tesseract -# -# Requires imagemagick, pdftk, hocr2pdf and tesseract -# -# Also requires this tesseract configuration: -# echo 'tessedit_create_hocr 1' > /usr/local/share/tessdata/configs/hocr +# Requires imagemagick, pdftk, hocr2pdf and tesseract 3 + +echo 'tessedit_create_hocr 1' > hocr for i in `ls *png` do a=`basename $i .png` echo processing $a - tesseract $i $a hocr 2>/dev/null + # unfortunately tesseract seems to work much better with a + # resized larger image + convert $i -geometry 1000x $a.big.png + tesseract $a.big.png $a hocr 2>/dev/null # hocr2pdf has a habit of segfaulting, so fall back to convert - hocr2pdf -i $i -o $a.pdf < $a.html || convert $i $a.pdf - rm -f $a.html + hocr2pdf -i $a.big.png -o $a.pdf < $a.html || convert $a.big.png $a.pdf + rm -f $a.html $a.big.png done pdftk *pdf cat output book.pdf +rm -f [0-9]*pdf hocr diff --git a/extras/mkpdf.sh b/extras/mkpdf.sh index 80e776f..cbfcd1b 100755 --- a/extras/mkpdf.sh +++ b/extras/mkpdf.sh @@ -1,5 +1,6 @@ #!/bin/sh # See COPYING file for copyright and license details. +# # Requires imagemagick convert *.png book.pdf |