diff options
author | Nick White <git@njw.me.uk> | 2011-10-30 12:29:45 +0000 |
---|---|---|
committer | Nick White <git@njw.me.uk> | 2011-10-30 12:29:45 +0000 |
commit | cd0c0a821361f5ee7c52ee60fb0ed5b758e53620 (patch) | |
tree | d9bc0c72dc2447c9ac3b4edc68081348b36ba64e /extras/mkocrpdf.sh | |
parent | 82908257a64d4fd67785c76ab33b0392bc9d9724 (diff) |
Add ocr pdf script
Diffstat (limited to 'extras/mkocrpdf.sh')
-rw-r--r-- | extras/mkocrpdf.sh | 22 |
1 files changed, 22 insertions, 0 deletions
diff --git a/extras/mkocrpdf.sh b/extras/mkocrpdf.sh new file mode 100644 index 0000000..165059a --- /dev/null +++ b/extras/mkocrpdf.sh @@ -0,0 +1,22 @@ +#!/bin/sh +# +# Makes a PDF with embedded text extracted by tesseract +# +# Requires imagemagick, pdftk, hocr2pdf and tesseract +# +# Also requires this tesseract configuration: +# echo 'tessedit_create_hocr 1' > /usr/local/share/tessdata/configs/hocr + +for i in `ls *png` +do + a=`basename $i .png` + echo processing $a + + tesseract $i $a hocr 2>/dev/null + + # hocr2pdf has a habit of segfaulting, so fall back to convert + hocr2pdf -i $i -o $a.pdf < $a.html || convert $i $a.pdf + rm -f $a.html +done + +pdftk *pdf cat output book.pdf |