blob: 3a01559cdf7d31c9260ba0e476080b7c01bd92be (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
#!/bin/sh
# See COPYING file for copyright and license details.
#
# Makes a PDF with embedded text extracted by tesseract
# Requires imagemagick, pdftk, hocr2pdf and tesseract 3
echo 'tessedit_create_hocr 1' > hocr
for i in `ls *png`
do
a=`basename $i .png`
echo processing $a
# unfortunately tesseract seems to work much better with a
# resized larger image
convert $i -geometry 1000x $a.big.png
tesseract $a.big.png $a hocr 2>/dev/null
# hocr2pdf has a habit of segfaulting, so fall back to convert
hocr2pdf -i $a.big.png -o $a.pdf < $a.html || convert $a.big.png $a.pdf
rm -f $a.html $a.big.png
done
pdftk *pdf cat output book.pdf
rm -f [0-9]*pdf hocr
|