#!/bin/sh # # Makes a PDF with embedded text extracted by tesseract # # Requires imagemagick, pdftk, hocr2pdf and tesseract # # Also requires this tesseract configuration: # echo 'tessedit_create_hocr 1' > /usr/local/share/tessdata/configs/hocr for i in `ls *png` do a=`basename $i .png` echo processing $a tesseract $i $a hocr 2>/dev/null # hocr2pdf has a habit of segfaulting, so fall back to convert hocr2pdf -i $i -o $a.pdf < $a.html || convert $i $a.pdf rm -f $a.html done pdftk *pdf cat output book.pdf