diff options
Diffstat (limited to 'extras')
-rw-r--r-- | extras/mkocrpdf.sh | 22 | ||||
-rw-r--r-- | extras/mkocrtxt.sh | 2 |
2 files changed, 15 insertions, 9 deletions
diff --git a/extras/mkocrpdf.sh b/extras/mkocrpdf.sh index 9113963..1ec9a06 100644 --- a/extras/mkocrpdf.sh +++ b/extras/mkocrpdf.sh @@ -13,31 +13,37 @@ echo 'tessedit_create_hocr 1' > hocr for i in `ls *png` do + echo "$i" + # create a much bigger version of the page image width=`identify "$i" | awk '{print $3}' | sed 's/x.*//'` bigwidth=`expr $width \* 4` - convert "$i" -geometry ${bigwidth}x "$i.big.png" + convert "$i" -geometry ${bigwidth}x "$i.big.tif" # scan the page image - tesseract "$i.big.png" "$i.big.png" hocr 2>&1 | sed '/Tesseract Open Source OCR Engine/d' + tesseract "$i.big.tif" "$i.big.tif" hocr 2>&1 | sed '/Tesseract Open Source OCR Engine/d' # this reduces all bbox information to match the original image size - sedrule=`cat "$i.big.png.html" \ + sedrule=`cat "$i.big.tif.html" \ | sed -e 's/</\n/g' \ | sed -e '/bbox/!d' -e 's/.*bbox//g' -e 's/".*//g' -e "s/'.*//g" \ | awk '{ printf("s/bbox %d %d %d %d/bbox",$1,$2,$3,$4); - for(i=1;i<5;i++) printf(" %d", $i/4); + for(a=1;a<5;a++) printf(" %d", $a/4); printf("/g\n")}'` - sed -e 's/\.big\.png//g' -e "$sedrule" < "$i.big.png.html" > "$i.html" + sed -e 's/\.big\.tif//g' -e "$sedrule" < "$i.big.tif.html" > "$i.html" # combine the image and hocr into a pdf page - # Note: hocr2pdf has a habit of segfaulting, so fall back to convert - hocr2pdf -i "$i" -o "$i.pdf" < "$i.html" || convert "$i" "$i.pdf" + # Note: hocr2pdf has a habit of segfaulting, so fall back to convert. + # also, it tends to complain about the quality of tesseract's + # hocr output, which it's best to silence here. + hocr2pdf -i "$i" -o "$i.pdf" < "$i.html" >/dev/null 2>&1 || convert "$i" "$i.pdf" # remove working files - rm -f "$i.big.png" "$i.big.png.html" "$i.html" + rm -f "$i.big.tif" "$i.big.tif.html" "$i.html" done +echo book.pdf + # cat the pdf pages together pdftk *pdf cat output book.pdf rm -f [0-9]*pdf hocr diff --git a/extras/mkocrtxt.sh b/extras/mkocrtxt.sh index e683459..497f8a7 100644 --- a/extras/mkocrtxt.sh +++ b/extras/mkocrtxt.sh @@ -11,7 +11,7 @@ # converted to tiff format, so that they're readable by # any version of tesseract. -for i in `ls *tif` +for i in `ls *png` do echo "$i" |