From 9458d263dc8275b5d487a004166111f0775455e3 Mon Sep 17 00:00:00 2001 From: Nick White Date: Thu, 17 Nov 2011 19:57:32 +0000 Subject: Clean up ocr scripts further --- extras/mkocrpdf.sh | 22 ++++++++++++++-------- extras/mkocrtxt.sh | 2 +- 2 files changed, 15 insertions(+), 9 deletions(-) (limited to 'extras') diff --git a/extras/mkocrpdf.sh b/extras/mkocrpdf.sh index 9113963..1ec9a06 100644 --- a/extras/mkocrpdf.sh +++ b/extras/mkocrpdf.sh @@ -13,31 +13,37 @@ echo 'tessedit_create_hocr 1' > hocr for i in `ls *png` do + echo "$i" + # create a much bigger version of the page image width=`identify "$i" | awk '{print $3}' | sed 's/x.*//'` bigwidth=`expr $width \* 4` - convert "$i" -geometry ${bigwidth}x "$i.big.png" + convert "$i" -geometry ${bigwidth}x "$i.big.tif" # scan the page image - tesseract "$i.big.png" "$i.big.png" hocr 2>&1 | sed '/Tesseract Open Source OCR Engine/d' + tesseract "$i.big.tif" "$i.big.tif" hocr 2>&1 | sed '/Tesseract Open Source OCR Engine/d' # this reduces all bbox information to match the original image size - sedrule=`cat "$i.big.png.html" \ + sedrule=`cat "$i.big.tif.html" \ | sed -e 's/ "$i.html" + sed -e 's/\.big\.tif//g' -e "$sedrule" < "$i.big.tif.html" > "$i.html" # combine the image and hocr into a pdf page - # Note: hocr2pdf has a habit of segfaulting, so fall back to convert - hocr2pdf -i "$i" -o "$i.pdf" < "$i.html" || convert "$i" "$i.pdf" + # Note: hocr2pdf has a habit of segfaulting, so fall back to convert. + # also, it tends to complain about the quality of tesseract's + # hocr output, which it's best to silence here. + hocr2pdf -i "$i" -o "$i.pdf" < "$i.html" >/dev/null 2>&1 || convert "$i" "$i.pdf" # remove working files - rm -f "$i.big.png" "$i.big.png.html" "$i.html" + rm -f "$i.big.tif" "$i.big.tif.html" "$i.html" done +echo book.pdf + # cat the pdf pages together pdftk *pdf cat output book.pdf rm -f [0-9]*pdf hocr diff --git a/extras/mkocrtxt.sh b/extras/mkocrtxt.sh index e683459..497f8a7 100644 --- a/extras/mkocrtxt.sh +++ b/extras/mkocrtxt.sh @@ -11,7 +11,7 @@ # converted to tiff format, so that they're readable by # any version of tesseract. -for i in `ls *tif` +for i in `ls *png` do echo "$i" -- cgit v1.2.3