Clean up ocr scripts further

author: Nick White <git@njw.me.uk> 2011-11-17 19:57:32 +0000
committer: Nick White <git@njw.me.uk> 2011-11-17 19:57:32 +0000
commit: 9458d263dc8275b5d487a004166111f0775455e3 (patch)
tree: 3aa7f6b42295a118a8d9c4a7161fc3791049d5a8
parent: 92f10a6fbcc486045b810c1929ea4a932adfe68e (diff)
2 files changed, 15 insertions, 9 deletions
diff --git a/extras/mkocrpdf.sh b/extras/mkocrpdf.sh
index 9113963..1ec9a06 100644
--- a/extras/mkocrpdf.sh
+++ b/extras/mkocrpdf.sh
@@ -13,31 +13,37 @@ echo 'tessedit_create_hocr 1' > hocr
 
 for i in `ls *png`
 do
+	echo "$i"
+
 	# create a much bigger version of the page image
 	width=`identify "$i" | awk '{print $3}' | sed 's/x.*//'`
 	bigwidth=`expr $width \* 4`
-	convert "$i" -geometry ${bigwidth}x "$i.big.png"
+	convert "$i" -geometry ${bigwidth}x "$i.big.tif"
 
 	# scan the page image
-	tesseract "$i.big.png" "$i.big.png" hocr 2>&1 | sed '/Tesseract Open Source OCR Engine/d'
+	tesseract "$i.big.tif" "$i.big.tif" hocr 2>&1 | sed '/Tesseract Open Source OCR Engine/d'
 
 	# this reduces all bbox information to match the original image size
-	sedrule=`cat "$i.big.png.html" \
+	sedrule=`cat "$i.big.tif.html" \
 	         | sed -e 's/</\n/g' \
 	         | sed -e '/bbox/!d' -e 's/.*bbox//g' -e 's/".*//g' -e "s/'.*//g" \
 	         | awk '{ printf("s/bbox %d %d %d %d/bbox",$1,$2,$3,$4);
-	                  for(i=1;i<5;i++) printf(" %d", $i/4);
+	                  for(a=1;a<5;a++) printf(" %d", $a/4);
 	                  printf("/g\n")}'`
-	sed -e 's/\.big\.png//g' -e "$sedrule" < "$i.big.png.html" > "$i.html"
+	sed -e 's/\.big\.tif//g' -e "$sedrule" < "$i.big.tif.html" > "$i.html"
 
 	# combine the image and hocr into a pdf page
-	# Note: hocr2pdf has a habit of segfaulting, so fall back to convert
-	hocr2pdf -i "$i" -o "$i.pdf" < "$i.html" || convert "$i" "$i.pdf"
+	# Note: hocr2pdf has a habit of segfaulting, so fall back to convert.
+	#       also, it tends to complain about the quality of tesseract's
+	#       hocr output, which it's best to silence here.
+	hocr2pdf -i "$i" -o "$i.pdf" < "$i.html" >/dev/null 2>&1 || convert "$i" "$i.pdf"
 
 	# remove working files
-	rm -f "$i.big.png" "$i.big.png.html" "$i.html"
+	rm -f "$i.big.tif" "$i.big.tif.html" "$i.html"
 done
 
+echo book.pdf
+
 # cat the pdf pages together
 pdftk *pdf cat output book.pdf
 rm -f [0-9]*pdf hocr
diff --git a/extras/mkocrtxt.sh b/extras/mkocrtxt.sh
index e683459..497f8a7 100644
--- a/extras/mkocrtxt.sh
+++ b/extras/mkocrtxt.sh
@@ -11,7 +11,7 @@
 #       converted to tiff format, so that they're readable by
 #       any version of tesseract.
 
-for i in `ls *tif`
+for i in `ls *png`
 do
 	echo "$i"
author	Nick White <git@njw.me.uk>	2011-11-17 19:57:32 +0000
committer	Nick White <git@njw.me.uk>	2011-11-17 19:57:32 +0000
commit	9458d263dc8275b5d487a004166111f0775455e3 (patch)
tree	3aa7f6b42295a118a8d9c4a7161fc3791049d5a8
parent	92f10a6fbcc486045b810c1929ea4a932adfe68e (diff)