Make mkocrtxt work with tesseract 2, and be more verbose

author: Nick White <git@njw.me.uk> 2011-11-17 19:27:52 +0000
committer: Nick White <git@njw.me.uk> 2011-11-17 19:27:52 +0000
commit: 92f10a6fbcc486045b810c1929ea4a932adfe68e (patch)
tree: 09c4242c6f7707d00df06ef1d81182c3ea99438e
parent: 05094aa978ab43c9f731a1851ac3d9bf78903697 (diff)
1 files changed, 12 insertions, 5 deletions
diff --git a/extras/mkocrtxt.sh b/extras/mkocrtxt.sh
index 02f7146..e683459 100644
--- a/extras/mkocrtxt.sh
+++ b/extras/mkocrtxt.sh
@@ -2,25 +2,32 @@
 # See COPYING file for copyright and license details.
 #
 # Makes a text file with text extracted by tesseract
+# Requires imagemagick and tesseract
 #
 # Note: Unfortunately tesseract works much better if one first
 #       makes the image to be OCRed significantly larger. This
 #       script therefore temporarily creates a larger file to
-#       feed to tesseract.
+#       feed to tesseract. These temporary large files are also
+#       converted to tiff format, so that they're readable by
+#       any version of tesseract.
 
-for i in `ls *png`
+for i in `ls *tif`
 do
+	echo "$i"
+
 	# create a much bigger version of the page image
 	width=`identify "$i" | awk '{print $3}' | sed 's/x.*//'`
 	bigwidth=`expr $width \* 4`
-	convert "$i" -geometry ${bigwidth}x "$i.big.png"
+	convert "$i" -geometry ${bigwidth}x "$i.big.tif"
 
 	# scan the page image
-	tesseract "$i.big.png" "$i" 2>&1 | sed '/Tesseract Open Source OCR Engine/d'
+	tesseract "$i.big.tif" "$i" 2>&1 | sed '/Tesseract Open Source OCR Engine/d'
 
 	# combine the page text with the rest of the book
 	cat "$i.txt" >> book.txt
 
 	# remove working files
-	rm -f "$i.big.png" "$i.txt"
+	rm -f "$i.big.tif" "$i.txt"
 done
+
+echo book.txt
author	Nick White <git@njw.me.uk>	2011-11-17 19:27:52 +0000
committer	Nick White <git@njw.me.uk>	2011-11-17 19:27:52 +0000
commit	92f10a6fbcc486045b810c1929ea4a932adfe68e (patch)
tree	09c4242c6f7707d00df06ef1d81182c3ea99438e
parent	05094aa978ab43c9f731a1851ac3d9bf78903697 (diff)