summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.me.uk>2011-11-17 19:27:52 +0000
committerNick White <git@njw.me.uk>2011-11-17 19:27:52 +0000
commit92f10a6fbcc486045b810c1929ea4a932adfe68e (patch)
tree09c4242c6f7707d00df06ef1d81182c3ea99438e
parent05094aa978ab43c9f731a1851ac3d9bf78903697 (diff)
Make mkocrtxt work with tesseract 2, and be more verbose
-rw-r--r--extras/mkocrtxt.sh17
1 files changed, 12 insertions, 5 deletions
diff --git a/extras/mkocrtxt.sh b/extras/mkocrtxt.sh
index 02f7146..e683459 100644
--- a/extras/mkocrtxt.sh
+++ b/extras/mkocrtxt.sh
@@ -2,25 +2,32 @@
# See COPYING file for copyright and license details.
#
# Makes a text file with text extracted by tesseract
+# Requires imagemagick and tesseract
#
# Note: Unfortunately tesseract works much better if one first
# makes the image to be OCRed significantly larger. This
# script therefore temporarily creates a larger file to
-# feed to tesseract.
+# feed to tesseract. These temporary large files are also
+# converted to tiff format, so that they're readable by
+# any version of tesseract.
-for i in `ls *png`
+for i in `ls *tif`
do
+ echo "$i"
+
# create a much bigger version of the page image
width=`identify "$i" | awk '{print $3}' | sed 's/x.*//'`
bigwidth=`expr $width \* 4`
- convert "$i" -geometry ${bigwidth}x "$i.big.png"
+ convert "$i" -geometry ${bigwidth}x "$i.big.tif"
# scan the page image
- tesseract "$i.big.png" "$i" 2>&1 | sed '/Tesseract Open Source OCR Engine/d'
+ tesseract "$i.big.tif" "$i" 2>&1 | sed '/Tesseract Open Source OCR Engine/d'
# combine the page text with the rest of the book
cat "$i.txt" >> book.txt
# remove working files
- rm -f "$i.big.png" "$i.txt"
+ rm -f "$i.big.tif" "$i.txt"
done
+
+echo book.txt