From 92f10a6fbcc486045b810c1929ea4a932adfe68e Mon Sep 17 00:00:00 2001 From: Nick White Date: Thu, 17 Nov 2011 19:27:52 +0000 Subject: Make mkocrtxt work with tesseract 2, and be more verbose --- extras/mkocrtxt.sh | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'extras') diff --git a/extras/mkocrtxt.sh b/extras/mkocrtxt.sh index 02f7146..e683459 100644 --- a/extras/mkocrtxt.sh +++ b/extras/mkocrtxt.sh @@ -2,25 +2,32 @@ # See COPYING file for copyright and license details. # # Makes a text file with text extracted by tesseract +# Requires imagemagick and tesseract # # Note: Unfortunately tesseract works much better if one first # makes the image to be OCRed significantly larger. This # script therefore temporarily creates a larger file to -# feed to tesseract. +# feed to tesseract. These temporary large files are also +# converted to tiff format, so that they're readable by +# any version of tesseract. -for i in `ls *png` +for i in `ls *tif` do + echo "$i" + # create a much bigger version of the page image width=`identify "$i" | awk '{print $3}' | sed 's/x.*//'` bigwidth=`expr $width \* 4` - convert "$i" -geometry ${bigwidth}x "$i.big.png" + convert "$i" -geometry ${bigwidth}x "$i.big.tif" # scan the page image - tesseract "$i.big.png" "$i" 2>&1 | sed '/Tesseract Open Source OCR Engine/d' + tesseract "$i.big.tif" "$i" 2>&1 | sed '/Tesseract Open Source OCR Engine/d' # combine the page text with the rest of the book cat "$i.txt" >> book.txt # remove working files - rm -f "$i.big.png" "$i.txt" + rm -f "$i.big.tif" "$i.txt" done + +echo book.txt -- cgit v1.2.3