diff options
| author | Nick White <git@njw.me.uk> | 2011-11-17 19:27:52 +0000 | 
|---|---|---|
| committer | Nick White <git@njw.me.uk> | 2011-11-17 19:27:52 +0000 | 
| commit | 92f10a6fbcc486045b810c1929ea4a932adfe68e (patch) | |
| tree | 09c4242c6f7707d00df06ef1d81182c3ea99438e | |
| parent | 05094aa978ab43c9f731a1851ac3d9bf78903697 (diff) | |
Make mkocrtxt work with tesseract 2, and be more verbose
| -rw-r--r-- | extras/mkocrtxt.sh | 17 | 
1 files changed, 12 insertions, 5 deletions
| diff --git a/extras/mkocrtxt.sh b/extras/mkocrtxt.sh index 02f7146..e683459 100644 --- a/extras/mkocrtxt.sh +++ b/extras/mkocrtxt.sh @@ -2,25 +2,32 @@  # See COPYING file for copyright and license details.  #  # Makes a text file with text extracted by tesseract +# Requires imagemagick and tesseract  #  # Note: Unfortunately tesseract works much better if one first  #       makes the image to be OCRed significantly larger. This  #       script therefore temporarily creates a larger file to -#       feed to tesseract. +#       feed to tesseract. These temporary large files are also +#       converted to tiff format, so that they're readable by +#       any version of tesseract. -for i in `ls *png` +for i in `ls *tif`  do +	echo "$i" +  	# create a much bigger version of the page image  	width=`identify "$i" | awk '{print $3}' | sed 's/x.*//'`  	bigwidth=`expr $width \* 4` -	convert "$i" -geometry ${bigwidth}x "$i.big.png" +	convert "$i" -geometry ${bigwidth}x "$i.big.tif"  	# scan the page image -	tesseract "$i.big.png" "$i" 2>&1 | sed '/Tesseract Open Source OCR Engine/d' +	tesseract "$i.big.tif" "$i" 2>&1 | sed '/Tesseract Open Source OCR Engine/d'  	# combine the page text with the rest of the book  	cat "$i.txt" >> book.txt  	# remove working files -	rm -f "$i.big.png" "$i.txt" +	rm -f "$i.big.tif" "$i.txt"  done + +echo book.txt | 
