diff options
author | Nick White <git@njw.me.uk> | 2011-11-16 21:46:59 +0000 |
---|---|---|
committer | Nick White <git@njw.me.uk> | 2011-11-16 21:46:59 +0000 |
commit | 58521d1c5461ab98bc63415ad26e91d54612e43e (patch) | |
tree | 94fd42741cd221f139bf88efb428434e996ba147 | |
parent | 048c334b5fa81355cedef4799da3a3c646a4a321 (diff) |
Vastly improve ocrpdf script, and create ocrtxt script
-rw-r--r-- | extras/mkocrpdf.sh | 36 | ||||
-rw-r--r-- | extras/mkocrtxt.sh | 26 |
2 files changed, 53 insertions, 9 deletions
diff --git a/extras/mkocrpdf.sh b/extras/mkocrpdf.sh index 3a01559..9113963 100644 --- a/extras/mkocrpdf.sh +++ b/extras/mkocrpdf.sh @@ -3,23 +3,41 @@ # # Makes a PDF with embedded text extracted by tesseract # Requires imagemagick, pdftk, hocr2pdf and tesseract 3 +# +# Note: Unfortunately tesseract works much better if one first +# makes the image to be OCRed significantly larger. This +# script does that, then reduces the results back down +# to create a reasonable size PDF. echo 'tessedit_create_hocr 1' > hocr for i in `ls *png` do - a=`basename $i .png` - echo processing $a + # create a much bigger version of the page image + width=`identify "$i" | awk '{print $3}' | sed 's/x.*//'` + bigwidth=`expr $width \* 4` + convert "$i" -geometry ${bigwidth}x "$i.big.png" + + # scan the page image + tesseract "$i.big.png" "$i.big.png" hocr 2>&1 | sed '/Tesseract Open Source OCR Engine/d' + + # this reduces all bbox information to match the original image size + sedrule=`cat "$i.big.png.html" \ + | sed -e 's/</\n/g' \ + | sed -e '/bbox/!d' -e 's/.*bbox//g' -e 's/".*//g' -e "s/'.*//g" \ + | awk '{ printf("s/bbox %d %d %d %d/bbox",$1,$2,$3,$4); + for(i=1;i<5;i++) printf(" %d", $i/4); + printf("/g\n")}'` + sed -e 's/\.big\.png//g' -e "$sedrule" < "$i.big.png.html" > "$i.html" - # unfortunately tesseract seems to work much better with a - # resized larger image - convert $i -geometry 1000x $a.big.png - tesseract $a.big.png $a hocr 2>/dev/null + # combine the image and hocr into a pdf page + # Note: hocr2pdf has a habit of segfaulting, so fall back to convert + hocr2pdf -i "$i" -o "$i.pdf" < "$i.html" || convert "$i" "$i.pdf" - # hocr2pdf has a habit of segfaulting, so fall back to convert - hocr2pdf -i $a.big.png -o $a.pdf < $a.html || convert $a.big.png $a.pdf - rm -f $a.html $a.big.png + # remove working files + rm -f "$i.big.png" "$i.big.png.html" "$i.html" done +# cat the pdf pages together pdftk *pdf cat output book.pdf rm -f [0-9]*pdf hocr diff --git a/extras/mkocrtxt.sh b/extras/mkocrtxt.sh new file mode 100644 index 0000000..02f7146 --- /dev/null +++ b/extras/mkocrtxt.sh @@ -0,0 +1,26 @@ +#!/bin/sh +# See COPYING file for copyright and license details. +# +# Makes a text file with text extracted by tesseract +# +# Note: Unfortunately tesseract works much better if one first +# makes the image to be OCRed significantly larger. This +# script therefore temporarily creates a larger file to +# feed to tesseract. + +for i in `ls *png` +do + # create a much bigger version of the page image + width=`identify "$i" | awk '{print $3}' | sed 's/x.*//'` + bigwidth=`expr $width \* 4` + convert "$i" -geometry ${bigwidth}x "$i.big.png" + + # scan the page image + tesseract "$i.big.png" "$i" 2>&1 | sed '/Tesseract Open Source OCR Engine/d' + + # combine the page text with the rest of the book + cat "$i.txt" >> book.txt + + # remove working files + rm -f "$i.big.png" "$i.txt" +done |