summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.me.uk>2011-11-16 21:46:59 +0000
committerNick White <git@njw.me.uk>2011-11-16 21:46:59 +0000
commit58521d1c5461ab98bc63415ad26e91d54612e43e (patch)
tree94fd42741cd221f139bf88efb428434e996ba147
parent048c334b5fa81355cedef4799da3a3c646a4a321 (diff)
Vastly improve ocrpdf script, and create ocrtxt script
-rw-r--r--extras/mkocrpdf.sh36
-rw-r--r--extras/mkocrtxt.sh26
2 files changed, 53 insertions, 9 deletions
diff --git a/extras/mkocrpdf.sh b/extras/mkocrpdf.sh
index 3a01559..9113963 100644
--- a/extras/mkocrpdf.sh
+++ b/extras/mkocrpdf.sh
@@ -3,23 +3,41 @@
#
# Makes a PDF with embedded text extracted by tesseract
# Requires imagemagick, pdftk, hocr2pdf and tesseract 3
+#
+# Note: Unfortunately tesseract works much better if one first
+# makes the image to be OCRed significantly larger. This
+# script does that, then reduces the results back down
+# to create a reasonable size PDF.
echo 'tessedit_create_hocr 1' > hocr
for i in `ls *png`
do
- a=`basename $i .png`
- echo processing $a
+ # create a much bigger version of the page image
+ width=`identify "$i" | awk '{print $3}' | sed 's/x.*//'`
+ bigwidth=`expr $width \* 4`
+ convert "$i" -geometry ${bigwidth}x "$i.big.png"
+
+ # scan the page image
+ tesseract "$i.big.png" "$i.big.png" hocr 2>&1 | sed '/Tesseract Open Source OCR Engine/d'
+
+ # this reduces all bbox information to match the original image size
+ sedrule=`cat "$i.big.png.html" \
+ | sed -e 's/</\n/g' \
+ | sed -e '/bbox/!d' -e 's/.*bbox//g' -e 's/".*//g' -e "s/'.*//g" \
+ | awk '{ printf("s/bbox %d %d %d %d/bbox",$1,$2,$3,$4);
+ for(i=1;i<5;i++) printf(" %d", $i/4);
+ printf("/g\n")}'`
+ sed -e 's/\.big\.png//g' -e "$sedrule" < "$i.big.png.html" > "$i.html"
- # unfortunately tesseract seems to work much better with a
- # resized larger image
- convert $i -geometry 1000x $a.big.png
- tesseract $a.big.png $a hocr 2>/dev/null
+ # combine the image and hocr into a pdf page
+ # Note: hocr2pdf has a habit of segfaulting, so fall back to convert
+ hocr2pdf -i "$i" -o "$i.pdf" < "$i.html" || convert "$i" "$i.pdf"
- # hocr2pdf has a habit of segfaulting, so fall back to convert
- hocr2pdf -i $a.big.png -o $a.pdf < $a.html || convert $a.big.png $a.pdf
- rm -f $a.html $a.big.png
+ # remove working files
+ rm -f "$i.big.png" "$i.big.png.html" "$i.html"
done
+# cat the pdf pages together
pdftk *pdf cat output book.pdf
rm -f [0-9]*pdf hocr
diff --git a/extras/mkocrtxt.sh b/extras/mkocrtxt.sh
new file mode 100644
index 0000000..02f7146
--- /dev/null
+++ b/extras/mkocrtxt.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+# See COPYING file for copyright and license details.
+#
+# Makes a text file with text extracted by tesseract
+#
+# Note: Unfortunately tesseract works much better if one first
+# makes the image to be OCRed significantly larger. This
+# script therefore temporarily creates a larger file to
+# feed to tesseract.
+
+for i in `ls *png`
+do
+ # create a much bigger version of the page image
+ width=`identify "$i" | awk '{print $3}' | sed 's/x.*//'`
+ bigwidth=`expr $width \* 4`
+ convert "$i" -geometry ${bigwidth}x "$i.big.png"
+
+ # scan the page image
+ tesseract "$i.big.png" "$i" 2>&1 | sed '/Tesseract Open Source OCR Engine/d'
+
+ # combine the page text with the rest of the book
+ cat "$i.txt" >> book.txt
+
+ # remove working files
+ rm -f "$i.big.png" "$i.txt"
+done