From cd0c0a821361f5ee7c52ee60fb0ed5b758e53620 Mon Sep 17 00:00:00 2001 From: Nick White Date: Sun, 30 Oct 2011 12:29:45 +0000 Subject: Add ocr pdf script --- extras/mkocrpdf-cuneform.sh | 21 +++++++++++++++++++++ extras/mkocrpdf.sh | 22 ++++++++++++++++++++++ extras/mkpdf.sh | 5 +++++ 3 files changed, 48 insertions(+) create mode 100644 extras/mkocrpdf-cuneform.sh create mode 100644 extras/mkocrpdf.sh create mode 100755 extras/mkpdf.sh (limited to 'extras') diff --git a/extras/mkocrpdf-cuneform.sh b/extras/mkocrpdf-cuneform.sh new file mode 100644 index 0000000..94e10eb --- /dev/null +++ b/extras/mkocrpdf-cuneform.sh @@ -0,0 +1,21 @@ +#!/bin/sh +# +# Makes a pdf with embedded text as extracted by cuneiform +# +# Requires imagemagick, pdftk, hocr2pdf and cuneiform + +for i in `ls *png` +do + a=`basename $i .png` + echo processing $a + + convert $i $a.bmp + cuneiform -f hocr -o $a.html $a.bmp + rm -f $a.bmp + + # hocr2pdf has a habit of segfaulting, so fall back to convert + hocr2pdf -i $i -o $a.pdf < $a.html || convert $i $a.pdf + rm -f $a.html +done + +pdftk *pdf cat output book.pdf diff --git a/extras/mkocrpdf.sh b/extras/mkocrpdf.sh new file mode 100644 index 0000000..165059a --- /dev/null +++ b/extras/mkocrpdf.sh @@ -0,0 +1,22 @@ +#!/bin/sh +# +# Makes a PDF with embedded text extracted by tesseract +# +# Requires imagemagick, pdftk, hocr2pdf and tesseract +# +# Also requires this tesseract configuration: +# echo 'tessedit_create_hocr 1' > /usr/local/share/tessdata/configs/hocr + +for i in `ls *png` +do + a=`basename $i .png` + echo processing $a + + tesseract $i $a hocr 2>/dev/null + + # hocr2pdf has a habit of segfaulting, so fall back to convert + hocr2pdf -i $i -o $a.pdf < $a.html || convert $i $a.pdf + rm -f $a.html +done + +pdftk *pdf cat output book.pdf diff --git a/extras/mkpdf.sh b/extras/mkpdf.sh new file mode 100755 index 0000000..80e776f --- /dev/null +++ b/extras/mkpdf.sh @@ -0,0 +1,5 @@ +#!/bin/sh +# See COPYING file for copyright and license details. +# Requires imagemagick + +convert *.png book.pdf -- cgit v1.2.3