summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.me.uk>2011-10-30 12:29:45 +0000
committerNick White <git@njw.me.uk>2011-10-30 12:29:45 +0000
commitcd0c0a821361f5ee7c52ee60fb0ed5b758e53620 (patch)
treed9bc0c72dc2447c9ac3b4edc68081348b36ba64e
parent82908257a64d4fd67785c76ab33b0392bc9d9724 (diff)
Add ocr pdf script
-rw-r--r--Makefile9
-rw-r--r--TODO3
-rw-r--r--extras/mkocrpdf-cuneform.sh21
-rw-r--r--extras/mkocrpdf.sh22
-rwxr-xr-xextras/mkpdf.sh (renamed from makebookpdf.sh)0
5 files changed, 48 insertions, 7 deletions
diff --git a/Makefile b/Makefile
index c87e90d..0428b51 100644
--- a/Makefile
+++ b/Makefile
@@ -5,8 +5,9 @@ NAME = getxbook
SRC = getgbook.c getabook.c
LIB = util.o
-SCRIPTS = makebookpdf.sh getxbookgui
+GUI = getxbookgui
DOC = README COPYING INSTALL LEGAL
+EXTRAS = extras/mkpdf.sh extras/mkocrpdf.sh
BIN = $(SRC:.c=)
MAN = $(SRC:.c=.1)
@@ -32,12 +33,12 @@ util.a: $(LIB)
install: all
mkdir -p $(DESTDIR)$(PREFIX)/bin
- cp -f $(BIN) $(SCRIPTS) $(DESTDIR)$(PREFIX)/bin
+ cp -f $(BIN) $(GUI) $(DESTDIR)$(PREFIX)/bin
mkdir -p $(DESTDIR)$(MANPREFIX)/man1
for f in $(MAN); do sed "s/VERSION/$(VERSION)/g" < $$f > $(DESTDIR)$(MANPREFIX)/man1/$$f; done
uninstall:
- cd $(DESTDIR)$(PREFIX)/bin && rm -f $(BIN) $(SCRIPTS)
+ cd $(DESTDIR)$(PREFIX)/bin && rm -f $(BIN) $(GUI)
cd $(DESTDIR)$(MANPREFIX)/man1 && rm -f $(MAN)
clean:
@@ -45,7 +46,7 @@ clean:
dist:
mkdir -p $(NAME)-$(VERSION)
- cp $(SRC) $(SCRIPTS) $(DOC) util.h util.c Makefile config.mk $(NAME)-$(VERSION)
+ cp $(SRC) $(GUI) $(EXTRAS) $(DOC) util.h util.c Makefile config.mk $(NAME)-$(VERSION)
tar c $(NAME)-$(VERSION) | bzip2 -c > $(NAME)-$(VERSION).tar.bz2
gpg -b < $(NAME)-$(VERSION).tar.bz2 > $(NAME)-$(VERSION).tar.bz2.sig
rm -rf $(NAME)-$(VERSION)
diff --git a/TODO b/TODO
index 4c79489..43c7b19 100644
--- a/TODO
+++ b/TODO
@@ -4,9 +4,6 @@ before 1.0: create bn tool, fix http bugs, be unicode safe, package for osx & wi
# other todos
-improve 2pdf script to use ocr; use tesseract to output hocr & hocr2pdf (from exact-image pkg) - see http://www.exactcode.de/site/open_source/exactimage/hocr2pdf/ https://tfischernet.wordpress.com/2008/11/26/searchable-pdfs-with-linux/ http://code.google.com/p/tesseract-ocr/
- create 2epub script if simple
-
use the correct file extension depending on the image type (for google and amazon
the first page is a jpg, all the others are png)
diff --git a/extras/mkocrpdf-cuneform.sh b/extras/mkocrpdf-cuneform.sh
new file mode 100644
index 0000000..94e10eb
--- /dev/null
+++ b/extras/mkocrpdf-cuneform.sh
@@ -0,0 +1,21 @@
+#!/bin/sh
+#
+# Makes a pdf with embedded text as extracted by cuneiform
+#
+# Requires imagemagick, pdftk, hocr2pdf and cuneiform
+
+for i in `ls *png`
+do
+ a=`basename $i .png`
+ echo processing $a
+
+ convert $i $a.bmp
+ cuneiform -f hocr -o $a.html $a.bmp
+ rm -f $a.bmp
+
+ # hocr2pdf has a habit of segfaulting, so fall back to convert
+ hocr2pdf -i $i -o $a.pdf < $a.html || convert $i $a.pdf
+ rm -f $a.html
+done
+
+pdftk *pdf cat output book.pdf
diff --git a/extras/mkocrpdf.sh b/extras/mkocrpdf.sh
new file mode 100644
index 0000000..165059a
--- /dev/null
+++ b/extras/mkocrpdf.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+#
+# Makes a PDF with embedded text extracted by tesseract
+#
+# Requires imagemagick, pdftk, hocr2pdf and tesseract
+#
+# Also requires this tesseract configuration:
+# echo 'tessedit_create_hocr 1' > /usr/local/share/tessdata/configs/hocr
+
+for i in `ls *png`
+do
+ a=`basename $i .png`
+ echo processing $a
+
+ tesseract $i $a hocr 2>/dev/null
+
+ # hocr2pdf has a habit of segfaulting, so fall back to convert
+ hocr2pdf -i $i -o $a.pdf < $a.html || convert $i $a.pdf
+ rm -f $a.html
+done
+
+pdftk *pdf cat output book.pdf
diff --git a/makebookpdf.sh b/extras/mkpdf.sh
index 80e776f..80e776f 100755
--- a/makebookpdf.sh
+++ b/extras/mkpdf.sh