Add ocr pdf script

author: Nick White <git@njw.me.uk> 2011-10-30 12:29:45 +0000
committer: Nick White <git@njw.me.uk> 2011-10-30 12:29:45 +0000
commit: cd0c0a821361f5ee7c52ee60fb0ed5b758e53620 (patch)
tree: d9bc0c72dc2447c9ac3b4edc68081348b36ba64e
parent: 82908257a64d4fd67785c76ab33b0392bc9d9724 (diff)
5 files changed, 48 insertions, 7 deletions
diff --git a/Makefile b/Makefile
index c87e90d..0428b51 100644
--- a/Makefile
+++ b/Makefile
@@ -5,8 +5,9 @@ NAME = getxbook
 
 SRC = getgbook.c getabook.c
 LIB = util.o
-SCRIPTS = makebookpdf.sh getxbookgui
+GUI = getxbookgui
 DOC = README COPYING INSTALL LEGAL
+EXTRAS = extras/mkpdf.sh extras/mkocrpdf.sh
 
 BIN = $(SRC:.c=)
 MAN = $(SRC:.c=.1)
@@ -32,12 +33,12 @@ util.a: $(LIB)
 
 install: all
 	mkdir -p $(DESTDIR)$(PREFIX)/bin
-	cp -f $(BIN) $(SCRIPTS) $(DESTDIR)$(PREFIX)/bin
+	cp -f $(BIN) $(GUI) $(DESTDIR)$(PREFIX)/bin
 	mkdir -p $(DESTDIR)$(MANPREFIX)/man1
 	for f in $(MAN); do sed "s/VERSION/$(VERSION)/g" < $$f > $(DESTDIR)$(MANPREFIX)/man1/$$f; done
 
 uninstall:
-	cd $(DESTDIR)$(PREFIX)/bin && rm -f $(BIN) $(SCRIPTS)
+	cd $(DESTDIR)$(PREFIX)/bin && rm -f $(BIN) $(GUI)
 	cd $(DESTDIR)$(MANPREFIX)/man1 && rm -f $(MAN)
 
 clean:
@@ -45,7 +46,7 @@ clean:
 
 dist:
 	mkdir -p $(NAME)-$(VERSION)
-	cp $(SRC) $(SCRIPTS) $(DOC) util.h util.c Makefile config.mk $(NAME)-$(VERSION)
+	cp $(SRC) $(GUI) $(EXTRAS) $(DOC) util.h util.c Makefile config.mk $(NAME)-$(VERSION)
 	tar c $(NAME)-$(VERSION) | bzip2 -c > $(NAME)-$(VERSION).tar.bz2
 	gpg -b < $(NAME)-$(VERSION).tar.bz2 > $(NAME)-$(VERSION).tar.bz2.sig
 	rm -rf $(NAME)-$(VERSION)
diff --git a/TODO b/TODO
index 4c79489..43c7b19 100644
--- a/TODO
+++ b/TODO
@@ -4,9 +4,6 @@ before 1.0: create bn tool, fix http bugs, be unicode safe, package for osx & wi
 
 # other todos
 
-improve 2pdf script to use ocr; use tesseract to output hocr & hocr2pdf (from exact-image pkg) - see http://www.exactcode.de/site/open_source/exactimage/hocr2pdf/ https://tfischernet.wordpress.com/2008/11/26/searchable-pdfs-with-linux/ http://code.google.com/p/tesseract-ocr/
- create 2epub script if simple
-
 use the correct file extension depending on the image type (for google and amazon
 the first page is a jpg, all the others are png)
 
diff --git a/extras/mkocrpdf-cuneform.sh b/extras/mkocrpdf-cuneform.sh
new file mode 100644
index 0000000..94e10eb
--- /dev/null
+++ b/extras/mkocrpdf-cuneform.sh
@@ -0,0 +1,21 @@
+#!/bin/sh
+#
+# Makes a pdf with embedded text as extracted by cuneiform
+#
+# Requires imagemagick, pdftk, hocr2pdf and cuneiform
+
+for i in `ls *png`
+do
+	a=`basename $i .png`
+	echo processing $a
+
+	convert $i $a.bmp
+	cuneiform -f hocr -o $a.html $a.bmp
+	rm -f $a.bmp
+
+	# hocr2pdf has a habit of segfaulting, so fall back to convert
+	hocr2pdf -i $i -o $a.pdf < $a.html || convert $i $a.pdf
+	rm -f $a.html
+done
+
+pdftk *pdf cat output book.pdf
diff --git a/extras/mkocrpdf.sh b/extras/mkocrpdf.sh
new file mode 100644
index 0000000..165059a
--- /dev/null
+++ b/extras/mkocrpdf.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+#
+# Makes a PDF with embedded text extracted by tesseract
+#
+# Requires imagemagick, pdftk, hocr2pdf and tesseract
+#
+# Also requires this tesseract configuration:
+# echo 'tessedit_create_hocr 1' > /usr/local/share/tessdata/configs/hocr
+
+for i in `ls *png`
+do
+	a=`basename $i .png`
+	echo processing $a
+
+	tesseract $i $a hocr 2>/dev/null
+
+	# hocr2pdf has a habit of segfaulting, so fall back to convert
+	hocr2pdf -i $i -o $a.pdf < $a.html || convert $i $a.pdf
+	rm -f $a.html
+done
+
+pdftk *pdf cat output book.pdf
diff --git a/makebookpdf.sh b/extras/mkpdf.sh
index 80e776f..80e776f 100755
--- a/makebookpdf.sh
+++ b/extras/mkpdf.sh
author	Nick White <git@njw.me.uk>	2011-10-30 12:29:45 +0000
committer	Nick White <git@njw.me.uk>	2011-10-30 12:29:45 +0000
commit	cd0c0a821361f5ee7c52ee60fb0ed5b758e53620 (patch)
tree	d9bc0c72dc2447c9ac3b4edc68081348b36ba64e
parent	82908257a64d4fd67785c76ab33b0392bc9d9724 (diff)