diff options
author | Nick White <git@njw.me.uk> | 2011-10-30 12:29:45 +0000 |
---|---|---|
committer | Nick White <git@njw.me.uk> | 2011-10-30 12:29:45 +0000 |
commit | cd0c0a821361f5ee7c52ee60fb0ed5b758e53620 (patch) | |
tree | d9bc0c72dc2447c9ac3b4edc68081348b36ba64e | |
parent | 82908257a64d4fd67785c76ab33b0392bc9d9724 (diff) |
Add ocr pdf script
-rw-r--r-- | Makefile | 9 | ||||
-rw-r--r-- | TODO | 3 | ||||
-rw-r--r-- | extras/mkocrpdf-cuneform.sh | 21 | ||||
-rw-r--r-- | extras/mkocrpdf.sh | 22 | ||||
-rwxr-xr-x | extras/mkpdf.sh (renamed from makebookpdf.sh) | 0 |
5 files changed, 48 insertions, 7 deletions
@@ -5,8 +5,9 @@ NAME = getxbook SRC = getgbook.c getabook.c LIB = util.o -SCRIPTS = makebookpdf.sh getxbookgui +GUI = getxbookgui DOC = README COPYING INSTALL LEGAL +EXTRAS = extras/mkpdf.sh extras/mkocrpdf.sh BIN = $(SRC:.c=) MAN = $(SRC:.c=.1) @@ -32,12 +33,12 @@ util.a: $(LIB) install: all mkdir -p $(DESTDIR)$(PREFIX)/bin - cp -f $(BIN) $(SCRIPTS) $(DESTDIR)$(PREFIX)/bin + cp -f $(BIN) $(GUI) $(DESTDIR)$(PREFIX)/bin mkdir -p $(DESTDIR)$(MANPREFIX)/man1 for f in $(MAN); do sed "s/VERSION/$(VERSION)/g" < $$f > $(DESTDIR)$(MANPREFIX)/man1/$$f; done uninstall: - cd $(DESTDIR)$(PREFIX)/bin && rm -f $(BIN) $(SCRIPTS) + cd $(DESTDIR)$(PREFIX)/bin && rm -f $(BIN) $(GUI) cd $(DESTDIR)$(MANPREFIX)/man1 && rm -f $(MAN) clean: @@ -45,7 +46,7 @@ clean: dist: mkdir -p $(NAME)-$(VERSION) - cp $(SRC) $(SCRIPTS) $(DOC) util.h util.c Makefile config.mk $(NAME)-$(VERSION) + cp $(SRC) $(GUI) $(EXTRAS) $(DOC) util.h util.c Makefile config.mk $(NAME)-$(VERSION) tar c $(NAME)-$(VERSION) | bzip2 -c > $(NAME)-$(VERSION).tar.bz2 gpg -b < $(NAME)-$(VERSION).tar.bz2 > $(NAME)-$(VERSION).tar.bz2.sig rm -rf $(NAME)-$(VERSION) @@ -4,9 +4,6 @@ before 1.0: create bn tool, fix http bugs, be unicode safe, package for osx & wi # other todos -improve 2pdf script to use ocr; use tesseract to output hocr & hocr2pdf (from exact-image pkg) - see http://www.exactcode.de/site/open_source/exactimage/hocr2pdf/ https://tfischernet.wordpress.com/2008/11/26/searchable-pdfs-with-linux/ http://code.google.com/p/tesseract-ocr/ - create 2epub script if simple - use the correct file extension depending on the image type (for google and amazon the first page is a jpg, all the others are png) diff --git a/extras/mkocrpdf-cuneform.sh b/extras/mkocrpdf-cuneform.sh new file mode 100644 index 0000000..94e10eb --- /dev/null +++ b/extras/mkocrpdf-cuneform.sh @@ -0,0 +1,21 @@ +#!/bin/sh +# +# Makes a pdf with embedded text as extracted by cuneiform +# +# Requires imagemagick, pdftk, hocr2pdf and cuneiform + +for i in `ls *png` +do + a=`basename $i .png` + echo processing $a + + convert $i $a.bmp + cuneiform -f hocr -o $a.html $a.bmp + rm -f $a.bmp + + # hocr2pdf has a habit of segfaulting, so fall back to convert + hocr2pdf -i $i -o $a.pdf < $a.html || convert $i $a.pdf + rm -f $a.html +done + +pdftk *pdf cat output book.pdf diff --git a/extras/mkocrpdf.sh b/extras/mkocrpdf.sh new file mode 100644 index 0000000..165059a --- /dev/null +++ b/extras/mkocrpdf.sh @@ -0,0 +1,22 @@ +#!/bin/sh +# +# Makes a PDF with embedded text extracted by tesseract +# +# Requires imagemagick, pdftk, hocr2pdf and tesseract +# +# Also requires this tesseract configuration: +# echo 'tessedit_create_hocr 1' > /usr/local/share/tessdata/configs/hocr + +for i in `ls *png` +do + a=`basename $i .png` + echo processing $a + + tesseract $i $a hocr 2>/dev/null + + # hocr2pdf has a habit of segfaulting, so fall back to convert + hocr2pdf -i $i -o $a.pdf < $a.html || convert $i $a.pdf + rm -f $a.html +done + +pdftk *pdf cat output book.pdf diff --git a/makebookpdf.sh b/extras/mkpdf.sh index 80e776f..80e776f 100755 --- a/makebookpdf.sh +++ b/extras/mkpdf.sh |