From e1fada376c4b883d53d6293babb899aa1c510b65 Mon Sep 17 00:00:00 2001 From: Nick White Date: Thu, 2 Feb 2012 00:34:03 +0000 Subject: Add djvu creation scripts --- Makefile | 2 +- TODO | 2 -- extras/mkdjvu.sh | 24 ++++++++++++++++++++++++ extras/mkocrdjvu.sh | 41 +++++++++++++++++++++++++++++++++++++++++ extras/mkpdf.sh | 1 + 5 files changed, 67 insertions(+), 3 deletions(-) create mode 100644 extras/mkdjvu.sh create mode 100644 extras/mkocrdjvu.sh diff --git a/Makefile b/Makefile index 68a300c..b2c3979 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ SRC = getgbook.c getabook.c getbnbook.c LIB = util.o GUI = getxbookgui.tcl DOC = README COPYING INSTALL LEGAL -EXTRAS = extras/mkpdf.sh extras/mkocrpdf.sh extras/mkocrtxt.sh +EXTRAS = extras/mkpdf.sh extras/mkocrpdf.sh extras/mkdjvu.sh extras/mkocrtxt.sh extras/mkocrdjvu.sh BIN = $(SRC:.c=) MAN = $(SRC:.c=.1) diff --git a/TODO b/TODO index 8f01eec..3c51448 100644 --- a/TODO +++ b/TODO @@ -8,8 +8,6 @@ in getgbook, check that downloaded page doesn't match 'page not available' image package for osx - https://github.com/kennethreitz/osx-gcc-installer -add djvu convert script - use something smarter than update in gui to stop freezing add https support to get (getabook can use it everywhere, others cannot) diff --git a/extras/mkdjvu.sh b/extras/mkdjvu.sh new file mode 100644 index 0000000..6e89235 --- /dev/null +++ b/extras/mkdjvu.sh @@ -0,0 +1,24 @@ +#!/bin/sh +# See COPYING file for copyright and license details. +# +# Makes a DjVu +# Requires imagemagick and djvulibre + +test $# -ne 1 && echo "Usage: $0 bookdir" && exit 1 +cd "$1" || exit 1 + +for i in `ls` +do + echo "$i" + + convert "$i" "$i.ppm" + c44 "$i.ppm" "$i.djvu" + + rm -f "$i.ppm" +done + +djvm -c book.djvu *.djvu + +rm -f [0-9]*djvu + +echo "$1/book.djvu" diff --git a/extras/mkocrdjvu.sh b/extras/mkocrdjvu.sh new file mode 100644 index 0000000..c1207f5 --- /dev/null +++ b/extras/mkocrdjvu.sh @@ -0,0 +1,41 @@ +#!/bin/sh +# See COPYING file for copyright and license details. +# +# Makes a DjVu with embedded text extracted by tesseract +# Requires imagemagick, djvulibre and tesseract 3 +# +# Note that this doesn't use bounding box info, so that text +# reflows much better. + +test $# -ne 1 && echo "Usage: $0 bookdir" && exit 1 +cd "$1" || exit 1 + +for i in `ls` +do + echo "$i" + + # create djvu compressed version + convert "$i" "$i.ppm" + c44 "$i.ppm" "$i.djvu" + + # create a much bigger version of the page image, for better + # tesseract accuracy + width=`identify "$i" |awk '{print $3}'|awk -F x '{print $1}'` + height=`identify "$i" |awk '{print $3}'|awk -F x '{print $2}'` + bigwidth=`expr $width \* 4` + convert "$i" -geometry ${bigwidth}x "$i.tif" + + tesseract "$i.tif" "$i" 2>&1 | sed '/Tesseract Open Source OCR Engine/d' + + # convert tesseract output into djvused input + (printf "(page 0 0 $width $height \"";sed 's/"/\\"/g;'"s/\'/\\\'/g" < "$i.txt";printf \"")\n") > "$i.djvutxt" + djvused "$i.djvu" -e "select 1; set-txt $i.djvutxt" -s + + rm -f "$i.ppm" "$i.tif" "$i.txt" "$i.djvutxt" +done + +djvm -c book.djvu *.djvu + +rm -f [0-9]*djvu + +echo "$1/book.djvu" diff --git a/extras/mkpdf.sh b/extras/mkpdf.sh index a2ba2c7..6ea7e08 100755 --- a/extras/mkpdf.sh +++ b/extras/mkpdf.sh @@ -1,6 +1,7 @@ #!/bin/sh # See COPYING file for copyright and license details. # +# Makes a PDF # Requires imagemagick test $# -ne 1 && echo "Usage: $0 bookdir" && exit 1 -- cgit v1.2.3