From 063a0de3c10d38741c939297bb19d8284757e00c Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 1 Aug 2011 18:26:56 +0100 Subject: Lots of tightening and cleanups --- Makefile | 2 +- TODO | 28 +++++++++++----------------- config.mk | 10 +++++----- getgbook.c | 49 ++++++++++++++++++++++++++++--------------------- getgmissing.sh | 12 ++++++++++++ util.c | 21 ++++++++++----------- util.h | 3 ++- 7 files changed, 69 insertions(+), 56 deletions(-) create mode 100644 getgmissing.sh diff --git a/Makefile b/Makefile index 8cb19fd..df6b1a4 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -# See COPYING file for copyright, license and warranty details. +# See COPYING file for copyright and license details. include config.mk NAME = getxbook diff --git a/TODO b/TODO index 43cb56a..a1acd78 100644 --- a/TODO +++ b/TODO @@ -1,12 +1,6 @@ -Note: looks like google allows around 3 page requests per cookie session, and about 40 per ip per [some time period]. If I knew the time period, and once stdin retry is working, could make a script that gets all it can, gets a list of failures, waits, then tries failures, etc. Note these would also have to stop at some point; some pages just aren't available +list all binaries in readme and what they do -make sure i'm checking all lib calls that could fail - -make sure all arrays are used within bounds - -strace to check paths taken are sensible - -use defined constants rather than e.g. 1024 +# other utils getgbooktxt (different program as it gets from html pages, which getgbook doesn't any more) @@ -14,9 +8,7 @@ getabook getbnbook -openlibrary.org? - -# once it is basically working # +# other todos try supporting 3xx in get, if it can be done in a few lines by getting Location line, freeing buf, and returning a new @@ -24,14 +16,16 @@ try supporting 3xx in get, if it can be done in a few lines add https support to get -to be fast and efficient it's best to crank through all the json 1st, filling in an array of page structs as we go - this requires slightly fuller json support - could consider making a json reading module, ala confoo, to make ad-hoc memory structures from json - -write helper scripts like trymissing - write some little tests +## getgbook + have file extension be determined by file type, rather than assuming png think about whether default functionality should be dl all, rather than -a + +to be fast and efficient it's best to crank through all the json 1st, filling in an array of page structs as we go + this requires slightly fuller json support + could consider making a json reading module, ala confoo, to make ad-hoc memory structures from json + +Note: looks like google allows around 3 page requests per cookie session, and about 40 per ip per [some time period]. If I knew the time period, could make a script that gets all it can, gets a list of failures, waits, then tries failures, etc. Note these would also have to stop at some point; some pages just aren't available diff --git a/config.mk b/config.mk index 321a1ec..500e53d 100644 --- a/config.mk +++ b/config.mk @@ -1,4 +1,4 @@ -# See COPYING file for copyright, license and warranty details. +# See COPYING file for copyright and license details. VERSION = prealpha # paths @@ -8,11 +8,11 @@ CFLAGS = -ansi -pedantic -Wall -Wextra -Werror -g -D_POSIX_C_SOURCE=200112L \ -DVERSION=\"$(VERSION)\" # musl static -CC = musl-gcc -LDFLAGS = -static #-s +#CC = musl-gcc +#LDFLAGS = -static #-s # glibc dynamic -#CC = cc -#LDFLAGS = +CC = cc +LDFLAGS = LD = $(CC) diff --git a/getgbook.c b/getgbook.c index f947a82..1f98fc1 100644 --- a/getgbook.c +++ b/getgbook.c @@ -1,4 +1,4 @@ -/* See COPYING file for copyright, license and warranty details. */ +/* See COPYING file for copyright and license details. */ #include #include #include @@ -11,19 +11,22 @@ " otherwise, all pages in stdin will be downloaded\n" #define URLMAX 1024 +#define STRMAX 1024 +#define PGCODELEN 3 +#define RETRYNUM 5 typedef struct { int num; char url[URLMAX]; - char name[80]; + char name[STRMAX]; } Page; -char pagecodes[][3] = { "PP", "PR", "PA", "PT", "\0" }; +char pagecodes[][PGCODELEN] = { "PP", "PR", "PA", "PT", "\0" }; Page *getpagedetail(char *bookid, char *pg, char *cookie) { - char url[URLMAX]; - char *buf, *c, *d, m[80], *p; + char url[URLMAX], m[STRMAX]; + char *c, *d, *p, *buf = NULL; Page *page; snprintf(url, URLMAX, "/books?id=%s&pg=%s&jscmd=click3", bookid, pg); @@ -31,12 +34,12 @@ Page *getpagedetail(char *bookid, char *pg, char *cookie) if(!get("books.google.com", url, cookie, NULL, &buf)) return NULL; - snprintf(m, 80, "\"pid\":\"%s\"", pg); + snprintf(m, STRMAX, "\"pid\":\"%s\"", pg); if(!(c = strstr(buf,m))) return NULL; page = malloc(sizeof(Page)); - strncpy(page->name, pg, 80); + strncpy(page->name, pg, STRMAX); page->url[0] = '\0'; page->num = -1; @@ -68,7 +71,8 @@ Page *getpagedetail(char *bookid, char *pg, char *cookie) int main(int argc, char *argv[]) { - char *bookid, *tmp, pg[16], buf[1024], n[80], code[3], cookie[COOKIEMAX]; + char *bookid, *tmp, *code; + char pg[STRMAX], buf[BUFSIZ], n[STRMAX], cookie[COOKIEMAX] = ""; int i, c, retry; Page *page; @@ -81,13 +85,13 @@ int main(int argc, char *argv[]) bookid = argv[argc-1]; if(argv[1][0] == '-') { - strncpy(code, pagecodes[0], 3); + code = pagecodes[0]; c = i = retry = 0; while(++i) { - snprintf(pg, 15, "%s%d", code, i); + snprintf(pg, STRMAX, "%s%d", code, i); if(!(page = getpagedetail(bookid, pg, cookie))) { /* no more pages with that code */ - strncpy(code, pagecodes[++c], 3); + code = pagecodes[++c]; if(code[0] == '\0') break; i=0; continue; @@ -95,7 +99,7 @@ int main(int argc, char *argv[]) if(!page->url[0]) { free(page); /* try with fresh cookie */ - if(retry < 5) { + if(retry < RETRYNUM) { get("books.google.com", "/", NULL, cookie, &tmp); free(tmp); retry++; @@ -109,11 +113,13 @@ int main(int argc, char *argv[]) retry=0; if(argv[1][1] == 'a') { if(page->num != -1) - snprintf(n, 80, "%05d.png", page->num); + snprintf(n, STRMAX, "%04d.png", page->num); + else + snprintf(n, STRMAX, "%s.png", page->name); + if(gettofile("books.google.com", page->url, cookie, NULL, n)) + fprintf(stderr, "%s failed\n", pg); else - snprintf(n, 80, "%s.png", page->name); - gettofile("books.google.com", page->url, cookie, NULL, n); - printf("Downloaded page %d\n", page->num); + printf("Downloaded page %d\n", page->num); } else { printf("%s ", page->name); if(page->num != -1) printf("%d", page->num); @@ -122,20 +128,21 @@ int main(int argc, char *argv[]) free(page); } } else { - while(fgets(buf, 1024, stdin)) { + while(fgets(buf, BUFSIZ, stdin)) { sscanf(buf, "%15s", pg); - for(retry = 0; retry < 5; retry++) { + for(retry = 0; retry < RETRYNUM; retry++) { get("books.google.com", "/", NULL, cookie, &tmp); if((page = getpagedetail(bookid, pg, cookie)) && page->url[0]) { - snprintf(n, 80, "%05d.png", page->num); - gettofile("books.google.com", page->url, cookie, NULL, n); + snprintf(n, STRMAX, "%04d.png", page->num); + if(gettofile("books.google.com", page->url, cookie, NULL, n)) + continue; printf("Downloaded page %d\n", page->num); free(page); break; } if(page) free(page); } - if(retry == 5) + if(retry == RETRYNUM) fprintf(stderr, "%s failed\n", pg); } } diff --git a/getgmissing.sh b/getgmissing.sh new file mode 100644 index 0000000..d936425 --- /dev/null +++ b/getgmissing.sh @@ -0,0 +1,12 @@ +#!/bin/sh +# See COPYING file for copyright and license details. + +test $# -ne 1 && echo "usage: $0 bookid" && exit + +getgbook -p $1 2>/dev/null | while read i +do + code=`echo $i|awk '{print $1}'` + num=`echo $i|awk '{print $2}'` + test -n "$num" && num=`printf '%05d' || num=$code + test -f $num.png || echo $code | getgbook $1 +done diff --git a/util.c b/util.c index 21bc598..719c8ff 100644 --- a/util.c +++ b/util.c @@ -1,4 +1,4 @@ -/* See COPYING file for copyright, license and warranty details. */ +/* See COPYING file for copyright and license details. */ #include #include #include @@ -39,31 +39,30 @@ int dial(char *host, char *port) { int get(char *host, char *path, char *sendcookie, char *savecookie, char **buf) { size_t l, res; int fd, i, p; - char h[1024] = "\0"; - char c[1024] = ""; + char h[HEADERMAX] = "\0"; + char c[COOKIEMAX] = ""; FILE *srv; - if(savecookie) savecookie[0] = 0; /* TEMP TO PLEASE GCC */ if((fd = dial(host, "80")) == -1) return 0; srv = fdopen(fd, "r+"); if(sendcookie) - snprintf(c, COOKIEMAX-1, "\r\nCookie: %s", sendcookie); - fprintf(srv, "GET %s HTTP/1.0\r\nUser-Agent: getgbook-"VERSION \ + snprintf(c, COOKIEMAX, "\r\nCookie: %s", sendcookie); + fprintf(srv, "GET %s HTTP/1.0\r\nUser-Agent: getxbook-"VERSION \ " (not mozilla)\r\nHost: %s%s\r\n\r\n", path, host, c); fflush(srv); while(h[0] != '\r') { - fgets(h, 1024, srv); + fgets(h, HEADERMAX, srv); if(sscanf(h, "HTTP/%d.%d %d", &i, &i, &p) == 3 && p != 200) return 0; if(savecookie != NULL && sscanf(h, "Set-Cookie: %s;", c)) - strncat(savecookie, c, COOKIEMAX-1); + strncat(savecookie, c, COOKIEMAX); } - *buf = malloc(sizeof(char *) * 4096); - for(l=0; (res = fread(*buf+l, 1, 4096, srv)) > 0; l+=res) - *buf = realloc(*buf, sizeof(char *) * (l+4096)); + *buf = malloc(sizeof(char *) * BUFSIZ); + for(l=0; (res = fread(*buf+l, 1, BUFSIZ, srv)) > 0; l+=res) + *buf = realloc(*buf, sizeof(char *) * (l+BUFSIZ)); fclose(srv); return l; diff --git a/util.h b/util.h index 82c0a29..11677af 100644 --- a/util.h +++ b/util.h @@ -1,5 +1,6 @@ -/* See COPYING file for copyright, license and warranty details. */ +/* See COPYING file for copyright and license details. */ #define COOKIEMAX 1024 +#define HEADERMAX 1024 int dial(char *host, char *port); int get(char *host, char *path, char *sendcookie, char *savecookie, char **buf); int gettofile(char *host, char *url, char *sendcookie, char *savecookie, char *savepath); -- cgit v1.2.3