From 3afe70f3cd0a19465ef9f8bbaf6a0961d9eb6d3a Mon Sep 17 00:00:00 2001 From: Nick White Date: Wed, 17 Aug 2011 18:57:02 +0100 Subject: Started rewrite (not there yet) --- TODO | 2 ++ getgbook.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 49 insertions(+), 9 deletions(-) diff --git a/TODO b/TODO index 558b8d8..8a1deb7 100644 --- a/TODO +++ b/TODO @@ -4,6 +4,8 @@ getabook getbnbook +use "" rather than "\0" in headermax + # other todos use HTTP/1.1 with "Connection: close" header diff --git a/getgbook.c b/getgbook.c index 5f0a0ae..52eb82a 100644 --- a/getgbook.c +++ b/getgbook.c @@ -5,15 +5,15 @@ #include "util.h" #define usage "getgbook " VERSION " - a google books downloader\n" \ - "usage: getgbook [-p|-a] bookid\n" \ - " -p print all available pages\n" \ - " -a download all available pages\n" \ - " otherwise, all pages in stdin will be downloaded\n" + "usage: getgbook [-] bookid\n" \ + " - download pages from stdin\n" \ + " otherwise, all available pages will be downloaded\n" #define URLMAX 1024 #define STRMAX 1024 #define PGCODELEN 3 #define RETRYNUM 5 +#define COOKIENUM 5 typedef struct { int num; @@ -23,6 +23,13 @@ typedef struct { char pagecodes[][PGCODELEN] = { "PP", "PR", "PA", "PT", "\0" }; +int getpagelist(char *bookid, Page *pages) +{ + /* TODO */ + /*http://books.google.com/books?id=h3DSQ0L10o8C&printsec=frontcover*/ + return 1; +} + Page *getpagedetail(char *bookid, char *pg, char *cookie) { char url[URLMAX], m[STRMAX]; @@ -71,20 +78,48 @@ Page *getpagedetail(char *bookid, char *pg, char *cookie) int main(int argc, char *argv[]) { - char *bookid, *tmp, *code; + char *bookid, *tmp, *code, cookies[COOKIENUM][COOKIEMAX]; char pg[STRMAX], buf[BUFSIZ], n[STRMAX], cookie[COOKIEMAX] = ""; int i, c, retry; - Page *page; - if(argc < 2 || argc > 3 || - (argv[1][0]=='-' && ((argv[1][1]!='p' && argv[1][1]!='a') || argc < 3))) { + if(argc < 2 || argc > 3 || (argc == 3 && argv[1][0]!='-')) { fputs(usage, stdout); return 1; } + /* get cookies */ + for(i=0;i Date: Thu, 18 Aug 2011 23:08:25 +0100 Subject: Fix cookie bug causing segfault if site blocked --- getgbook.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/getgbook.c b/getgbook.c index 52eb82a..f944d9f 100644 --- a/getgbook.c +++ b/getgbook.c @@ -89,8 +89,8 @@ int main(int argc, char *argv[]) /* get cookies */ for(i=0;i Date: Thu, 18 Aug 2011 23:08:57 +0100 Subject: getpagelist working, though no further functionality yet --- getgbook.c | 118 +++++++++++++++++++++---------------------------------------- 1 file changed, 41 insertions(+), 77 deletions(-) diff --git a/getgbook.c b/getgbook.c index f944d9f..a9c0165 100644 --- a/getgbook.c +++ b/getgbook.c @@ -23,11 +23,40 @@ typedef struct { char pagecodes[][PGCODELEN] = { "PP", "PR", "PA", "PT", "\0" }; -int getpagelist(char *bookid, Page *pages) +int getpagelist(char *bookid, Page **pages) { - /* TODO */ - /*http://books.google.com/books?id=h3DSQ0L10o8C&printsec=frontcover*/ - return 1; + char url[URLMAX]; + char *buf; + char *s; + int i; + Page *p; + + snprintf(url, URLMAX, "/books?id=%s&printsec=frontcover", bookid); + + if(!get("books.google.com", url, NULL, NULL, &buf)) + return -1; + + if((s = strstr(buf, "_OC_Run({\"page\":[")) == NULL) + return -1; + s+=strlen("_OC_Run({\"page\":["); + + for(i=0, p=pages[0];*s; s++) { + if(*s == ']') + break; + if(!strncmp(s, "\"pid\"", 5)) { + sscanf(s+6, "\"%[^\"]\",", p->name); + for(;*s; s++) { + if(*s == '}') + break; + if(!strncmp(s, "\"order\"", 7)) { + sscanf(s+8, "%d,", &(p->num)); + } + } + p=pages[++i]; + } + } + + return i; } Page *getpagedetail(char *bookid, char *pg, char *cookie) @@ -78,9 +107,8 @@ Page *getpagedetail(char *bookid, char *pg, char *cookie) int main(int argc, char *argv[]) { - char *bookid, *tmp, *code, cookies[COOKIENUM][COOKIEMAX]; - char pg[STRMAX], buf[BUFSIZ], n[STRMAX], cookie[COOKIEMAX] = ""; - int i, c, retry; + char *bookid, *tmp, cookies[COOKIENUM][COOKIEMAX]; + int i, a; if(argc < 2 || argc > 3 || (argc == 3 && argv[1][0]!='-')) { fputs(usage, stdout); @@ -111,79 +139,15 @@ int main(int argc, char *argv[]) - maybe: when retry is 5, quit as it looks like we won't get anything more from any cookies */ - Page page[10000]; - if(!getpagelist(bookid, page)) { + Page **page; + page = malloc(sizeof(Page) * 1000); + for(i=0; i<1000; i++) page[i] = malloc(sizeof(*page)); + if(!(i = getpagelist(bookid, page))) { fprintf(stderr, "Could not find pages for %s\n", bookid); return 1; } - - - - /* OLD CODE */ - code = pagecodes[0]; - c = i = retry = 0; - while(++i) { - snprintf(pg, STRMAX, "%s%d", code, i); - if(!(page = getpagedetail(bookid, pg, cookie))) { - /* no more pages with that code */ - code = pagecodes[++c]; - if(code[0] == '\0') break; - i=0; - continue; - } - if(!page->url[0]) { - free(page); - /* try with fresh cookie */ - if(retry < RETRYNUM) { - get("books.google.com", "/", NULL, cookie, &tmp); - free(tmp); - retry++; - i--; - } else { - fprintf(stderr, "%s not available\n", pg); - retry=0; - } - continue; - } - retry=0; - if(argv[1][1] == 'a') { - if(page->num != -1) - snprintf(n, STRMAX, "%04d.png", page->num); - else - snprintf(n, STRMAX, "%s.png", page->name); - if(gettofile("books.google.com", page->url, cookie, NULL, n)) - fprintf(stderr, "%s failed\n", pg); - else - printf("Downloaded page %d\n", page->num); - } else { - printf("%s ", page->name); - if(page->num != -1) printf("%d", page->num); - printf("\n"); - fflush(stdout); - } - free(page); - } - } else { - /* download pages from stdin */ - /* TODO: rewrite using cookies as above */ - Page *page; - while(fgets(buf, BUFSIZ, stdin)) { - sscanf(buf, "%15s", pg); - for(retry = 0; retry < RETRYNUM; retry++) { - get("books.google.com", "/", NULL, cookie, &tmp); - if((page = getpagedetail(bookid, pg, cookie)) && page->url[0]) { - snprintf(n, STRMAX, "%04d.png", page->num); - if(gettofile("books.google.com", page->url, cookie, NULL, n)) - continue; - printf("Downloaded page %d\n", page->num); - free(page); - break; - } - if(page) free(page); - } - if(retry == RETRYNUM) - fprintf(stderr, "%s failed\n", pg); - } + for(a=0; aname, page[a]->num); } return EXIT_SUCCESS; -- cgit v1.2.3 From 496e3d7171044a381b91a9a077e7ecc53cd19f97 Mon Sep 17 00:00:00 2001 From: Nick White Date: Thu, 18 Aug 2011 23:37:25 +0100 Subject: Fix memory allocation bugs --- getgbook.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/getgbook.c b/getgbook.c index a9c0165..29f7a19 100644 --- a/getgbook.c +++ b/getgbook.c @@ -140,8 +140,8 @@ int main(int argc, char *argv[]) */ Page **page; - page = malloc(sizeof(Page) * 1000); - for(i=0; i<1000; i++) page[i] = malloc(sizeof(*page)); + page = malloc(sizeof(*page) * 1000); + for(i=0; i<1000; i++) page[i] = malloc(sizeof(**page)); if(!(i = getpagelist(bookid, page))) { fprintf(stderr, "Could not find pages for %s\n", bookid); return 1; -- cgit v1.2.3 From 12c8ed14117a68358f4b680c1fe1f08d61970fec Mon Sep 17 00:00:00 2001 From: Nick White Date: Fri, 19 Aug 2011 00:50:30 +0100 Subject: Simplify a teeny bit --- getgbook.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/getgbook.c b/getgbook.c index 29f7a19..d685279 100644 --- a/getgbook.c +++ b/getgbook.c @@ -48,9 +48,8 @@ int getpagelist(char *bookid, Page **pages) for(;*s; s++) { if(*s == '}') break; - if(!strncmp(s, "\"order\"", 7)) { + if(!strncmp(s, "\"order\"", 7)) sscanf(s+8, "%d,", &(p->num)); - } } p=pages[++i]; } -- cgit v1.2.3 From b686e031da6622b329d43c361b196eda46ea5154 Mon Sep 17 00:00:00 2001 From: Nick White Date: Sun, 21 Aug 2011 16:53:55 +0100 Subject: Get all pages works reasonably --- getgbook.c | 124 ++++++++++++++++++++++++++++++++----------------------------- 1 file changed, 65 insertions(+), 59 deletions(-) diff --git a/getgbook.c b/getgbook.c index d685279..5f1e381 100644 --- a/getgbook.c +++ b/getgbook.c @@ -5,24 +5,22 @@ #include "util.h" #define usage "getgbook " VERSION " - a google books downloader\n" \ - "usage: getgbook [-] bookid\n" \ - " - download pages from stdin\n" \ + "usage: getgbook [-c|-n] bookid\n" \ + " -c download pages from codes in stdin (TODO)\n" \ + " -n download pages from numbers in stdin (TODO)\n" \ " otherwise, all available pages will be downloaded\n" #define URLMAX 1024 #define STRMAX 1024 -#define PGCODELEN 3 -#define RETRYNUM 5 #define COOKIENUM 5 typedef struct { int num; char url[URLMAX]; char name[STRMAX]; + char cookie[COOKIEMAX]; } Page; -char pagecodes[][PGCODELEN] = { "PP", "PR", "PA", "PT", "\0" }; - int getpagelist(char *bookid, Page **pages) { char url[URLMAX]; @@ -41,6 +39,7 @@ int getpagelist(char *bookid, Page **pages) s+=strlen("_OC_Run({\"page\":["); for(i=0, p=pages[0];*s; s++) { + p->url[0] = '\0'; if(*s == ']') break; if(!strncmp(s, "\"pid\"", 5)) { @@ -58,56 +57,52 @@ int getpagelist(char *bookid, Page **pages) return i; } -Page *getpagedetail(char *bookid, char *pg, char *cookie) -{ - char url[URLMAX], m[STRMAX]; +int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char *cookie) { + char url[URLMAX], code[STRMAX]; char *c, *d, *p, *buf = NULL; - Page *page; + int i; - snprintf(url, URLMAX, "/books?id=%s&pg=%s&jscmd=click3&q=subject:a", bookid, pg); + snprintf(url, URLMAX, "/books?id=%s&pg=%s&jscmd=click3&q=subject:a", bookid, pagecode); if(!get("books.google.com", url, cookie, NULL, &buf)) - return NULL; - - snprintf(m, STRMAX, "\"pid\":\"%s\"", pg); - if(!(c = strstr(buf,m))) - return NULL; - - page = malloc(sizeof(*page)); - strncpy(page->name, pg, STRMAX); - page->url[0] = '\0'; - page->num = -1; - - if(!strncmp(c+strlen(m)+1, "\"src\"", 5)) { - for(p=page->url, d=c+strlen(m)+8; *d && *d != '"'; d++, p++) { - if(!strncmp(d, "\\u0026", 6)) { - *p = '&'; - d+=5; - } else - *p = *d; - } - strncpy(p, "&q=subject:a", 12); - } else - d=c; + return 1; - for(; *d; d++) { - if(*d == '}') { - break; - } - if(!strncmp(d, "\"order\"", 7)) { - sscanf(d+8, "%d,", &(page->num)); + c = buf; + while(*c && (c = strstr(c, "\"pid\":"))) { + if(!sscanf(c, "\"pid\":\"%[^\"]\"", code)) break; + for(; *c; c++) { + if(*c == '}') { + break; + } + if(!strncmp(c, "\"src\"", 5)) { + for(i=0; iname, code, STRMAX)) + break; + for(p=pages[i]->url, d=c+strlen("\"src\":")+1; *d && *d != '"'; d++, p++) { + if(!strncmp(d, "\\u0026", 6)) { + *p = '&'; + d+=5; + } else + *p = *d; + } + strncpy(p, "&q=subject:a", 13); + strncpy(pages[i]->cookie, cookie, COOKIEMAX); + break; + } } } free(buf); - return page; + return 0; } int main(int argc, char *argv[]) { char *bookid, *tmp, cookies[COOKIENUM][COOKIEMAX]; - int i, a; + char pgpath[STRMAX]; + int a, i, j, totalpages; + FILE *f; if(argc < 2 || argc > 3 || (argc == 3 && argv[1][0]!='-')) { fputs(usage, stdout); @@ -123,30 +118,41 @@ int main(int argc, char *argv[]) bookid = argv[argc-1]; if(argc == 2) { - /* download all pages */ - /* - fill page struct with names & nums - * - loop through each struct - * - if there's not a file matching num, try downloading, if dl failure, try with a different cookie */ - /* - cookie management: - use up to 5 cookies. (number might change) - complexity comes with a page which is not available; that shouldn't cause us to use up all the cookies - so: - - save 5 cookies immediately - - use first until it fails - - then use next. if it succeeds, drop previous. if not, try next, etc. if all failed, don't drop any, and continue to next page, and +1 to retry - - maybe: when retry is 5, quit as it looks like we won't get anything more from any cookies - */ - Page **page; page = malloc(sizeof(*page) * 1000); for(i=0; i<1000; i++) page[i] = malloc(sizeof(**page)); - if(!(i = getpagelist(bookid, page))) { + if(!(totalpages = getpagelist(bookid, page))) { fprintf(stderr, "Could not find pages for %s\n", bookid); return 1; } - for(a=0; aname, page[a]->num); + for(i=0; inum); + if((f = fopen(pgpath, "r")) != NULL) { + fclose(f); + continue; + } + if(page[i]->url[0] == '\0') { + for(j=0; jname, cookies[j]); + if(page[i]->url[0] != '\0') { + /* invalidate old cookies if one succeeded */ + for(a=0; aurl[0] == '\0') + fprintf(stderr, "%s not found\n", page[i]->name); + else { + if(gettofile("books.google.com", page[i]->url, page[i]->cookie, NULL, pgpath)) + fprintf(stderr, "%s failed\n", page[i]->name); + else + printf("%d downloaded\n", page[i]->num); + } + } } return EXIT_SUCCESS; -- cgit v1.2.3 From be77fe85042dfcc4a943c4c979ba7b990d6a124f Mon Sep 17 00:00:00 2001 From: Nick White Date: Sun, 21 Aug 2011 17:00:00 +0100 Subject: Tighten sscanf usage, add TODOs --- TODO | 24 ++++++++---------------- getgbook.c | 10 ++++++---- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/TODO b/TODO index 8a1deb7..4703148 100644 --- a/TODO +++ b/TODO @@ -8,6 +8,10 @@ use "" rather than "\0" in headermax # other todos +use wide string functions when dealing with stuff returned over http; it's known utf8 + +bug in get(): if the \r\n\r\n after http headers is cut off between recv buffers + use HTTP/1.1 with "Connection: close" header try supporting 3xx in get, if it can be done in a few lines @@ -25,23 +29,11 @@ have websummary.sh print the date of release, e.g. ## getgbook +mkdir of bookid and save pages in there + Google will give you up to 5 cookies which get useful pages in immediate succession. It will stop serving new pages to the ip, even with a fresh cookie. So the cookie is certainly not everything. If one does something too naughty, all requests from the ip to books.google.com are blocked with a 403 'automated requests' error for 24 hours. What causes this ip block is less clear. It certainly isn't after just trying lots of pages with 5 cookies. It seems to be after requesting 100 new cookies in a certain time period - 100 in 5 minutes seemed to do it, as did 100 in ~15 minutes. -So, if no more than 5 useable cookies can be gotten, and many more than this cause an ip block, a strategy could be to not bother getting more than 5 cookies, and bail once the 5th starts failing. of course, this doesn't address getting more pages, and moreover it doesn't address knowing which pages are available. - -all pages available (includes page code & order (even when not available from main click3 part) (& title sometimes, & height), though not url): curl 'http://books.google.com/books?id=h3DSQ0L10o8C&printsec=frontcover' | sed -e '/OC_Run\(/!d' -e 's/.*_OC_Run\({"page"://g' -e 's/}].*//g' - -TODO, THEN: - at start (if in -p or -a mode), fill a Page struct (don't hold url in struct any more) - in -a, go through Page struct, if file exists, skip, otherwise get the url for the page (don't bother about re-getting order etc). this means that getgfailed and getgmissing can go away - in -p, just go through Page struct and print each entry - when 5 cookies have been exhausted, quit, saying no more cookies available for now (and recommending a time period to retry) - have -a be default, and stdin be - - - so, usage should be - getgbook [-] bookid - if - is given, read page codes from stdin - otherwise, just download everything (skipping already - downloaded pages) +NOTE!!: the method of getting all pages from book page does miss some; they aren't all listed +* these pages can often be requested, though diff --git a/getgbook.c b/getgbook.c index 5f1e381..62faf46 100644 --- a/getgbook.c +++ b/getgbook.c @@ -23,7 +23,7 @@ typedef struct { int getpagelist(char *bookid, Page **pages) { - char url[URLMAX]; + char url[URLMAX], m[STRMAX]; char *buf; char *s; int i; @@ -43,7 +43,8 @@ int getpagelist(char *bookid, Page **pages) if(*s == ']') break; if(!strncmp(s, "\"pid\"", 5)) { - sscanf(s+6, "\"%[^\"]\",", p->name); + snprintf(m, STRMAX, "\"%%%d[^\"]\"", STRMAX-1); + sscanf(s+6, m, p->name); for(;*s; s++) { if(*s == '}') break; @@ -58,7 +59,7 @@ int getpagelist(char *bookid, Page **pages) } int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char *cookie) { - char url[URLMAX], code[STRMAX]; + char url[URLMAX], code[STRMAX], m[STRMAX]; char *c, *d, *p, *buf = NULL; int i; @@ -69,7 +70,8 @@ int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char c = buf; while(*c && (c = strstr(c, "\"pid\":"))) { - if(!sscanf(c, "\"pid\":\"%[^\"]\"", code)) + snprintf(m, STRMAX, "\"pid\":\"%%%d[^\"]\"", STRMAX-1); + if(!sscanf(c, m, code)) break; for(; *c; c++) { if(*c == '}') { -- cgit v1.2.3 From df8c5735b2d71374385baf288e13d6e88a17840a Mon Sep 17 00:00:00 2001 From: Nick White Date: Sun, 21 Aug 2011 17:11:43 +0100 Subject: More sscanf tightening --- util.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/util.c b/util.c index 6e341a5..4d2b04c 100644 --- a/util.c +++ b/util.c @@ -41,6 +41,7 @@ int get(char *host, char *path, char *sendcookie, char *savecookie, char **buf) int fd, i, p; char h[HEADERMAX] = "\0"; char c[COOKIEMAX] = ""; + char m[256]; FILE *srv; if((fd = dial(host, "80")) == -1) return 0; @@ -52,11 +53,13 @@ int get(char *host, char *path, char *sendcookie, char *savecookie, char **buf) " (not mozilla)\r\nHost: %s%s\r\n\r\n", path, host, c); fflush(srv); + snprintf(m, 256, "Set-Cookie: %%%ds;", COOKIEMAX-1); + while(h[0] != '\r') { if(!fgets(h, HEADERMAX, srv)) return 0; if(sscanf(h, "HTTP/%d.%d %d", &i, &i, &p) == 3 && p != 200) return 0; - if(savecookie != NULL && sscanf(h, "Set-Cookie: %s;", c)) + if(savecookie != NULL && sscanf(h, m, c)) strncat(savecookie, c, COOKIEMAX); } -- cgit v1.2.3 From 0fedff7492d97609cdfc5a02a883bdfd693f4dbb Mon Sep 17 00:00:00 2001 From: Nick White Date: Sun, 21 Aug 2011 17:11:54 +0100 Subject: Set max pages explicitly --- getgbook.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/getgbook.c b/getgbook.c index 62faf46..e60316f 100644 --- a/getgbook.c +++ b/getgbook.c @@ -12,6 +12,7 @@ #define URLMAX 1024 #define STRMAX 1024 +#define MAXPAGES 9999 #define COOKIENUM 5 typedef struct { @@ -121,8 +122,8 @@ int main(int argc, char *argv[]) if(argc == 2) { Page **page; - page = malloc(sizeof(*page) * 1000); - for(i=0; i<1000; i++) page[i] = malloc(sizeof(**page)); + page = malloc(sizeof(*page) * MAXPAGES); + for(i=0; i Date: Sun, 21 Aug 2011 17:22:35 +0100 Subject: Fix reporting of no pages available --- TODO | 9 +++++++-- getgbook.c | 6 +++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/TODO b/TODO index 4703148..4eb35e4 100644 --- a/TODO +++ b/TODO @@ -31,9 +31,14 @@ have websummary.sh print the date of release, e.g. mkdir of bookid and save pages in there +add cmdline arguments for stdin parsing + +merge pageinfo branch + +### notes + Google will give you up to 5 cookies which get useful pages in immediate succession. It will stop serving new pages to the ip, even with a fresh cookie. So the cookie is certainly not everything. If one does something too naughty, all requests from the ip to books.google.com are blocked with a 403 'automated requests' error for 24 hours. What causes this ip block is less clear. It certainly isn't after just trying lots of pages with 5 cookies. It seems to be after requesting 100 new cookies in a certain time period - 100 in 5 minutes seemed to do it, as did 100 in ~15 minutes. -NOTE!!: the method of getting all pages from book page does miss some; they aren't all listed -* these pages can often be requested, though +The method of getting all pages from book webpage does miss some; they aren't all listed. These pages can often be requested, though. diff --git a/getgbook.c b/getgbook.c index e60316f..3fbdf47 100644 --- a/getgbook.c +++ b/getgbook.c @@ -33,10 +33,10 @@ int getpagelist(char *bookid, Page **pages) snprintf(url, URLMAX, "/books?id=%s&printsec=frontcover", bookid); if(!get("books.google.com", url, NULL, NULL, &buf)) - return -1; + return 0; if((s = strstr(buf, "_OC_Run({\"page\":[")) == NULL) - return -1; + return 0; s+=strlen("_OC_Run({\"page\":["); for(i=0, p=pages[0];*s; s++) { @@ -125,7 +125,7 @@ int main(int argc, char *argv[]) page = malloc(sizeof(*page) * MAXPAGES); for(i=0; i Date: Sun, 21 Aug 2011 17:58:28 +0100 Subject: Fix usage printing --- getgbook.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/getgbook.c b/getgbook.c index 3fbdf47..8073194 100644 --- a/getgbook.c +++ b/getgbook.c @@ -107,7 +107,9 @@ int main(int argc, char *argv[]) int a, i, j, totalpages; FILE *f; - if(argc < 2 || argc > 3 || (argc == 3 && argv[1][0]!='-')) { + if(argc < 2 || argc > 3 || (argc == 3 && (argv[1][0]!='-' + || (argv[1][1] != 'c' && argv[1][1] != 'n'))) + || (argc >= 2 && argv[1][0] == '-' && argv[1][1] == 'h')) { fputs(usage, stdout); return 1; } -- cgit v1.2.3 From 6b059ae1888b0cf8d38c7fe9b4f5c10ec28ab7b6 Mon Sep 17 00:00:00 2001 From: Nick White Date: Sun, 21 Aug 2011 21:14:24 +0100 Subject: Restructure getgbook code --- Makefile | 2 +- TODO | 6 +-- getgbook.c | 132 ++++++++++++++++++++++++++++++++++++++++++--------------- getgfailed.sh | 13 ------ getgmissing.sh | 17 -------- 5 files changed, 99 insertions(+), 71 deletions(-) delete mode 100755 getgfailed.sh delete mode 100755 getgmissing.sh diff --git a/Makefile b/Makefile index 4ec498d..d7e947c 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ NAME = getxbook SRC = getgbook.c LIB = util.o -SCRIPTS = getgmissing.sh getgfailed.sh makebookpdf.sh +SCRIPTS = makebookpdf.sh DOC = README COPYING LEGAL BIN = $(SRC:.c=) diff --git a/TODO b/TODO index 4eb35e4..6b08e9f 100644 --- a/TODO +++ b/TODO @@ -31,14 +31,10 @@ have websummary.sh print the date of release, e.g. mkdir of bookid and save pages in there -add cmdline arguments for stdin parsing - -merge pageinfo branch - ### notes Google will give you up to 5 cookies which get useful pages in immediate succession. It will stop serving new pages to the ip, even with a fresh cookie. So the cookie is certainly not everything. If one does something too naughty, all requests from the ip to books.google.com are blocked with a 403 'automated requests' error for 24 hours. What causes this ip block is less clear. It certainly isn't after just trying lots of pages with 5 cookies. It seems to be after requesting 100 new cookies in a certain time period - 100 in 5 minutes seemed to do it, as did 100 in ~15 minutes. -The method of getting all pages from book webpage does miss some; they aren't all listed. These pages can often be requested, though. +The method of getting all pages from book webpage does miss some; they aren't all listed. These pages can often be requested, though, though at present getgbook can't, as if a page isn't in its initial structure it won't save the url, even if it's presented. diff --git a/getgbook.c b/getgbook.c index 8073194..d1d6e4a 100644 --- a/getgbook.c +++ b/getgbook.c @@ -22,10 +22,15 @@ typedef struct { char cookie[COOKIEMAX]; } Page; -int getpagelist(char *bookid, Page **pages) +Page **pages; +int totalpages; +char cookies[COOKIENUM][COOKIEMAX]; +char *bookid; + +int getpagelist() { char url[URLMAX], m[STRMAX]; - char *buf; + char *buf = NULL; char *s; int i; Page *p; @@ -56,13 +61,14 @@ int getpagelist(char *bookid, Page **pages) } } + free(buf); return i; } -int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char *cookie) { +int getpageurls(char *pagecode, char *cookie) { char url[URLMAX], code[STRMAX], m[STRMAX]; char *c, *d, *p, *buf = NULL; - int i; + int i, j; snprintf(url, URLMAX, "/books?id=%s&pg=%s&jscmd=click3&q=subject:a", bookid, pagecode); @@ -78,11 +84,17 @@ int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char if(*c == '}') { break; } + j = -1; if(!strncmp(c, "\"src\"", 5)) { - for(i=0; iname, code, STRMAX)) + for(i=0; iname, code, STRMAX)) { + j = i; break; - for(p=pages[i]->url, d=c+strlen("\"src\":")+1; *d && *d != '"'; d++, p++) { + } + } + if(j == -1) /* TODO: it would be good to add new page on the end */ + break; /* of structure rather than throw it away. */ + for(p=pages[j]->url, d=c+strlen("\"src\":")+1; *d && *d != '"'; d++, p++) { if(!strncmp(d, "\\u0026", 6)) { *p = '&'; d+=5; @@ -90,7 +102,7 @@ int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char *p = *d; } strncpy(p, "&q=subject:a", 13); - strncpy(pages[i]->cookie, cookie, COOKIEMAX); + strncpy(pages[j]->cookie, cookie, COOKIEMAX); break; } } @@ -100,11 +112,50 @@ int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char return 0; } +int getpage(Page *page) +{ + char path[STRMAX]; + snprintf(path, STRMAX, "%04d.png", page->num); + + if(page->url[0] == '\0') { + fprintf(stderr, "%s not found\n", page->name); + return 1; + } + + if(gettofile("books.google.com", page->url, page->cookie, NULL, path)) { + fprintf(stderr, "%s failed\n", page->name); + return 1; + } + + printf("%d downloaded\n", page->num); + return 0; +} + +void searchpage(Page *page) { + int i, j; + + if(page->url[0] != '\0') + return; + + for(i=0; iname, cookies[i]); + if(page->url[0] != '\0') { + /* invalidate old cookies if one succeeded */ + for(j=0; j 3 || (argc == 3 && (argv[1][0]!='-' @@ -122,43 +173,54 @@ int main(int argc, char *argv[]) bookid = argv[argc-1]; + pages = malloc(sizeof(*pages) * MAXPAGES); + for(i=0; inum); + snprintf(pgpath, STRMAX, "%04d.png", pages[i]->num); if((f = fopen(pgpath, "r")) != NULL) { fclose(f); continue; } - if(page[i]->url[0] == '\0') { - for(j=0; jname, cookies[j]); - if(page[i]->url[0] != '\0') { - /* invalidate old cookies if one succeeded */ - for(a=0; aname, in, STRMAX) == 0) { + i = a; + break; + } + } + } else if(argv[1][1] == 'n') { + sscanf(in, "%d", &n); + for(a=0; anum == n) { + i = a; break; } } } - if(page[i]->url[0] == '\0') - fprintf(stderr, "%s not found\n", page[i]->name); - else { - if(gettofile("books.google.com", page[i]->url, page[i]->cookie, NULL, pgpath)) - fprintf(stderr, "%s failed\n", page[i]->name); - else - printf("%d downloaded\n", page[i]->num); + if(i == -1) { + fprintf(stderr, "%s not found\n", in); + continue; } + searchpage(pages[i]); + getpage(pages[i]); } } + for(i=0; i faillog) - -test $# -ne 2 && echo "usage: $0 bookid faillog" && exit - -sort < $2 | sort | shuf | head -n 5 | while read i -do - code=`echo $i|awk '{print $1}'` - echo $code | getgbook $1 -done diff --git a/getgmissing.sh b/getgmissing.sh deleted file mode 100755 index e8198d8..0000000 --- a/getgmissing.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/sh -# See COPYING file for copyright and license details. -# -# This gets any pages listed as available that have not been -# downloaded. Note that at present this is not too useful, as -# an IP block will be imposed after the first x pages each run, -# just for checking availaility. - -test $# -ne 1 && echo "usage: $0 bookid" && exit - -getgbook -p $1 2>/dev/null | while read i -do - code=`echo $i|awk '{print $1}'` - num=`echo $i|awk '{print $2}'` - test -n "$num" && num=`printf '%04d' $num` || num=$code - test -f $num.png || echo $code | getgbook $1 -done -- cgit v1.2.3