From 2c87bf5e7c229e7c7c85bb6bc0cae03e989ad388 Mon Sep 17 00:00:00 2001 From: Nick White Date: Sun, 17 Jul 2011 19:26:33 +0100 Subject: Roll over all page types, and thereby remove need for page total code --- TODO | 7 ------ getgbook.c | 73 +++++++++++++++++++++++--------------------------------------- 2 files changed, 27 insertions(+), 53 deletions(-) diff --git a/TODO b/TODO index d8e96a6..2ee5e68 100644 --- a/TODO +++ b/TODO @@ -1,5 +1,3 @@ -order is available even for pages that don't have src. check this, and if so rewrite getpagedetails to set it even if no src (rather than just bailing) - use order to be able to use real page numbers (getpagecode) to find this advance through click3 letter by letter until either } (none found) or strcmp "order" @@ -11,8 +9,3 @@ to be fast and efficient it's best to crank through all the json 1st, filling in could consider making a json reading module, ala confoo, to make ad-hoc memory structures from json to be super fast we could have 2 threads, one filling the pages structs and one consuming them. this would complicate the code rather, though - -NOTE: there's now a new api that returns json. -it requires https, which we don't yet support. -https://www.googleapis.com/books/v1/volumes?q=isbn:1589235126 -https://www.googleapis.com/books/v1/volumes/jglfL_eVG4cC diff --git a/getgbook.c b/getgbook.c index df744eb..1442e8e 100644 --- a/getgbook.c +++ b/getgbook.c @@ -19,26 +19,7 @@ typedef struct { char name[80]; } Page; -int gettotalpages(char *bookid) -{ - char url[URLMAX]; - char *buf, *c; - int total; - - snprintf(url, URLMAX, "/books/feeds/volumes/%s", bookid); - - bookid = malloc(sizeof(char *) * BOOKID_LEN); - - if(!get("books.google.com", url, &buf)) - return 0; - - if(!(c = strstr(buf," pages"))) - return 0; - while(*c && *c != '>') c--; - sscanf(c+1, "%d ", &total); - - return total; -} +char pagecodes[][3] = { "PP", "PR", "PA", "PT", "\0" }; Page *getpagedetail(char *bookid, char *pg) { @@ -58,20 +39,18 @@ Page *getpagedetail(char *bookid, char *pg) page = malloc(sizeof(Page)); strncpy(page->name, pg, 80); page->url[0] = '\0'; - page->num = 0; + page->num = -1; - if(strncmp(c+strlen(m)+1, "\"src\"", 5)) { - free(buf); return page; - } - - for(p=page->url, d=c+strlen(m)+8; *d && *d != '"'; d++, p++) { - if(!strncmp(d, "\\u0026", 6)) { - *p = '&'; - d+=5; - } else - *p = *d; + if(!strncmp(c+strlen(m)+1, "\"src\"", 5)) { + for(p=page->url, d=c+strlen(m)+8; *d && *d != '"'; d++, p++) { + if(!strncmp(d, "\\u0026", 6)) { + *p = '&'; + d+=5; + } else + *p = *d; + } + *p = '\0'; } - *p = '\0'; for(; *d; d++) { if(*d == '}') { @@ -89,8 +68,8 @@ Page *getpagedetail(char *bookid, char *pg) int main(int argc, char *argv[]) { - char *bookid, pg[16], buf[1024], n[80]; - int totalpages, i; + char *bookid, pg[16], buf[1024], n[80], code[3]; + int i, c; Page *page; if(argc < 2 || argc > 3 || @@ -100,17 +79,19 @@ int main(int argc, char *argv[]) bookid = argv[argc-1]; if(argv[1][0] == '-') { - /* note this isn't the best way, not least because it misses the - * non PA pages. best is to crawl around the json grabbing everything - * available, by starting on PP1, and filling in by going through - * all pages in totalpages. */ - if(!(totalpages = gettotalpages(bookid))) - die("Book has no pages\n"); - - for(i=1; i<=totalpages; i++) { - snprintf(pg, 16, "%s%d", "PA", i); - if(!(page = getpagedetail(bookid, pg)) || !page->url[0]) { - fprintf(stderr, "%s failed\n", pg); + strncpy(code, pagecodes[0], 3); + c = i =0; + while(++i) { + snprintf(pg, 16, "%s%d", code, i); + if(!(page = getpagedetail(bookid, pg))) { + /* no more pages with that code */ + strncpy(code, pagecodes[++c], 3); + if(code[0] == '\0') break; + i=0; + continue; + } + if(!page->url[0]) { + fprintf(stderr, "%s not available\n", pg); free(page); continue; } @@ -118,7 +99,7 @@ int main(int argc, char *argv[]) snprintf(n, 80, "%05d.png", page->num); gettofile("books.google.com", page->url, n); printf("Downloaded page %d\n", page->num); - } else + } else if(page->num != -1) printf("%d\n", page->num); free(page); } -- cgit v1.2.3