diff options
author | Nick White <git@njw.me.uk> | 2011-08-21 21:14:24 +0100 |
---|---|---|
committer | Nick White <git@njw.me.uk> | 2011-08-21 21:14:24 +0100 |
commit | 6b059ae1888b0cf8d38c7fe9b4f5c10ec28ab7b6 (patch) | |
tree | 05e45a7b7f53277b6877f4c029e3d13ac45d281a | |
parent | fc43d1cacbb62fd854960901688e1b9b9752e7cd (diff) |
Restructure getgbook code
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | TODO | 6 | ||||
-rw-r--r-- | getgbook.c | 132 | ||||
-rwxr-xr-x | getgfailed.sh | 13 | ||||
-rwxr-xr-x | getgmissing.sh | 17 |
5 files changed, 99 insertions, 71 deletions
@@ -5,7 +5,7 @@ NAME = getxbook SRC = getgbook.c LIB = util.o -SCRIPTS = getgmissing.sh getgfailed.sh makebookpdf.sh +SCRIPTS = makebookpdf.sh DOC = README COPYING LEGAL BIN = $(SRC:.c=) @@ -31,14 +31,10 @@ have websummary.sh print the date of release, e.g. mkdir of bookid and save pages in there -add cmdline arguments for stdin parsing - -merge pageinfo branch - ### notes Google will give you up to 5 cookies which get useful pages in immediate succession. It will stop serving new pages to the ip, even with a fresh cookie. So the cookie is certainly not everything. If one does something too naughty, all requests from the ip to books.google.com are blocked with a 403 'automated requests' error for 24 hours. What causes this ip block is less clear. It certainly isn't after just trying lots of pages with 5 cookies. It seems to be after requesting 100 new cookies in a certain time period - 100 in 5 minutes seemed to do it, as did 100 in ~15 minutes. -The method of getting all pages from book webpage does miss some; they aren't all listed. These pages can often be requested, though. +The method of getting all pages from book webpage does miss some; they aren't all listed. These pages can often be requested, though, though at present getgbook can't, as if a page isn't in its initial structure it won't save the url, even if it's presented. @@ -22,10 +22,15 @@ typedef struct { char cookie[COOKIEMAX]; } Page; -int getpagelist(char *bookid, Page **pages) +Page **pages; +int totalpages; +char cookies[COOKIENUM][COOKIEMAX]; +char *bookid; + +int getpagelist() { char url[URLMAX], m[STRMAX]; - char *buf; + char *buf = NULL; char *s; int i; Page *p; @@ -56,13 +61,14 @@ int getpagelist(char *bookid, Page **pages) } } + free(buf); return i; } -int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char *cookie) { +int getpageurls(char *pagecode, char *cookie) { char url[URLMAX], code[STRMAX], m[STRMAX]; char *c, *d, *p, *buf = NULL; - int i; + int i, j; snprintf(url, URLMAX, "/books?id=%s&pg=%s&jscmd=click3&q=subject:a", bookid, pagecode); @@ -78,11 +84,17 @@ int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char if(*c == '}') { break; } + j = -1; if(!strncmp(c, "\"src\"", 5)) { - for(i=0; i<totalpages; i++) - if(!strncmp(pages[i]->name, code, STRMAX)) + for(i=0; i<totalpages; i++) { + if(!strncmp(pages[i]->name, code, STRMAX)) { + j = i; break; - for(p=pages[i]->url, d=c+strlen("\"src\":")+1; *d && *d != '"'; d++, p++) { + } + } + if(j == -1) /* TODO: it would be good to add new page on the end */ + break; /* of structure rather than throw it away. */ + for(p=pages[j]->url, d=c+strlen("\"src\":")+1; *d && *d != '"'; d++, p++) { if(!strncmp(d, "\\u0026", 6)) { *p = '&'; d+=5; @@ -90,7 +102,7 @@ int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char *p = *d; } strncpy(p, "&q=subject:a", 13); - strncpy(pages[i]->cookie, cookie, COOKIEMAX); + strncpy(pages[j]->cookie, cookie, COOKIEMAX); break; } } @@ -100,11 +112,50 @@ int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char return 0; } +int getpage(Page *page) +{ + char path[STRMAX]; + snprintf(path, STRMAX, "%04d.png", page->num); + + if(page->url[0] == '\0') { + fprintf(stderr, "%s not found\n", page->name); + return 1; + } + + if(gettofile("books.google.com", page->url, page->cookie, NULL, path)) { + fprintf(stderr, "%s failed\n", page->name); + return 1; + } + + printf("%d downloaded\n", page->num); + return 0; +} + +void searchpage(Page *page) { + int i, j; + + if(page->url[0] != '\0') + return; + + for(i=0; i<COOKIENUM; i++) { + if(cookies[i][0] == '\0') /* dead cookie */ + continue; + getpageurls(page->name, cookies[i]); + if(page->url[0] != '\0') { + /* invalidate old cookies if one succeeded */ + for(j=0; j<i; j++) + cookies[j][0] = '\0'; + break; + } + } +} + int main(int argc, char *argv[]) { - char *bookid, *tmp, cookies[COOKIENUM][COOKIEMAX]; - char pgpath[STRMAX]; - int a, i, j, totalpages; + char *tmp; + char buf[BUFSIZ], pgpath[STRMAX]; + char in[16]; + int a, i, n; FILE *f; if(argc < 2 || argc > 3 || (argc == 3 && (argv[1][0]!='-' @@ -122,43 +173,54 @@ int main(int argc, char *argv[]) bookid = argv[argc-1]; + pages = malloc(sizeof(*pages) * MAXPAGES); + for(i=0; i<MAXPAGES; i++) pages[i] = malloc(sizeof(**pages)); + if(!(totalpages = getpagelist(bookid, pages))) { + fprintf(stderr, "Could not find any pages for %s\n", bookid); + return 1; + } + if(argc == 2) { - Page **page; - page = malloc(sizeof(*page) * MAXPAGES); - for(i=0; i<MAXPAGES; i++) page[i] = malloc(sizeof(**page)); - if(!(totalpages = getpagelist(bookid, page))) { - fprintf(stderr, "Could not find any pages for %s\n", bookid); - return 1; - } for(i=0; i<totalpages; i++) { - snprintf(pgpath, STRMAX, "%04d.png", page[i]->num); + snprintf(pgpath, STRMAX, "%04d.png", pages[i]->num); if((f = fopen(pgpath, "r")) != NULL) { fclose(f); continue; } - if(page[i]->url[0] == '\0') { - for(j=0; j<COOKIENUM; j++) { - if(cookies[j][0] == '\0') /* dead cookie */ - continue; - getpageurls(bookid, page, totalpages, page[i]->name, cookies[j]); - if(page[i]->url[0] != '\0') { - /* invalidate old cookies if one succeeded */ - for(a=0; a<j; a++) - cookies[a][0] = '\0'; + searchpage(pages[i]); + getpage(pages[i]); + } + } else if(argv[1][0] == '-') { + while(fgets(buf, BUFSIZ, stdin)) { + sscanf(buf, "%15s", in); + i = -1; + if(argv[1][1] == 'c') { + for(a=0; a<totalpages; a++) { + if(strncmp(pages[a]->name, in, STRMAX) == 0) { + i = a; + break; + } + } + } else if(argv[1][1] == 'n') { + sscanf(in, "%d", &n); + for(a=0; a<totalpages; a++) { + if(pages[a]->num == n) { + i = a; break; } } } - if(page[i]->url[0] == '\0') - fprintf(stderr, "%s not found\n", page[i]->name); - else { - if(gettofile("books.google.com", page[i]->url, page[i]->cookie, NULL, pgpath)) - fprintf(stderr, "%s failed\n", page[i]->name); - else - printf("%d downloaded\n", page[i]->num); + if(i == -1) { + fprintf(stderr, "%s not found\n", in); + continue; } + searchpage(pages[i]); + getpage(pages[i]); } } + for(i=0; i<MAXPAGES; i++) free(pages[i]); + free(pages); + return EXIT_SUCCESS; } diff --git a/getgfailed.sh b/getgfailed.sh deleted file mode 100755 index 9ecd9e3..0000000 --- a/getgfailed.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/sh -# See COPYING file for copyright and license details. -# -# Tries to download each page listed in a fail log (from a -# previous run of getgbook -a bookid > faillog) - -test $# -ne 2 && echo "usage: $0 bookid faillog" && exit - -sort < $2 | sort | shuf | head -n 5 | while read i -do - code=`echo $i|awk '{print $1}'` - echo $code | getgbook $1 -done diff --git a/getgmissing.sh b/getgmissing.sh deleted file mode 100755 index e8198d8..0000000 --- a/getgmissing.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/sh -# See COPYING file for copyright and license details. -# -# This gets any pages listed as available that have not been -# downloaded. Note that at present this is not too useful, as -# an IP block will be imposed after the first x pages each run, -# just for checking availaility. - -test $# -ne 1 && echo "usage: $0 bookid" && exit - -getgbook -p $1 2>/dev/null | while read i -do - code=`echo $i|awk '{print $1}'` - num=`echo $i|awk '{print $2}'` - test -n "$num" && num=`printf '%04d' $num` || num=$code - test -f $num.png || echo $code | getgbook $1 -done |