summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.me.uk>2011-08-21 22:23:12 +0100
committerNick White <git@njw.me.uk>2011-08-21 22:23:12 +0100
commit85750ee58dea89bae829d4d30f41e83d99abc654 (patch)
tree2093b6b28657e92bb86516d41f1335a979cce83c
parentba96802ba13f022047e93dfa96caddf4fff42146 (diff)
parent6b059ae1888b0cf8d38c7fe9b4f5c10ec28ab7b6 (diff)
Merge work using pages much more efficiently
-rw-r--r--Makefile2
-rw-r--r--TODO25
-rw-r--r--getgbook.c277
-rwxr-xr-xgetgfailed.sh13
-rwxr-xr-xgetgmissing.sh17
-rw-r--r--util.c5
6 files changed, 189 insertions, 150 deletions
diff --git a/Makefile b/Makefile
index dbd5753..f09cdb8 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ NAME = getxbook
SRC = getgbook.c
LIB = util.o
-SCRIPTS = getgmissing.sh getgfailed.sh makebookpdf.sh
+SCRIPTS = makebookpdf.sh
DOC = README COPYING LEGAL
BIN = $(SRC:.c=)
diff --git a/TODO b/TODO
index f4903c4..eb8b65d 100644
--- a/TODO
+++ b/TODO
@@ -6,6 +6,10 @@ getbnbook
# other todos
+use wide string functions when dealing with stuff returned over http; it's known utf8
+
+bug in get(): if the \r\n\r\n after http headers is cut off between recv buffers
+
use HTTP/1.1 with "Connection: close" header
try supporting 3xx in get, if it can be done in a few lines
@@ -21,23 +25,12 @@ have websummary.sh print the date of release, e.g.
## getgbook
-Google will give you up to 5 cookies which get useful pages in immediate succession. It will stop serving new pages to the ip, even with a fresh cookie. So the cookie is certainly not everything.
+mkdir of bookid and save pages in there
-If one does something too naughty, all requests from the ip to books.google.com are blocked with a 403 'automated requests' error for 24 hours. What causes this ip block is less clear. It certainly isn't after just trying lots of pages with 5 cookies. It seems to be after requesting 100 new cookies in a certain time period - 100 in 5 minutes seemed to do it, as did 100 in ~15 minutes.
+### notes
-So, if no more than 5 useable cookies can be gotten, and many more than this cause an ip block, a strategy could be to not bother getting more than 5 cookies, and bail once the 5th starts failing. of course, this doesn't address getting more pages, and moreover it doesn't address knowing which pages are available.
-
-all pages available (includes page code & order (even when not available from main click3 part) (& title sometimes, & height), though not url): curl 'http://books.google.com/books?id=h3DSQ0L10o8C&printsec=frontcover' | sed -e '/OC_Run\(/!d' -e 's/.*_OC_Run\({"page"://g' -e 's/}].*//g'
+Google will give you up to 5 cookies which get useful pages in immediate succession. It will stop serving new pages to the ip, even with a fresh cookie. So the cookie is certainly not everything.
-TODO, THEN:
- at start (if in -p or -a mode), fill a Page struct (don't hold url in struct any more)
- in -a, go through Page struct, if file exists, skip, otherwise get the url for the page (don't bother about re-getting order etc). this means that getgfailed and getgmissing can go away
- in -p, just go through Page struct and print each entry
- when 5 cookies have been exhausted, quit, saying no more cookies available for now (and recommending a time period to retry)
- have -a be default, and stdin be -
+If one does something too naughty, all requests from the ip to books.google.com are blocked with a 403 'automated requests' error for 24 hours. What causes this ip block is less clear. It certainly isn't after just trying lots of pages with 5 cookies. It seems to be after requesting 100 new cookies in a certain time period - 100 in 5 minutes seemed to do it, as did 100 in ~15 minutes.
- so, usage should be
- getgbook [-] bookid
- if - is given, read page codes from stdin
- otherwise, just download everything (skipping already
- downloaded pages)
+The method of getting all pages from book webpage does miss some; they aren't all listed. These pages can often be requested, though, though at present getgbook can't, as if a page isn't in its initial structure it won't save the url, even if it's presented.
diff --git a/getgbook.c b/getgbook.c
index cd12c86..d1d6e4a 100644
--- a/getgbook.c
+++ b/getgbook.c
@@ -5,149 +5,222 @@
#include "util.h"
#define usage "getgbook " VERSION " - a google books downloader\n" \
- "usage: getgbook [-p|-a] bookid\n" \
- " -p print all available pages\n" \
- " -a download all available pages\n" \
- " otherwise, all pages in stdin will be downloaded\n"
+ "usage: getgbook [-c|-n] bookid\n" \
+ " -c download pages from codes in stdin (TODO)\n" \
+ " -n download pages from numbers in stdin (TODO)\n" \
+ " otherwise, all available pages will be downloaded\n"
#define URLMAX 1024
#define STRMAX 1024
-#define PGCODELEN 3
-#define RETRYNUM 5
+#define MAXPAGES 9999
+#define COOKIENUM 5
typedef struct {
int num;
char url[URLMAX];
char name[STRMAX];
+ char cookie[COOKIEMAX];
} Page;
-char pagecodes[][PGCODELEN] = { "PP", "PR", "PA", "PT", "\0" };
+Page **pages;
+int totalpages;
+char cookies[COOKIENUM][COOKIEMAX];
+char *bookid;
-Page *getpagedetail(char *bookid, char *pg, char *cookie)
+int getpagelist()
{
char url[URLMAX], m[STRMAX];
- char *c, *d, *p, *buf = NULL;
- Page *page;
+ char *buf = NULL;
+ char *s;
+ int i;
+ Page *p;
- snprintf(url, URLMAX, "/books?id=%s&pg=%s&jscmd=click3&q=subject:a", bookid, pg);
+ snprintf(url, URLMAX, "/books?id=%s&printsec=frontcover", bookid);
- if(!get("books.google.com", url, cookie, NULL, &buf))
- return NULL;
-
- snprintf(m, STRMAX, "\"pid\":\"%s\"", pg);
- if(!(c = strstr(buf,m)))
- return NULL;
-
- page = malloc(sizeof(*page));
- strncpy(page->name, pg, STRMAX);
- page->url[0] = '\0';
- page->num = -1;
-
- if(!strncmp(c+strlen(m)+1, "\"src\"", 5)) {
- for(p=page->url, d=c+strlen(m)+8; *d && *d != '"'; d++, p++) {
- if(!strncmp(d, "\\u0026", 6)) {
- *p = '&';
- d+=5;
- } else
- *p = *d;
- }
- strncpy(p, "&q=subject:a", 12);
- *(p+12) = '\0';
- } else
- d=c;
+ if(!get("books.google.com", url, NULL, NULL, &buf))
+ return 0;
+
+ if((s = strstr(buf, "_OC_Run({\"page\":[")) == NULL)
+ return 0;
+ s+=strlen("_OC_Run({\"page\":[");
- for(; *d; d++) {
- if(*d == '}') {
+ for(i=0, p=pages[0];*s; s++) {
+ p->url[0] = '\0';
+ if(*s == ']')
break;
+ if(!strncmp(s, "\"pid\"", 5)) {
+ snprintf(m, STRMAX, "\"%%%d[^\"]\"", STRMAX-1);
+ sscanf(s+6, m, p->name);
+ for(;*s; s++) {
+ if(*s == '}')
+ break;
+ if(!strncmp(s, "\"order\"", 7))
+ sscanf(s+8, "%d,", &(p->num));
+ }
+ p=pages[++i];
}
- if(!strncmp(d, "\"order\"", 7)) {
- sscanf(d+8, "%d,", &(page->num));
+ }
+
+ free(buf);
+ return i;
+}
+
+int getpageurls(char *pagecode, char *cookie) {
+ char url[URLMAX], code[STRMAX], m[STRMAX];
+ char *c, *d, *p, *buf = NULL;
+ int i, j;
+
+ snprintf(url, URLMAX, "/books?id=%s&pg=%s&jscmd=click3&q=subject:a", bookid, pagecode);
+
+ if(!get("books.google.com", url, cookie, NULL, &buf))
+ return 1;
+
+ c = buf;
+ while(*c && (c = strstr(c, "\"pid\":"))) {
+ snprintf(m, STRMAX, "\"pid\":\"%%%d[^\"]\"", STRMAX-1);
+ if(!sscanf(c, m, code))
break;
+ for(; *c; c++) {
+ if(*c == '}') {
+ break;
+ }
+ j = -1;
+ if(!strncmp(c, "\"src\"", 5)) {
+ for(i=0; i<totalpages; i++) {
+ if(!strncmp(pages[i]->name, code, STRMAX)) {
+ j = i;
+ break;
+ }
+ }
+ if(j == -1) /* TODO: it would be good to add new page on the end */
+ break; /* of structure rather than throw it away. */
+ for(p=pages[j]->url, d=c+strlen("\"src\":")+1; *d && *d != '"'; d++, p++) {
+ if(!strncmp(d, "\\u0026", 6)) {
+ *p = '&';
+ d+=5;
+ } else
+ *p = *d;
+ }
+ strncpy(p, "&q=subject:a", 13);
+ strncpy(pages[j]->cookie, cookie, COOKIEMAX);
+ break;
+ }
}
}
free(buf);
- return page;
+ return 0;
}
-int main(int argc, char *argv[])
+int getpage(Page *page)
{
- char *bookid, *tmp, *code;
- char pg[STRMAX], buf[BUFSIZ], n[STRMAX], cookie[COOKIEMAX] = "";
- int i, c, retry;
- Page *page;
+ char path[STRMAX];
+ snprintf(path, STRMAX, "%04d.png", page->num);
+
+ if(page->url[0] == '\0') {
+ fprintf(stderr, "%s not found\n", page->name);
+ return 1;
+ }
- if(argc < 2 || argc > 3 ||
- (argv[1][0]=='-' && ((argv[1][1]!='p' && argv[1][1]!='a') || argc < 3))) {
+ if(gettofile("books.google.com", page->url, page->cookie, NULL, path)) {
+ fprintf(stderr, "%s failed\n", page->name);
+ return 1;
+ }
+
+ printf("%d downloaded\n", page->num);
+ return 0;
+}
+
+void searchpage(Page *page) {
+ int i, j;
+
+ if(page->url[0] != '\0')
+ return;
+
+ for(i=0; i<COOKIENUM; i++) {
+ if(cookies[i][0] == '\0') /* dead cookie */
+ continue;
+ getpageurls(page->name, cookies[i]);
+ if(page->url[0] != '\0') {
+ /* invalidate old cookies if one succeeded */
+ for(j=0; j<i; j++)
+ cookies[j][0] = '\0';
+ break;
+ }
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ char *tmp;
+ char buf[BUFSIZ], pgpath[STRMAX];
+ char in[16];
+ int a, i, n;
+ FILE *f;
+
+ if(argc < 2 || argc > 3 || (argc == 3 && (argv[1][0]!='-'
+ || (argv[1][1] != 'c' && argv[1][1] != 'n')))
+ || (argc >= 2 && argv[1][0] == '-' && argv[1][1] == 'h')) {
fputs(usage, stdout);
return 1;
}
+ /* get cookies */
+ for(i=0;i<COOKIENUM;i++) {
+ if(get("books.google.com", "/", NULL, cookies[i], &tmp))
+ free(tmp);
+ }
+
bookid = argv[argc-1];
- if(argv[1][0] == '-') {
- code = pagecodes[0];
- c = i = retry = 0;
- while(++i) {
- snprintf(pg, STRMAX, "%s%d", code, i);
- if(!(page = getpagedetail(bookid, pg, cookie))) {
- /* no more pages with that code */
- code = pagecodes[++c];
- if(code[0] == '\0') break;
- i=0;
- continue;
- }
- if(!page->url[0]) {
- free(page);
- /* try with fresh cookie */
- if(retry < RETRYNUM) {
- get("books.google.com", "/", NULL, cookie, &tmp);
- free(tmp);
- retry++;
- i--;
- } else {
- fprintf(stderr, "%s not available\n", pg);
- retry=0;
- }
+ pages = malloc(sizeof(*pages) * MAXPAGES);
+ for(i=0; i<MAXPAGES; i++) pages[i] = malloc(sizeof(**pages));
+ if(!(totalpages = getpagelist(bookid, pages))) {
+ fprintf(stderr, "Could not find any pages for %s\n", bookid);
+ return 1;
+ }
+
+ if(argc == 2) {
+ for(i=0; i<totalpages; i++) {
+ snprintf(pgpath, STRMAX, "%04d.png", pages[i]->num);
+ if((f = fopen(pgpath, "r")) != NULL) {
+ fclose(f);
continue;
}
- retry=0;
- if(argv[1][1] == 'a') {
- if(page->num != -1)
- snprintf(n, STRMAX, "%04d.png", page->num);
- else
- snprintf(n, STRMAX, "%s.png", page->name);
- if(gettofile("books.google.com", page->url, cookie, NULL, n))
- fprintf(stderr, "%s failed\n", pg);
- else
- printf("Downloaded page %d\n", page->num);
- } else {
- printf("%s ", page->name);
- if(page->num != -1) printf("%d", page->num);
- printf("\n");
- fflush(stdout);
- }
- free(page);
+ searchpage(pages[i]);
+ getpage(pages[i]);
}
- } else {
+ } else if(argv[1][0] == '-') {
while(fgets(buf, BUFSIZ, stdin)) {
- sscanf(buf, "%15s", pg);
- for(retry = 0; retry < RETRYNUM; retry++) {
- get("books.google.com", "/", NULL, cookie, &tmp);
- if((page = getpagedetail(bookid, pg, cookie)) && page->url[0]) {
- snprintf(n, STRMAX, "%04d.png", page->num);
- if(gettofile("books.google.com", page->url, cookie, NULL, n))
- continue;
- printf("Downloaded page %d\n", page->num);
- free(page);
- break;
+ sscanf(buf, "%15s", in);
+ i = -1;
+ if(argv[1][1] == 'c') {
+ for(a=0; a<totalpages; a++) {
+ if(strncmp(pages[a]->name, in, STRMAX) == 0) {
+ i = a;
+ break;
+ }
+ }
+ } else if(argv[1][1] == 'n') {
+ sscanf(in, "%d", &n);
+ for(a=0; a<totalpages; a++) {
+ if(pages[a]->num == n) {
+ i = a;
+ break;
+ }
}
- if(page) free(page);
}
- if(retry == RETRYNUM)
- fprintf(stderr, "%s failed\n", pg);
+ if(i == -1) {
+ fprintf(stderr, "%s not found\n", in);
+ continue;
+ }
+ searchpage(pages[i]);
+ getpage(pages[i]);
}
}
+ for(i=0; i<MAXPAGES; i++) free(pages[i]);
+ free(pages);
+
return EXIT_SUCCESS;
}
diff --git a/getgfailed.sh b/getgfailed.sh
deleted file mode 100755
index 9ecd9e3..0000000
--- a/getgfailed.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/sh
-# See COPYING file for copyright and license details.
-#
-# Tries to download each page listed in a fail log (from a
-# previous run of getgbook -a bookid > faillog)
-
-test $# -ne 2 && echo "usage: $0 bookid faillog" && exit
-
-sort < $2 | sort | shuf | head -n 5 | while read i
-do
- code=`echo $i|awk '{print $1}'`
- echo $code | getgbook $1
-done
diff --git a/getgmissing.sh b/getgmissing.sh
deleted file mode 100755
index e8198d8..0000000
--- a/getgmissing.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/sh
-# See COPYING file for copyright and license details.
-#
-# This gets any pages listed as available that have not been
-# downloaded. Note that at present this is not too useful, as
-# an IP block will be imposed after the first x pages each run,
-# just for checking availaility.
-
-test $# -ne 1 && echo "usage: $0 bookid" && exit
-
-getgbook -p $1 2>/dev/null | while read i
-do
- code=`echo $i|awk '{print $1}'`
- num=`echo $i|awk '{print $2}'`
- test -n "$num" && num=`printf '%04d' $num` || num=$code
- test -f $num.png || echo $code | getgbook $1
-done
diff --git a/util.c b/util.c
index e030be6..04e80b4 100644
--- a/util.c
+++ b/util.c
@@ -56,6 +56,7 @@ int get(char *host, char *path, char *sendcookie, char *savecookie, char **buf)
char c[COOKIEMAX] = "";
char t[BUFSIZ];
char *t2;
+ char m[256];
if((fd = dial(host, "80")) == -1) return 0;
@@ -67,10 +68,12 @@ int get(char *host, char *path, char *sendcookie, char *savecookie, char **buf)
*buf = NULL;
l = 0;
+ snprintf(m, 256, "Set-Cookie: %%%ds;", COOKIEMAX-1);
while((res = recv(fd, t, BUFSIZ, 0)) > 0) {
if(sscanf(t, "HTTP/%d.%d %d", &i, &i, &p) == 3 && p != 200)
return 0;
- if(savecookie != NULL && sscanf(t, "Set-Cookie: %s;", c))
+ if(savecookie != NULL &&
+ (t2 = strstr(t, "Set-Cookie: ")) != NULL && sscanf(t2, m, c))
strncat(savecookie, c, COOKIEMAX);
if((t2 = strstr(t, "\r\n\r\n")) != NULL) {
t2+=4;