From 3afe70f3cd0a19465ef9f8bbaf6a0961d9eb6d3a Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.me.uk>
Date: Wed, 17 Aug 2011 18:57:02 +0100
Subject: Started rewrite (not there yet)

---
 TODO       |  2 ++
 getgbook.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/TODO b/TODO
index 558b8d8..8a1deb7 100644
--- a/TODO
+++ b/TODO
@@ -4,6 +4,8 @@ getabook
 
 getbnbook
 
+use "" rather than "\0" in headermax
+
 # other todos
 
 use HTTP/1.1 with "Connection: close" header
diff --git a/getgbook.c b/getgbook.c
index 5f0a0ae..52eb82a 100644
--- a/getgbook.c
+++ b/getgbook.c
@@ -5,15 +5,15 @@
 #include "util.h"
 
 #define usage "getgbook " VERSION " - a google books downloader\n" \
-              "usage: getgbook [-p|-a] bookid\n" \
-              "  -p print all available pages\n" \
-              "  -a download all available pages\n" \
-              "  otherwise, all pages in stdin will be downloaded\n"
+              "usage: getgbook [-] bookid\n" \
+              "  - download pages from stdin\n" \
+              "  otherwise, all available pages will be downloaded\n"
 
 #define URLMAX 1024
 #define STRMAX 1024
 #define PGCODELEN 3
 #define RETRYNUM 5
+#define COOKIENUM 5
 
 typedef struct {
 	int num;
@@ -23,6 +23,13 @@ typedef struct {
 
 char pagecodes[][PGCODELEN] = { "PP", "PR", "PA", "PT", "\0" };
 
+int getpagelist(char *bookid, Page *pages)
+{
+	/* TODO */
+	/*http://books.google.com/books?id=h3DSQ0L10o8C&printsec=frontcover*/
+	return 1;
+}
+
 Page *getpagedetail(char *bookid, char *pg, char *cookie)
 {
 	char url[URLMAX], m[STRMAX];
@@ -71,20 +78,48 @@ Page *getpagedetail(char *bookid, char *pg, char *cookie)
 
 int main(int argc, char *argv[])
 {
-	char *bookid, *tmp, *code;
+	char *bookid, *tmp, *code, cookies[COOKIENUM][COOKIEMAX];
 	char pg[STRMAX], buf[BUFSIZ], n[STRMAX], cookie[COOKIEMAX] = "";
 	int i, c, retry;
-	Page *page;
 
-	if(argc < 2 || argc > 3 ||
-	   (argv[1][0]=='-' && ((argv[1][1]!='p' && argv[1][1]!='a') || argc < 3))) {
+	if(argc < 2 || argc > 3 || (argc == 3 && argv[1][0]!='-')) {
 		fputs(usage, stdout);
 		return 1;
 	}
 
+	/* get cookies */
+	for(i=0;i<COOKIENUM;i++) {
+		get("books.google.com", "/", NULL, cookies[i], &tmp);
+		free(tmp);
+	}
+
 	bookid = argv[argc-1];
 
-	if(argv[1][0] == '-') {
+	if(argc == 2) {
+		/* download all pages */
+		/* - fill page struct with names & nums
+		 * - loop through each struct
+		 * - if there's not a file matching num, try downloading, if dl failure, try with a different cookie */
+		/*
+			cookie management:
+			use up to 5 cookies. (number might change)
+			complexity comes with a page which is not available; that shouldn't cause us to use up all the cookies
+			so:
+			 - save 5 cookies immediately
+			 - use first until it fails
+				 - then use next. if it succeeds, drop previous. if not, try next, etc. if all failed, don't drop any, and continue to next page, and +1 to retry
+			 - maybe: when retry is 5, quit as it looks like we won't get anything more from any cookies
+		*/
+
+		Page page[10000];
+		if(!getpagelist(bookid, page)) {
+			fprintf(stderr, "Could not find pages for %s\n", bookid);
+			return 1;
+		}
+
+
+
+		/* OLD CODE */
 		code = pagecodes[0];
 		c = i = retry = 0;
 		while(++i) {
@@ -129,6 +164,9 @@ int main(int argc, char *argv[])
 			free(page);
 		}
 	} else {
+		/* download pages from stdin */
+		/* TODO: rewrite using cookies as above */
+		Page *page;
 		while(fgets(buf, BUFSIZ, stdin)) {
 			sscanf(buf, "%15s", pg);
 			for(retry = 0; retry < RETRYNUM; retry++) {
-- 
cgit v1.2.3


From 2c612c00078c73297ad778ea21ffaf13ec058e81 Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.me.uk>
Date: Thu, 18 Aug 2011 23:08:25 +0100
Subject: Fix cookie bug causing segfault if site blocked

---
 getgbook.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/getgbook.c b/getgbook.c
index 52eb82a..f944d9f 100644
--- a/getgbook.c
+++ b/getgbook.c
@@ -89,8 +89,8 @@ int main(int argc, char *argv[])
 
 	/* get cookies */
 	for(i=0;i<COOKIENUM;i++) {
-		get("books.google.com", "/", NULL, cookies[i], &tmp);
-		free(tmp);
+		if(get("books.google.com", "/", NULL, cookies[i], &tmp))
+			free(tmp);
 	}
 
 	bookid = argv[argc-1];
-- 
cgit v1.2.3


From fd9b2a8213fcd98cba454ef8c3b71ad3b806cc12 Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.me.uk>
Date: Thu, 18 Aug 2011 23:08:57 +0100
Subject: getpagelist working, though no further functionality yet

---
 getgbook.c | 118 +++++++++++++++++++++----------------------------------------
 1 file changed, 41 insertions(+), 77 deletions(-)

diff --git a/getgbook.c b/getgbook.c
index f944d9f..a9c0165 100644
--- a/getgbook.c
+++ b/getgbook.c
@@ -23,11 +23,40 @@ typedef struct {
 
 char pagecodes[][PGCODELEN] = { "PP", "PR", "PA", "PT", "\0" };
 
-int getpagelist(char *bookid, Page *pages)
+int getpagelist(char *bookid, Page **pages)
 {
-	/* TODO */
-	/*http://books.google.com/books?id=h3DSQ0L10o8C&printsec=frontcover*/
-	return 1;
+	char url[URLMAX];
+	char *buf;
+	char *s;
+	int i;
+	Page *p;
+
+	snprintf(url, URLMAX, "/books?id=%s&printsec=frontcover", bookid);
+
+	if(!get("books.google.com", url, NULL, NULL, &buf))
+		return -1;
+
+	if((s = strstr(buf, "_OC_Run({\"page\":[")) == NULL)
+		return -1;
+	s+=strlen("_OC_Run({\"page\":[");
+
+	for(i=0, p=pages[0];*s; s++) {
+		if(*s == ']')
+			break;
+		if(!strncmp(s, "\"pid\"", 5)) {
+			sscanf(s+6, "\"%[^\"]\",", p->name);
+			for(;*s; s++) {
+				if(*s == '}')
+					break;
+				if(!strncmp(s, "\"order\"", 7)) {
+					sscanf(s+8, "%d,", &(p->num));
+				}
+			}
+			p=pages[++i];
+		}
+	}
+
+	return i;
 }
 
 Page *getpagedetail(char *bookid, char *pg, char *cookie)
@@ -78,9 +107,8 @@ Page *getpagedetail(char *bookid, char *pg, char *cookie)
 
 int main(int argc, char *argv[])
 {
-	char *bookid, *tmp, *code, cookies[COOKIENUM][COOKIEMAX];
-	char pg[STRMAX], buf[BUFSIZ], n[STRMAX], cookie[COOKIEMAX] = "";
-	int i, c, retry;
+	char *bookid, *tmp, cookies[COOKIENUM][COOKIEMAX];
+	int i, a;
 
 	if(argc < 2 || argc > 3 || (argc == 3 && argv[1][0]!='-')) {
 		fputs(usage, stdout);
@@ -111,79 +139,15 @@ int main(int argc, char *argv[])
 			 - maybe: when retry is 5, quit as it looks like we won't get anything more from any cookies
 		*/
 
-		Page page[10000];
-		if(!getpagelist(bookid, page)) {
+		Page **page;
+		page = malloc(sizeof(Page) * 1000);
+		for(i=0; i<1000; i++) page[i] = malloc(sizeof(*page));
+		if(!(i = getpagelist(bookid, page))) {
 			fprintf(stderr, "Could not find pages for %s\n", bookid);
 			return 1;
 		}
-
-
-
-		/* OLD CODE */
-		code = pagecodes[0];
-		c = i = retry = 0;
-		while(++i) {
-			snprintf(pg, STRMAX, "%s%d", code, i);
-			if(!(page = getpagedetail(bookid, pg, cookie))) {
-				/* no more pages with that code */
-				code = pagecodes[++c];
-				if(code[0] == '\0') break;
-				i=0;
-				continue;
-			}
-			if(!page->url[0]) {
-				free(page);
-				/* try with fresh cookie */
-				if(retry < RETRYNUM) {
-					get("books.google.com", "/", NULL, cookie, &tmp);
-					free(tmp);
-					retry++;
-					i--;
-				} else {
-					fprintf(stderr, "%s not available\n", pg);
-					retry=0;
-				}
-				continue;
-			}
-			retry=0;
-			if(argv[1][1] == 'a') {
-				if(page->num != -1)
-					snprintf(n, STRMAX, "%04d.png", page->num);
-				else
-					snprintf(n, STRMAX, "%s.png", page->name);
-				if(gettofile("books.google.com", page->url, cookie, NULL, n))
-					fprintf(stderr, "%s failed\n", pg);
-				else
-					printf("Downloaded page %d\n", page->num);
-			} else {
-				printf("%s ", page->name);
-				if(page->num != -1) printf("%d", page->num);
-				printf("\n");
-				fflush(stdout);
-			}
-			free(page);
-		}
-	} else {
-		/* download pages from stdin */
-		/* TODO: rewrite using cookies as above */
-		Page *page;
-		while(fgets(buf, BUFSIZ, stdin)) {
-			sscanf(buf, "%15s", pg);
-			for(retry = 0; retry < RETRYNUM; retry++) {
-				get("books.google.com", "/", NULL, cookie, &tmp);
-				if((page = getpagedetail(bookid, pg, cookie)) && page->url[0]) {
-					snprintf(n, STRMAX, "%04d.png", page->num);
-					if(gettofile("books.google.com", page->url, cookie, NULL, n))
-						continue;
-					printf("Downloaded page %d\n", page->num);
-					free(page);
-					break;
-				}
-				if(page) free(page);
-			}
-			if(retry == RETRYNUM)
-				fprintf(stderr, "%s failed\n", pg);
-		}
+		for(a=0; a<i; a++)
+			printf("page name: %s page num: %d\n", page[a]->name, page[a]->num);
 	}
 
 	return EXIT_SUCCESS;
-- 
cgit v1.2.3


From 496e3d7171044a381b91a9a077e7ecc53cd19f97 Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.me.uk>
Date: Thu, 18 Aug 2011 23:37:25 +0100
Subject: Fix memory allocation bugs

---
 getgbook.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/getgbook.c b/getgbook.c
index a9c0165..29f7a19 100644
--- a/getgbook.c
+++ b/getgbook.c
@@ -140,8 +140,8 @@ int main(int argc, char *argv[])
 		*/
 
 		Page **page;
-		page = malloc(sizeof(Page) * 1000);
-		for(i=0; i<1000; i++) page[i] = malloc(sizeof(*page));
+		page = malloc(sizeof(*page) * 1000);
+		for(i=0; i<1000; i++) page[i] = malloc(sizeof(**page));
 		if(!(i = getpagelist(bookid, page))) {
 			fprintf(stderr, "Could not find pages for %s\n", bookid);
 			return 1;
-- 
cgit v1.2.3


From 12c8ed14117a68358f4b680c1fe1f08d61970fec Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.me.uk>
Date: Fri, 19 Aug 2011 00:50:30 +0100
Subject: Simplify a teeny bit

---
 getgbook.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/getgbook.c b/getgbook.c
index 29f7a19..d685279 100644
--- a/getgbook.c
+++ b/getgbook.c
@@ -48,9 +48,8 @@ int getpagelist(char *bookid, Page **pages)
 			for(;*s; s++) {
 				if(*s == '}')
 					break;
-				if(!strncmp(s, "\"order\"", 7)) {
+				if(!strncmp(s, "\"order\"", 7))
 					sscanf(s+8, "%d,", &(p->num));
-				}
 			}
 			p=pages[++i];
 		}
-- 
cgit v1.2.3


From b686e031da6622b329d43c361b196eda46ea5154 Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.me.uk>
Date: Sun, 21 Aug 2011 16:53:55 +0100
Subject: Get all pages works reasonably

---
 getgbook.c | 124 ++++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 65 insertions(+), 59 deletions(-)

diff --git a/getgbook.c b/getgbook.c
index d685279..5f1e381 100644
--- a/getgbook.c
+++ b/getgbook.c
@@ -5,24 +5,22 @@
 #include "util.h"
 
 #define usage "getgbook " VERSION " - a google books downloader\n" \
-              "usage: getgbook [-] bookid\n" \
-              "  - download pages from stdin\n" \
+              "usage: getgbook [-c|-n] bookid\n" \
+              "  -c download pages from codes in stdin (TODO)\n" \
+              "  -n download pages from numbers in stdin (TODO)\n" \
               "  otherwise, all available pages will be downloaded\n"
 
 #define URLMAX 1024
 #define STRMAX 1024
-#define PGCODELEN 3
-#define RETRYNUM 5
 #define COOKIENUM 5
 
 typedef struct {
 	int num;
 	char url[URLMAX];
 	char name[STRMAX];
+	char cookie[COOKIEMAX];
 } Page;
 
-char pagecodes[][PGCODELEN] = { "PP", "PR", "PA", "PT", "\0" };
-
 int getpagelist(char *bookid, Page **pages)
 {
 	char url[URLMAX];
@@ -41,6 +39,7 @@ int getpagelist(char *bookid, Page **pages)
 	s+=strlen("_OC_Run({\"page\":[");
 
 	for(i=0, p=pages[0];*s; s++) {
+		p->url[0] = '\0';
 		if(*s == ']')
 			break;
 		if(!strncmp(s, "\"pid\"", 5)) {
@@ -58,56 +57,52 @@ int getpagelist(char *bookid, Page **pages)
 	return i;
 }
 
-Page *getpagedetail(char *bookid, char *pg, char *cookie)
-{
-	char url[URLMAX], m[STRMAX];
+int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char *cookie) {
+	char url[URLMAX], code[STRMAX];
 	char *c, *d, *p, *buf = NULL;
-	Page *page;
+	int i;
 
-	snprintf(url, URLMAX, "/books?id=%s&pg=%s&jscmd=click3&q=subject:a", bookid, pg);
+	snprintf(url, URLMAX, "/books?id=%s&pg=%s&jscmd=click3&q=subject:a", bookid, pagecode);
 
 	if(!get("books.google.com", url, cookie, NULL, &buf))
-		return NULL;
-
-	snprintf(m, STRMAX, "\"pid\":\"%s\"", pg);
-	if(!(c = strstr(buf,m)))
-		return NULL;
-
-	page = malloc(sizeof(*page));
-	strncpy(page->name, pg, STRMAX);
-	page->url[0] = '\0';
-	page->num = -1;
-
-	if(!strncmp(c+strlen(m)+1, "\"src\"", 5)) {
-		for(p=page->url, d=c+strlen(m)+8; *d && *d != '"'; d++, p++) {
-			if(!strncmp(d, "\\u0026", 6)) {
-				*p = '&';
-				d+=5;
-			} else
-				*p = *d;
-		}
-		strncpy(p, "&q=subject:a", 12);
-	} else
-		d=c;
+		return 1;
 
-	for(; *d; d++) {
-		if(*d == '}') {
-			break;
-		}
-		if(!strncmp(d, "\"order\"", 7)) {
-			sscanf(d+8, "%d,", &(page->num));
+	c = buf;
+	while(*c && (c = strstr(c, "\"pid\":"))) {
+		if(!sscanf(c, "\"pid\":\"%[^\"]\"", code))
 			break;
+		for(; *c; c++) {
+			if(*c == '}') {
+				break;
+			}
+			if(!strncmp(c, "\"src\"", 5)) {
+				for(i=0; i<totalpages; i++)
+					if(!strncmp(pages[i]->name, code, STRMAX))
+						break;
+				for(p=pages[i]->url, d=c+strlen("\"src\":")+1; *d && *d != '"'; d++, p++) {
+					if(!strncmp(d, "\\u0026", 6)) {
+						*p = '&';
+						d+=5;
+					} else
+						*p = *d;
+				}
+				strncpy(p, "&q=subject:a", 13);
+				strncpy(pages[i]->cookie, cookie, COOKIEMAX);
+				break;
+			}
 		}
 	}
 
 	free(buf);
-	return page;
+	return 0;
 }
 
 int main(int argc, char *argv[])
 {
 	char *bookid, *tmp, cookies[COOKIENUM][COOKIEMAX];
-	int i, a;
+	char pgpath[STRMAX];
+	int a, i, j, totalpages;
+	FILE *f;
 
 	if(argc < 2 || argc > 3 || (argc == 3 && argv[1][0]!='-')) {
 		fputs(usage, stdout);
@@ -123,30 +118,41 @@ int main(int argc, char *argv[])
 	bookid = argv[argc-1];
 
 	if(argc == 2) {
-		/* download all pages */
-		/* - fill page struct with names & nums
-		 * - loop through each struct
-		 * - if there's not a file matching num, try downloading, if dl failure, try with a different cookie */
-		/*
-			cookie management:
-			use up to 5 cookies. (number might change)
-			complexity comes with a page which is not available; that shouldn't cause us to use up all the cookies
-			so:
-			 - save 5 cookies immediately
-			 - use first until it fails
-				 - then use next. if it succeeds, drop previous. if not, try next, etc. if all failed, don't drop any, and continue to next page, and +1 to retry
-			 - maybe: when retry is 5, quit as it looks like we won't get anything more from any cookies
-		*/
-
 		Page **page;
 		page = malloc(sizeof(*page) * 1000);
 		for(i=0; i<1000; i++) page[i] = malloc(sizeof(**page));
-		if(!(i = getpagelist(bookid, page))) {
+		if(!(totalpages = getpagelist(bookid, page))) {
 			fprintf(stderr, "Could not find pages for %s\n", bookid);
 			return 1;
 		}
-		for(a=0; a<i; a++)
-			printf("page name: %s page num: %d\n", page[a]->name, page[a]->num);
+		for(i=0; i<totalpages; i++) {
+			snprintf(pgpath, STRMAX, "%04d.png", page[i]->num);
+			if((f = fopen(pgpath, "r")) != NULL) {
+				fclose(f);
+				continue;
+			}
+			if(page[i]->url[0] == '\0') {
+				for(j=0; j<COOKIENUM; j++) {
+					if(cookies[j][0] == '\0') /* dead cookie */
+						continue;
+					getpageurls(bookid, page, totalpages, page[i]->name, cookies[j]);
+					if(page[i]->url[0] != '\0') {
+						/* invalidate old cookies if one succeeded */
+						for(a=0; a<j; a++)
+							cookies[a][0] = '\0';
+						break;
+					}
+				}
+			}
+			if(page[i]->url[0] == '\0')
+				fprintf(stderr, "%s not found\n", page[i]->name);
+			else {
+				if(gettofile("books.google.com", page[i]->url, page[i]->cookie, NULL, pgpath))
+					fprintf(stderr, "%s failed\n", page[i]->name);
+				else
+					printf("%d downloaded\n", page[i]->num);
+			}
+		}
 	}
 
 	return EXIT_SUCCESS;
-- 
cgit v1.2.3


From be77fe85042dfcc4a943c4c979ba7b990d6a124f Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.me.uk>
Date: Sun, 21 Aug 2011 17:00:00 +0100
Subject: Tighten sscanf usage, add TODOs

---
 TODO       | 24 ++++++++----------------
 getgbook.c | 10 ++++++----
 2 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/TODO b/TODO
index 8a1deb7..4703148 100644
--- a/TODO
+++ b/TODO
@@ -8,6 +8,10 @@ use "" rather than "\0" in headermax
 
 # other todos
 
+use wide string functions when dealing with stuff returned over http; it's known utf8
+
+bug in get(): if the \r\n\r\n after http headers is cut off between recv buffers
+
 use HTTP/1.1 with "Connection: close" header
 
 try supporting 3xx in get, if it can be done in a few lines
@@ -25,23 +29,11 @@ have websummary.sh print the date of release, e.g.
 
 ## getgbook
 
+mkdir of bookid and save pages in there
+
 Google will give you up to 5 cookies which get useful pages in immediate succession. It will stop serving new pages to the ip, even with a fresh cookie. So the cookie is certainly not everything.
 
 If one does something too naughty, all requests from the ip to books.google.com are blocked with a 403 'automated requests' error for 24 hours. What causes this ip block is less clear. It certainly isn't after just trying lots of pages with 5 cookies. It seems to be after requesting 100 new cookies in a certain time period - 100 in 5 minutes seemed to do it, as did 100 in ~15 minutes.
 
-So, if no more than 5 useable cookies can be gotten, and many more than this cause an ip block, a strategy could be to not bother getting more than 5 cookies, and bail once the 5th starts failing. of course, this doesn't address getting more pages, and moreover it doesn't address knowing which pages are available.
-
-all pages available (includes page code & order (even when not available from main click3 part) (& title sometimes, & height), though not url): curl 'http://books.google.com/books?id=h3DSQ0L10o8C&printsec=frontcover' | sed -e '/OC_Run\(/!d' -e 's/.*_OC_Run\({"page"://g' -e 's/}].*//g'
-
-TODO, THEN:
-	at start (if in -p or -a mode), fill a Page struct (don't hold url in struct any more)
-	in -a, go through Page struct, if file exists, skip, otherwise get the url for the page (don't bother about re-getting order etc). this means that getgfailed and getgmissing can go away
-	in -p, just go through Page struct and print each entry
-	when 5 cookies have been exhausted, quit, saying no more cookies available for now (and recommending a time period to retry)
-	have -a be default, and stdin be -
-
-	so, usage should be
-	 getgbook [-] bookid
-	  if - is given, read page codes from stdin
-	  otherwise, just download everything (skipping already
-	  downloaded pages)
+NOTE!!: the method of getting all pages from book page does miss some; they aren't all listed
+* these pages can often be requested, though
diff --git a/getgbook.c b/getgbook.c
index 5f1e381..62faf46 100644
--- a/getgbook.c
+++ b/getgbook.c
@@ -23,7 +23,7 @@ typedef struct {
 
 int getpagelist(char *bookid, Page **pages)
 {
-	char url[URLMAX];
+	char url[URLMAX], m[STRMAX];
 	char *buf;
 	char *s;
 	int i;
@@ -43,7 +43,8 @@ int getpagelist(char *bookid, Page **pages)
 		if(*s == ']')
 			break;
 		if(!strncmp(s, "\"pid\"", 5)) {
-			sscanf(s+6, "\"%[^\"]\",", p->name);
+			snprintf(m, STRMAX, "\"%%%d[^\"]\"", STRMAX-1);
+			sscanf(s+6, m, p->name);
 			for(;*s; s++) {
 				if(*s == '}')
 					break;
@@ -58,7 +59,7 @@ int getpagelist(char *bookid, Page **pages)
 }
 
 int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char *cookie) {
-	char url[URLMAX], code[STRMAX];
+	char url[URLMAX], code[STRMAX], m[STRMAX];
 	char *c, *d, *p, *buf = NULL;
 	int i;
 
@@ -69,7 +70,8 @@ int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char
 
 	c = buf;
 	while(*c && (c = strstr(c, "\"pid\":"))) {
-		if(!sscanf(c, "\"pid\":\"%[^\"]\"", code))
+		snprintf(m, STRMAX, "\"pid\":\"%%%d[^\"]\"", STRMAX-1);
+		if(!sscanf(c, m, code))
 			break;
 		for(; *c; c++) {
 			if(*c == '}') {
-- 
cgit v1.2.3


From df8c5735b2d71374385baf288e13d6e88a17840a Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.me.uk>
Date: Sun, 21 Aug 2011 17:11:43 +0100
Subject: More sscanf tightening

---
 util.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/util.c b/util.c
index 6e341a5..4d2b04c 100644
--- a/util.c
+++ b/util.c
@@ -41,6 +41,7 @@ int get(char *host, char *path, char *sendcookie, char *savecookie, char **buf)
 	int fd, i, p;
 	char h[HEADERMAX] = "\0";
 	char c[COOKIEMAX] = "";
+	char m[256];
 	FILE *srv;
 
 	if((fd = dial(host, "80")) == -1) return 0;
@@ -52,11 +53,13 @@ int get(char *host, char *path, char *sendcookie, char *savecookie, char **buf)
 	             " (not mozilla)\r\nHost: %s%s\r\n\r\n", path, host, c);
 	fflush(srv);
 
+	snprintf(m, 256, "Set-Cookie: %%%ds;", COOKIEMAX-1);
+
 	while(h[0] != '\r') {
 		if(!fgets(h, HEADERMAX, srv)) return 0;
 		if(sscanf(h, "HTTP/%d.%d %d", &i, &i, &p) == 3 && p != 200)
 			return 0;
-		if(savecookie != NULL && sscanf(h, "Set-Cookie: %s;", c))
+		if(savecookie != NULL && sscanf(h, m, c))
 			strncat(savecookie, c, COOKIEMAX);
 	}
 
-- 
cgit v1.2.3


From 0fedff7492d97609cdfc5a02a883bdfd693f4dbb Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.me.uk>
Date: Sun, 21 Aug 2011 17:11:54 +0100
Subject: Set max pages explicitly

---
 getgbook.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/getgbook.c b/getgbook.c
index 62faf46..e60316f 100644
--- a/getgbook.c
+++ b/getgbook.c
@@ -12,6 +12,7 @@
 
 #define URLMAX 1024
 #define STRMAX 1024
+#define MAXPAGES 9999
 #define COOKIENUM 5
 
 typedef struct {
@@ -121,8 +122,8 @@ int main(int argc, char *argv[])
 
 	if(argc == 2) {
 		Page **page;
-		page = malloc(sizeof(*page) * 1000);
-		for(i=0; i<1000; i++) page[i] = malloc(sizeof(**page));
+		page = malloc(sizeof(*page) * MAXPAGES);
+		for(i=0; i<MAXPAGES; i++) page[i] = malloc(sizeof(**page));
 		if(!(totalpages = getpagelist(bookid, page))) {
 			fprintf(stderr, "Could not find pages for %s\n", bookid);
 			return 1;
-- 
cgit v1.2.3


From 043da4609ae6f9e229f0f03d602f57908f66879a Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.me.uk>
Date: Sun, 21 Aug 2011 17:22:35 +0100
Subject: Fix reporting of no pages available

---
 TODO       | 9 +++++++--
 getgbook.c | 6 +++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/TODO b/TODO
index 4703148..4eb35e4 100644
--- a/TODO
+++ b/TODO
@@ -31,9 +31,14 @@ have websummary.sh print the date of release, e.g.
 
 mkdir of bookid and save pages in there
 
+add cmdline arguments for stdin parsing
+
+merge pageinfo branch
+
+### notes
+
 Google will give you up to 5 cookies which get useful pages in immediate succession. It will stop serving new pages to the ip, even with a fresh cookie. So the cookie is certainly not everything.
 
 If one does something too naughty, all requests from the ip to books.google.com are blocked with a 403 'automated requests' error for 24 hours. What causes this ip block is less clear. It certainly isn't after just trying lots of pages with 5 cookies. It seems to be after requesting 100 new cookies in a certain time period - 100 in 5 minutes seemed to do it, as did 100 in ~15 minutes.
 
-NOTE!!: the method of getting all pages from book page does miss some; they aren't all listed
-* these pages can often be requested, though
+The method of getting all pages from book webpage does miss some; they aren't all listed. These pages can often be requested, though.
diff --git a/getgbook.c b/getgbook.c
index e60316f..3fbdf47 100644
--- a/getgbook.c
+++ b/getgbook.c
@@ -33,10 +33,10 @@ int getpagelist(char *bookid, Page **pages)
 	snprintf(url, URLMAX, "/books?id=%s&printsec=frontcover", bookid);
 
 	if(!get("books.google.com", url, NULL, NULL, &buf))
-		return -1;
+		return 0;
 
 	if((s = strstr(buf, "_OC_Run({\"page\":[")) == NULL)
-		return -1;
+		return 0;
 	s+=strlen("_OC_Run({\"page\":[");
 
 	for(i=0, p=pages[0];*s; s++) {
@@ -125,7 +125,7 @@ int main(int argc, char *argv[])
 		page = malloc(sizeof(*page) * MAXPAGES);
 		for(i=0; i<MAXPAGES; i++) page[i] = malloc(sizeof(**page));
 		if(!(totalpages = getpagelist(bookid, page))) {
-			fprintf(stderr, "Could not find pages for %s\n", bookid);
+			fprintf(stderr, "Could not find any pages for %s\n", bookid);
 			return 1;
 		}
 		for(i=0; i<totalpages; i++) {
-- 
cgit v1.2.3


From fc43d1cacbb62fd854960901688e1b9b9752e7cd Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.me.uk>
Date: Sun, 21 Aug 2011 17:58:28 +0100
Subject: Fix usage printing

---
 getgbook.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/getgbook.c b/getgbook.c
index 3fbdf47..8073194 100644
--- a/getgbook.c
+++ b/getgbook.c
@@ -107,7 +107,9 @@ int main(int argc, char *argv[])
 	int a, i, j, totalpages;
 	FILE *f;
 
-	if(argc < 2 || argc > 3 || (argc == 3 && argv[1][0]!='-')) {
+	if(argc < 2 || argc > 3 || (argc == 3 && (argv[1][0]!='-'
+	   || (argv[1][1] != 'c' && argv[1][1] != 'n')))
+	   || (argc >= 2 && argv[1][0] == '-' && argv[1][1] == 'h')) {
 		fputs(usage, stdout);
 		return 1;
 	}
-- 
cgit v1.2.3


From 6b059ae1888b0cf8d38c7fe9b4f5c10ec28ab7b6 Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.me.uk>
Date: Sun, 21 Aug 2011 21:14:24 +0100
Subject: Restructure getgbook code

---
 Makefile       |   2 +-
 TODO           |   6 +--
 getgbook.c     | 132 ++++++++++++++++++++++++++++++++++++++++++---------------
 getgfailed.sh  |  13 ------
 getgmissing.sh |  17 --------
 5 files changed, 99 insertions(+), 71 deletions(-)
 delete mode 100755 getgfailed.sh
 delete mode 100755 getgmissing.sh

diff --git a/Makefile b/Makefile
index 4ec498d..d7e947c 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ NAME = getxbook
 
 SRC = getgbook.c
 LIB = util.o
-SCRIPTS = getgmissing.sh getgfailed.sh makebookpdf.sh
+SCRIPTS = makebookpdf.sh
 DOC = README COPYING LEGAL
 
 BIN = $(SRC:.c=)
diff --git a/TODO b/TODO
index 4eb35e4..6b08e9f 100644
--- a/TODO
+++ b/TODO
@@ -31,14 +31,10 @@ have websummary.sh print the date of release, e.g.
 
 mkdir of bookid and save pages in there
 
-add cmdline arguments for stdin parsing
-
-merge pageinfo branch
-
 ### notes
 
 Google will give you up to 5 cookies which get useful pages in immediate succession. It will stop serving new pages to the ip, even with a fresh cookie. So the cookie is certainly not everything.
 
 If one does something too naughty, all requests from the ip to books.google.com are blocked with a 403 'automated requests' error for 24 hours. What causes this ip block is less clear. It certainly isn't after just trying lots of pages with 5 cookies. It seems to be after requesting 100 new cookies in a certain time period - 100 in 5 minutes seemed to do it, as did 100 in ~15 minutes.
 
-The method of getting all pages from book webpage does miss some; they aren't all listed. These pages can often be requested, though.
+The method of getting all pages from book webpage does miss some; they aren't all listed. These pages can often be requested, though, though at present getgbook can't, as if a page isn't in its initial structure it won't save the url, even if it's presented.
diff --git a/getgbook.c b/getgbook.c
index 8073194..d1d6e4a 100644
--- a/getgbook.c
+++ b/getgbook.c
@@ -22,10 +22,15 @@ typedef struct {
 	char cookie[COOKIEMAX];
 } Page;
 
-int getpagelist(char *bookid, Page **pages)
+Page **pages;
+int totalpages;
+char cookies[COOKIENUM][COOKIEMAX];
+char *bookid;
+
+int getpagelist()
 {
 	char url[URLMAX], m[STRMAX];
-	char *buf;
+	char *buf = NULL;
 	char *s;
 	int i;
 	Page *p;
@@ -56,13 +61,14 @@ int getpagelist(char *bookid, Page **pages)
 		}
 	}
 
+	free(buf);
 	return i;
 }
 
-int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char *cookie) {
+int getpageurls(char *pagecode, char *cookie) {
 	char url[URLMAX], code[STRMAX], m[STRMAX];
 	char *c, *d, *p, *buf = NULL;
-	int i;
+	int i, j;
 
 	snprintf(url, URLMAX, "/books?id=%s&pg=%s&jscmd=click3&q=subject:a", bookid, pagecode);
 
@@ -78,11 +84,17 @@ int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char
 			if(*c == '}') {
 				break;
 			}
+			j = -1;
 			if(!strncmp(c, "\"src\"", 5)) {
-				for(i=0; i<totalpages; i++)
-					if(!strncmp(pages[i]->name, code, STRMAX))
+				for(i=0; i<totalpages; i++) {
+					if(!strncmp(pages[i]->name, code, STRMAX)) {
+						j = i;
 						break;
-				for(p=pages[i]->url, d=c+strlen("\"src\":")+1; *d && *d != '"'; d++, p++) {
+					}
+				}
+				if(j == -1) /* TODO: it would be good to add new page on the end */
+					break;  /*       of structure rather than throw it away. */
+				for(p=pages[j]->url, d=c+strlen("\"src\":")+1; *d && *d != '"'; d++, p++) {
 					if(!strncmp(d, "\\u0026", 6)) {
 						*p = '&';
 						d+=5;
@@ -90,7 +102,7 @@ int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char
 						*p = *d;
 				}
 				strncpy(p, "&q=subject:a", 13);
-				strncpy(pages[i]->cookie, cookie, COOKIEMAX);
+				strncpy(pages[j]->cookie, cookie, COOKIEMAX);
 				break;
 			}
 		}
@@ -100,11 +112,50 @@ int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char
 	return 0;
 }
 
+int getpage(Page *page)
+{
+	char path[STRMAX];
+	snprintf(path, STRMAX, "%04d.png", page->num);
+
+	if(page->url[0] == '\0') {
+		fprintf(stderr, "%s not found\n", page->name);
+		return 1;
+	}
+
+	if(gettofile("books.google.com", page->url, page->cookie, NULL, path)) {
+		fprintf(stderr, "%s failed\n", page->name);
+		return 1;
+	}
+
+	printf("%d downloaded\n", page->num);
+	return 0;
+}
+
+void searchpage(Page *page) {
+	int i, j;
+
+	if(page->url[0] != '\0')
+		return;
+
+	for(i=0; i<COOKIENUM; i++) {
+		if(cookies[i][0] == '\0') /* dead cookie */
+			continue;
+		getpageurls(page->name, cookies[i]);
+		if(page->url[0] != '\0') {
+			/* invalidate old cookies if one succeeded */
+			for(j=0; j<i; j++)
+				cookies[j][0] = '\0';
+			break;
+		}
+	}
+}
+
 int main(int argc, char *argv[])
 {
-	char *bookid, *tmp, cookies[COOKIENUM][COOKIEMAX];
-	char pgpath[STRMAX];
-	int a, i, j, totalpages;
+	char *tmp;
+	char buf[BUFSIZ], pgpath[STRMAX];
+	char in[16];
+	int a, i, n;
 	FILE *f;
 
 	if(argc < 2 || argc > 3 || (argc == 3 && (argv[1][0]!='-'
@@ -122,43 +173,54 @@ int main(int argc, char *argv[])
 
 	bookid = argv[argc-1];
 
+	pages = malloc(sizeof(*pages) * MAXPAGES);
+	for(i=0; i<MAXPAGES; i++) pages[i] = malloc(sizeof(**pages));
+	if(!(totalpages = getpagelist(bookid, pages))) {
+		fprintf(stderr, "Could not find any pages for %s\n", bookid);
+		return 1;
+	}
+
 	if(argc == 2) {
-		Page **page;
-		page = malloc(sizeof(*page) * MAXPAGES);
-		for(i=0; i<MAXPAGES; i++) page[i] = malloc(sizeof(**page));
-		if(!(totalpages = getpagelist(bookid, page))) {
-			fprintf(stderr, "Could not find any pages for %s\n", bookid);
-			return 1;
-		}
 		for(i=0; i<totalpages; i++) {
-			snprintf(pgpath, STRMAX, "%04d.png", page[i]->num);
+			snprintf(pgpath, STRMAX, "%04d.png", pages[i]->num);
 			if((f = fopen(pgpath, "r")) != NULL) {
 				fclose(f);
 				continue;
 			}
-			if(page[i]->url[0] == '\0') {
-				for(j=0; j<COOKIENUM; j++) {
-					if(cookies[j][0] == '\0') /* dead cookie */
-						continue;
-					getpageurls(bookid, page, totalpages, page[i]->name, cookies[j]);
-					if(page[i]->url[0] != '\0') {
-						/* invalidate old cookies if one succeeded */
-						for(a=0; a<j; a++)
-							cookies[a][0] = '\0';
+			searchpage(pages[i]);
+			getpage(pages[i]);
+		}
+	} else if(argv[1][0] == '-') {
+		while(fgets(buf, BUFSIZ, stdin)) {
+			sscanf(buf, "%15s", in);
+			i = -1;
+			if(argv[1][1] == 'c') {
+				for(a=0; a<totalpages; a++) {
+					if(strncmp(pages[a]->name, in, STRMAX) == 0) {
+						i = a;
+						break;
+					}
+				}
+			} else if(argv[1][1] == 'n') {
+				sscanf(in, "%d", &n);
+				for(a=0; a<totalpages; a++) {
+					if(pages[a]->num == n) {
+						i = a;
 						break;
 					}
 				}
 			}
-			if(page[i]->url[0] == '\0')
-				fprintf(stderr, "%s not found\n", page[i]->name);
-			else {
-				if(gettofile("books.google.com", page[i]->url, page[i]->cookie, NULL, pgpath))
-					fprintf(stderr, "%s failed\n", page[i]->name);
-				else
-					printf("%d downloaded\n", page[i]->num);
+			if(i == -1) {
+				fprintf(stderr, "%s not found\n", in);
+				continue;
 			}
+			searchpage(pages[i]);
+			getpage(pages[i]);
 		}
 	}
 
+	for(i=0; i<MAXPAGES; i++) free(pages[i]);
+	free(pages);
+
 	return EXIT_SUCCESS;
 }
diff --git a/getgfailed.sh b/getgfailed.sh
deleted file mode 100755
index 9ecd9e3..0000000
--- a/getgfailed.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/sh
-# See COPYING file for copyright and license details.
-#
-# Tries to download each page listed in a fail log (from a
-# previous run of getgbook -a bookid > faillog)
-
-test $# -ne 2 && echo "usage: $0 bookid faillog" && exit
-
-sort < $2 | sort | shuf | head -n 5 | while read i
-do
-	code=`echo $i|awk '{print $1}'`
-	echo $code | getgbook $1
-done
diff --git a/getgmissing.sh b/getgmissing.sh
deleted file mode 100755
index e8198d8..0000000
--- a/getgmissing.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/sh
-# See COPYING file for copyright and license details.
-#
-# This gets any pages listed as available that have not been
-# downloaded. Note that at present this is not too useful, as
-# an IP block will be imposed after the first x pages each run,
-# just for checking availaility.
-
-test $# -ne 1 && echo "usage: $0 bookid" && exit
-
-getgbook -p $1 2>/dev/null | while read i
-do
-	code=`echo $i|awk '{print $1}'`
-	num=`echo $i|awk '{print $2}'`
-	test -n "$num" && num=`printf '%04d' $num` || num=$code
-	test -f $num.png || echo $code | getgbook $1
-done
-- 
cgit v1.2.3