From 6b059ae1888b0cf8d38c7fe9b4f5c10ec28ab7b6 Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.me.uk>
Date: Sun, 21 Aug 2011 21:14:24 +0100
Subject: Restructure getgbook code

---
 Makefile       |   2 +-
 TODO           |   6 +--
 getgbook.c     | 132 ++++++++++++++++++++++++++++++++++++++++++---------------
 getgfailed.sh  |  13 ------
 getgmissing.sh |  17 --------
 5 files changed, 99 insertions(+), 71 deletions(-)
 delete mode 100755 getgfailed.sh
 delete mode 100755 getgmissing.sh

diff --git a/Makefile b/Makefile
index 4ec498d..d7e947c 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ NAME = getxbook
 
 SRC = getgbook.c
 LIB = util.o
-SCRIPTS = getgmissing.sh getgfailed.sh makebookpdf.sh
+SCRIPTS = makebookpdf.sh
 DOC = README COPYING LEGAL
 
 BIN = $(SRC:.c=)
diff --git a/TODO b/TODO
index 4eb35e4..6b08e9f 100644
--- a/TODO
+++ b/TODO
@@ -31,14 +31,10 @@ have websummary.sh print the date of release, e.g.
 
 mkdir of bookid and save pages in there
 
-add cmdline arguments for stdin parsing
-
-merge pageinfo branch
-
 ### notes
 
 Google will give you up to 5 cookies which get useful pages in immediate succession. It will stop serving new pages to the ip, even with a fresh cookie. So the cookie is certainly not everything.
 
 If one does something too naughty, all requests from the ip to books.google.com are blocked with a 403 'automated requests' error for 24 hours. What causes this ip block is less clear. It certainly isn't after just trying lots of pages with 5 cookies. It seems to be after requesting 100 new cookies in a certain time period - 100 in 5 minutes seemed to do it, as did 100 in ~15 minutes.
 
-The method of getting all pages from book webpage does miss some; they aren't all listed. These pages can often be requested, though.
+The method of getting all pages from book webpage does miss some; they aren't all listed. These pages can often be requested, though, though at present getgbook can't, as if a page isn't in its initial structure it won't save the url, even if it's presented.
diff --git a/getgbook.c b/getgbook.c
index 8073194..d1d6e4a 100644
--- a/getgbook.c
+++ b/getgbook.c
@@ -22,10 +22,15 @@ typedef struct {
 	char cookie[COOKIEMAX];
 } Page;
 
-int getpagelist(char *bookid, Page **pages)
+Page **pages;
+int totalpages;
+char cookies[COOKIENUM][COOKIEMAX];
+char *bookid;
+
+int getpagelist()
 {
 	char url[URLMAX], m[STRMAX];
-	char *buf;
+	char *buf = NULL;
 	char *s;
 	int i;
 	Page *p;
@@ -56,13 +61,14 @@ int getpagelist(char *bookid, Page **pages)
 		}
 	}
 
+	free(buf);
 	return i;
 }
 
-int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char *cookie) {
+int getpageurls(char *pagecode, char *cookie) {
 	char url[URLMAX], code[STRMAX], m[STRMAX];
 	char *c, *d, *p, *buf = NULL;
-	int i;
+	int i, j;
 
 	snprintf(url, URLMAX, "/books?id=%s&pg=%s&jscmd=click3&q=subject:a", bookid, pagecode);
 
@@ -78,11 +84,17 @@ int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char
 			if(*c == '}') {
 				break;
 			}
+			j = -1;
 			if(!strncmp(c, "\"src\"", 5)) {
-				for(i=0; i<totalpages; i++)
-					if(!strncmp(pages[i]->name, code, STRMAX))
+				for(i=0; i<totalpages; i++) {
+					if(!strncmp(pages[i]->name, code, STRMAX)) {
+						j = i;
 						break;
-				for(p=pages[i]->url, d=c+strlen("\"src\":")+1; *d && *d != '"'; d++, p++) {
+					}
+				}
+				if(j == -1) /* TODO: it would be good to add new page on the end */
+					break;  /*       of structure rather than throw it away. */
+				for(p=pages[j]->url, d=c+strlen("\"src\":")+1; *d && *d != '"'; d++, p++) {
 					if(!strncmp(d, "\\u0026", 6)) {
 						*p = '&';
 						d+=5;
@@ -90,7 +102,7 @@ int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char
 						*p = *d;
 				}
 				strncpy(p, "&q=subject:a", 13);
-				strncpy(pages[i]->cookie, cookie, COOKIEMAX);
+				strncpy(pages[j]->cookie, cookie, COOKIEMAX);
 				break;
 			}
 		}
@@ -100,11 +112,50 @@ int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char
 	return 0;
 }
 
+int getpage(Page *page)
+{
+	char path[STRMAX];
+	snprintf(path, STRMAX, "%04d.png", page->num);
+
+	if(page->url[0] == '\0') {
+		fprintf(stderr, "%s not found\n", page->name);
+		return 1;
+	}
+
+	if(gettofile("books.google.com", page->url, page->cookie, NULL, path)) {
+		fprintf(stderr, "%s failed\n", page->name);
+		return 1;
+	}
+
+	printf("%d downloaded\n", page->num);
+	return 0;
+}
+
+void searchpage(Page *page) {
+	int i, j;
+
+	if(page->url[0] != '\0')
+		return;
+
+	for(i=0; i<COOKIENUM; i++) {
+		if(cookies[i][0] == '\0') /* dead cookie */
+			continue;
+		getpageurls(page->name, cookies[i]);
+		if(page->url[0] != '\0') {
+			/* invalidate old cookies if one succeeded */
+			for(j=0; j<i; j++)
+				cookies[j][0] = '\0';
+			break;
+		}
+	}
+}
+
 int main(int argc, char *argv[])
 {
-	char *bookid, *tmp, cookies[COOKIENUM][COOKIEMAX];
-	char pgpath[STRMAX];
-	int a, i, j, totalpages;
+	char *tmp;
+	char buf[BUFSIZ], pgpath[STRMAX];
+	char in[16];
+	int a, i, n;
 	FILE *f;
 
 	if(argc < 2 || argc > 3 || (argc == 3 && (argv[1][0]!='-'
@@ -122,43 +173,54 @@ int main(int argc, char *argv[])
 
 	bookid = argv[argc-1];
 
+	pages = malloc(sizeof(*pages) * MAXPAGES);
+	for(i=0; i<MAXPAGES; i++) pages[i] = malloc(sizeof(**pages));
+	if(!(totalpages = getpagelist(bookid, pages))) {
+		fprintf(stderr, "Could not find any pages for %s\n", bookid);
+		return 1;
+	}
+
 	if(argc == 2) {
-		Page **page;
-		page = malloc(sizeof(*page) * MAXPAGES);
-		for(i=0; i<MAXPAGES; i++) page[i] = malloc(sizeof(**page));
-		if(!(totalpages = getpagelist(bookid, page))) {
-			fprintf(stderr, "Could not find any pages for %s\n", bookid);
-			return 1;
-		}
 		for(i=0; i<totalpages; i++) {
-			snprintf(pgpath, STRMAX, "%04d.png", page[i]->num);
+			snprintf(pgpath, STRMAX, "%04d.png", pages[i]->num);
 			if((f = fopen(pgpath, "r")) != NULL) {
 				fclose(f);
 				continue;
 			}
-			if(page[i]->url[0] == '\0') {
-				for(j=0; j<COOKIENUM; j++) {
-					if(cookies[j][0] == '\0') /* dead cookie */
-						continue;
-					getpageurls(bookid, page, totalpages, page[i]->name, cookies[j]);
-					if(page[i]->url[0] != '\0') {
-						/* invalidate old cookies if one succeeded */
-						for(a=0; a<j; a++)
-							cookies[a][0] = '\0';
+			searchpage(pages[i]);
+			getpage(pages[i]);
+		}
+	} else if(argv[1][0] == '-') {
+		while(fgets(buf, BUFSIZ, stdin)) {
+			sscanf(buf, "%15s", in);
+			i = -1;
+			if(argv[1][1] == 'c') {
+				for(a=0; a<totalpages; a++) {
+					if(strncmp(pages[a]->name, in, STRMAX) == 0) {
+						i = a;
+						break;
+					}
+				}
+			} else if(argv[1][1] == 'n') {
+				sscanf(in, "%d", &n);
+				for(a=0; a<totalpages; a++) {
+					if(pages[a]->num == n) {
+						i = a;
 						break;
 					}
 				}
 			}
-			if(page[i]->url[0] == '\0')
-				fprintf(stderr, "%s not found\n", page[i]->name);
-			else {
-				if(gettofile("books.google.com", page[i]->url, page[i]->cookie, NULL, pgpath))
-					fprintf(stderr, "%s failed\n", page[i]->name);
-				else
-					printf("%d downloaded\n", page[i]->num);
+			if(i == -1) {
+				fprintf(stderr, "%s not found\n", in);
+				continue;
 			}
+			searchpage(pages[i]);
+			getpage(pages[i]);
 		}
 	}
 
+	for(i=0; i<MAXPAGES; i++) free(pages[i]);
+	free(pages);
+
 	return EXIT_SUCCESS;
 }
diff --git a/getgfailed.sh b/getgfailed.sh
deleted file mode 100755
index 9ecd9e3..0000000
--- a/getgfailed.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/sh
-# See COPYING file for copyright and license details.
-#
-# Tries to download each page listed in a fail log (from a
-# previous run of getgbook -a bookid > faillog)
-
-test $# -ne 2 && echo "usage: $0 bookid faillog" && exit
-
-sort < $2 | sort | shuf | head -n 5 | while read i
-do
-	code=`echo $i|awk '{print $1}'`
-	echo $code | getgbook $1
-done
diff --git a/getgmissing.sh b/getgmissing.sh
deleted file mode 100755
index e8198d8..0000000
--- a/getgmissing.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/sh
-# See COPYING file for copyright and license details.
-#
-# This gets any pages listed as available that have not been
-# downloaded. Note that at present this is not too useful, as
-# an IP block will be imposed after the first x pages each run,
-# just for checking availaility.
-
-test $# -ne 1 && echo "usage: $0 bookid" && exit
-
-getgbook -p $1 2>/dev/null | while read i
-do
-	code=`echo $i|awk '{print $1}'`
-	num=`echo $i|awk '{print $2}'`
-	test -n "$num" && num=`printf '%04d' $num` || num=$code
-	test -f $num.png || echo $code | getgbook $1
-done
-- 
cgit v1.2.3