summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.me.uk>2011-08-21 21:14:24 +0100
committerNick White <git@njw.me.uk>2011-08-21 21:14:24 +0100
commit6b059ae1888b0cf8d38c7fe9b4f5c10ec28ab7b6 (patch)
tree05e45a7b7f53277b6877f4c029e3d13ac45d281a
parentfc43d1cacbb62fd854960901688e1b9b9752e7cd (diff)
Restructure getgbook code
-rw-r--r--Makefile2
-rw-r--r--TODO6
-rw-r--r--getgbook.c132
-rwxr-xr-xgetgfailed.sh13
-rwxr-xr-xgetgmissing.sh17
5 files changed, 99 insertions, 71 deletions
diff --git a/Makefile b/Makefile
index 4ec498d..d7e947c 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ NAME = getxbook
SRC = getgbook.c
LIB = util.o
-SCRIPTS = getgmissing.sh getgfailed.sh makebookpdf.sh
+SCRIPTS = makebookpdf.sh
DOC = README COPYING LEGAL
BIN = $(SRC:.c=)
diff --git a/TODO b/TODO
index 4eb35e4..6b08e9f 100644
--- a/TODO
+++ b/TODO
@@ -31,14 +31,10 @@ have websummary.sh print the date of release, e.g.
mkdir of bookid and save pages in there
-add cmdline arguments for stdin parsing
-
-merge pageinfo branch
-
### notes
Google will give you up to 5 cookies which get useful pages in immediate succession. It will stop serving new pages to the ip, even with a fresh cookie. So the cookie is certainly not everything.
If one does something too naughty, all requests from the ip to books.google.com are blocked with a 403 'automated requests' error for 24 hours. What causes this ip block is less clear. It certainly isn't after just trying lots of pages with 5 cookies. It seems to be after requesting 100 new cookies in a certain time period - 100 in 5 minutes seemed to do it, as did 100 in ~15 minutes.
-The method of getting all pages from book webpage does miss some; they aren't all listed. These pages can often be requested, though.
+The method of getting all pages from book webpage does miss some; they aren't all listed. These pages can often be requested, though, though at present getgbook can't, as if a page isn't in its initial structure it won't save the url, even if it's presented.
diff --git a/getgbook.c b/getgbook.c
index 8073194..d1d6e4a 100644
--- a/getgbook.c
+++ b/getgbook.c
@@ -22,10 +22,15 @@ typedef struct {
char cookie[COOKIEMAX];
} Page;
-int getpagelist(char *bookid, Page **pages)
+Page **pages;
+int totalpages;
+char cookies[COOKIENUM][COOKIEMAX];
+char *bookid;
+
+int getpagelist()
{
char url[URLMAX], m[STRMAX];
- char *buf;
+ char *buf = NULL;
char *s;
int i;
Page *p;
@@ -56,13 +61,14 @@ int getpagelist(char *bookid, Page **pages)
}
}
+ free(buf);
return i;
}
-int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char *cookie) {
+int getpageurls(char *pagecode, char *cookie) {
char url[URLMAX], code[STRMAX], m[STRMAX];
char *c, *d, *p, *buf = NULL;
- int i;
+ int i, j;
snprintf(url, URLMAX, "/books?id=%s&pg=%s&jscmd=click3&q=subject:a", bookid, pagecode);
@@ -78,11 +84,17 @@ int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char
if(*c == '}') {
break;
}
+ j = -1;
if(!strncmp(c, "\"src\"", 5)) {
- for(i=0; i<totalpages; i++)
- if(!strncmp(pages[i]->name, code, STRMAX))
+ for(i=0; i<totalpages; i++) {
+ if(!strncmp(pages[i]->name, code, STRMAX)) {
+ j = i;
break;
- for(p=pages[i]->url, d=c+strlen("\"src\":")+1; *d && *d != '"'; d++, p++) {
+ }
+ }
+ if(j == -1) /* TODO: it would be good to add new page on the end */
+ break; /* of structure rather than throw it away. */
+ for(p=pages[j]->url, d=c+strlen("\"src\":")+1; *d && *d != '"'; d++, p++) {
if(!strncmp(d, "\\u0026", 6)) {
*p = '&';
d+=5;
@@ -90,7 +102,7 @@ int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char
*p = *d;
}
strncpy(p, "&q=subject:a", 13);
- strncpy(pages[i]->cookie, cookie, COOKIEMAX);
+ strncpy(pages[j]->cookie, cookie, COOKIEMAX);
break;
}
}
@@ -100,11 +112,50 @@ int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char
return 0;
}
+int getpage(Page *page)
+{
+ char path[STRMAX];
+ snprintf(path, STRMAX, "%04d.png", page->num);
+
+ if(page->url[0] == '\0') {
+ fprintf(stderr, "%s not found\n", page->name);
+ return 1;
+ }
+
+ if(gettofile("books.google.com", page->url, page->cookie, NULL, path)) {
+ fprintf(stderr, "%s failed\n", page->name);
+ return 1;
+ }
+
+ printf("%d downloaded\n", page->num);
+ return 0;
+}
+
+void searchpage(Page *page) {
+ int i, j;
+
+ if(page->url[0] != '\0')
+ return;
+
+ for(i=0; i<COOKIENUM; i++) {
+ if(cookies[i][0] == '\0') /* dead cookie */
+ continue;
+ getpageurls(page->name, cookies[i]);
+ if(page->url[0] != '\0') {
+ /* invalidate old cookies if one succeeded */
+ for(j=0; j<i; j++)
+ cookies[j][0] = '\0';
+ break;
+ }
+ }
+}
+
int main(int argc, char *argv[])
{
- char *bookid, *tmp, cookies[COOKIENUM][COOKIEMAX];
- char pgpath[STRMAX];
- int a, i, j, totalpages;
+ char *tmp;
+ char buf[BUFSIZ], pgpath[STRMAX];
+ char in[16];
+ int a, i, n;
FILE *f;
if(argc < 2 || argc > 3 || (argc == 3 && (argv[1][0]!='-'
@@ -122,43 +173,54 @@ int main(int argc, char *argv[])
bookid = argv[argc-1];
+ pages = malloc(sizeof(*pages) * MAXPAGES);
+ for(i=0; i<MAXPAGES; i++) pages[i] = malloc(sizeof(**pages));
+ if(!(totalpages = getpagelist(bookid, pages))) {
+ fprintf(stderr, "Could not find any pages for %s\n", bookid);
+ return 1;
+ }
+
if(argc == 2) {
- Page **page;
- page = malloc(sizeof(*page) * MAXPAGES);
- for(i=0; i<MAXPAGES; i++) page[i] = malloc(sizeof(**page));
- if(!(totalpages = getpagelist(bookid, page))) {
- fprintf(stderr, "Could not find any pages for %s\n", bookid);
- return 1;
- }
for(i=0; i<totalpages; i++) {
- snprintf(pgpath, STRMAX, "%04d.png", page[i]->num);
+ snprintf(pgpath, STRMAX, "%04d.png", pages[i]->num);
if((f = fopen(pgpath, "r")) != NULL) {
fclose(f);
continue;
}
- if(page[i]->url[0] == '\0') {
- for(j=0; j<COOKIENUM; j++) {
- if(cookies[j][0] == '\0') /* dead cookie */
- continue;
- getpageurls(bookid, page, totalpages, page[i]->name, cookies[j]);
- if(page[i]->url[0] != '\0') {
- /* invalidate old cookies if one succeeded */
- for(a=0; a<j; a++)
- cookies[a][0] = '\0';
+ searchpage(pages[i]);
+ getpage(pages[i]);
+ }
+ } else if(argv[1][0] == '-') {
+ while(fgets(buf, BUFSIZ, stdin)) {
+ sscanf(buf, "%15s", in);
+ i = -1;
+ if(argv[1][1] == 'c') {
+ for(a=0; a<totalpages; a++) {
+ if(strncmp(pages[a]->name, in, STRMAX) == 0) {
+ i = a;
+ break;
+ }
+ }
+ } else if(argv[1][1] == 'n') {
+ sscanf(in, "%d", &n);
+ for(a=0; a<totalpages; a++) {
+ if(pages[a]->num == n) {
+ i = a;
break;
}
}
}
- if(page[i]->url[0] == '\0')
- fprintf(stderr, "%s not found\n", page[i]->name);
- else {
- if(gettofile("books.google.com", page[i]->url, page[i]->cookie, NULL, pgpath))
- fprintf(stderr, "%s failed\n", page[i]->name);
- else
- printf("%d downloaded\n", page[i]->num);
+ if(i == -1) {
+ fprintf(stderr, "%s not found\n", in);
+ continue;
}
+ searchpage(pages[i]);
+ getpage(pages[i]);
}
}
+ for(i=0; i<MAXPAGES; i++) free(pages[i]);
+ free(pages);
+
return EXIT_SUCCESS;
}
diff --git a/getgfailed.sh b/getgfailed.sh
deleted file mode 100755
index 9ecd9e3..0000000
--- a/getgfailed.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/sh
-# See COPYING file for copyright and license details.
-#
-# Tries to download each page listed in a fail log (from a
-# previous run of getgbook -a bookid > faillog)
-
-test $# -ne 2 && echo "usage: $0 bookid faillog" && exit
-
-sort < $2 | sort | shuf | head -n 5 | while read i
-do
- code=`echo $i|awk '{print $1}'`
- echo $code | getgbook $1
-done
diff --git a/getgmissing.sh b/getgmissing.sh
deleted file mode 100755
index e8198d8..0000000
--- a/getgmissing.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/sh
-# See COPYING file for copyright and license details.
-#
-# This gets any pages listed as available that have not been
-# downloaded. Note that at present this is not too useful, as
-# an IP block will be imposed after the first x pages each run,
-# just for checking availaility.
-
-test $# -ne 1 && echo "usage: $0 bookid" && exit
-
-getgbook -p $1 2>/dev/null | while read i
-do
- code=`echo $i|awk '{print $1}'`
- num=`echo $i|awk '{print $2}'`
- test -n "$num" && num=`printf '%04d' $num` || num=$code
- test -f $num.png || echo $code | getgbook $1
-done