summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.me.uk>2011-08-21 17:00:00 +0100
committerNick White <git@njw.me.uk>2011-08-21 17:00:00 +0100
commitbe77fe85042dfcc4a943c4c979ba7b990d6a124f (patch)
treeed709eb7d57e9e694d4cc53a04abb97a96d8d5a5
parentb686e031da6622b329d43c361b196eda46ea5154 (diff)
Tighten sscanf usage, add TODOs
-rw-r--r--TODO24
-rw-r--r--getgbook.c10
2 files changed, 14 insertions, 20 deletions
diff --git a/TODO b/TODO
index 8a1deb7..4703148 100644
--- a/TODO
+++ b/TODO
@@ -8,6 +8,10 @@ use "" rather than "\0" in headermax
# other todos
+use wide string functions when dealing with stuff returned over http; it's known utf8
+
+bug in get(): if the \r\n\r\n after http headers is cut off between recv buffers
+
use HTTP/1.1 with "Connection: close" header
try supporting 3xx in get, if it can be done in a few lines
@@ -25,23 +29,11 @@ have websummary.sh print the date of release, e.g.
## getgbook
+mkdir of bookid and save pages in there
+
Google will give you up to 5 cookies which get useful pages in immediate succession. It will stop serving new pages to the ip, even with a fresh cookie. So the cookie is certainly not everything.
If one does something too naughty, all requests from the ip to books.google.com are blocked with a 403 'automated requests' error for 24 hours. What causes this ip block is less clear. It certainly isn't after just trying lots of pages with 5 cookies. It seems to be after requesting 100 new cookies in a certain time period - 100 in 5 minutes seemed to do it, as did 100 in ~15 minutes.
-So, if no more than 5 useable cookies can be gotten, and many more than this cause an ip block, a strategy could be to not bother getting more than 5 cookies, and bail once the 5th starts failing. of course, this doesn't address getting more pages, and moreover it doesn't address knowing which pages are available.
-
-all pages available (includes page code & order (even when not available from main click3 part) (& title sometimes, & height), though not url): curl 'http://books.google.com/books?id=h3DSQ0L10o8C&printsec=frontcover' | sed -e '/OC_Run\(/!d' -e 's/.*_OC_Run\({"page"://g' -e 's/}].*//g'
-
-TODO, THEN:
- at start (if in -p or -a mode), fill a Page struct (don't hold url in struct any more)
- in -a, go through Page struct, if file exists, skip, otherwise get the url for the page (don't bother about re-getting order etc). this means that getgfailed and getgmissing can go away
- in -p, just go through Page struct and print each entry
- when 5 cookies have been exhausted, quit, saying no more cookies available for now (and recommending a time period to retry)
- have -a be default, and stdin be -
-
- so, usage should be
- getgbook [-] bookid
- if - is given, read page codes from stdin
- otherwise, just download everything (skipping already
- downloaded pages)
+NOTE!!: the method of getting all pages from book page does miss some; they aren't all listed
+* these pages can often be requested, though
diff --git a/getgbook.c b/getgbook.c
index 5f1e381..62faf46 100644
--- a/getgbook.c
+++ b/getgbook.c
@@ -23,7 +23,7 @@ typedef struct {
int getpagelist(char *bookid, Page **pages)
{
- char url[URLMAX];
+ char url[URLMAX], m[STRMAX];
char *buf;
char *s;
int i;
@@ -43,7 +43,8 @@ int getpagelist(char *bookid, Page **pages)
if(*s == ']')
break;
if(!strncmp(s, "\"pid\"", 5)) {
- sscanf(s+6, "\"%[^\"]\",", p->name);
+ snprintf(m, STRMAX, "\"%%%d[^\"]\"", STRMAX-1);
+ sscanf(s+6, m, p->name);
for(;*s; s++) {
if(*s == '}')
break;
@@ -58,7 +59,7 @@ int getpagelist(char *bookid, Page **pages)
}
int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char *cookie) {
- char url[URLMAX], code[STRMAX];
+ char url[URLMAX], code[STRMAX], m[STRMAX];
char *c, *d, *p, *buf = NULL;
int i;
@@ -69,7 +70,8 @@ int getpageurls(char *bookid, Page **pages, int totalpages, char *pagecode, char
c = buf;
while(*c && (c = strstr(c, "\"pid\":"))) {
- if(!sscanf(c, "\"pid\":\"%[^\"]\"", code))
+ snprintf(m, STRMAX, "\"pid\":\"%%%d[^\"]\"", STRMAX-1);
+ if(!sscanf(c, m, code))
break;
for(; *c; c++) {
if(*c == '}') {