diff options
author | Nick White <hg@njw.me.uk> | 2011-08-07 12:46:52 +0100 |
---|---|---|
committer | Nick White <hg@njw.me.uk> | 2011-08-07 12:46:52 +0100 |
commit | 62563596f477238d480fe4a701544413b6c722f5 (patch) | |
tree | 91379e6c654e9e9ff47793a892aa990e737e85dc | |
parent | 3d08e78700331588f6d43db725cc361f841c012d (diff) |
Abide by google's robots.txt, and lay out legal issues
-rw-r--r-- | LEGAL | 27 | ||||
-rw-r--r-- | getgbook.c | 4 |
2 files changed, 29 insertions, 2 deletions
@@ -0,0 +1,27 @@ +# Getgbook + +## TOS + +Google's terms of service forbid using anything but a browser +to access their sites. This is absurd and ruinous. +See section 5.3 of http://www.google.com/accounts/TOS. + +Thankfully, however, for Google Books one is only bound to it +"for digital content you purchase through the Google Books +service," which does not affect this program. +See http://www.google.com/googlebooks/tos.html + +## robots.txt + +Their robots.txt allows certain book pages, but disallows +others. + +We use two types of URL: +http://books.google.com/books?id=<bookid>&pg=<pgcode>&jscmd=click3 +http://books.google.com/books?id=<bookid>&pg=<pgcode>&img=1&zoom=3&hl=en&<sig> + +robots.txt disallows /books?*jscmd=* and /books?*pg=*. However, +Google consider Allow statements to overrule disallow statements +if they are longer. And they happen to allow /books?*q=subject:*. +So, we append that to both url types (it has no effect on them), +and we are obeying robots.txt @@ -29,7 +29,7 @@ Page *getpagedetail(char *bookid, char *pg, char *cookie) char *c, *d, *p, *buf = NULL; Page *page; - snprintf(url, URLMAX, "/books?id=%s&pg=%s&jscmd=click3", bookid, pg); + snprintf(url, URLMAX, "/books?id=%s&pg=%s&jscmd=click3&q=subject:a", bookid, pg); if(!get("books.google.com", url, cookie, NULL, &buf)) return NULL; @@ -51,7 +51,7 @@ Page *getpagedetail(char *bookid, char *pg, char *cookie) } else *p = *d; } - *p = '\0'; + strncpy(p, "&q=subject:a", 12); } else d=c; |