From 62563596f477238d480fe4a701544413b6c722f5 Mon Sep 17 00:00:00 2001 From: Nick White Date: Sun, 7 Aug 2011 12:46:52 +0100 Subject: Abide by google's robots.txt, and lay out legal issues --- LEGAL | 27 +++++++++++++++++++++++++++ getgbook.c | 4 ++-- 2 files changed, 29 insertions(+), 2 deletions(-) create mode 100644 LEGAL diff --git a/LEGAL b/LEGAL new file mode 100644 index 0000000..ec1a2c8 --- /dev/null +++ b/LEGAL @@ -0,0 +1,27 @@ +# Getgbook + +## TOS + +Google's terms of service forbid using anything but a browser +to access their sites. This is absurd and ruinous. +See section 5.3 of http://www.google.com/accounts/TOS. + +Thankfully, however, for Google Books one is only bound to it +"for digital content you purchase through the Google Books +service," which does not affect this program. +See http://www.google.com/googlebooks/tos.html + +## robots.txt + +Their robots.txt allows certain book pages, but disallows +others. + +We use two types of URL: +http://books.google.com/books?id=&pg=&jscmd=click3 +http://books.google.com/books?id=&pg=&img=1&zoom=3&hl=en& + +robots.txt disallows /books?*jscmd=* and /books?*pg=*. However, +Google consider Allow statements to overrule disallow statements +if they are longer. And they happen to allow /books?*q=subject:*. +So, we append that to both url types (it has no effect on them), +and we are obeying robots.txt diff --git a/getgbook.c b/getgbook.c index b4a23af..5f0a0ae 100644 --- a/getgbook.c +++ b/getgbook.c @@ -29,7 +29,7 @@ Page *getpagedetail(char *bookid, char *pg, char *cookie) char *c, *d, *p, *buf = NULL; Page *page; - snprintf(url, URLMAX, "/books?id=%s&pg=%s&jscmd=click3", bookid, pg); + snprintf(url, URLMAX, "/books?id=%s&pg=%s&jscmd=click3&q=subject:a", bookid, pg); if(!get("books.google.com", url, cookie, NULL, &buf)) return NULL; @@ -51,7 +51,7 @@ Page *getpagedetail(char *bookid, char *pg, char *cookie) } else *p = *d; } - *p = '\0'; + strncpy(p, "&q=subject:a", 12); } else d=c; -- cgit v1.2.3