From b2a763ad9b2c256ec989194afb77aba3cf498a09 Mon Sep 17 00:00:00 2001 From: Nick White Date: Tue, 20 Sep 2011 00:42:32 +0100 Subject: Mostly working getabook --- getabook.c | 192 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) create mode 100644 getabook.c (limited to 'getabook.c') diff --git a/getabook.c b/getabook.c new file mode 100644 index 0000000..11297ed --- /dev/null +++ b/getabook.c @@ -0,0 +1,192 @@ +/* See COPYING file for copyright and license details. */ +#include +#include +#include +#include "util.h" + +#define usage "getabook " VERSION " - an amazon look inside the book downloader\n" \ + "usage: getabook [-c|-n] asin\n" \ + " -n download pages from numbers in stdin\n" \ + " otherwise, all available pages will be downloaded\n" + +#define URLMAX 1024 +#define STRMAX 1024 +#define MAXPAGES 9999 + +typedef struct { + int num; + char url[URLMAX]; +} Page; + +Page **pages; +int numpages; +char *bookid; + +int getpagelist() +{ + char url[URLMAX], b[STRMAX]; + char *buf = NULL; + char *s, *c; + int i; + Page *p; + + snprintf(url, URLMAX, "/gp/search-inside/service-data?method=getBookData&asin=%s", bookid); + + if(!get("www.amazon.com", url, NULL, NULL, &buf)) + return 0; + + if((s = strstr(buf, "\"litbPages\":[")) == NULL) + return 0; + s+=strlen("\"litbPages\":["); + + for(i=0, p=pages[0];*s && inum)); + if(s[0] == ']') + break; + p->url[0] = '\0'; + } + free(buf); + return i; +} + +int getpageurls(int pagenum) { + char url[URLMAX], m[STRMAX]; + char *c, *s, *buf = NULL; + size_t l; + int i; + + snprintf(url, URLMAX, "/gp/search-inside/service-data?method=goToPage&asin=%s&page=%d", bookid, pagenum); + + if(!(l = get("www.amazon.com", url, NULL, NULL, &buf))) + return 1; + + s = strstr(buf, "\"jumboImageUrls\":{") + strlen("\"jumboImageUrls\":{"); + + for(i=0; *s && inum); + + while(strncmp(c, m, strlen(m)) != 0) { + while(*c && *c != '}' && *c != ',') + c++; + if(*c == '}') + break; + c++; + } + if(*c == '}') + continue; + + c += strlen(m); + if(!sscanf(c, "\"%[^\"]\"", pages[i]->url)) + continue; + } + + free(buf); + return 0; +} + +/* +int getpage(Page *page) +{ + char path[STRMAX]; + snprintf(path, STRMAX, "%04d.png", page->num); + + if(page->url[0] == '\0') { + fprintf(stderr, "%s not found\n", page->name); + return 1; + } + + if(gettofile("books.google.com", page->url, page->cookie, NULL, path)) { + fprintf(stderr, "%s failed\n", page->name); + return 1; + } + + printf("%d downloaded\n", page->num); + fflush(stdout); + return 0; +} +*/ + +int main(int argc, char *argv[]) +{ + /*char *tmp; + char buf[BUFSIZ], pgpath[STRMAX]; + char in[16]; + int a, i, n; + FILE *f;*/ + + int i; + + if(argc < 2 || argc > 3 || + (argc == 3 && (argv[1][0]!='-' || argv[1][1] != 'n')) + || (argc >= 2 && argv[1][0] == '-' && argv[1][1] == 'h')) { + fputs(usage, stdout); + return 1; + } + + bookid = argv[argc-1]; + + pages = malloc(sizeof(*pages) * MAXPAGES); + if(!(numpages = getpagelist(bookid, pages))) { + fprintf(stderr, "Could not find any pages for %s\n", bookid); + return 1; + } + + for(i=0; iurl[0] == '\0') + getpageurls(pages[i]->num); + printf("page %d is %s\n", pages[i]->num, pages[i]->url); + } + + return 0; + +/* + if(argc == 2) { + for(i=0; inum); + if((f = fopen(pgpath, "r")) != NULL) { + fclose(f); + continue; + } + searchpage(pages[i]); + getpage(pages[i]); + } + } else if(argv[1][0] == '-') { + while(fgets(buf, BUFSIZ, stdin)) { + sscanf(buf, "%15s", in); + i = -1; + if(argv[1][1] == 'c') { + for(a=0; aname, in, STRMAX) == 0) { + i = a; + break; + } + } + } else if(argv[1][1] == 'n') { + sscanf(in, "%d", &n); + for(a=0; anum == n) { + i = a; + break; + } + } + } + if(i == -1) { + fprintf(stderr, "%s not found\n", in); + continue; + } + searchpage(pages[i]); + getpage(pages[i]); + } + } +*/ + + for(i=0; i Date: Tue, 20 Sep 2011 00:44:04 +0100 Subject: Removed unneeded var --- getabook.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'getabook.c') diff --git a/getabook.c b/getabook.c index 11297ed..dd7eec7 100644 --- a/getabook.c +++ b/getabook.c @@ -55,12 +55,11 @@ int getpagelist() int getpageurls(int pagenum) { char url[URLMAX], m[STRMAX]; char *c, *s, *buf = NULL; - size_t l; int i; snprintf(url, URLMAX, "/gp/search-inside/service-data?method=goToPage&asin=%s&page=%d", bookid, pagenum); - if(!(l = get("www.amazon.com", url, NULL, NULL, &buf))) + if(!get("www.amazon.com", url, NULL, NULL, &buf)) return 1; s = strstr(buf, "\"jumboImageUrls\":{") + strlen("\"jumboImageUrls\":{"); -- cgit v1.2.3 From aaca1c3ef0fa07a9a8178d3001a0c681e374d448 Mon Sep 17 00:00:00 2001 From: Nick White Date: Tue, 20 Sep 2011 21:00:05 +0100 Subject: Fix up getabook; basically working now --- getabook.c | 58 ++++++++++++++++++++-------------------------------------- 1 file changed, 20 insertions(+), 38 deletions(-) (limited to 'getabook.c') diff --git a/getabook.c b/getabook.c index dd7eec7..7f205d9 100644 --- a/getabook.c +++ b/getabook.c @@ -62,7 +62,11 @@ int getpageurls(int pagenum) { if(!get("www.amazon.com", url, NULL, NULL, &buf)) return 1; - s = strstr(buf, "\"jumboImageUrls\":{") + strlen("\"jumboImageUrls\":{"); + if(!(s = strstr(buf, "\"jumboImageUrls\":{"))) { + free(buf); + return 1; + } + s += strlen("\"jumboImageUrls\":{"); for(i=0; *s && iurl)) + if(!sscanf(c, "\"//sitb-images.amazon.com%[^\"]\"", pages[i]->url)) continue; } @@ -88,19 +92,18 @@ int getpageurls(int pagenum) { return 0; } -/* int getpage(Page *page) { char path[STRMAX]; snprintf(path, STRMAX, "%04d.png", page->num); if(page->url[0] == '\0') { - fprintf(stderr, "%s not found\n", page->name); + fprintf(stderr, "%d not found\n", page->num); return 1; } - if(gettofile("books.google.com", page->url, page->cookie, NULL, path)) { - fprintf(stderr, "%s failed\n", page->name); + if(gettofile("sitb-images.amazon.com", page->url, NULL, NULL, path)) { + fprintf(stderr, "%d failed\n", page->num); return 1; } @@ -108,17 +111,13 @@ int getpage(Page *page) fflush(stdout); return 0; } -*/ int main(int argc, char *argv[]) { - /*char *tmp; char buf[BUFSIZ], pgpath[STRMAX]; char in[16]; int a, i, n; - FILE *f;*/ - - int i; + FILE *f; if(argc < 2 || argc > 3 || (argc == 3 && (argv[1][0]!='-' || argv[1][1] != 'n')) @@ -135,15 +134,6 @@ int main(int argc, char *argv[]) return 1; } - for(i=0; iurl[0] == '\0') - getpageurls(pages[i]->num); - printf("page %d is %s\n", pages[i]->num, pages[i]->url); - } - - return 0; - -/* if(argc == 2) { for(i=0; inum); @@ -151,38 +141,30 @@ int main(int argc, char *argv[]) fclose(f); continue; } - searchpage(pages[i]); + if(pages[i]->url[0] == '\0') + getpageurls(pages[i]->num); getpage(pages[i]); } - } else if(argv[1][0] == '-') { + } else if(argv[1][0] == '-' && argv[1][1] == 'n') { while(fgets(buf, BUFSIZ, stdin)) { sscanf(buf, "%15s", in); i = -1; - if(argv[1][1] == 'c') { - for(a=0; aname, in, STRMAX) == 0) { - i = a; - break; - } - } - } else if(argv[1][1] == 'n') { - sscanf(in, "%d", &n); - for(a=0; anum == n) { - i = a; - break; - } + sscanf(in, "%d", &n); + for(a=0; anum == n) { + i = a; + break; } } if(i == -1) { fprintf(stderr, "%s not found\n", in); continue; } - searchpage(pages[i]); + if(pages[i]->url[0] == '\0') + getpageurls(pages[i]->num); getpage(pages[i]); } } -*/ for(i=0; i Date: Tue, 20 Sep 2011 21:20:44 +0100 Subject: Get jumbo image urls from initial request too --- getabook.c | 80 ++++++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 46 insertions(+), 34 deletions(-) (limited to 'getabook.c') diff --git a/getabook.c b/getabook.c index 7f205d9..9f10868 100644 --- a/getabook.c +++ b/getabook.c @@ -22,6 +22,41 @@ Page **pages; int numpages; char *bookid; +int fillurls(char *buf) { + char m[STRMAX]; + char *c, *s; + int i; + + if(!(s = strstr(buf, "\"jumboImageUrls\":{"))) { + free(buf); + return 1; + } + s += strlen("\"jumboImageUrls\":{"); + + for(i=0; *s && inum); + + while(strncmp(c, m, strlen(m)) != 0) { + while(*c && *c != '}' && *c != ',') + c++; + if(*c == '}') + break; + c++; + } + if(*c == '}') + continue; + + c += strlen(m); + if(!sscanf(c, "\"//sitb-images.amazon.com%[^\"]\"", pages[i]->url)) + continue; + } + + free(buf); + return 0; +} + int getpagelist() { char url[URLMAX], b[STRMAX]; @@ -33,10 +68,10 @@ int getpagelist() snprintf(url, URLMAX, "/gp/search-inside/service-data?method=getBookData&asin=%s", bookid); if(!get("www.amazon.com", url, NULL, NULL, &buf)) - return 0; + return 1; if((s = strstr(buf, "\"litbPages\":[")) == NULL) - return 0; + return 1; s+=strlen("\"litbPages\":["); for(i=0, p=pages[0];*s && iurl[0] = '\0'; } - free(buf); - return i; + numpages = i; + + fillurls(buf); + + return 0; } int getpageurls(int pagenum) { - char url[URLMAX], m[STRMAX]; - char *c, *s, *buf = NULL; - int i; + char url[URLMAX]; + char *buf = NULL; snprintf(url, URLMAX, "/gp/search-inside/service-data?method=goToPage&asin=%s&page=%d", bookid, pagenum); if(!get("www.amazon.com", url, NULL, NULL, &buf)) return 1; - if(!(s = strstr(buf, "\"jumboImageUrls\":{"))) { - free(buf); - return 1; - } - s += strlen("\"jumboImageUrls\":{"); + fillurls(buf); - for(i=0; *s && inum); - - while(strncmp(c, m, strlen(m)) != 0) { - while(*c && *c != '}' && *c != ',') - c++; - if(*c == '}') - break; - c++; - } - if(*c == '}') - continue; - - c += strlen(m); - if(!sscanf(c, "\"//sitb-images.amazon.com%[^\"]\"", pages[i]->url)) - continue; - } - - free(buf); return 0; } @@ -129,7 +141,7 @@ int main(int argc, char *argv[]) bookid = argv[argc-1]; pages = malloc(sizeof(*pages) * MAXPAGES); - if(!(numpages = getpagelist(bookid, pages))) { + if(getpagelist(bookid, pages)) { fprintf(stderr, "Could not find any pages for %s\n", bookid); return 1; } -- cgit v1.2.3