summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <hg@njw.me.uk>2011-08-01 18:26:56 +0100
committerNick White <hg@njw.me.uk>2011-08-01 18:26:56 +0100
commit063a0de3c10d38741c939297bb19d8284757e00c (patch)
treeff7b00c71dfc1bdd08b4fd1c143aaef5a0e12b9f
parentaccfeb090db432165ebc7e80ddce5ab673631af4 (diff)
Lots of tightening and cleanups
-rw-r--r--Makefile2
-rw-r--r--TODO28
-rw-r--r--config.mk10
-rw-r--r--getgbook.c49
-rw-r--r--getgmissing.sh12
-rw-r--r--util.c21
-rw-r--r--util.h3
7 files changed, 69 insertions, 56 deletions
diff --git a/Makefile b/Makefile
index 8cb19fd..df6b1a4 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-# See COPYING file for copyright, license and warranty details.
+# See COPYING file for copyright and license details.
include config.mk
NAME = getxbook
diff --git a/TODO b/TODO
index 43cb56a..a1acd78 100644
--- a/TODO
+++ b/TODO
@@ -1,12 +1,6 @@
-Note: looks like google allows around 3 page requests per cookie session, and about 40 per ip per [some time period]. If I knew the time period, and once stdin retry is working, could make a script that gets all it can, gets a list of failures, waits, then tries failures, etc. Note these would also have to stop at some point; some pages just aren't available
+list all binaries in readme and what they do
-make sure i'm checking all lib calls that could fail
-
-make sure all arrays are used within bounds
-
-strace to check paths taken are sensible
-
-use defined constants rather than e.g. 1024
+# other utils
getgbooktxt (different program as it gets from html pages, which getgbook doesn't any more)
@@ -14,9 +8,7 @@ getabook
getbnbook
-openlibrary.org?
-
-# once it is basically working #
+# other todos
try supporting 3xx in get, if it can be done in a few lines
by getting Location line, freeing buf, and returning a new
@@ -24,14 +16,16 @@ try supporting 3xx in get, if it can be done in a few lines
add https support to get
-to be fast and efficient it's best to crank through all the json 1st, filling in an array of page structs as we go
- this requires slightly fuller json support
- could consider making a json reading module, ala confoo, to make ad-hoc memory structures from json
-
-write helper scripts like trymissing
-
write some little tests
+## getgbook
+
have file extension be determined by file type, rather than assuming png
think about whether default functionality should be dl all, rather than -a
+
+to be fast and efficient it's best to crank through all the json 1st, filling in an array of page structs as we go
+ this requires slightly fuller json support
+ could consider making a json reading module, ala confoo, to make ad-hoc memory structures from json
+
+Note: looks like google allows around 3 page requests per cookie session, and about 40 per ip per [some time period]. If I knew the time period, could make a script that gets all it can, gets a list of failures, waits, then tries failures, etc. Note these would also have to stop at some point; some pages just aren't available
diff --git a/config.mk b/config.mk
index 321a1ec..500e53d 100644
--- a/config.mk
+++ b/config.mk
@@ -1,4 +1,4 @@
-# See COPYING file for copyright, license and warranty details.
+# See COPYING file for copyright and license details.
VERSION = prealpha
# paths
@@ -8,11 +8,11 @@ CFLAGS = -ansi -pedantic -Wall -Wextra -Werror -g -D_POSIX_C_SOURCE=200112L \
-DVERSION=\"$(VERSION)\"
# musl static
-CC = musl-gcc
-LDFLAGS = -static #-s
+#CC = musl-gcc
+#LDFLAGS = -static #-s
# glibc dynamic
-#CC = cc
-#LDFLAGS =
+CC = cc
+LDFLAGS =
LD = $(CC)
diff --git a/getgbook.c b/getgbook.c
index f947a82..1f98fc1 100644
--- a/getgbook.c
+++ b/getgbook.c
@@ -1,4 +1,4 @@
-/* See COPYING file for copyright, license and warranty details. */
+/* See COPYING file for copyright and license details. */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -11,19 +11,22 @@
" otherwise, all pages in stdin will be downloaded\n"
#define URLMAX 1024
+#define STRMAX 1024
+#define PGCODELEN 3
+#define RETRYNUM 5
typedef struct {
int num;
char url[URLMAX];
- char name[80];
+ char name[STRMAX];
} Page;
-char pagecodes[][3] = { "PP", "PR", "PA", "PT", "\0" };
+char pagecodes[][PGCODELEN] = { "PP", "PR", "PA", "PT", "\0" };
Page *getpagedetail(char *bookid, char *pg, char *cookie)
{
- char url[URLMAX];
- char *buf, *c, *d, m[80], *p;
+ char url[URLMAX], m[STRMAX];
+ char *c, *d, *p, *buf = NULL;
Page *page;
snprintf(url, URLMAX, "/books?id=%s&pg=%s&jscmd=click3", bookid, pg);
@@ -31,12 +34,12 @@ Page *getpagedetail(char *bookid, char *pg, char *cookie)
if(!get("books.google.com", url, cookie, NULL, &buf))
return NULL;
- snprintf(m, 80, "\"pid\":\"%s\"", pg);
+ snprintf(m, STRMAX, "\"pid\":\"%s\"", pg);
if(!(c = strstr(buf,m)))
return NULL;
page = malloc(sizeof(Page));
- strncpy(page->name, pg, 80);
+ strncpy(page->name, pg, STRMAX);
page->url[0] = '\0';
page->num = -1;
@@ -68,7 +71,8 @@ Page *getpagedetail(char *bookid, char *pg, char *cookie)
int main(int argc, char *argv[])
{
- char *bookid, *tmp, pg[16], buf[1024], n[80], code[3], cookie[COOKIEMAX];
+ char *bookid, *tmp, *code;
+ char pg[STRMAX], buf[BUFSIZ], n[STRMAX], cookie[COOKIEMAX] = "";
int i, c, retry;
Page *page;
@@ -81,13 +85,13 @@ int main(int argc, char *argv[])
bookid = argv[argc-1];
if(argv[1][0] == '-') {
- strncpy(code, pagecodes[0], 3);
+ code = pagecodes[0];
c = i = retry = 0;
while(++i) {
- snprintf(pg, 15, "%s%d", code, i);
+ snprintf(pg, STRMAX, "%s%d", code, i);
if(!(page = getpagedetail(bookid, pg, cookie))) {
/* no more pages with that code */
- strncpy(code, pagecodes[++c], 3);
+ code = pagecodes[++c];
if(code[0] == '\0') break;
i=0;
continue;
@@ -95,7 +99,7 @@ int main(int argc, char *argv[])
if(!page->url[0]) {
free(page);
/* try with fresh cookie */
- if(retry < 5) {
+ if(retry < RETRYNUM) {
get("books.google.com", "/", NULL, cookie, &tmp);
free(tmp);
retry++;
@@ -109,11 +113,13 @@ int main(int argc, char *argv[])
retry=0;
if(argv[1][1] == 'a') {
if(page->num != -1)
- snprintf(n, 80, "%05d.png", page->num);
+ snprintf(n, STRMAX, "%04d.png", page->num);
+ else
+ snprintf(n, STRMAX, "%s.png", page->name);
+ if(gettofile("books.google.com", page->url, cookie, NULL, n))
+ fprintf(stderr, "%s failed\n", pg);
else
- snprintf(n, 80, "%s.png", page->name);
- gettofile("books.google.com", page->url, cookie, NULL, n);
- printf("Downloaded page %d\n", page->num);
+ printf("Downloaded page %d\n", page->num);
} else {
printf("%s ", page->name);
if(page->num != -1) printf("%d", page->num);
@@ -122,20 +128,21 @@ int main(int argc, char *argv[])
free(page);
}
} else {
- while(fgets(buf, 1024, stdin)) {
+ while(fgets(buf, BUFSIZ, stdin)) {
sscanf(buf, "%15s", pg);
- for(retry = 0; retry < 5; retry++) {
+ for(retry = 0; retry < RETRYNUM; retry++) {
get("books.google.com", "/", NULL, cookie, &tmp);
if((page = getpagedetail(bookid, pg, cookie)) && page->url[0]) {
- snprintf(n, 80, "%05d.png", page->num);
- gettofile("books.google.com", page->url, cookie, NULL, n);
+ snprintf(n, STRMAX, "%04d.png", page->num);
+ if(gettofile("books.google.com", page->url, cookie, NULL, n))
+ continue;
printf("Downloaded page %d\n", page->num);
free(page);
break;
}
if(page) free(page);
}
- if(retry == 5)
+ if(retry == RETRYNUM)
fprintf(stderr, "%s failed\n", pg);
}
}
diff --git a/getgmissing.sh b/getgmissing.sh
new file mode 100644
index 0000000..d936425
--- /dev/null
+++ b/getgmissing.sh
@@ -0,0 +1,12 @@
+#!/bin/sh
+# See COPYING file for copyright and license details.
+
+test $# -ne 1 && echo "usage: $0 bookid" && exit
+
+getgbook -p $1 2>/dev/null | while read i
+do
+ code=`echo $i|awk '{print $1}'`
+ num=`echo $i|awk '{print $2}'`
+ test -n "$num" && num=`printf '%05d' || num=$code
+ test -f $num.png || echo $code | getgbook $1
+done
diff --git a/util.c b/util.c
index 21bc598..719c8ff 100644
--- a/util.c
+++ b/util.c
@@ -1,4 +1,4 @@
-/* See COPYING file for copyright, license and warranty details. */
+/* See COPYING file for copyright and license details. */
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
@@ -39,31 +39,30 @@ int dial(char *host, char *port) {
int get(char *host, char *path, char *sendcookie, char *savecookie, char **buf) {
size_t l, res;
int fd, i, p;
- char h[1024] = "\0";
- char c[1024] = "";
+ char h[HEADERMAX] = "\0";
+ char c[COOKIEMAX] = "";
FILE *srv;
- if(savecookie) savecookie[0] = 0; /* TEMP TO PLEASE GCC */
if((fd = dial(host, "80")) == -1) return 0;
srv = fdopen(fd, "r+");
if(sendcookie)
- snprintf(c, COOKIEMAX-1, "\r\nCookie: %s", sendcookie);
- fprintf(srv, "GET %s HTTP/1.0\r\nUser-Agent: getgbook-"VERSION \
+ snprintf(c, COOKIEMAX, "\r\nCookie: %s", sendcookie);
+ fprintf(srv, "GET %s HTTP/1.0\r\nUser-Agent: getxbook-"VERSION \
" (not mozilla)\r\nHost: %s%s\r\n\r\n", path, host, c);
fflush(srv);
while(h[0] != '\r') {
- fgets(h, 1024, srv);
+ fgets(h, HEADERMAX, srv);
if(sscanf(h, "HTTP/%d.%d %d", &i, &i, &p) == 3 && p != 200)
return 0;
if(savecookie != NULL && sscanf(h, "Set-Cookie: %s;", c))
- strncat(savecookie, c, COOKIEMAX-1);
+ strncat(savecookie, c, COOKIEMAX);
}
- *buf = malloc(sizeof(char *) * 4096);
- for(l=0; (res = fread(*buf+l, 1, 4096, srv)) > 0; l+=res)
- *buf = realloc(*buf, sizeof(char *) * (l+4096));
+ *buf = malloc(sizeof(char *) * BUFSIZ);
+ for(l=0; (res = fread(*buf+l, 1, BUFSIZ, srv)) > 0; l+=res)
+ *buf = realloc(*buf, sizeof(char *) * (l+BUFSIZ));
fclose(srv);
return l;
diff --git a/util.h b/util.h
index 82c0a29..11677af 100644
--- a/util.h
+++ b/util.h
@@ -1,5 +1,6 @@
-/* See COPYING file for copyright, license and warranty details. */
+/* See COPYING file for copyright and license details. */
#define COOKIEMAX 1024
+#define HEADERMAX 1024
int dial(char *host, char *port);
int get(char *host, char *path, char *sendcookie, char *savecookie, char **buf);
int gettofile(char *host, char *url, char *sendcookie, char *savecookie, char *savepath);