Fix potential bug in HTTP code

author: Nick White <git@njw.me.uk> 2012-07-10 20:57:07 +0100
committer: Nick White <git@njw.me.uk> 2012-07-10 20:57:07 +0100
commit: 4e6b01857db64898a4619415543eefca771a527e (patch)
tree: 776ceb6e824099b41bbc43e985c1ec98e55ce672
parent: e7a7e29a59be8bd43f09938c623b093e162ddf92 (diff)
2 files changed, 69 insertions, 54 deletions
diff --git a/TODO b/TODO
index 87bc878..d4b9297 100644
--- a/TODO
+++ b/TODO
@@ -1,20 +1,14 @@
 # other todos
 
-bug in get() & post(): if the \r\n\r\n after http headers is cut off between recv buffers. solution is to get all, then strstr(\n\r\n\r) to find end of header, and memcopy the rest out (so that original memory can be freed)
+format and package man pages in win packages
 
 in getabook, the web client tries downloading sequentially the first few pages, regardless of whether they're in the available page list. this actually works (some or all of these pages will return), so we should implement something similar too. exactly how it knows when to stop looking is not clear, at least with the one i tried, it just tried all of the first 25 pages.
 
 in getgbook, check that downloaded page doesn't match 'page not available' image; if so delete (as may be redownloadable later, perhaps even then with different cookies)
 in getbnbook, check that downloaded page doesn't match 'page not available' swf; if so delete (as may be redownloadable later, perhaps even then with different cookies)
 
-in getgbook, grab the link data (presumably as json somewhere), and add this to pdf
-
-1.0 format and package man pages in win and osx packages
+submit 'pad' file to websites http://padsites.asp-software.org/
 
 write some little tests
 
-1.0 submit 'pad' file to websites http://padsites.asp-software.org/
-
-add function to download html text to getabook (just a html request to get kindle version)
-
 add scribd functionality - example is http://www.scribd.com/doc/20448287/Etidorhpa-John-Uri-Lloyd producing urls like http://htmlimg3.scribdassets.com/1qva8jpekgdk0wl/images/1-bfa8361a96.jpg
diff --git a/util.c b/util.c
index 41bbc3c..d20e4e3 100644
--- a/util.c
+++ b/util.c
@@ -49,14 +49,17 @@ int dial(char *host, char *port) {
 	return srv;
 }
 
-int get(char *host, char *path, char *sendcookie, char *savecookie, char **buf) {
+int get(char *host, char *path, char *sendcookie, char *savecookie, char **body) {
 	size_t l, res;
 	int fd, i, p;
 	char h[BUFSIZ] = "";
 	char c[COOKIEMAX] = "";
-	char t[BUFSIZ];
-	char *t2;
 	char m[256];
+	char *headpos;
+	size_t headsize;
+	char headline[BUFSIZ] = "";
+	char *buf;
+	char *cur, *pos;
 
 	if((fd = dial(host, "80")) == -1) return 0;
 
@@ -66,40 +69,42 @@ int get(char *host, char *path, char *sendcookie, char *savecookie, char **buf)
 	                    " (not mozilla)\r\nHost: %s%s\r\n\r\n", path, host, c);
 	if(!send(fd, h, strlen(h), 0)) return 0;
 
-	*buf = NULL;
+	/* download everything into buf */
 	l = 0;
-	h[0] = 0;
-	snprintf(m, 256, "Set-Cookie: %%%ds;", COOKIEMAX-1);
+	buf = malloc(sizeof(char *) * BUFSIZ);
+	for(; buf != NULL && (res = recv(fd, buf+l, BUFSIZ, 0)) > 0; l+=res)
+		buf = realloc(buf, sizeof(char *) * (l+BUFSIZ));
 
-	while((res = recv(fd, t, BUFSIZ, 0)) > 0) {
-		strncat(h, t, BUFSIZ - strlen(h) - 1);
-		if((t2 = strstr(t, "\r\n\r\n")) != NULL && (t2 - t) < (signed)res) {
-			/* end of header, save rest to buffer */
-			t2+=4;
-			l = res - (t2 - t);
-			*buf = malloc(sizeof(char *) * l);
-			memcpy(*buf, t2, l);
-			break;
-		}
-	}
+	/* strstr to find end of header */
+	if((headpos = strstr(buf, "\r\n\r\n")) == NULL)
+		return 0;
+	headpos += 4;
+	headsize = headpos - buf;
 
-	if(sscanf(h, "HTTP/%d.%d %d", &i, &i, &p) == 3 && p != 200) {
-		if(p == 403)
-			fprintf(stderr, "403 forbidden: your IP address may be temporarily blocked\n");
+	/* memcopy from there into a large enough buf */
+	if((*body = malloc(sizeof(char *) * (l - headsize))) == NULL)
 		return 0;
-	}
-	t2 = h;
-	if(savecookie != NULL) {
-		while((t2 = strstr(t2, "Set-Cookie: ")) && sscanf(t2, m, c)) {
+	memcpy(*body, headpos, sizeof(char *) * (l - headsize));
+
+	/* parse header as needed */
+	snprintf(m, 256, "Set-Cookie: %%%ds;", COOKIEMAX-1);
+	cur = buf;
+	while((pos = strstr(cur, "\r\n")) != NULL && cur < (headpos - 4)) {
+		strncpy(headline, cur, pos - cur);
+		headline[pos - cur] = '\0';
+		cur = pos + 2;
+
+		if(sscanf(headline, "HTTP/%d.%d %d", &i, &i, &p) == 3 && p != 200) {
+			if(p == 403)
+				fprintf(stderr, "403 forbidden: your IP address may be temporarily blocked\n");
+			return 0;
+		}
+
+		if(savecookie != NULL && sscanf(headline, m, c)) {
 			strncat(savecookie, c, COOKIEMAX - strlen(savecookie) - 1);
-			t2++;
 		}
 	}
 
-	*buf = realloc(*buf, sizeof(char *) * (l+BUFSIZ));
-	for(; buf != NULL && (res = recv(fd, *buf+l, BUFSIZ, 0)) > 0; l+=res)
-		*buf = realloc(*buf, sizeof(char *) * (l+BUFSIZ));
-
 	return l;
 }
 
@@ -129,12 +134,16 @@ int gettofile(char *host, char *url, char *sendcookie, char *savecookie, char *s
 	return 0;
 }
 
-int post(char *host, char *path, char *data, char **buf) {
+/* TODO: merge this with get(); almost all code is the same */
+int post(char *host, char *path, char *data, char **body) {
 	size_t l, res;
 	int fd, i, p;
 	char h[BUFSIZ] = "";
-	char t[BUFSIZ];
-	char *t2;
+	char *headpos;
+	size_t headsize;
+	char headline[BUFSIZ] = "";
+	char *buf;
+	char *cur, *pos;
 
 	if((fd = dial(host, "80")) == -1) return 0;
 
@@ -145,25 +154,37 @@ int post(char *host, char *path, char *data, char **buf) {
 	                    path, (int)strlen(data), host, data);
 	if(!send(fd, h, strlen(h), 0)) return 0;
 
-	*buf = NULL;
+	/* download everything into buf */
 	l = 0;
-	while((res = recv(fd, t, BUFSIZ, 0)) > 0) {
-		if(sscanf(t, "HTTP/%d.%d %d", &i, &i, &p) == 3 && p != 200)
+	buf = malloc(sizeof(char *) * BUFSIZ);
+	for(; buf != NULL && (res = recv(fd, buf+l, BUFSIZ, 0)) > 0; l+=res)
+		buf = realloc(buf, sizeof(char *) * (l+BUFSIZ));
+
+	/* strstr to find end of header */
+	if((headpos = strstr(buf, "\r\n\r\n")) == NULL)
+		return 0;
+	headpos += 4;
+	headsize = headpos - buf;
+
+	/* memcopy from there into a large enough buf */
+	if((*body = malloc(sizeof(char *) * (l - headsize))) == NULL)
+		return 0;
+	memcpy(*body, headpos, sizeof(char *) * (l - headsize));
+
+	/* parse header as needed */
+	cur = buf;
+	while((pos = strstr(cur, "\r\n")) != NULL && cur < (headpos - 4)) {
+		strncpy(headline, cur, pos - cur);
+		headline[pos - cur] = '\0';
+		cur = pos + 2;
+
+		if(sscanf(headline, "HTTP/%d.%d %d", &i, &i, &p) == 3 && p != 200) {
+			if(p == 403)
+				fprintf(stderr, "403 forbidden: your IP address may be temporarily blocked\n");
 			return 0;
-		t2 = t;
-		if((t2 = strstr(t, "\r\n\r\n")) != NULL && (t2 - t) < (signed)res) {
-			t2+=4;
-			l = res - (t2 - t);
-			*buf = malloc(sizeof(char *) * l);
-			memcpy(*buf, t2, l);
-			break;
 		}
 	}
 
-	*buf = realloc(*buf, sizeof(char *) * (l+BUFSIZ));
-	for(; (res = recv(fd, *buf+l, BUFSIZ, 0)) > 0; l+=res)
-		*buf = realloc(*buf, sizeof(char *) * (l+BUFSIZ));
-
 	return l;
 }
author	Nick White <git@njw.me.uk>	2012-07-10 20:57:07 +0100
committer	Nick White <git@njw.me.uk>	2012-07-10 20:57:07 +0100
commit	4e6b01857db64898a4619415543eefca771a527e (patch)
tree	776ceb6e824099b41bbc43e985c1ec98e55ce672
parent	e7a7e29a59be8bd43f09938c623b093e162ddf92 (diff)