filter: implement basic analysis of urls

Analysing of the url host part, some simple tests. Not usable as squid filter yet.
author: Timo Teräs <timo.teras@iki.fi> 2010-08-11 14:28:06 +0300
committer: Timo Teräs <timo.teras@iki.fi> 2010-08-11 14:28:06 +0300
commit: e0a013397a51963039c43877be3afe954e519be0 (patch)
tree: 77cbd4db435ec62d679920596ffbd07166fdc902
parent: cf7e91d59880424ff6c643a848938619b7968ad8 (diff)
download: squark-e0a013397a51963039c43877be3afe954e519be0.tar.bz2
squark-e0a013397a51963039c43877be3afe954e519be0.tar.xz
5 files changed, 250 insertions, 8 deletions
diff --git a/Makefile b/Makefile
index d4b84a6..499eb67 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-TARGETS=squark-auth squarkdb.so
+TARGETS=squark-auth squark-filter squarkdb.so
 
 NETSNMP_CFLAGS:=$(shell net-snmp-config --cflags)
 NETSNMP_LIBS:=$(shell net-snmp-config --libs)
@@ -15,6 +15,9 @@ all: $(TARGETS)
 squark-auth: squark-auth.o blob.o
 	$(CC) -o $@ $^ $(NETSNMP_LIBS)
 
+squark-filter: squark-filter.o squarkdb.o blob.o
+	$(CC) -o $@ $^ $(CMPH_LIBS)
+
 squarkdb.so: lua-squarkdb.o squarkdb.o blob.o
 	$(CC) -shared -o $@ $^ $(LUA_LIBS) $(CMPH_LIBS)
 
diff --git a/blob.c b/blob.c
index 377ec62..a417a0b 100644
--- a/blob.c
+++ b/blob.c
@@ -180,3 +180,23 @@ blob_t blob_pull_cspn(blob_t *b, const blob_t reject)
 	*b = BLOB_NULL;
 	return t;
 }
+
+blob_t blob_expand_head(blob_t *b, blob_t limits, unsigned char sep)
+{
+	blob_t t = *b;
+	blob_t r;
+
+	if (t.ptr <= limits.ptr || t.ptr+t.len > limits.ptr+limits.len)
+		return BLOB_NULL;
+	while (t.ptr > limits.ptr && t.ptr[-1] == sep)
+		t.ptr--, t.len++;
+
+	r.ptr = t.ptr;
+	r.len = 0;
+	while (t.ptr > limits.ptr && t.ptr[-1] != sep) {
+		t.ptr--, t.len++;
+		r.ptr--, r.len++;
+	}
+	*b = t;
+	return r;
+}
diff --git a/blob.h b/blob.h
index 883c053..767e661 100644
--- a/blob.h
+++ b/blob.h
@@ -31,7 +31,7 @@ extern const blob_t BLOB_NULL;
 
 static inline int blob_is_null(blob_t b)
 {
-	return b.ptr == NULL;
+	return b.len == 0;
 }
 
 char *blob_cstr_dup(blob_t b);
@@ -47,4 +47,6 @@ int blob_pull_matching(blob_t *b, blob_t e);
 unsigned int blob_pull_uint(blob_t *b, int radix);
 blob_t blob_pull_cspn(blob_t *b, const blob_t cspn);
 
+blob_t blob_expand_head(blob_t *b, blob_t limits, unsigned char sep);
+
 #endif
diff --git a/squark-filter.c b/squark-filter.c
new file mode 100644
index 0000000..8973d04
--- /dev/null
+++ b/squark-filter.c
@@ -0,0 +1,198 @@
+#include <stdio.h>
+
+#include <cmph.h>
+
+#include "squarkdb.h"
+#include "blob.h"
+
+struct url_info {
+	blob_t protocol;
+	blob_t username;
+	blob_t password;
+	blob_t host;
+	blob_t path;
+	blob_t query;
+	blob_t fragment;
+	int port;
+};
+
+/* URI is generalized as:
+ * [proto://][user[:password]@]domain.name[:port][/[path/to][?p=a&q=b;r=c][#fragment]]
+ * Character literals used as separators are:
+ *  : / @ ? & ; #
+ * Also URI escaping says to treat %XX as encoded hex value.
+ */
+
+static int url_parse(blob_t uri, struct url_info *nfo)
+{
+	blob_t before_colon;
+	blob_t word;
+
+	memset(nfo, 0, sizeof(*nfo));
+
+	/* parse protocol, username/password and domain name/port */
+	do {
+		word = blob_pull_cspn(&uri, BLOB_STR(":@/?"));
+		switch (uri.len ? uri.ptr[0] : '/') {
+		case ':':
+			blob_pull_skip(&uri, 1);
+			if (blob_is_null(nfo->protocol) &&
+			    blob_pull_matching(&uri, BLOB_STR("//")))
+				nfo->protocol = word;
+			else
+				before_colon = word;
+			break;
+		case '@':
+			blob_pull_skip(&uri, 1);
+			if (!blob_is_null(nfo->username) ||
+			    !blob_is_null(nfo->password))
+				goto error;
+			if (!blob_is_null(before_colon)) {
+				nfo->username = before_colon;
+				nfo->password = word;
+			} else
+				nfo->username = word;
+			before_colon = BLOB_NULL;
+			break;
+		case '/':
+		case '?':
+			if (!blob_is_null(before_colon)) {
+				nfo->host = before_colon;
+				nfo->port = blob_pull_uint(&word, 10);
+			} else
+				nfo->host = word;
+			break;
+		}
+	} while (blob_is_null(nfo->host) && !blob_is_null(uri));
+
+	/* rest of the components */
+	nfo->path = blob_pull_cspn(&uri, BLOB_STR("?&;#"));
+	nfo->query = blob_pull_cspn(&uri, BLOB_STR("#"));
+	nfo->fragment = uri;
+
+	/* fill in defaults if needed */
+	if (blob_is_null(nfo->protocol)) {
+		if (nfo->port == 443)
+			nfo->protocol = BLOB_STR("https");
+		else
+			nfo->protocol = BLOB_STR("http");
+		if (nfo->port == 0)
+			nfo->port = 80;
+	} else if (nfo->port == 0) {
+		if (blob_cmp(nfo->protocol, BLOB_STR("https")) == 0)
+			nfo->port = 443;
+		else
+			nfo->port = 80;
+	}
+	if (blob_is_null(nfo->path))
+		nfo->path = BLOB_STR("/");
+
+	return 1;
+error:
+	return 0;
+}
+
+static void url_print(struct url_info *nfo)
+{
+#define print_field(nfo, x) if (!blob_is_null(nfo->x)) printf(" %s{%.*s}", #x, nfo->x.len, nfo->x.ptr)
+	print_field(nfo, protocol);
+	print_field(nfo, username);
+	print_field(nfo, password);
+	print_field(nfo, host);
+	printf(" port{%d}", nfo->port);
+	print_field(nfo, path);
+	print_field(nfo, query);
+	print_field(nfo, fragment);
+#undef print_field
+}
+
+static blob_t url_classify(struct url_info *url, struct sqdb *db)
+{
+	blob_t key, got, tld;
+	void *cmph;
+	struct sqdb_index_entry *indx;
+	uint32_t *categories;
+	char *strings;
+	cmph_uint32 i = -1, previ;
+
+	cmph = sqdb_section_get(db, SQDB_SECTION_INDEX_MPH, NULL);
+	indx = sqdb_section_get(db, SQDB_SECTION_INDEX, NULL);
+	strings = sqdb_section_get(db, SQDB_SECTION_STRINGS, NULL);
+
+	/* search for most qualified domain match; do first lookup
+	 * with two domain components */
+	key = BLOB_PTR_LEN(url->host.ptr + url->host.len, 0);
+	tld = blob_expand_head(&key, url->host, '.');
+
+	do {
+		/* add one more domain component */
+		got = blob_expand_head(&key, url->host, '.');
+		if (blob_is_null(got))
+			break;
+
+		previ = i;
+		i = cmph_search_packed(cmph, key.ptr, key.len);
+		if (blob_cmp(got, BLOB_STR(&strings[indx[i].component])) != 0) {
+			/* the subdomain did no longer match, use 
+			 * parents classification */
+			i = previ;
+			goto parent_dns_match;
+		}
+		if (!blob_is_null(tld)) {
+			if (blob_cmp(tld, BLOB_STR(&strings[indx[indx[i].parent].component])) != 0) {
+				/* top level domain did not match */
+				i = -1;
+				goto parent_dns_match;
+			}
+			tld = BLOB_NULL;
+		}
+	} while (indx[i].has_subdomains);
+
+	if (key.ptr != url->host.ptr) {
+		/* the full of dns part did not match, so we skip the
+		 * path name search */
+		goto parent_dns_match;
+	}
+
+	/* and then search for path matches */
+
+
+parent_dns_match:
+	if (i == -1)
+		return BLOB_STR("unknown");
+
+	categories = sqdb_section_get(db, SQDB_SECTION_CATEGORIES, NULL);
+	printf("%d\n", indx[i].category);
+	return BLOB_STR(&strings[categories[indx[i].category]]);
+}
+
+int main(int argc, char **argv)
+{
+	const char * const uri[] = {
+		"http://sex.com",
+		"http://facebook.com:1234/",
+		"https://slashdot.org/path/to/me",
+		"http://user:pass@paistortuga.com/~mocosoft",
+		"user@weather.whenu.speedera.net",
+		"zedo1.speedera.net",
+		"foo.com/stuff?query;bar#frag",
+		"foo.com?query;bar#frag",
+	};
+	struct sqdb db;
+	struct url_info nfo;
+	blob_t cat;
+	int i;
+
+	sqdb_open(&db, "squark.db");
+	for (i = 0; i < ARRAY_SIZE(uri); i++) {
+		if (url_parse(BLOB_STR(uri[i]), &nfo)) {
+			cat = url_classify(&nfo, &db);
+			printf("%s - %.*s -", uri[i], cat.len, cat.ptr);
+			url_print(&nfo);
+			printf("\n");
+		} else {
+			printf("%s - BAD_URL\n", uri[i]);
+		}
+	}
+	sqdb_close(&db);
+}
diff --git a/squarkdb.c b/squarkdb.c
index 543cbb1..e05f514 100644
--- a/squarkdb.c
+++ b/squarkdb.c
@@ -17,10 +17,11 @@ const char *sqdb_section_names[SQDB_SECTION_MAX] = {
 	[SQDB_SECTION_KEYWORD_MPH]	= "keyword_mph",
 };
 
-static int sqdb_allocate(struct sqdb *db, size_t s)
+static int sqdb_allocate(struct sqdb *db, size_t s, int wr)
 {
 	size_t old_size, new_size;
 	void *base;
+	int prot = PROT_READ;
 
 	old_size = db->file_length;
 	new_size = ALIGN(db->file_length + s, PAGE_SIZE);
@@ -30,12 +31,13 @@ static int sqdb_allocate(struct sqdb *db, size_t s)
 		return old_size;
 	}
 
-	if (ftruncate(db->fd, new_size) < 0)
+	if (wr && ftruncate(db->fd, new_size) < 0)
 		return -1;
 
 	if (db->mmap_base == NULL) {
-		base = mmap(NULL, new_size, PROT_READ|PROT_WRITE,
-			    MAP_SHARED, db->fd, 0);
+		if (wr)
+			prot |= PROT_WRITE;
+		base = mmap(NULL, new_size, prot, MAP_SHARED, db->fd, 0);
 	} else {
 		base = mremap(db->mmap_base, ALIGN(old_size, PAGE_SIZE),
 			      new_size, MREMAP_MAYMOVE);
@@ -49,6 +51,23 @@ static int sqdb_allocate(struct sqdb *db, size_t s)
 	return old_size;
 }
 
+int sqdb_open(struct sqdb *db, const char *fn)
+{
+	struct stat st;
+
+	db->fd = open(fn, O_RDONLY);
+	if (db->fd < 0)
+		return -1;
+
+	fstat(db->fd, &st);
+
+	db->file_length = 0;
+	db->mmap_base = NULL;
+	sqdb_allocate(db, st.st_size, 0);
+
+	return 0;
+}
+
 int sqdb_create(struct sqdb *db, const char *fn)
 {
 	struct sqdb_header *hdr;
@@ -61,7 +80,7 @@ int sqdb_create(struct sqdb *db, const char *fn)
 	db->file_length = 0;
 	db->mmap_base = NULL;
 
-	rc = sqdb_allocate(db, sizeof(struct sqdb_header));
+	rc = sqdb_allocate(db, sizeof(struct sqdb_header), 1);
 	if (rc < 0) {
 		close(db->fd);
 		return rc;
@@ -94,7 +113,7 @@ void *sqdb_section_create(struct sqdb *db, int id, uint32_t size)
 	if (hdr->section[id].offset || hdr->section[id].length)
 		return NULL;
 
-	pos = sqdb_allocate(db, size);
+	pos = sqdb_allocate(db, size, 1);
 	if (pos < 0)
 		return NULL;
author	Timo Teräs <timo.teras@iki.fi>	2010-08-11 14:28:06 +0300
committer	Timo Teräs <timo.teras@iki.fi>	2010-08-11 14:28:06 +0300
commit	e0a013397a51963039c43877be3afe954e519be0 (patch)
tree	77cbd4db435ec62d679920596ffbd07166fdc902
parent	cf7e91d59880424ff6c643a848938619b7968ad8 (diff)
download	squark-e0a013397a51963039c43877be3afe954e519be0.tar.bz2 squark-e0a013397a51963039c43877be3afe954e519be0.tar.xz