1 files changed, 198 insertions, 0 deletions
diff --git a/squark-filter.c b/squark-filter.c
new file mode 100644
index 0000000..8973d04
--- /dev/null
+++ b/squark-filter.c
@@ -0,0 +1,198 @@
+#include <stdio.h>
+
+#include <cmph.h>
+
+#include "squarkdb.h"
+#include "blob.h"
+
+struct url_info {
+	blob_t protocol;
+	blob_t username;
+	blob_t password;
+	blob_t host;
+	blob_t path;
+	blob_t query;
+	blob_t fragment;
+	int port;
+};
+
+/* URI is generalized as:
+ * [proto://][user[:password]@]domain.name[:port][/[path/to][?p=a&q=b;r=c][#fragment]]
+ * Character literals used as separators are:
+ *  : / @ ? & ; #
+ * Also URI escaping says to treat %XX as encoded hex value.
+ */
+
+static int url_parse(blob_t uri, struct url_info *nfo)
+{
+	blob_t before_colon;
+	blob_t word;
+
+	memset(nfo, 0, sizeof(*nfo));
+
+	/* parse protocol, username/password and domain name/port */
+	do {
+		word = blob_pull_cspn(&uri, BLOB_STR(":@/?"));
+		switch (uri.len ? uri.ptr[0] : '/') {
+		case ':':
+			blob_pull_skip(&uri, 1);
+			if (blob_is_null(nfo->protocol) &&
+			    blob_pull_matching(&uri, BLOB_STR("//")))
+				nfo->protocol = word;
+			else
+				before_colon = word;
+			break;
+		case '@':
+			blob_pull_skip(&uri, 1);
+			if (!blob_is_null(nfo->username) ||
+			    !blob_is_null(nfo->password))
+				goto error;
+			if (!blob_is_null(before_colon)) {
+				nfo->username = before_colon;
+				nfo->password = word;
+			} else
+				nfo->username = word;
+			before_colon = BLOB_NULL;
+			break;
+		case '/':
+		case '?':
+			if (!blob_is_null(before_colon)) {
+				nfo->host = before_colon;
+				nfo->port = blob_pull_uint(&word, 10);
+			} else
+				nfo->host = word;
+			break;
+		}
+	} while (blob_is_null(nfo->host) && !blob_is_null(uri));
+
+	/* rest of the components */
+	nfo->path = blob_pull_cspn(&uri, BLOB_STR("?&;#"));
+	nfo->query = blob_pull_cspn(&uri, BLOB_STR("#"));
+	nfo->fragment = uri;
+
+	/* fill in defaults if needed */
+	if (blob_is_null(nfo->protocol)) {
+		if (nfo->port == 443)
+			nfo->protocol = BLOB_STR("https");
+		else
+			nfo->protocol = BLOB_STR("http");
+		if (nfo->port == 0)
+			nfo->port = 80;
+	} else if (nfo->port == 0) {
+		if (blob_cmp(nfo->protocol, BLOB_STR("https")) == 0)
+			nfo->port = 443;
+		else
+			nfo->port = 80;
+	}
+	if (blob_is_null(nfo->path))
+		nfo->path = BLOB_STR("/");
+
+	return 1;
+error:
+	return 0;
+}
+
+static void url_print(struct url_info *nfo)
+{
+#define print_field(nfo, x) if (!blob_is_null(nfo->x)) printf(" %s{%.*s}", #x, nfo->x.len, nfo->x.ptr)
+	print_field(nfo, protocol);
+	print_field(nfo, username);
+	print_field(nfo, password);
+	print_field(nfo, host);
+	printf(" port{%d}", nfo->port);
+	print_field(nfo, path);
+	print_field(nfo, query);
+	print_field(nfo, fragment);
+#undef print_field
+}
+
+static blob_t url_classify(struct url_info *url, struct sqdb *db)
+{
+	blob_t key, got, tld;
+	void *cmph;
+	struct sqdb_index_entry *indx;
+	uint32_t *categories;
+	char *strings;
+	cmph_uint32 i = -1, previ;
+
+	cmph = sqdb_section_get(db, SQDB_SECTION_INDEX_MPH, NULL);
+	indx = sqdb_section_get(db, SQDB_SECTION_INDEX, NULL);
+	strings = sqdb_section_get(db, SQDB_SECTION_STRINGS, NULL);
+
+	/* search for most qualified domain match; do first lookup
+	 * with two domain components */
+	key = BLOB_PTR_LEN(url->host.ptr + url->host.len, 0);
+	tld = blob_expand_head(&key, url->host, '.');
+
+	do {
+		/* add one more domain component */
+		got = blob_expand_head(&key, url->host, '.');
+		if (blob_is_null(got))
+			break;
+
+		previ = i;
+		i = cmph_search_packed(cmph, key.ptr, key.len);
+		if (blob_cmp(got, BLOB_STR(&strings[indx[i].component])) != 0) {
+			/* the subdomain did no longer match, use 
+			 * parents classification */
+			i = previ;
+			goto parent_dns_match;
+		}
+		if (!blob_is_null(tld)) {
+			if (blob_cmp(tld, BLOB_STR(&strings[indx[indx[i].parent].component])) != 0) {
+				/* top level domain did not match */
+				i = -1;
+				goto parent_dns_match;
+			}
+			tld = BLOB_NULL;
+		}
+	} while (indx[i].has_subdomains);
+
+	if (key.ptr != url->host.ptr) {
+		/* the full of dns part did not match, so we skip the
+		 * path name search */
+		goto parent_dns_match;
+	}
+
+	/* and then search for path matches */
+
+
+parent_dns_match:
+	if (i == -1)
+		return BLOB_STR("unknown");
+
+	categories = sqdb_section_get(db, SQDB_SECTION_CATEGORIES, NULL);
+	printf("%d\n", indx[i].category);
+	return BLOB_STR(&strings[categories[indx[i].category]]);
+}
+
+int main(int argc, char **argv)
+{
+	const char * const uri[] = {
+		"http://sex.com",
+		"http://facebook.com:1234/",
+		"https://slashdot.org/path/to/me",
+		"http://user:pass@paistortuga.com/~mocosoft",
+		"user@weather.whenu.speedera.net",
+		"zedo1.speedera.net",
+		"foo.com/stuff?query;bar#frag",
+		"foo.com?query;bar#frag",
+	};
+	struct sqdb db;
+	struct url_info nfo;
+	blob_t cat;
+	int i;
+
+	sqdb_open(&db, "squark.db");
+	for (i = 0; i < ARRAY_SIZE(uri); i++) {
+		if (url_parse(BLOB_STR(uri[i]), &nfo)) {
+			cat = url_classify(&nfo, &db);
+			printf("%s - %.*s -", uri[i], cat.len, cat.ptr);
+			url_print(&nfo);
+			printf("\n");
+		} else {
+			printf("%s - BAD_URL\n", uri[i]);
+		}
+	}
+	sqdb_close(&db);
+}