summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTimo Teräs <timo.teras@iki.fi>2010-08-11 14:28:06 +0300
committerTimo Teräs <timo.teras@iki.fi>2010-08-11 14:28:06 +0300
commite0a013397a51963039c43877be3afe954e519be0 (patch)
tree77cbd4db435ec62d679920596ffbd07166fdc902
parentcf7e91d59880424ff6c643a848938619b7968ad8 (diff)
downloadsquark-e0a013397a51963039c43877be3afe954e519be0.tar.bz2
squark-e0a013397a51963039c43877be3afe954e519be0.tar.xz
filter: implement basic analysis of urls
Analysing of the url host part, some simple tests. Not usable as squid filter yet.
-rw-r--r--Makefile5
-rw-r--r--blob.c20
-rw-r--r--blob.h4
-rw-r--r--squark-filter.c198
-rw-r--r--squarkdb.c31
5 files changed, 250 insertions, 8 deletions
diff --git a/Makefile b/Makefile
index d4b84a6..499eb67 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-TARGETS=squark-auth squarkdb.so
+TARGETS=squark-auth squark-filter squarkdb.so
NETSNMP_CFLAGS:=$(shell net-snmp-config --cflags)
NETSNMP_LIBS:=$(shell net-snmp-config --libs)
@@ -15,6 +15,9 @@ all: $(TARGETS)
squark-auth: squark-auth.o blob.o
$(CC) -o $@ $^ $(NETSNMP_LIBS)
+squark-filter: squark-filter.o squarkdb.o blob.o
+ $(CC) -o $@ $^ $(CMPH_LIBS)
+
squarkdb.so: lua-squarkdb.o squarkdb.o blob.o
$(CC) -shared -o $@ $^ $(LUA_LIBS) $(CMPH_LIBS)
diff --git a/blob.c b/blob.c
index 377ec62..a417a0b 100644
--- a/blob.c
+++ b/blob.c
@@ -180,3 +180,23 @@ blob_t blob_pull_cspn(blob_t *b, const blob_t reject)
*b = BLOB_NULL;
return t;
}
+
+blob_t blob_expand_head(blob_t *b, blob_t limits, unsigned char sep)
+{
+ blob_t t = *b;
+ blob_t r;
+
+ if (t.ptr <= limits.ptr || t.ptr+t.len > limits.ptr+limits.len)
+ return BLOB_NULL;
+ while (t.ptr > limits.ptr && t.ptr[-1] == sep)
+ t.ptr--, t.len++;
+
+ r.ptr = t.ptr;
+ r.len = 0;
+ while (t.ptr > limits.ptr && t.ptr[-1] != sep) {
+ t.ptr--, t.len++;
+ r.ptr--, r.len++;
+ }
+ *b = t;
+ return r;
+}
diff --git a/blob.h b/blob.h
index 883c053..767e661 100644
--- a/blob.h
+++ b/blob.h
@@ -31,7 +31,7 @@ extern const blob_t BLOB_NULL;
static inline int blob_is_null(blob_t b)
{
- return b.ptr == NULL;
+ return b.len == 0;
}
char *blob_cstr_dup(blob_t b);
@@ -47,4 +47,6 @@ int blob_pull_matching(blob_t *b, blob_t e);
unsigned int blob_pull_uint(blob_t *b, int radix);
blob_t blob_pull_cspn(blob_t *b, const blob_t cspn);
+blob_t blob_expand_head(blob_t *b, blob_t limits, unsigned char sep);
+
#endif
diff --git a/squark-filter.c b/squark-filter.c
new file mode 100644
index 0000000..8973d04
--- /dev/null
+++ b/squark-filter.c
@@ -0,0 +1,198 @@
+#include <stdio.h>
+
+#include <cmph.h>
+
+#include "squarkdb.h"
+#include "blob.h"
+
+struct url_info {
+ blob_t protocol;
+ blob_t username;
+ blob_t password;
+ blob_t host;
+ blob_t path;
+ blob_t query;
+ blob_t fragment;
+ int port;
+};
+
+/* URI is generalized as:
+ * [proto://][user[:password]@]domain.name[:port][/[path/to][?p=a&q=b;r=c][#fragment]]
+ * Character literals used as separators are:
+ * : / @ ? & ; #
+ * Also URI escaping says to treat %XX as encoded hex value.
+ */
+
+static int url_parse(blob_t uri, struct url_info *nfo)
+{
+ blob_t before_colon;
+ blob_t word;
+
+ memset(nfo, 0, sizeof(*nfo));
+
+ /* parse protocol, username/password and domain name/port */
+ do {
+ word = blob_pull_cspn(&uri, BLOB_STR(":@/?"));
+ switch (uri.len ? uri.ptr[0] : '/') {
+ case ':':
+ blob_pull_skip(&uri, 1);
+ if (blob_is_null(nfo->protocol) &&
+ blob_pull_matching(&uri, BLOB_STR("//")))
+ nfo->protocol = word;
+ else
+ before_colon = word;
+ break;
+ case '@':
+ blob_pull_skip(&uri, 1);
+ if (!blob_is_null(nfo->username) ||
+ !blob_is_null(nfo->password))
+ goto error;
+ if (!blob_is_null(before_colon)) {
+ nfo->username = before_colon;
+ nfo->password = word;
+ } else
+ nfo->username = word;
+ before_colon = BLOB_NULL;
+ break;
+ case '/':
+ case '?':
+ if (!blob_is_null(before_colon)) {
+ nfo->host = before_colon;
+ nfo->port = blob_pull_uint(&word, 10);
+ } else
+ nfo->host = word;
+ break;
+ }
+ } while (blob_is_null(nfo->host) && !blob_is_null(uri));
+
+ /* rest of the components */
+ nfo->path = blob_pull_cspn(&uri, BLOB_STR("?&;#"));
+ nfo->query = blob_pull_cspn(&uri, BLOB_STR("#"));
+ nfo->fragment = uri;
+
+ /* fill in defaults if needed */
+ if (blob_is_null(nfo->protocol)) {
+ if (nfo->port == 443)
+ nfo->protocol = BLOB_STR("https");
+ else
+ nfo->protocol = BLOB_STR("http");
+ if (nfo->port == 0)
+ nfo->port = 80;
+ } else if (nfo->port == 0) {
+ if (blob_cmp(nfo->protocol, BLOB_STR("https")) == 0)
+ nfo->port = 443;
+ else
+ nfo->port = 80;
+ }
+ if (blob_is_null(nfo->path))
+ nfo->path = BLOB_STR("/");
+
+ return 1;
+error:
+ return 0;
+}
+
+static void url_print(struct url_info *nfo)
+{
+#define print_field(nfo, x) if (!blob_is_null(nfo->x)) printf(" %s{%.*s}", #x, nfo->x.len, nfo->x.ptr)
+ print_field(nfo, protocol);
+ print_field(nfo, username);
+ print_field(nfo, password);
+ print_field(nfo, host);
+ printf(" port{%d}", nfo->port);
+ print_field(nfo, path);
+ print_field(nfo, query);
+ print_field(nfo, fragment);
+#undef print_field
+}
+
+static blob_t url_classify(struct url_info *url, struct sqdb *db)
+{
+ blob_t key, got, tld;
+ void *cmph;
+ struct sqdb_index_entry *indx;
+ uint32_t *categories;
+ char *strings;
+ cmph_uint32 i = -1, previ;
+
+ cmph = sqdb_section_get(db, SQDB_SECTION_INDEX_MPH, NULL);
+ indx = sqdb_section_get(db, SQDB_SECTION_INDEX, NULL);
+ strings = sqdb_section_get(db, SQDB_SECTION_STRINGS, NULL);
+
+ /* search for most qualified domain match; do first lookup
+ * with two domain components */
+ key = BLOB_PTR_LEN(url->host.ptr + url->host.len, 0);
+ tld = blob_expand_head(&key, url->host, '.');
+
+ do {
+ /* add one more domain component */
+ got = blob_expand_head(&key, url->host, '.');
+ if (blob_is_null(got))
+ break;
+
+ previ = i;
+ i = cmph_search_packed(cmph, key.ptr, key.len);
+ if (blob_cmp(got, BLOB_STR(&strings[indx[i].component])) != 0) {
+ /* the subdomain did no longer match, use
+ * parents classification */
+ i = previ;
+ goto parent_dns_match;
+ }
+ if (!blob_is_null(tld)) {
+ if (blob_cmp(tld, BLOB_STR(&strings[indx[indx[i].parent].component])) != 0) {
+ /* top level domain did not match */
+ i = -1;
+ goto parent_dns_match;
+ }
+ tld = BLOB_NULL;
+ }
+ } while (indx[i].has_subdomains);
+
+ if (key.ptr != url->host.ptr) {
+ /* the full of dns part did not match, so we skip the
+ * path name search */
+ goto parent_dns_match;
+ }
+
+ /* and then search for path matches */
+
+
+parent_dns_match:
+ if (i == -1)
+ return BLOB_STR("unknown");
+
+ categories = sqdb_section_get(db, SQDB_SECTION_CATEGORIES, NULL);
+ printf("%d\n", indx[i].category);
+ return BLOB_STR(&strings[categories[indx[i].category]]);
+}
+
+int main(int argc, char **argv)
+{
+ const char * const uri[] = {
+ "http://sex.com",
+ "http://facebook.com:1234/",
+ "https://slashdot.org/path/to/me",
+ "http://user:pass@paistortuga.com/~mocosoft",
+ "user@weather.whenu.speedera.net",
+ "zedo1.speedera.net",
+ "foo.com/stuff?query;bar#frag",
+ "foo.com?query;bar#frag",
+ };
+ struct sqdb db;
+ struct url_info nfo;
+ blob_t cat;
+ int i;
+
+ sqdb_open(&db, "squark.db");
+ for (i = 0; i < ARRAY_SIZE(uri); i++) {
+ if (url_parse(BLOB_STR(uri[i]), &nfo)) {
+ cat = url_classify(&nfo, &db);
+ printf("%s - %.*s -", uri[i], cat.len, cat.ptr);
+ url_print(&nfo);
+ printf("\n");
+ } else {
+ printf("%s - BAD_URL\n", uri[i]);
+ }
+ }
+ sqdb_close(&db);
+}
diff --git a/squarkdb.c b/squarkdb.c
index 543cbb1..e05f514 100644
--- a/squarkdb.c
+++ b/squarkdb.c
@@ -17,10 +17,11 @@ const char *sqdb_section_names[SQDB_SECTION_MAX] = {
[SQDB_SECTION_KEYWORD_MPH] = "keyword_mph",
};
-static int sqdb_allocate(struct sqdb *db, size_t s)
+static int sqdb_allocate(struct sqdb *db, size_t s, int wr)
{
size_t old_size, new_size;
void *base;
+ int prot = PROT_READ;
old_size = db->file_length;
new_size = ALIGN(db->file_length + s, PAGE_SIZE);
@@ -30,12 +31,13 @@ static int sqdb_allocate(struct sqdb *db, size_t s)
return old_size;
}
- if (ftruncate(db->fd, new_size) < 0)
+ if (wr && ftruncate(db->fd, new_size) < 0)
return -1;
if (db->mmap_base == NULL) {
- base = mmap(NULL, new_size, PROT_READ|PROT_WRITE,
- MAP_SHARED, db->fd, 0);
+ if (wr)
+ prot |= PROT_WRITE;
+ base = mmap(NULL, new_size, prot, MAP_SHARED, db->fd, 0);
} else {
base = mremap(db->mmap_base, ALIGN(old_size, PAGE_SIZE),
new_size, MREMAP_MAYMOVE);
@@ -49,6 +51,23 @@ static int sqdb_allocate(struct sqdb *db, size_t s)
return old_size;
}
+int sqdb_open(struct sqdb *db, const char *fn)
+{
+ struct stat st;
+
+ db->fd = open(fn, O_RDONLY);
+ if (db->fd < 0)
+ return -1;
+
+ fstat(db->fd, &st);
+
+ db->file_length = 0;
+ db->mmap_base = NULL;
+ sqdb_allocate(db, st.st_size, 0);
+
+ return 0;
+}
+
int sqdb_create(struct sqdb *db, const char *fn)
{
struct sqdb_header *hdr;
@@ -61,7 +80,7 @@ int sqdb_create(struct sqdb *db, const char *fn)
db->file_length = 0;
db->mmap_base = NULL;
- rc = sqdb_allocate(db, sizeof(struct sqdb_header));
+ rc = sqdb_allocate(db, sizeof(struct sqdb_header), 1);
if (rc < 0) {
close(db->fd);
return rc;
@@ -94,7 +113,7 @@ void *sqdb_section_create(struct sqdb *db, int id, uint32_t size)
if (hdr->section[id].offset || hdr->section[id].length)
return NULL;
- pos = sqdb_allocate(db, size);
+ pos = sqdb_allocate(db, size, 1);
if (pos < 0)
return NULL;