diff options
author | Timo Teräs <timo.teras@iki.fi> | 2010-08-11 14:28:06 +0300 |
---|---|---|
committer | Timo Teräs <timo.teras@iki.fi> | 2010-08-11 14:28:06 +0300 |
commit | e0a013397a51963039c43877be3afe954e519be0 (patch) | |
tree | 77cbd4db435ec62d679920596ffbd07166fdc902 | |
parent | cf7e91d59880424ff6c643a848938619b7968ad8 (diff) | |
download | squark-e0a013397a51963039c43877be3afe954e519be0.tar.bz2 squark-e0a013397a51963039c43877be3afe954e519be0.tar.xz |
filter: implement basic analysis of urls
Analysing of the url host part, some simple tests. Not usable as
squid filter yet.
-rw-r--r-- | Makefile | 5 | ||||
-rw-r--r-- | blob.c | 20 | ||||
-rw-r--r-- | blob.h | 4 | ||||
-rw-r--r-- | squark-filter.c | 198 | ||||
-rw-r--r-- | squarkdb.c | 31 |
5 files changed, 250 insertions, 8 deletions
@@ -1,4 +1,4 @@ -TARGETS=squark-auth squarkdb.so +TARGETS=squark-auth squark-filter squarkdb.so NETSNMP_CFLAGS:=$(shell net-snmp-config --cflags) NETSNMP_LIBS:=$(shell net-snmp-config --libs) @@ -15,6 +15,9 @@ all: $(TARGETS) squark-auth: squark-auth.o blob.o $(CC) -o $@ $^ $(NETSNMP_LIBS) +squark-filter: squark-filter.o squarkdb.o blob.o + $(CC) -o $@ $^ $(CMPH_LIBS) + squarkdb.so: lua-squarkdb.o squarkdb.o blob.o $(CC) -shared -o $@ $^ $(LUA_LIBS) $(CMPH_LIBS) @@ -180,3 +180,23 @@ blob_t blob_pull_cspn(blob_t *b, const blob_t reject) *b = BLOB_NULL; return t; } + +blob_t blob_expand_head(blob_t *b, blob_t limits, unsigned char sep) +{ + blob_t t = *b; + blob_t r; + + if (t.ptr <= limits.ptr || t.ptr+t.len > limits.ptr+limits.len) + return BLOB_NULL; + while (t.ptr > limits.ptr && t.ptr[-1] == sep) + t.ptr--, t.len++; + + r.ptr = t.ptr; + r.len = 0; + while (t.ptr > limits.ptr && t.ptr[-1] != sep) { + t.ptr--, t.len++; + r.ptr--, r.len++; + } + *b = t; + return r; +} @@ -31,7 +31,7 @@ extern const blob_t BLOB_NULL; static inline int blob_is_null(blob_t b) { - return b.ptr == NULL; + return b.len == 0; } char *blob_cstr_dup(blob_t b); @@ -47,4 +47,6 @@ int blob_pull_matching(blob_t *b, blob_t e); unsigned int blob_pull_uint(blob_t *b, int radix); blob_t blob_pull_cspn(blob_t *b, const blob_t cspn); +blob_t blob_expand_head(blob_t *b, blob_t limits, unsigned char sep); + #endif diff --git a/squark-filter.c b/squark-filter.c new file mode 100644 index 0000000..8973d04 --- /dev/null +++ b/squark-filter.c @@ -0,0 +1,198 @@ +#include <stdio.h> + +#include <cmph.h> + +#include "squarkdb.h" +#include "blob.h" + +struct url_info { + blob_t protocol; + blob_t username; + blob_t password; + blob_t host; + blob_t path; + blob_t query; + blob_t fragment; + int port; +}; + +/* URI is generalized as: + * [proto://][user[:password]@]domain.name[:port][/[path/to][?p=a&q=b;r=c][#fragment]] + * Character literals used as separators are: + * : / @ ? & ; # + * Also URI escaping says to treat %XX as encoded hex value. + */ + +static int url_parse(blob_t uri, struct url_info *nfo) +{ + blob_t before_colon; + blob_t word; + + memset(nfo, 0, sizeof(*nfo)); + + /* parse protocol, username/password and domain name/port */ + do { + word = blob_pull_cspn(&uri, BLOB_STR(":@/?")); + switch (uri.len ? uri.ptr[0] : '/') { + case ':': + blob_pull_skip(&uri, 1); + if (blob_is_null(nfo->protocol) && + blob_pull_matching(&uri, BLOB_STR("//"))) + nfo->protocol = word; + else + before_colon = word; + break; + case '@': + blob_pull_skip(&uri, 1); + if (!blob_is_null(nfo->username) || + !blob_is_null(nfo->password)) + goto error; + if (!blob_is_null(before_colon)) { + nfo->username = before_colon; + nfo->password = word; + } else + nfo->username = word; + before_colon = BLOB_NULL; + break; + case '/': + case '?': + if (!blob_is_null(before_colon)) { + nfo->host = before_colon; + nfo->port = blob_pull_uint(&word, 10); + } else + nfo->host = word; + break; + } + } while (blob_is_null(nfo->host) && !blob_is_null(uri)); + + /* rest of the components */ + nfo->path = blob_pull_cspn(&uri, BLOB_STR("?&;#")); + nfo->query = blob_pull_cspn(&uri, BLOB_STR("#")); + nfo->fragment = uri; + + /* fill in defaults if needed */ + if (blob_is_null(nfo->protocol)) { + if (nfo->port == 443) + nfo->protocol = BLOB_STR("https"); + else + nfo->protocol = BLOB_STR("http"); + if (nfo->port == 0) + nfo->port = 80; + } else if (nfo->port == 0) { + if (blob_cmp(nfo->protocol, BLOB_STR("https")) == 0) + nfo->port = 443; + else + nfo->port = 80; + } + if (blob_is_null(nfo->path)) + nfo->path = BLOB_STR("/"); + + return 1; +error: + return 0; +} + +static void url_print(struct url_info *nfo) +{ +#define print_field(nfo, x) if (!blob_is_null(nfo->x)) printf(" %s{%.*s}", #x, nfo->x.len, nfo->x.ptr) + print_field(nfo, protocol); + print_field(nfo, username); + print_field(nfo, password); + print_field(nfo, host); + printf(" port{%d}", nfo->port); + print_field(nfo, path); + print_field(nfo, query); + print_field(nfo, fragment); +#undef print_field +} + +static blob_t url_classify(struct url_info *url, struct sqdb *db) +{ + blob_t key, got, tld; + void *cmph; + struct sqdb_index_entry *indx; + uint32_t *categories; + char *strings; + cmph_uint32 i = -1, previ; + + cmph = sqdb_section_get(db, SQDB_SECTION_INDEX_MPH, NULL); + indx = sqdb_section_get(db, SQDB_SECTION_INDEX, NULL); + strings = sqdb_section_get(db, SQDB_SECTION_STRINGS, NULL); + + /* search for most qualified domain match; do first lookup + * with two domain components */ + key = BLOB_PTR_LEN(url->host.ptr + url->host.len, 0); + tld = blob_expand_head(&key, url->host, '.'); + + do { + /* add one more domain component */ + got = blob_expand_head(&key, url->host, '.'); + if (blob_is_null(got)) + break; + + previ = i; + i = cmph_search_packed(cmph, key.ptr, key.len); + if (blob_cmp(got, BLOB_STR(&strings[indx[i].component])) != 0) { + /* the subdomain did no longer match, use + * parents classification */ + i = previ; + goto parent_dns_match; + } + if (!blob_is_null(tld)) { + if (blob_cmp(tld, BLOB_STR(&strings[indx[indx[i].parent].component])) != 0) { + /* top level domain did not match */ + i = -1; + goto parent_dns_match; + } + tld = BLOB_NULL; + } + } while (indx[i].has_subdomains); + + if (key.ptr != url->host.ptr) { + /* the full of dns part did not match, so we skip the + * path name search */ + goto parent_dns_match; + } + + /* and then search for path matches */ + + +parent_dns_match: + if (i == -1) + return BLOB_STR("unknown"); + + categories = sqdb_section_get(db, SQDB_SECTION_CATEGORIES, NULL); + printf("%d\n", indx[i].category); + return BLOB_STR(&strings[categories[indx[i].category]]); +} + +int main(int argc, char **argv) +{ + const char * const uri[] = { + "http://sex.com", + "http://facebook.com:1234/", + "https://slashdot.org/path/to/me", + "http://user:pass@paistortuga.com/~mocosoft", + "user@weather.whenu.speedera.net", + "zedo1.speedera.net", + "foo.com/stuff?query;bar#frag", + "foo.com?query;bar#frag", + }; + struct sqdb db; + struct url_info nfo; + blob_t cat; + int i; + + sqdb_open(&db, "squark.db"); + for (i = 0; i < ARRAY_SIZE(uri); i++) { + if (url_parse(BLOB_STR(uri[i]), &nfo)) { + cat = url_classify(&nfo, &db); + printf("%s - %.*s -", uri[i], cat.len, cat.ptr); + url_print(&nfo); + printf("\n"); + } else { + printf("%s - BAD_URL\n", uri[i]); + } + } + sqdb_close(&db); +} @@ -17,10 +17,11 @@ const char *sqdb_section_names[SQDB_SECTION_MAX] = { [SQDB_SECTION_KEYWORD_MPH] = "keyword_mph", }; -static int sqdb_allocate(struct sqdb *db, size_t s) +static int sqdb_allocate(struct sqdb *db, size_t s, int wr) { size_t old_size, new_size; void *base; + int prot = PROT_READ; old_size = db->file_length; new_size = ALIGN(db->file_length + s, PAGE_SIZE); @@ -30,12 +31,13 @@ static int sqdb_allocate(struct sqdb *db, size_t s) return old_size; } - if (ftruncate(db->fd, new_size) < 0) + if (wr && ftruncate(db->fd, new_size) < 0) return -1; if (db->mmap_base == NULL) { - base = mmap(NULL, new_size, PROT_READ|PROT_WRITE, - MAP_SHARED, db->fd, 0); + if (wr) + prot |= PROT_WRITE; + base = mmap(NULL, new_size, prot, MAP_SHARED, db->fd, 0); } else { base = mremap(db->mmap_base, ALIGN(old_size, PAGE_SIZE), new_size, MREMAP_MAYMOVE); @@ -49,6 +51,23 @@ static int sqdb_allocate(struct sqdb *db, size_t s) return old_size; } +int sqdb_open(struct sqdb *db, const char *fn) +{ + struct stat st; + + db->fd = open(fn, O_RDONLY); + if (db->fd < 0) + return -1; + + fstat(db->fd, &st); + + db->file_length = 0; + db->mmap_base = NULL; + sqdb_allocate(db, st.st_size, 0); + + return 0; +} + int sqdb_create(struct sqdb *db, const char *fn) { struct sqdb_header *hdr; @@ -61,7 +80,7 @@ int sqdb_create(struct sqdb *db, const char *fn) db->file_length = 0; db->mmap_base = NULL; - rc = sqdb_allocate(db, sizeof(struct sqdb_header)); + rc = sqdb_allocate(db, sizeof(struct sqdb_header), 1); if (rc < 0) { close(db->fd); return rc; @@ -94,7 +113,7 @@ void *sqdb_section_create(struct sqdb *db, int id, uint32_t size) if (hdr->section[id].offset || hdr->section[id].length) return NULL; - pos = sqdb_allocate(db, size); + pos = sqdb_allocate(db, size, 1); if (pos < 0) return NULL; |