diff options
-rw-r--r-- | blob.c | 22 | ||||
-rw-r--r-- | blob.h | 1 | ||||
-rw-r--r-- | squark-filter.c | 61 |
3 files changed, 44 insertions, 40 deletions
@@ -1,3 +1,6 @@ +#include <ctype.h> +#include <string.h> + #include "blob.h" /* RFC 3986 section 2.3 Unreserved Characters (January 2005) */ @@ -162,6 +165,20 @@ void blob_push(blob_t *b, blob_t d) } } +void blob_push_lower(blob_t *b, blob_t d) +{ + int i; + + if (b->len < d.len) { + *b = BLOB_NULL; + return; + } + for (i = 0; i < d.len; i++) + b->ptr[i] = tolower(d.ptr[i]); + b->ptr += d.len; + b->len -= d.len; +} + void blob_push_byte(blob_t *b, unsigned char byte) { if (b->len) { @@ -219,9 +236,6 @@ void blob_push_urldecode(blob_t *to, blob_t url) do { blob_pull_matching(&url, BLOB_STR("/")); b = blob_pull_cspn(&url, BLOB_STR("/")); - if (blob_is_null(url) && blob_is_null(b)) - break; - if (blob_is_null(b) || blob_cmp(b, BLOB_STR(".")) == 0) { /* skip '.' or two consecutive / */ } else if (blob_cmp(b, BLOB_STR("..")) == 0) { @@ -232,7 +246,7 @@ void blob_push_urldecode(blob_t *to, blob_t url) blob_push_byte(to, '/'); blob_push(to, b); } - } while (1); + } while (!blob_is_null(url)); } void blob_push_urlencode(blob_t *to, blob_t url) @@ -42,6 +42,7 @@ unsigned long blob_inet_addr(blob_t buf); blob_t blob_pushed(blob_t buffer, blob_t left); void blob_push(blob_t *b, blob_t d); +void blob_push_lower(blob_t *b, blob_t d); void blob_push_byte(blob_t *b, unsigned char byte); void blob_push_uint(blob_t *to, unsigned int value, int radix); void blob_push_hexdump(blob_t *to, blob_t binary); diff --git a/squark-filter.c b/squark-filter.c index 8fab0bf..9bc6bb2 100644 --- a/squark-filter.c +++ b/squark-filter.c @@ -31,6 +31,7 @@ struct url_info { blob_t username; blob_t password; blob_t host; + blob_t significant_host; blob_t path; blob_t query; blob_t fragment; @@ -66,7 +67,6 @@ void blob_pull_url_dns_part(blob_t *b, struct url_dns_part_data *udp) case '5': case '6': case '7': case '8': case '9': break; default: - t.ptr[i] = tolower(c); numeric = 0; break; } @@ -155,7 +155,17 @@ static int url_parse(blob_t uri, struct url_info *nfo) if (blob_is_null(nfo->path)) nfo->path = BLOB_STR("/"); + /* significant host name */ + nfo->significant_host = nfo->host; + if (nfo->num_dots > 1) { + blob_t b = nfo->significant_host; + if (blob_pull_matching(&b, BLOB_STR("www")) && + (blob_pull_uint(&b, 10), 1) && + blob_pull_matching(&b, BLOB_STR("."))) + nfo->significant_host = b; + } return 1; + error: return 0; } @@ -178,8 +188,8 @@ static void url_print(struct url_info *nfo) static int url_classify(struct url_info *url, struct sqdb *db) { - unsigned char buffer[1024]; - blob_t b, key, got, tld, mkey; + unsigned char buffer[512]; + blob_t key, got, tld, keybuf, keylimits; void *cmph; struct sqdb_index_entry *indx; cmph_uint32 i = SQDB_PARENT_ROOT, previ = SQDB_PARENT_ROOT; @@ -188,10 +198,13 @@ static int url_classify(struct url_info *url, struct sqdb *db) cmph = sqdb_section_get(db, SQDB_SECTION_INDEX_MPH, NULL); indx = sqdb_section_get(db, SQDB_SECTION_INDEX, NULL); + keybuf = BLOB_BUF(buffer); + blob_push_lower(&keybuf, url->significant_host); + key = keylimits = blob_pushed(BLOB_BUF(buffer), keybuf); + /* search for most qualified domain match; do first lookup * with two domain components */ if (url->is_ipv4) { - key = url->host; i = cmph_search_packed(cmph, key.ptr, key.len); if (indx[i].parent != SQDB_PARENT_IPV4 || @@ -200,12 +213,12 @@ static int url_classify(struct url_info *url, struct sqdb *db) goto parent_dns_match; } } else { - key = BLOB_PTR_LEN(url->host.ptr + url->host.len, 0); - tld = blob_expand_head(&key, url->host, '.'); + key = BLOB_PTR_LEN(key.ptr + key.len, 0); + tld = blob_expand_head(&key, keylimits, '.'); do { /* add one more domain component */ - got = blob_expand_head(&key, url->host, '.'); + got = blob_expand_head(&key, keylimits, '.'); if (blob_is_null(got)) break; @@ -232,45 +245,22 @@ static int url_classify(struct url_info *url, struct sqdb *db) i = previ; goto parent_dns_match; } - mkey = key; dots_done++; } while (indx[i].has_subdomains); } /* No paths to match for */ - if (i == SQDB_PARENT_ROOT || !indx[i].has_paths) + if (i == SQDB_PARENT_ROOT || !indx[i].has_paths || key.ptr != keylimits.ptr) goto parent_dns_match; - if (key.ptr != url->host.ptr) { - blob_t tmpkey = key; - - /* Not exact dns match, but there's paths. Check if we - * have only one more dns entry and it's of form www1 or - * such. If so, this should be treated as exact match. */ - if (dots_done != url->num_dots) - goto parent_dns_match; - - got = blob_expand_head(&tmpkey, url->host, '.'); - if (blob_is_null(got) || - !blob_pull_matching(&got, BLOB_STR("www"))) - goto parent_dns_match; - - blob_pull_uint(&got, 10); - if (got.len != 0) - goto parent_dns_match; - } - /* and then search for path matches -- construct hashing * string of url decoded path */ - b = BLOB_BUF(buffer); - blob_push(&b, key); - key = blob_pushed(BLOB_BUF(buffer), b); - blob_push_urldecode(&b, url->path); - b = blob_pushed(BLOB_BUF(buffer), b); + blob_push_urldecode(&keybuf, url->path); + key = keylimits = blob_pushed(BLOB_BUF(buffer), keybuf); while (indx[i].has_paths) { /* add one more path component */ - got = blob_expand_tail(&key, b, '/'); + got = blob_expand_tail(&key, keylimits, '/'); if (blob_is_null(got)) break; previ = i; @@ -282,7 +272,6 @@ static int url_classify(struct url_info *url, struct sqdb *db) i = previ; goto parent_dns_match; } - mkey = key; } parent_dns_match: @@ -432,7 +421,7 @@ int main(int argc, char **argv) struct sqdb db; int opt; - sqdb_open(&db, "squark.db"); + sqdb_open(&db, "/var/lib/squark/squark.db"); while ((opt = getopt(argc, argv, "r:b:")) != -1) { switch (opt) { |