From 7daf2874969fb6773d480e9776cd8418eeb6353f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timo=20Ter=C3=A4s?= Date: Fri, 13 Aug 2010 13:40:41 +0300 Subject: filter: fix db building issues and implement path component matching Fixes has sub domains/paths hints to be correct. www as first domain entry matching now checks it won't remove second level domain names. And the filter code now looksup path components from the db. --- blob.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- blob.h | 3 +++ sqdb-build.lua | 17 +++++++++------ squark-filter.c | 39 ++++++++++++++++++++++++++++------ 4 files changed, 110 insertions(+), 15 deletions(-) diff --git a/blob.c b/blob.c index a417a0b..0c28877 100644 --- a/blob.c +++ b/blob.c @@ -111,6 +111,29 @@ void blob_push_hexdump(blob_t *to, blob_t binary) to->len -= binary.len * 2; } +void blob_push_urldecode(blob_t *to, blob_t url) +{ + blob_t b, orig = *to; + + do { + blob_pull_matching(&url, BLOB_STR("/")); + b = blob_pull_cspn(&url, BLOB_STR("/")); + if (blob_is_null(url) && blob_is_null(b)) + break; + + if (blob_is_null(b) || blob_cmp(b, BLOB_STR(".")) == 0) { + /* skip '.' or two consecutive / */ + } else if (blob_cmp(b, BLOB_STR("..")) == 0) { + /* go up one path component */ + blob_shrink_tail(to, blob_pushed(orig, b), '/'); + } else { + /* copy decoded; FIXME decode percent encoding */ + blob_push(to, BLOB_STR("/")); + blob_push(to, b); + } + } while (1); +} + blob_t blob_pull(blob_t *b, int len) { blob_t r; @@ -177,6 +200,7 @@ blob_t blob_pull_cspn(blob_t *b, const blob_t reject) return BLOB_PTR_LEN(t.ptr, i); } } + *b = BLOB_NULL; return t; } @@ -186,7 +210,7 @@ blob_t blob_expand_head(blob_t *b, blob_t limits, unsigned char sep) blob_t t = *b; blob_t r; - if (t.ptr <= limits.ptr || t.ptr+t.len > limits.ptr+limits.len) + if (t.ptr < limits.ptr || t.ptr+t.len > limits.ptr+limits.len) return BLOB_NULL; while (t.ptr > limits.ptr && t.ptr[-1] == sep) t.ptr--, t.len++; @@ -200,3 +224,43 @@ blob_t blob_expand_head(blob_t *b, blob_t limits, unsigned char sep) *b = t; return r; } + +blob_t blob_expand_tail(blob_t *b, blob_t limits, unsigned char sep) +{ + blob_t t = *b; + blob_t r; + + if (t.ptr < limits.ptr || t.ptr+t.len > limits.ptr+limits.len) + return BLOB_NULL; + while (t.ptr + t.len < limits.ptr + limits.len && t.ptr[t.len] == sep) + t.len++; + + r.ptr = t.ptr + t.len; + r.len = 0; + while (t.ptr + t.len < limits.ptr + limits.len && t.ptr[t.len] != sep) { + t.len++; + r.len++; + } + *b = t; + return r; +} + +blob_t blob_shrink_tail(blob_t *b, blob_t limits, unsigned char sep) +{ + blob_t t = *b; + blob_t r; + + if (t.ptr <= limits.ptr || t.ptr+t.len > limits.ptr+limits.len) + return BLOB_NULL; + while (t.len && t.ptr[t.len-1] == sep) + t.len--; + + r.ptr = t.ptr; + r.len = 0; + while (t.len && t.ptr[t.len-1] != sep) { + t.len--; + r.ptr--, r.len++; + } + *b = t; + return r; +} diff --git a/blob.h b/blob.h index 767e661..3d065ed 100644 --- a/blob.h +++ b/blob.h @@ -41,6 +41,7 @@ blob_t blob_pushed(blob_t buffer, blob_t left); void blob_push(blob_t *b, blob_t d); void blob_push_uint(blob_t *to, unsigned int value, int radix); void blob_push_hexdump(blob_t *to, blob_t binary); +void blob_push_urldecode(blob_t *to, blob_t url); blob_t blob_pull(blob_t *b, int len); void blob_pull_skip(blob_t *b, int len); int blob_pull_matching(blob_t *b, blob_t e); @@ -48,5 +49,7 @@ unsigned int blob_pull_uint(blob_t *b, int radix); blob_t blob_pull_cspn(blob_t *b, const blob_t cspn); blob_t blob_expand_head(blob_t *b, blob_t limits, unsigned char sep); +blob_t blob_expand_tail(blob_t *b, blob_t limits, unsigned char sep); +blob_t blob_shrink_tail(blob_t *b, blob_t limits, unsigned char sep); #endif diff --git a/sqdb-build.lua b/sqdb-build.lua index fce1e7b..2b301fc 100755 --- a/sqdb-build.lua +++ b/sqdb-build.lua @@ -117,7 +117,7 @@ local function read_urls(filename, category, locked) url = url:gsub("#.*", "") url = url:gsub(" *^", "") url = url:lower() - url = url:gsub("^(www%d*[.])", "") + url = url:gsub("^(www%d*[.])([^.]*[.])", "%2") domain, path = url:match("([^/]*)/?(.*)") domain = domain:gsub(":.*", "") domain = domain:gsub("[.]$", "") -- trailing dot @@ -187,7 +187,7 @@ local function enum_tree(cb, category, dns, data) fdns = cdns end cat = cdata.category or category - cb(fdns, dns, cdns, cat, data.children, data.paths) + cb(fdns, dns, cdns, cat, cdata.children, cdata.paths) enum_tree(cb, cat, fdns, cdata) end end @@ -233,15 +233,15 @@ local function prune_paths(paths, category) return num_paths end -local function prune_tree(d, category) +local function prune_tree(d, pcategory) local num_childs = 0 local num_paths = 0 local cat - cat = d.category or category + cat = d.category or pcategory if d.children ~= nil then for n, child in pairs(d.children) do - if prune_tree(child, cat, count) then + if prune_tree(child, cat, n) then d.children[n] = nil else num_childs = num_childs + 1 @@ -254,9 +254,12 @@ local function prune_tree(d, category) end --print(name, d.category, category, d.num_paths, num_childs) if d.paths ~= nil then - num_paths = prune_paths(d.paths, d.category) + num_paths = prune_paths(d.paths, cat) + if num_paths == 0 then + d.paths = nil + end end - if cat == category and num_paths == 0 and num_childs == 0 then + if d.category == pcategory and num_paths == 0 and num_childs == 0 then --num_pruned_leafs = num_pruned_leafs + 1 return true end diff --git a/squark-filter.c b/squark-filter.c index e47cbf5..f3a4aed 100644 --- a/squark-filter.c +++ b/squark-filter.c @@ -108,7 +108,8 @@ static void url_print(struct url_info *nfo) static blob_t url_classify(struct url_info *url, struct sqdb *db) { - blob_t key, got, tld; + unsigned char buffer[1024]; + blob_t b, key, got, tld, mkey; void *cmph; struct sqdb_index_entry *indx; uint32_t *categories; @@ -144,23 +145,45 @@ static blob_t url_classify(struct url_info *url, struct sqdb *db) } tld = BLOB_NULL; } + mkey = key; } while (indx[i].has_subdomains); - if (key.ptr != url->host.ptr) { - /* the full of dns part did not match, so we skip the - * path name search */ + if (key.ptr != url->host.ptr || !indx[i].has_paths) { + /* the full dns part did not match, or there's no more + * specific paths in db -- skip the path name search */ goto parent_dns_match; } - /* and then search for path matches */ - + /* and then search for path matches -- construct hashing + * string of url decoded path */ + b = BLOB_BUF(buffer); + blob_push(&b, key); + key = blob_pushed(BLOB_BUF(buffer), b); + blob_push_urldecode(&b, url->path); + b = blob_pushed(BLOB_BUF(buffer), b); + + while (indx[i].has_paths) { + /* add one more path component */ + got = blob_expand_tail(&key, b, '/'); + if (blob_is_null(got)) + break; + previ = i; + i = cmph_search_packed(cmph, key.ptr, key.len); + tld = sqdb_get_string_literal(db, indx[i].component); + if (blob_cmp(got, sqdb_get_string_literal(db, indx[i].component)) != 0) { + /* the subdomain did no longer match, use + * parents classification */ + i = previ; + goto parent_dns_match; + } + mkey = key; + } parent_dns_match: if (i == -1) return BLOB_STR("unknown"); categories = sqdb_section_get(db, SQDB_SECTION_CATEGORIES, NULL); - printf("%d\n", indx[i].category); return sqdb_get_string_literal(db, categories[indx[i].category]); } @@ -171,10 +194,12 @@ int main(int argc, char **argv) "http://facebook.com:1234/", "https://slashdot.org/path/to/me", "http://user:pass@paistortuga.com/~mocosoft", + "http://user:pass@paistortuga.com", "user@weather.whenu.speedera.net", "zedo1.speedera.net", "foo.com/stuff?query;bar#frag", "foo.com?query;bar#frag", + "aapracingandsports.com.au/racing/", }; struct sqdb db; struct url_info nfo; -- cgit v1.2.3