diff options
author | Timo Teräs <timo.teras@iki.fi> | 2010-08-13 13:40:41 +0300 |
---|---|---|
committer | Timo Teräs <timo.teras@iki.fi> | 2010-08-13 13:40:41 +0300 |
commit | 7daf2874969fb6773d480e9776cd8418eeb6353f (patch) | |
tree | 32a9ca18ded660b18b4234c3311e09238d71c128 | |
parent | 8bc76c78a69360efc7a07a3c4e92f393cca22543 (diff) | |
download | squark-7daf2874969fb6773d480e9776cd8418eeb6353f.tar.bz2 squark-7daf2874969fb6773d480e9776cd8418eeb6353f.tar.xz |
filter: fix db building issues and implement path component matching
Fixes has sub domains/paths hints to be correct. www<number> as
first domain entry matching now checks it won't remove second level
domain names.
And the filter code now looksup path components from the db.
-rw-r--r-- | blob.c | 66 | ||||
-rw-r--r-- | blob.h | 3 | ||||
-rwxr-xr-x | sqdb-build.lua | 17 | ||||
-rw-r--r-- | squark-filter.c | 39 |
4 files changed, 110 insertions, 15 deletions
@@ -111,6 +111,29 @@ void blob_push_hexdump(blob_t *to, blob_t binary) to->len -= binary.len * 2; } +void blob_push_urldecode(blob_t *to, blob_t url) +{ + blob_t b, orig = *to; + + do { + blob_pull_matching(&url, BLOB_STR("/")); + b = blob_pull_cspn(&url, BLOB_STR("/")); + if (blob_is_null(url) && blob_is_null(b)) + break; + + if (blob_is_null(b) || blob_cmp(b, BLOB_STR(".")) == 0) { + /* skip '.' or two consecutive / */ + } else if (blob_cmp(b, BLOB_STR("..")) == 0) { + /* go up one path component */ + blob_shrink_tail(to, blob_pushed(orig, b), '/'); + } else { + /* copy decoded; FIXME decode percent encoding */ + blob_push(to, BLOB_STR("/")); + blob_push(to, b); + } + } while (1); +} + blob_t blob_pull(blob_t *b, int len) { blob_t r; @@ -177,6 +200,7 @@ blob_t blob_pull_cspn(blob_t *b, const blob_t reject) return BLOB_PTR_LEN(t.ptr, i); } } + *b = BLOB_NULL; return t; } @@ -186,7 +210,7 @@ blob_t blob_expand_head(blob_t *b, blob_t limits, unsigned char sep) blob_t t = *b; blob_t r; - if (t.ptr <= limits.ptr || t.ptr+t.len > limits.ptr+limits.len) + if (t.ptr < limits.ptr || t.ptr+t.len > limits.ptr+limits.len) return BLOB_NULL; while (t.ptr > limits.ptr && t.ptr[-1] == sep) t.ptr--, t.len++; @@ -200,3 +224,43 @@ blob_t blob_expand_head(blob_t *b, blob_t limits, unsigned char sep) *b = t; return r; } + +blob_t blob_expand_tail(blob_t *b, blob_t limits, unsigned char sep) +{ + blob_t t = *b; + blob_t r; + + if (t.ptr < limits.ptr || t.ptr+t.len > limits.ptr+limits.len) + return BLOB_NULL; + while (t.ptr + t.len < limits.ptr + limits.len && t.ptr[t.len] == sep) + t.len++; + + r.ptr = t.ptr + t.len; + r.len = 0; + while (t.ptr + t.len < limits.ptr + limits.len && t.ptr[t.len] != sep) { + t.len++; + r.len++; + } + *b = t; + return r; +} + +blob_t blob_shrink_tail(blob_t *b, blob_t limits, unsigned char sep) +{ + blob_t t = *b; + blob_t r; + + if (t.ptr <= limits.ptr || t.ptr+t.len > limits.ptr+limits.len) + return BLOB_NULL; + while (t.len && t.ptr[t.len-1] == sep) + t.len--; + + r.ptr = t.ptr; + r.len = 0; + while (t.len && t.ptr[t.len-1] != sep) { + t.len--; + r.ptr--, r.len++; + } + *b = t; + return r; +} @@ -41,6 +41,7 @@ blob_t blob_pushed(blob_t buffer, blob_t left); void blob_push(blob_t *b, blob_t d); void blob_push_uint(blob_t *to, unsigned int value, int radix); void blob_push_hexdump(blob_t *to, blob_t binary); +void blob_push_urldecode(blob_t *to, blob_t url); blob_t blob_pull(blob_t *b, int len); void blob_pull_skip(blob_t *b, int len); int blob_pull_matching(blob_t *b, blob_t e); @@ -48,5 +49,7 @@ unsigned int blob_pull_uint(blob_t *b, int radix); blob_t blob_pull_cspn(blob_t *b, const blob_t cspn); blob_t blob_expand_head(blob_t *b, blob_t limits, unsigned char sep); +blob_t blob_expand_tail(blob_t *b, blob_t limits, unsigned char sep); +blob_t blob_shrink_tail(blob_t *b, blob_t limits, unsigned char sep); #endif diff --git a/sqdb-build.lua b/sqdb-build.lua index fce1e7b..2b301fc 100755 --- a/sqdb-build.lua +++ b/sqdb-build.lua @@ -117,7 +117,7 @@ local function read_urls(filename, category, locked) url = url:gsub("#.*", "") url = url:gsub(" *^", "") url = url:lower() - url = url:gsub("^(www%d*[.])", "") + url = url:gsub("^(www%d*[.])([^.]*[.])", "%2") domain, path = url:match("([^/]*)/?(.*)") domain = domain:gsub(":.*", "") domain = domain:gsub("[.]$", "") -- trailing dot @@ -187,7 +187,7 @@ local function enum_tree(cb, category, dns, data) fdns = cdns end cat = cdata.category or category - cb(fdns, dns, cdns, cat, data.children, data.paths) + cb(fdns, dns, cdns, cat, cdata.children, cdata.paths) enum_tree(cb, cat, fdns, cdata) end end @@ -233,15 +233,15 @@ local function prune_paths(paths, category) return num_paths end -local function prune_tree(d, category) +local function prune_tree(d, pcategory) local num_childs = 0 local num_paths = 0 local cat - cat = d.category or category + cat = d.category or pcategory if d.children ~= nil then for n, child in pairs(d.children) do - if prune_tree(child, cat, count) then + if prune_tree(child, cat, n) then d.children[n] = nil else num_childs = num_childs + 1 @@ -254,9 +254,12 @@ local function prune_tree(d, category) end --print(name, d.category, category, d.num_paths, num_childs) if d.paths ~= nil then - num_paths = prune_paths(d.paths, d.category) + num_paths = prune_paths(d.paths, cat) + if num_paths == 0 then + d.paths = nil + end end - if cat == category and num_paths == 0 and num_childs == 0 then + if d.category == pcategory and num_paths == 0 and num_childs == 0 then --num_pruned_leafs = num_pruned_leafs + 1 return true end diff --git a/squark-filter.c b/squark-filter.c index e47cbf5..f3a4aed 100644 --- a/squark-filter.c +++ b/squark-filter.c @@ -108,7 +108,8 @@ static void url_print(struct url_info *nfo) static blob_t url_classify(struct url_info *url, struct sqdb *db) { - blob_t key, got, tld; + unsigned char buffer[1024]; + blob_t b, key, got, tld, mkey; void *cmph; struct sqdb_index_entry *indx; uint32_t *categories; @@ -144,23 +145,45 @@ static blob_t url_classify(struct url_info *url, struct sqdb *db) } tld = BLOB_NULL; } + mkey = key; } while (indx[i].has_subdomains); - if (key.ptr != url->host.ptr) { - /* the full of dns part did not match, so we skip the - * path name search */ + if (key.ptr != url->host.ptr || !indx[i].has_paths) { + /* the full dns part did not match, or there's no more + * specific paths in db -- skip the path name search */ goto parent_dns_match; } - /* and then search for path matches */ - + /* and then search for path matches -- construct hashing + * string of url decoded path */ + b = BLOB_BUF(buffer); + blob_push(&b, key); + key = blob_pushed(BLOB_BUF(buffer), b); + blob_push_urldecode(&b, url->path); + b = blob_pushed(BLOB_BUF(buffer), b); + + while (indx[i].has_paths) { + /* add one more path component */ + got = blob_expand_tail(&key, b, '/'); + if (blob_is_null(got)) + break; + previ = i; + i = cmph_search_packed(cmph, key.ptr, key.len); + tld = sqdb_get_string_literal(db, indx[i].component); + if (blob_cmp(got, sqdb_get_string_literal(db, indx[i].component)) != 0) { + /* the subdomain did no longer match, use + * parents classification */ + i = previ; + goto parent_dns_match; + } + mkey = key; + } parent_dns_match: if (i == -1) return BLOB_STR("unknown"); categories = sqdb_section_get(db, SQDB_SECTION_CATEGORIES, NULL); - printf("%d\n", indx[i].category); return sqdb_get_string_literal(db, categories[indx[i].category]); } @@ -171,10 +194,12 @@ int main(int argc, char **argv) "http://facebook.com:1234/", "https://slashdot.org/path/to/me", "http://user:pass@paistortuga.com/~mocosoft", + "http://user:pass@paistortuga.com", "user@weather.whenu.speedera.net", "zedo1.speedera.net", "foo.com/stuff?query;bar#frag", "foo.com?query;bar#frag", + "aapracingandsports.com.au/racing/", }; struct sqdb db; struct url_info nfo; |