From 7daf2874969fb6773d480e9776cd8418eeb6353f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timo=20Ter=C3=A4s?= Date: Fri, 13 Aug 2010 13:40:41 +0300 Subject: filter: fix db building issues and implement path component matching Fixes has sub domains/paths hints to be correct. www as first domain entry matching now checks it won't remove second level domain names. And the filter code now looksup path components from the db. --- squark-filter.c | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) (limited to 'squark-filter.c') diff --git a/squark-filter.c b/squark-filter.c index e47cbf5..f3a4aed 100644 --- a/squark-filter.c +++ b/squark-filter.c @@ -108,7 +108,8 @@ static void url_print(struct url_info *nfo) static blob_t url_classify(struct url_info *url, struct sqdb *db) { - blob_t key, got, tld; + unsigned char buffer[1024]; + blob_t b, key, got, tld, mkey; void *cmph; struct sqdb_index_entry *indx; uint32_t *categories; @@ -144,23 +145,45 @@ static blob_t url_classify(struct url_info *url, struct sqdb *db) } tld = BLOB_NULL; } + mkey = key; } while (indx[i].has_subdomains); - if (key.ptr != url->host.ptr) { - /* the full of dns part did not match, so we skip the - * path name search */ + if (key.ptr != url->host.ptr || !indx[i].has_paths) { + /* the full dns part did not match, or there's no more + * specific paths in db -- skip the path name search */ goto parent_dns_match; } - /* and then search for path matches */ - + /* and then search for path matches -- construct hashing + * string of url decoded path */ + b = BLOB_BUF(buffer); + blob_push(&b, key); + key = blob_pushed(BLOB_BUF(buffer), b); + blob_push_urldecode(&b, url->path); + b = blob_pushed(BLOB_BUF(buffer), b); + + while (indx[i].has_paths) { + /* add one more path component */ + got = blob_expand_tail(&key, b, '/'); + if (blob_is_null(got)) + break; + previ = i; + i = cmph_search_packed(cmph, key.ptr, key.len); + tld = sqdb_get_string_literal(db, indx[i].component); + if (blob_cmp(got, sqdb_get_string_literal(db, indx[i].component)) != 0) { + /* the subdomain did no longer match, use + * parents classification */ + i = previ; + goto parent_dns_match; + } + mkey = key; + } parent_dns_match: if (i == -1) return BLOB_STR("unknown"); categories = sqdb_section_get(db, SQDB_SECTION_CATEGORIES, NULL); - printf("%d\n", indx[i].category); return sqdb_get_string_literal(db, categories[indx[i].category]); } @@ -171,10 +194,12 @@ int main(int argc, char **argv) "http://facebook.com:1234/", "https://slashdot.org/path/to/me", "http://user:pass@paistortuga.com/~mocosoft", + "http://user:pass@paistortuga.com", "user@weather.whenu.speedera.net", "zedo1.speedera.net", "foo.com/stuff?query;bar#frag", "foo.com?query;bar#frag", + "aapracingandsports.com.au/racing/", }; struct sqdb db; struct url_info nfo; -- cgit v1.2.3