summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTimo Teräs <timo.teras@iki.fi>2010-08-13 13:40:41 +0300
committerTimo Teräs <timo.teras@iki.fi>2010-08-13 13:40:41 +0300
commit7daf2874969fb6773d480e9776cd8418eeb6353f (patch)
tree32a9ca18ded660b18b4234c3311e09238d71c128
parent8bc76c78a69360efc7a07a3c4e92f393cca22543 (diff)
downloadsquark-7daf2874969fb6773d480e9776cd8418eeb6353f.tar.bz2
squark-7daf2874969fb6773d480e9776cd8418eeb6353f.tar.xz
filter: fix db building issues and implement path component matching
Fixes has sub domains/paths hints to be correct. www<number> as first domain entry matching now checks it won't remove second level domain names. And the filter code now looksup path components from the db.
-rw-r--r--blob.c66
-rw-r--r--blob.h3
-rwxr-xr-xsqdb-build.lua17
-rw-r--r--squark-filter.c39
4 files changed, 110 insertions, 15 deletions
diff --git a/blob.c b/blob.c
index a417a0b..0c28877 100644
--- a/blob.c
+++ b/blob.c
@@ -111,6 +111,29 @@ void blob_push_hexdump(blob_t *to, blob_t binary)
to->len -= binary.len * 2;
}
+void blob_push_urldecode(blob_t *to, blob_t url)
+{
+ blob_t b, orig = *to;
+
+ do {
+ blob_pull_matching(&url, BLOB_STR("/"));
+ b = blob_pull_cspn(&url, BLOB_STR("/"));
+ if (blob_is_null(url) && blob_is_null(b))
+ break;
+
+ if (blob_is_null(b) || blob_cmp(b, BLOB_STR(".")) == 0) {
+ /* skip '.' or two consecutive / */
+ } else if (blob_cmp(b, BLOB_STR("..")) == 0) {
+ /* go up one path component */
+ blob_shrink_tail(to, blob_pushed(orig, b), '/');
+ } else {
+ /* copy decoded; FIXME decode percent encoding */
+ blob_push(to, BLOB_STR("/"));
+ blob_push(to, b);
+ }
+ } while (1);
+}
+
blob_t blob_pull(blob_t *b, int len)
{
blob_t r;
@@ -177,6 +200,7 @@ blob_t blob_pull_cspn(blob_t *b, const blob_t reject)
return BLOB_PTR_LEN(t.ptr, i);
}
}
+
*b = BLOB_NULL;
return t;
}
@@ -186,7 +210,7 @@ blob_t blob_expand_head(blob_t *b, blob_t limits, unsigned char sep)
blob_t t = *b;
blob_t r;
- if (t.ptr <= limits.ptr || t.ptr+t.len > limits.ptr+limits.len)
+ if (t.ptr < limits.ptr || t.ptr+t.len > limits.ptr+limits.len)
return BLOB_NULL;
while (t.ptr > limits.ptr && t.ptr[-1] == sep)
t.ptr--, t.len++;
@@ -200,3 +224,43 @@ blob_t blob_expand_head(blob_t *b, blob_t limits, unsigned char sep)
*b = t;
return r;
}
+
+blob_t blob_expand_tail(blob_t *b, blob_t limits, unsigned char sep)
+{
+ blob_t t = *b;
+ blob_t r;
+
+ if (t.ptr < limits.ptr || t.ptr+t.len > limits.ptr+limits.len)
+ return BLOB_NULL;
+ while (t.ptr + t.len < limits.ptr + limits.len && t.ptr[t.len] == sep)
+ t.len++;
+
+ r.ptr = t.ptr + t.len;
+ r.len = 0;
+ while (t.ptr + t.len < limits.ptr + limits.len && t.ptr[t.len] != sep) {
+ t.len++;
+ r.len++;
+ }
+ *b = t;
+ return r;
+}
+
+blob_t blob_shrink_tail(blob_t *b, blob_t limits, unsigned char sep)
+{
+ blob_t t = *b;
+ blob_t r;
+
+ if (t.ptr <= limits.ptr || t.ptr+t.len > limits.ptr+limits.len)
+ return BLOB_NULL;
+ while (t.len && t.ptr[t.len-1] == sep)
+ t.len--;
+
+ r.ptr = t.ptr;
+ r.len = 0;
+ while (t.len && t.ptr[t.len-1] != sep) {
+ t.len--;
+ r.ptr--, r.len++;
+ }
+ *b = t;
+ return r;
+}
diff --git a/blob.h b/blob.h
index 767e661..3d065ed 100644
--- a/blob.h
+++ b/blob.h
@@ -41,6 +41,7 @@ blob_t blob_pushed(blob_t buffer, blob_t left);
void blob_push(blob_t *b, blob_t d);
void blob_push_uint(blob_t *to, unsigned int value, int radix);
void blob_push_hexdump(blob_t *to, blob_t binary);
+void blob_push_urldecode(blob_t *to, blob_t url);
blob_t blob_pull(blob_t *b, int len);
void blob_pull_skip(blob_t *b, int len);
int blob_pull_matching(blob_t *b, blob_t e);
@@ -48,5 +49,7 @@ unsigned int blob_pull_uint(blob_t *b, int radix);
blob_t blob_pull_cspn(blob_t *b, const blob_t cspn);
blob_t blob_expand_head(blob_t *b, blob_t limits, unsigned char sep);
+blob_t blob_expand_tail(blob_t *b, blob_t limits, unsigned char sep);
+blob_t blob_shrink_tail(blob_t *b, blob_t limits, unsigned char sep);
#endif
diff --git a/sqdb-build.lua b/sqdb-build.lua
index fce1e7b..2b301fc 100755
--- a/sqdb-build.lua
+++ b/sqdb-build.lua
@@ -117,7 +117,7 @@ local function read_urls(filename, category, locked)
url = url:gsub("#.*", "")
url = url:gsub(" *^", "")
url = url:lower()
- url = url:gsub("^(www%d*[.])", "")
+ url = url:gsub("^(www%d*[.])([^.]*[.])", "%2")
domain, path = url:match("([^/]*)/?(.*)")
domain = domain:gsub(":.*", "")
domain = domain:gsub("[.]$", "") -- trailing dot
@@ -187,7 +187,7 @@ local function enum_tree(cb, category, dns, data)
fdns = cdns
end
cat = cdata.category or category
- cb(fdns, dns, cdns, cat, data.children, data.paths)
+ cb(fdns, dns, cdns, cat, cdata.children, cdata.paths)
enum_tree(cb, cat, fdns, cdata)
end
end
@@ -233,15 +233,15 @@ local function prune_paths(paths, category)
return num_paths
end
-local function prune_tree(d, category)
+local function prune_tree(d, pcategory)
local num_childs = 0
local num_paths = 0
local cat
- cat = d.category or category
+ cat = d.category or pcategory
if d.children ~= nil then
for n, child in pairs(d.children) do
- if prune_tree(child, cat, count) then
+ if prune_tree(child, cat, n) then
d.children[n] = nil
else
num_childs = num_childs + 1
@@ -254,9 +254,12 @@ local function prune_tree(d, category)
end
--print(name, d.category, category, d.num_paths, num_childs)
if d.paths ~= nil then
- num_paths = prune_paths(d.paths, d.category)
+ num_paths = prune_paths(d.paths, cat)
+ if num_paths == 0 then
+ d.paths = nil
+ end
end
- if cat == category and num_paths == 0 and num_childs == 0 then
+ if d.category == pcategory and num_paths == 0 and num_childs == 0 then
--num_pruned_leafs = num_pruned_leafs + 1
return true
end
diff --git a/squark-filter.c b/squark-filter.c
index e47cbf5..f3a4aed 100644
--- a/squark-filter.c
+++ b/squark-filter.c
@@ -108,7 +108,8 @@ static void url_print(struct url_info *nfo)
static blob_t url_classify(struct url_info *url, struct sqdb *db)
{
- blob_t key, got, tld;
+ unsigned char buffer[1024];
+ blob_t b, key, got, tld, mkey;
void *cmph;
struct sqdb_index_entry *indx;
uint32_t *categories;
@@ -144,23 +145,45 @@ static blob_t url_classify(struct url_info *url, struct sqdb *db)
}
tld = BLOB_NULL;
}
+ mkey = key;
} while (indx[i].has_subdomains);
- if (key.ptr != url->host.ptr) {
- /* the full of dns part did not match, so we skip the
- * path name search */
+ if (key.ptr != url->host.ptr || !indx[i].has_paths) {
+ /* the full dns part did not match, or there's no more
+ * specific paths in db -- skip the path name search */
goto parent_dns_match;
}
- /* and then search for path matches */
-
+ /* and then search for path matches -- construct hashing
+ * string of url decoded path */
+ b = BLOB_BUF(buffer);
+ blob_push(&b, key);
+ key = blob_pushed(BLOB_BUF(buffer), b);
+ blob_push_urldecode(&b, url->path);
+ b = blob_pushed(BLOB_BUF(buffer), b);
+
+ while (indx[i].has_paths) {
+ /* add one more path component */
+ got = blob_expand_tail(&key, b, '/');
+ if (blob_is_null(got))
+ break;
+ previ = i;
+ i = cmph_search_packed(cmph, key.ptr, key.len);
+ tld = sqdb_get_string_literal(db, indx[i].component);
+ if (blob_cmp(got, sqdb_get_string_literal(db, indx[i].component)) != 0) {
+ /* the subdomain did no longer match, use
+ * parents classification */
+ i = previ;
+ goto parent_dns_match;
+ }
+ mkey = key;
+ }
parent_dns_match:
if (i == -1)
return BLOB_STR("unknown");
categories = sqdb_section_get(db, SQDB_SECTION_CATEGORIES, NULL);
- printf("%d\n", indx[i].category);
return sqdb_get_string_literal(db, categories[indx[i].category]);
}
@@ -171,10 +194,12 @@ int main(int argc, char **argv)
"http://facebook.com:1234/",
"https://slashdot.org/path/to/me",
"http://user:pass@paistortuga.com/~mocosoft",
+ "http://user:pass@paistortuga.com",
"user@weather.whenu.speedera.net",
"zedo1.speedera.net",
"foo.com/stuff?query;bar#frag",
"foo.com?query;bar#frag",
+ "aapracingandsports.com.au/racing/",
};
struct sqdb db;
struct url_info nfo;