From 7daf2874969fb6773d480e9776cd8418eeb6353f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timo=20Ter=C3=A4s?= <timo.teras@iki.fi>
Date: Fri, 13 Aug 2010 13:40:41 +0300
Subject: filter: fix db building issues and implement path component matching

Fixes has sub domains/paths hints to be correct.  www<number> as
first domain entry matching now checks it won't remove second level
domain names.

And the filter code now looksup path components from the db.
---
 blob.c          | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 blob.h          |  3 +++
 sqdb-build.lua  | 17 +++++++++------
 squark-filter.c | 39 ++++++++++++++++++++++++++++------
 4 files changed, 110 insertions(+), 15 deletions(-)
diff --git a/blob.c b/blob.c
index a417a0b..0c28877 100644
--- a/blob.c
+++ b/blob.c
@@ -111,6 +111,29 @@ void blob_push_hexdump(blob_t *to, blob_t binary)
 	to->len -= binary.len * 2;
 }
 
+void blob_push_urldecode(blob_t *to, blob_t url)
+{
+	blob_t b, orig = *to;
+
+	do {
+		blob_pull_matching(&url, BLOB_STR("/"));
+		b = blob_pull_cspn(&url, BLOB_STR("/"));
+		if (blob_is_null(url) && blob_is_null(b))
+			break;
+
+		if (blob_is_null(b) || blob_cmp(b, BLOB_STR(".")) == 0) {
+			/* skip '.' or two consecutive / */
+		} else if (blob_cmp(b, BLOB_STR("..")) == 0) {
+			/* go up one path component */
+			blob_shrink_tail(to, blob_pushed(orig, b), '/');
+		} else {
+			/* copy decoded; FIXME decode percent encoding */
+			blob_push(to, BLOB_STR("/"));
+			blob_push(to, b);
+		}
+	} while (1);
+}
+
 blob_t blob_pull(blob_t *b, int len)
 {
 	blob_t r;
@@ -177,6 +200,7 @@ blob_t blob_pull_cspn(blob_t *b, const blob_t reject)
 			return BLOB_PTR_LEN(t.ptr, i);
 		}
 	}
+
 	*b = BLOB_NULL;
 	return t;
 }
@@ -186,7 +210,7 @@ blob_t blob_expand_head(blob_t *b, blob_t limits, unsigned char sep)
 	blob_t t = *b;
 	blob_t r;
 
-	if (t.ptr <= limits.ptr || t.ptr+t.len > limits.ptr+limits.len)
+	if (t.ptr < limits.ptr || t.ptr+t.len > limits.ptr+limits.len)
 		return BLOB_NULL;
 	while (t.ptr > limits.ptr && t.ptr[-1] == sep)
 		t.ptr--, t.len++;
@@ -200,3 +224,43 @@ blob_t blob_expand_head(blob_t *b, blob_t limits, unsigned char sep)
 	*b = t;
 	return r;
 }
+
+blob_t blob_expand_tail(blob_t *b, blob_t limits, unsigned char sep)
+{
+	blob_t t = *b;
+	blob_t r;
+
+	if (t.ptr < limits.ptr || t.ptr+t.len > limits.ptr+limits.len)
+		return BLOB_NULL;
+	while (t.ptr + t.len < limits.ptr + limits.len && t.ptr[t.len] == sep)
+		t.len++;
+
+	r.ptr = t.ptr + t.len;
+	r.len = 0;
+	while (t.ptr + t.len < limits.ptr + limits.len && t.ptr[t.len] != sep) {
+		t.len++;
+		r.len++;
+	}
+	*b = t;
+	return r;
+}
+
+blob_t blob_shrink_tail(blob_t *b, blob_t limits, unsigned char sep)
+{
+	blob_t t = *b;
+	blob_t r;
+
+	if (t.ptr <= limits.ptr || t.ptr+t.len > limits.ptr+limits.len)
+		return BLOB_NULL;
+	while (t.len && t.ptr[t.len-1] == sep)
+		t.len--;
+
+	r.ptr = t.ptr;
+	r.len = 0;
+	while (t.len && t.ptr[t.len-1] != sep) {
+		t.len--;
+		r.ptr--, r.len++;
+	}
+	*b = t;
+	return r;
+}
diff --git a/blob.h b/blob.h
index 767e661..3d065ed 100644
--- a/blob.h
+++ b/blob.h
@@ -41,6 +41,7 @@ blob_t blob_pushed(blob_t buffer, blob_t left);
 void blob_push(blob_t *b, blob_t d);
 void blob_push_uint(blob_t *to, unsigned int value, int radix);
 void blob_push_hexdump(blob_t *to, blob_t binary);
+void blob_push_urldecode(blob_t *to, blob_t url);
 blob_t blob_pull(blob_t *b, int len);
 void blob_pull_skip(blob_t *b, int len);
 int blob_pull_matching(blob_t *b, blob_t e);
@@ -48,5 +49,7 @@ unsigned int blob_pull_uint(blob_t *b, int radix);
 blob_t blob_pull_cspn(blob_t *b, const blob_t cspn);
 
 blob_t blob_expand_head(blob_t *b, blob_t limits, unsigned char sep);
+blob_t blob_expand_tail(blob_t *b, blob_t limits, unsigned char sep);
+blob_t blob_shrink_tail(blob_t *b, blob_t limits, unsigned char sep);
 
 #endif
diff --git a/sqdb-build.lua b/sqdb-build.lua
index fce1e7b..2b301fc 100755
--- a/sqdb-build.lua
+++ b/sqdb-build.lua
@@ -117,7 +117,7 @@ local function read_urls(filename, category, locked)
 		url = url:gsub("#.*", "")
 		url = url:gsub(" *^", "")
 		url = url:lower()
-		url = url:gsub("^(www%d*[.])", "")
+		url = url:gsub("^(www%d*[.])([^.]*[.])", "%2")
 		domain, path = url:match("([^/]*)/?(.*)")
 		domain = domain:gsub(":.*", "")
 		domain = domain:gsub("[.]$", "")	-- trailing dot
@@ -187,7 +187,7 @@ local function enum_tree(cb, category, dns, data)
 				fdns = cdns
 			end
 			cat = cdata.category or category
-			cb(fdns, dns, cdns, cat, data.children, data.paths)
+			cb(fdns, dns, cdns, cat, cdata.children, cdata.paths)
 			enum_tree(cb, cat, fdns, cdata)
 		end
 	end
@@ -233,15 +233,15 @@ local function prune_paths(paths, category)
 	return num_paths
 end
 
-local function prune_tree(d, category)
+local function prune_tree(d, pcategory)
 	local num_childs = 0
 	local num_paths = 0
 	local cat
 
-	cat = d.category or category
+	cat = d.category or pcategory
 	if d.children ~= nil then
 		for n, child in pairs(d.children) do
-			if prune_tree(child, cat, count) then
+			if prune_tree(child, cat, n) then
 				d.children[n] = nil
 			else
 				num_childs = num_childs + 1
@@ -254,9 +254,12 @@ local function prune_tree(d, category)
 	end
 	--print(name, d.category, category, d.num_paths, num_childs)
 	if d.paths ~= nil then
-		num_paths = prune_paths(d.paths, d.category)
+		num_paths = prune_paths(d.paths, cat)
+		if num_paths == 0 then
+			d.paths = nil
+		end
 	end
-	if cat == category and num_paths == 0 and num_childs == 0 then
+	if d.category == pcategory and num_paths == 0 and num_childs == 0 then
 		--num_pruned_leafs = num_pruned_leafs + 1
 		return true
 	end
diff --git a/squark-filter.c b/squark-filter.c
index e47cbf5..f3a4aed 100644
--- a/squark-filter.c
+++ b/squark-filter.c
@@ -108,7 +108,8 @@ static void url_print(struct url_info *nfo)
 
 static blob_t url_classify(struct url_info *url, struct sqdb *db)
 {
-	blob_t key, got, tld;
+	unsigned char buffer[1024];
+	blob_t b, key, got, tld, mkey;
 	void *cmph;
 	struct sqdb_index_entry *indx;
 	uint32_t *categories;
@@ -144,23 +145,45 @@ static blob_t url_classify(struct url_info *url, struct sqdb *db)
 			}
 			tld = BLOB_NULL;
 		}
+		mkey = key;
 	} while (indx[i].has_subdomains);
 
-	if (key.ptr != url->host.ptr) {
-		/* the full of dns part did not match, so we skip the
-		 * path name search */
+	if (key.ptr != url->host.ptr || !indx[i].has_paths) {
+		/* the full dns part did not match, or there's no more
+		 * specific paths in db -- skip the path name search */
 		goto parent_dns_match;
 	}
 
-	/* and then search for path matches */
-
+	/* and then search for path matches -- construct hashing
+	 * string of url decoded path */
+	b = BLOB_BUF(buffer);
+	blob_push(&b, key);
+	key = blob_pushed(BLOB_BUF(buffer), b);
+	blob_push_urldecode(&b, url->path);
+	b = blob_pushed(BLOB_BUF(buffer), b);
+
+	while (indx[i].has_paths) {
+		/* add one more path component */
+		got = blob_expand_tail(&key, b, '/');
+		if (blob_is_null(got))
+			break;
+		previ = i;
+		i = cmph_search_packed(cmph, key.ptr, key.len);
+		tld = sqdb_get_string_literal(db, indx[i].component);
+		if (blob_cmp(got, sqdb_get_string_literal(db, indx[i].component)) != 0) {
+			/* the subdomain did no longer match, use 
+			* parents classification */
+			i = previ;
+			goto parent_dns_match;
+		}
+		mkey = key;
+	}
 
 parent_dns_match:
 	if (i == -1)
 		return BLOB_STR("unknown");
 
 	categories = sqdb_section_get(db, SQDB_SECTION_CATEGORIES, NULL);
-	printf("%d\n", indx[i].category);
 	return sqdb_get_string_literal(db, categories[indx[i].category]);
 }
 
@@ -171,10 +194,12 @@ int main(int argc, char **argv)
 		"http://facebook.com:1234/",
 		"https://slashdot.org/path/to/me",
 		"http://user:pass@paistortuga.com/~mocosoft",
+		"http://user:pass@paistortuga.com",
 		"user@weather.whenu.speedera.net",
 		"zedo1.speedera.net",
 		"foo.com/stuff?query;bar#frag",
 		"foo.com?query;bar#frag",
+		"aapracingandsports.com.au/racing/",
 	};
 	struct sqdb db;
 	struct url_info nfo;
-- 
cgit v1.2.3