filter: improve dns part matching

Lower case the dns part of url. Also skip "www123" and similar entries when determining if path components should be matched.
author: Timo Teräs <timo.teras@iki.fi> 2010-08-18 18:48:50 +0300
committer: Timo Teräs <timo.teras@iki.fi> 2010-08-18 18:48:50 +0300
commit: 74d0daba0a9ffeacc83b3e7361c30e4b0b24c5af (patch)
tree: 2b6fd78b4d38acba93ed9659011a49aa6dffa333
parent: 54581ca7fe116ab20d6fa7f15013efe1777e1380 (diff)
download: squark-74d0daba0a9ffeacc83b3e7361c30e4b0b24c5af.tar.bz2
squark-74d0daba0a9ffeacc83b3e7361c30e4b0b24c5af.tar.xz
1 files changed, 105 insertions, 21 deletions
diff --git a/squark-filter.c b/squark-filter.c
index c0d66d9..097f420 100644
--- a/squark-filter.c
+++ b/squark-filter.c
@@ -1,4 +1,17 @@
+/* squark-filter.c - Squid User Authentication and Rating Kit
+ *   An external redirector for Squid which analyzes the URL according
+ *   to a database and can redirect to a block page.
+ *
+ * Copyright (C) 2010 Timo Teräs <timo.teras@iki.fi>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation. See http://www.gnu.org/ for details.
+ */
+
 #include <stdio.h>
+#include <ctype.h>
 #include <string.h>
 #include <unistd.h>
 
@@ -22,8 +35,49 @@ struct url_info {
 	blob_t query;
 	blob_t fragment;
 	int port;
+	int is_ipv4;
+	int num_dots;
 };
 
+struct url_dns_part_data {
+	blob_t word;
+	int num_dots;
+	int numeric;
+};
+
+void blob_pull_url_dns_part(blob_t *b, struct url_dns_part_data *udp)
+{
+	blob_t t = *b;
+	int c, i, dots = 0, numeric = 1;
+
+	for (i = 0; i < t.len; i++) {
+		c = (unsigned char) t.ptr[i];
+		switch (c) {
+		case '.':
+			dots++;
+			break;
+		case ':': case '@': case '/': case '?':
+			*b = BLOB_PTR_LEN(t.ptr + i, t.len - i);
+			udp->word = BLOB_PTR_LEN(t.ptr, i);
+			udp->num_dots = dots;
+			udp->numeric = numeric;
+			return;
+		case '0': case '1': case '2': case '3': case '4':
+		case '5': case '6': case '7': case '8': case '9':
+			break;
+		default:
+			t.ptr[i] = tolower(c);
+			numeric = 0;
+			break;
+		}
+	}
+
+	*b = BLOB_NULL;
+	udp->word = t;
+	udp->num_dots = dots;
+	udp->numeric = numeric;
+}
+
 /* URI is generalized as:
  * [proto://][user[:password]@]domain.name[:port][/[path/to][?p=a&q=b;r=c][#fragment]]
  * Character literals used as separators are:
@@ -33,42 +87,48 @@ struct url_info {
 
 static int url_parse(blob_t uri, struct url_info *nfo)
 {
-	blob_t before_colon;
-	blob_t word;
+	struct url_dns_part_data prev, cur;
 
+	memset(&prev, 0, sizeof(prev));
 	memset(nfo, 0, sizeof(*nfo));
 
 	/* parse protocol, username/password and domain name/port */
 	do {
-		word = blob_pull_cspn(&uri, BLOB_STR(":@/?"));
+		blob_pull_url_dns_part(&uri, &cur);
+
 		switch (uri.len ? uri.ptr[0] : '/') {
 		case ':':
 			blob_pull_skip(&uri, 1);
 			if (blob_is_null(nfo->protocol) &&
 			    blob_pull_matching(&uri, BLOB_STR("//")))
-				nfo->protocol = word;
+				nfo->protocol = cur.word;
 			else
-				before_colon = word;
+				prev = cur;
 			break;
 		case '@':
 			blob_pull_skip(&uri, 1);
 			if (!blob_is_null(nfo->username) ||
 			    !blob_is_null(nfo->password))
 				goto error;
-			if (!blob_is_null(before_colon)) {
-				nfo->username = before_colon;
-				nfo->password = word;
+			if (!blob_is_null(prev.word)) {
+				nfo->username = prev.word;
+				nfo->password = cur.word;
 			} else
-				nfo->username = word;
-			before_colon = BLOB_NULL;
+				nfo->username = cur.word;
+			memset(&prev, 0, sizeof(prev));
 			break;
 		case '/':
 		case '?':
-			if (!blob_is_null(before_colon)) {
-				nfo->host = before_colon;
-				nfo->port = blob_pull_uint(&word, 10);
-			} else
-				nfo->host = word;
+			if (!blob_is_null(prev.word)) {
+				nfo->host = prev.word;
+				nfo->num_dots = prev.num_dots;
+				nfo->is_ipv4 = prev.numeric && prev.num_dots == 4;
+				nfo->port = blob_pull_uint(&cur.word, 10);
+			} else {
+				nfo->host = cur.word;
+				nfo->num_dots = cur.num_dots;
+				nfo->is_ipv4 = cur.numeric && cur.num_dots == 4;
+			}
 			break;
 		}
 	} while (blob_is_null(nfo->host) && !blob_is_null(uri));
@@ -112,6 +172,8 @@ static void url_print(struct url_info *nfo)
 	print_field(nfo, query);
 	print_field(nfo, fragment);
 #undef print_field
+	printf("\n");
+	fflush(stdout);
 }
 
 static int url_classify(struct url_info *url, struct sqdb *db)
@@ -121,6 +183,7 @@ static int url_classify(struct url_info *url, struct sqdb *db)
 	void *cmph;
 	struct sqdb_index_entry *indx;
 	cmph_uint32 i = -1, previ;
+	int dots_done = 1;
 
 	cmph = sqdb_section_get(db, SQDB_SECTION_INDEX_MPH, NULL);
 	indx = sqdb_section_get(db, SQDB_SECTION_INDEX, NULL);
@@ -153,12 +216,30 @@ static int url_classify(struct url_info *url, struct sqdb *db)
 			tld = BLOB_NULL;
 		}
 		mkey = key;
+		dots_done++;
 	} while (indx[i].has_subdomains);
 
-	if (key.ptr != url->host.ptr || !indx[i].has_paths) {
-		/* the full dns part did not match, or there's no more
-		 * specific paths in db -- skip the path name search */
+	/* No paths to match for */
+	if (!indx[i].has_paths)
 		goto parent_dns_match;
+
+	if (key.ptr != url->host.ptr) {
+		blob_t tmpkey = key;
+
+		/* Not exact dns match, but there's paths. Check if we
+		 * have only one more dns entry and it's of form www1 or
+		 * such. If so, this should be treated as exact match. */
+		if (dots_done != url->num_dots)
+			goto parent_dns_match;
+
+		got = blob_expand_head(&tmpkey, url->host, '.');
+		if (blob_is_null(got) ||
+		    !blob_pull_matching(&got, BLOB_STR("www")))
+			goto parent_dns_match;
+
+		blob_pull_uint(&got, 10);
+		if (got.len != 0)
+			goto parent_dns_match;
 	}
 
 	/* and then search for path matches -- construct hashing
@@ -290,11 +371,14 @@ static void read_input(struct sqdb *db)
 		/* urlgroup */
 		/* myaddr=xxx myport=xxx etc */
 
-		if (!blob_is_null(username)) {
+		if (!blob_is_null(url)) {
+			if (blob_is_null(username))
+				username = BLOB_STR("-");
 			/* valid request, handle it */
-			if (url_parse(url, &nfo))
+			if (url_parse(url, &nfo)) {
+				url_print(&nfo);
 				category = url_classify(&nfo, db);
-			else
+			} else
 				category = 0;
 
 			if ((1ULL << category) & banned_categories)
author	Timo Teräs <timo.teras@iki.fi>	2010-08-18 18:48:50 +0300
committer	Timo Teräs <timo.teras@iki.fi>	2010-08-18 18:48:50 +0300
commit	74d0daba0a9ffeacc83b3e7361c30e4b0b24c5af (patch)
tree	2b6fd78b4d38acba93ed9659011a49aa6dffa333
parent	54581ca7fe116ab20d6fa7f15013efe1777e1380 (diff)
download	squark-74d0daba0a9ffeacc83b3e7361c30e4b0b24c5af.tar.bz2 squark-74d0daba0a9ffeacc83b3e7361c30e4b0b24c5af.tar.xz