summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTimo Teräs <timo.teras@iki.fi>2010-08-18 18:48:50 +0300
committerTimo Teräs <timo.teras@iki.fi>2010-08-18 18:48:50 +0300
commit74d0daba0a9ffeacc83b3e7361c30e4b0b24c5af (patch)
tree2b6fd78b4d38acba93ed9659011a49aa6dffa333
parent54581ca7fe116ab20d6fa7f15013efe1777e1380 (diff)
downloadsquark-74d0daba0a9ffeacc83b3e7361c30e4b0b24c5af.tar.bz2
squark-74d0daba0a9ffeacc83b3e7361c30e4b0b24c5af.tar.xz
filter: improve dns part matching
Lower case the dns part of url. Also skip "www123" and similar entries when determining if path components should be matched.
-rw-r--r--squark-filter.c126
1 files changed, 105 insertions, 21 deletions
diff --git a/squark-filter.c b/squark-filter.c
index c0d66d9..097f420 100644
--- a/squark-filter.c
+++ b/squark-filter.c
@@ -1,4 +1,17 @@
+/* squark-filter.c - Squid User Authentication and Rating Kit
+ * An external redirector for Squid which analyzes the URL according
+ * to a database and can redirect to a block page.
+ *
+ * Copyright (C) 2010 Timo Teräs <timo.teras@iki.fi>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation. See http://www.gnu.org/ for details.
+ */
+
#include <stdio.h>
+#include <ctype.h>
#include <string.h>
#include <unistd.h>
@@ -22,8 +35,49 @@ struct url_info {
blob_t query;
blob_t fragment;
int port;
+ int is_ipv4;
+ int num_dots;
};
+struct url_dns_part_data {
+ blob_t word;
+ int num_dots;
+ int numeric;
+};
+
+void blob_pull_url_dns_part(blob_t *b, struct url_dns_part_data *udp)
+{
+ blob_t t = *b;
+ int c, i, dots = 0, numeric = 1;
+
+ for (i = 0; i < t.len; i++) {
+ c = (unsigned char) t.ptr[i];
+ switch (c) {
+ case '.':
+ dots++;
+ break;
+ case ':': case '@': case '/': case '?':
+ *b = BLOB_PTR_LEN(t.ptr + i, t.len - i);
+ udp->word = BLOB_PTR_LEN(t.ptr, i);
+ udp->num_dots = dots;
+ udp->numeric = numeric;
+ return;
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ break;
+ default:
+ t.ptr[i] = tolower(c);
+ numeric = 0;
+ break;
+ }
+ }
+
+ *b = BLOB_NULL;
+ udp->word = t;
+ udp->num_dots = dots;
+ udp->numeric = numeric;
+}
+
/* URI is generalized as:
* [proto://][user[:password]@]domain.name[:port][/[path/to][?p=a&q=b;r=c][#fragment]]
* Character literals used as separators are:
@@ -33,42 +87,48 @@ struct url_info {
static int url_parse(blob_t uri, struct url_info *nfo)
{
- blob_t before_colon;
- blob_t word;
+ struct url_dns_part_data prev, cur;
+ memset(&prev, 0, sizeof(prev));
memset(nfo, 0, sizeof(*nfo));
/* parse protocol, username/password and domain name/port */
do {
- word = blob_pull_cspn(&uri, BLOB_STR(":@/?"));
+ blob_pull_url_dns_part(&uri, &cur);
+
switch (uri.len ? uri.ptr[0] : '/') {
case ':':
blob_pull_skip(&uri, 1);
if (blob_is_null(nfo->protocol) &&
blob_pull_matching(&uri, BLOB_STR("//")))
- nfo->protocol = word;
+ nfo->protocol = cur.word;
else
- before_colon = word;
+ prev = cur;
break;
case '@':
blob_pull_skip(&uri, 1);
if (!blob_is_null(nfo->username) ||
!blob_is_null(nfo->password))
goto error;
- if (!blob_is_null(before_colon)) {
- nfo->username = before_colon;
- nfo->password = word;
+ if (!blob_is_null(prev.word)) {
+ nfo->username = prev.word;
+ nfo->password = cur.word;
} else
- nfo->username = word;
- before_colon = BLOB_NULL;
+ nfo->username = cur.word;
+ memset(&prev, 0, sizeof(prev));
break;
case '/':
case '?':
- if (!blob_is_null(before_colon)) {
- nfo->host = before_colon;
- nfo->port = blob_pull_uint(&word, 10);
- } else
- nfo->host = word;
+ if (!blob_is_null(prev.word)) {
+ nfo->host = prev.word;
+ nfo->num_dots = prev.num_dots;
+ nfo->is_ipv4 = prev.numeric && prev.num_dots == 4;
+ nfo->port = blob_pull_uint(&cur.word, 10);
+ } else {
+ nfo->host = cur.word;
+ nfo->num_dots = cur.num_dots;
+ nfo->is_ipv4 = cur.numeric && cur.num_dots == 4;
+ }
break;
}
} while (blob_is_null(nfo->host) && !blob_is_null(uri));
@@ -112,6 +172,8 @@ static void url_print(struct url_info *nfo)
print_field(nfo, query);
print_field(nfo, fragment);
#undef print_field
+ printf("\n");
+ fflush(stdout);
}
static int url_classify(struct url_info *url, struct sqdb *db)
@@ -121,6 +183,7 @@ static int url_classify(struct url_info *url, struct sqdb *db)
void *cmph;
struct sqdb_index_entry *indx;
cmph_uint32 i = -1, previ;
+ int dots_done = 1;
cmph = sqdb_section_get(db, SQDB_SECTION_INDEX_MPH, NULL);
indx = sqdb_section_get(db, SQDB_SECTION_INDEX, NULL);
@@ -153,12 +216,30 @@ static int url_classify(struct url_info *url, struct sqdb *db)
tld = BLOB_NULL;
}
mkey = key;
+ dots_done++;
} while (indx[i].has_subdomains);
- if (key.ptr != url->host.ptr || !indx[i].has_paths) {
- /* the full dns part did not match, or there's no more
- * specific paths in db -- skip the path name search */
+ /* No paths to match for */
+ if (!indx[i].has_paths)
goto parent_dns_match;
+
+ if (key.ptr != url->host.ptr) {
+ blob_t tmpkey = key;
+
+ /* Not exact dns match, but there's paths. Check if we
+ * have only one more dns entry and it's of form www1 or
+ * such. If so, this should be treated as exact match. */
+ if (dots_done != url->num_dots)
+ goto parent_dns_match;
+
+ got = blob_expand_head(&tmpkey, url->host, '.');
+ if (blob_is_null(got) ||
+ !blob_pull_matching(&got, BLOB_STR("www")))
+ goto parent_dns_match;
+
+ blob_pull_uint(&got, 10);
+ if (got.len != 0)
+ goto parent_dns_match;
}
/* and then search for path matches -- construct hashing
@@ -290,11 +371,14 @@ static void read_input(struct sqdb *db)
/* urlgroup */
/* myaddr=xxx myport=xxx etc */
- if (!blob_is_null(username)) {
+ if (!blob_is_null(url)) {
+ if (blob_is_null(username))
+ username = BLOB_STR("-");
/* valid request, handle it */
- if (url_parse(url, &nfo))
+ if (url_parse(url, &nfo)) {
+ url_print(&nfo);
category = url_classify(&nfo, db);
- else
+ } else
category = 0;
if ((1ULL << category) & banned_categories)