diff options
author | Timo Teräs <timo.teras@iki.fi> | 2010-08-18 18:48:50 +0300 |
---|---|---|
committer | Timo Teräs <timo.teras@iki.fi> | 2010-08-18 18:48:50 +0300 |
commit | 74d0daba0a9ffeacc83b3e7361c30e4b0b24c5af (patch) | |
tree | 2b6fd78b4d38acba93ed9659011a49aa6dffa333 | |
parent | 54581ca7fe116ab20d6fa7f15013efe1777e1380 (diff) | |
download | squark-74d0daba0a9ffeacc83b3e7361c30e4b0b24c5af.tar.bz2 squark-74d0daba0a9ffeacc83b3e7361c30e4b0b24c5af.tar.xz |
filter: improve dns part matching
Lower case the dns part of url. Also skip "www123" and similar
entries when determining if path components should be matched.
-rw-r--r-- | squark-filter.c | 126 |
1 files changed, 105 insertions, 21 deletions
diff --git a/squark-filter.c b/squark-filter.c index c0d66d9..097f420 100644 --- a/squark-filter.c +++ b/squark-filter.c @@ -1,4 +1,17 @@ +/* squark-filter.c - Squid User Authentication and Rating Kit + * An external redirector for Squid which analyzes the URL according + * to a database and can redirect to a block page. + * + * Copyright (C) 2010 Timo Teräs <timo.teras@iki.fi> + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. See http://www.gnu.org/ for details. + */ + #include <stdio.h> +#include <ctype.h> #include <string.h> #include <unistd.h> @@ -22,8 +35,49 @@ struct url_info { blob_t query; blob_t fragment; int port; + int is_ipv4; + int num_dots; }; +struct url_dns_part_data { + blob_t word; + int num_dots; + int numeric; +}; + +void blob_pull_url_dns_part(blob_t *b, struct url_dns_part_data *udp) +{ + blob_t t = *b; + int c, i, dots = 0, numeric = 1; + + for (i = 0; i < t.len; i++) { + c = (unsigned char) t.ptr[i]; + switch (c) { + case '.': + dots++; + break; + case ':': case '@': case '/': case '?': + *b = BLOB_PTR_LEN(t.ptr + i, t.len - i); + udp->word = BLOB_PTR_LEN(t.ptr, i); + udp->num_dots = dots; + udp->numeric = numeric; + return; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + break; + default: + t.ptr[i] = tolower(c); + numeric = 0; + break; + } + } + + *b = BLOB_NULL; + udp->word = t; + udp->num_dots = dots; + udp->numeric = numeric; +} + /* URI is generalized as: * [proto://][user[:password]@]domain.name[:port][/[path/to][?p=a&q=b;r=c][#fragment]] * Character literals used as separators are: @@ -33,42 +87,48 @@ struct url_info { static int url_parse(blob_t uri, struct url_info *nfo) { - blob_t before_colon; - blob_t word; + struct url_dns_part_data prev, cur; + memset(&prev, 0, sizeof(prev)); memset(nfo, 0, sizeof(*nfo)); /* parse protocol, username/password and domain name/port */ do { - word = blob_pull_cspn(&uri, BLOB_STR(":@/?")); + blob_pull_url_dns_part(&uri, &cur); + switch (uri.len ? uri.ptr[0] : '/') { case ':': blob_pull_skip(&uri, 1); if (blob_is_null(nfo->protocol) && blob_pull_matching(&uri, BLOB_STR("//"))) - nfo->protocol = word; + nfo->protocol = cur.word; else - before_colon = word; + prev = cur; break; case '@': blob_pull_skip(&uri, 1); if (!blob_is_null(nfo->username) || !blob_is_null(nfo->password)) goto error; - if (!blob_is_null(before_colon)) { - nfo->username = before_colon; - nfo->password = word; + if (!blob_is_null(prev.word)) { + nfo->username = prev.word; + nfo->password = cur.word; } else - nfo->username = word; - before_colon = BLOB_NULL; + nfo->username = cur.word; + memset(&prev, 0, sizeof(prev)); break; case '/': case '?': - if (!blob_is_null(before_colon)) { - nfo->host = before_colon; - nfo->port = blob_pull_uint(&word, 10); - } else - nfo->host = word; + if (!blob_is_null(prev.word)) { + nfo->host = prev.word; + nfo->num_dots = prev.num_dots; + nfo->is_ipv4 = prev.numeric && prev.num_dots == 4; + nfo->port = blob_pull_uint(&cur.word, 10); + } else { + nfo->host = cur.word; + nfo->num_dots = cur.num_dots; + nfo->is_ipv4 = cur.numeric && cur.num_dots == 4; + } break; } } while (blob_is_null(nfo->host) && !blob_is_null(uri)); @@ -112,6 +172,8 @@ static void url_print(struct url_info *nfo) print_field(nfo, query); print_field(nfo, fragment); #undef print_field + printf("\n"); + fflush(stdout); } static int url_classify(struct url_info *url, struct sqdb *db) @@ -121,6 +183,7 @@ static int url_classify(struct url_info *url, struct sqdb *db) void *cmph; struct sqdb_index_entry *indx; cmph_uint32 i = -1, previ; + int dots_done = 1; cmph = sqdb_section_get(db, SQDB_SECTION_INDEX_MPH, NULL); indx = sqdb_section_get(db, SQDB_SECTION_INDEX, NULL); @@ -153,12 +216,30 @@ static int url_classify(struct url_info *url, struct sqdb *db) tld = BLOB_NULL; } mkey = key; + dots_done++; } while (indx[i].has_subdomains); - if (key.ptr != url->host.ptr || !indx[i].has_paths) { - /* the full dns part did not match, or there's no more - * specific paths in db -- skip the path name search */ + /* No paths to match for */ + if (!indx[i].has_paths) goto parent_dns_match; + + if (key.ptr != url->host.ptr) { + blob_t tmpkey = key; + + /* Not exact dns match, but there's paths. Check if we + * have only one more dns entry and it's of form www1 or + * such. If so, this should be treated as exact match. */ + if (dots_done != url->num_dots) + goto parent_dns_match; + + got = blob_expand_head(&tmpkey, url->host, '.'); + if (blob_is_null(got) || + !blob_pull_matching(&got, BLOB_STR("www"))) + goto parent_dns_match; + + blob_pull_uint(&got, 10); + if (got.len != 0) + goto parent_dns_match; } /* and then search for path matches -- construct hashing @@ -290,11 +371,14 @@ static void read_input(struct sqdb *db) /* urlgroup */ /* myaddr=xxx myport=xxx etc */ - if (!blob_is_null(username)) { + if (!blob_is_null(url)) { + if (blob_is_null(username)) + username = BLOB_STR("-"); /* valid request, handle it */ - if (url_parse(url, &nfo)) + if (url_parse(url, &nfo)) { + url_print(&nfo); category = url_classify(&nfo, db); - else + } else category = 0; if ((1ULL << category) & banned_categories) |