summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--blob.c22
-rw-r--r--blob.h1
-rw-r--r--squark-filter.c61
3 files changed, 44 insertions, 40 deletions
diff --git a/blob.c b/blob.c
index 8f630c2..f2a80f6 100644
--- a/blob.c
+++ b/blob.c
@@ -1,3 +1,6 @@
+#include <ctype.h>
+#include <string.h>
+
#include "blob.h"
/* RFC 3986 section 2.3 Unreserved Characters (January 2005) */
@@ -162,6 +165,20 @@ void blob_push(blob_t *b, blob_t d)
}
}
+void blob_push_lower(blob_t *b, blob_t d)
+{
+ int i;
+
+ if (b->len < d.len) {
+ *b = BLOB_NULL;
+ return;
+ }
+ for (i = 0; i < d.len; i++)
+ b->ptr[i] = tolower(d.ptr[i]);
+ b->ptr += d.len;
+ b->len -= d.len;
+}
+
void blob_push_byte(blob_t *b, unsigned char byte)
{
if (b->len) {
@@ -219,9 +236,6 @@ void blob_push_urldecode(blob_t *to, blob_t url)
do {
blob_pull_matching(&url, BLOB_STR("/"));
b = blob_pull_cspn(&url, BLOB_STR("/"));
- if (blob_is_null(url) && blob_is_null(b))
- break;
-
if (blob_is_null(b) || blob_cmp(b, BLOB_STR(".")) == 0) {
/* skip '.' or two consecutive / */
} else if (blob_cmp(b, BLOB_STR("..")) == 0) {
@@ -232,7 +246,7 @@ void blob_push_urldecode(blob_t *to, blob_t url)
blob_push_byte(to, '/');
blob_push(to, b);
}
- } while (1);
+ } while (!blob_is_null(url));
}
void blob_push_urlencode(blob_t *to, blob_t url)
diff --git a/blob.h b/blob.h
index f5c57eb..0c10ca6 100644
--- a/blob.h
+++ b/blob.h
@@ -42,6 +42,7 @@ unsigned long blob_inet_addr(blob_t buf);
blob_t blob_pushed(blob_t buffer, blob_t left);
void blob_push(blob_t *b, blob_t d);
+void blob_push_lower(blob_t *b, blob_t d);
void blob_push_byte(blob_t *b, unsigned char byte);
void blob_push_uint(blob_t *to, unsigned int value, int radix);
void blob_push_hexdump(blob_t *to, blob_t binary);
diff --git a/squark-filter.c b/squark-filter.c
index 8fab0bf..9bc6bb2 100644
--- a/squark-filter.c
+++ b/squark-filter.c
@@ -31,6 +31,7 @@ struct url_info {
blob_t username;
blob_t password;
blob_t host;
+ blob_t significant_host;
blob_t path;
blob_t query;
blob_t fragment;
@@ -66,7 +67,6 @@ void blob_pull_url_dns_part(blob_t *b, struct url_dns_part_data *udp)
case '5': case '6': case '7': case '8': case '9':
break;
default:
- t.ptr[i] = tolower(c);
numeric = 0;
break;
}
@@ -155,7 +155,17 @@ static int url_parse(blob_t uri, struct url_info *nfo)
if (blob_is_null(nfo->path))
nfo->path = BLOB_STR("/");
+ /* significant host name */
+ nfo->significant_host = nfo->host;
+ if (nfo->num_dots > 1) {
+ blob_t b = nfo->significant_host;
+ if (blob_pull_matching(&b, BLOB_STR("www")) &&
+ (blob_pull_uint(&b, 10), 1) &&
+ blob_pull_matching(&b, BLOB_STR(".")))
+ nfo->significant_host = b;
+ }
return 1;
+
error:
return 0;
}
@@ -178,8 +188,8 @@ static void url_print(struct url_info *nfo)
static int url_classify(struct url_info *url, struct sqdb *db)
{
- unsigned char buffer[1024];
- blob_t b, key, got, tld, mkey;
+ unsigned char buffer[512];
+ blob_t key, got, tld, keybuf, keylimits;
void *cmph;
struct sqdb_index_entry *indx;
cmph_uint32 i = SQDB_PARENT_ROOT, previ = SQDB_PARENT_ROOT;
@@ -188,10 +198,13 @@ static int url_classify(struct url_info *url, struct sqdb *db)
cmph = sqdb_section_get(db, SQDB_SECTION_INDEX_MPH, NULL);
indx = sqdb_section_get(db, SQDB_SECTION_INDEX, NULL);
+ keybuf = BLOB_BUF(buffer);
+ blob_push_lower(&keybuf, url->significant_host);
+ key = keylimits = blob_pushed(BLOB_BUF(buffer), keybuf);
+
/* search for most qualified domain match; do first lookup
* with two domain components */
if (url->is_ipv4) {
- key = url->host;
i = cmph_search_packed(cmph, key.ptr, key.len);
if (indx[i].parent != SQDB_PARENT_IPV4 ||
@@ -200,12 +213,12 @@ static int url_classify(struct url_info *url, struct sqdb *db)
goto parent_dns_match;
}
} else {
- key = BLOB_PTR_LEN(url->host.ptr + url->host.len, 0);
- tld = blob_expand_head(&key, url->host, '.');
+ key = BLOB_PTR_LEN(key.ptr + key.len, 0);
+ tld = blob_expand_head(&key, keylimits, '.');
do {
/* add one more domain component */
- got = blob_expand_head(&key, url->host, '.');
+ got = blob_expand_head(&key, keylimits, '.');
if (blob_is_null(got))
break;
@@ -232,45 +245,22 @@ static int url_classify(struct url_info *url, struct sqdb *db)
i = previ;
goto parent_dns_match;
}
- mkey = key;
dots_done++;
} while (indx[i].has_subdomains);
}
/* No paths to match for */
- if (i == SQDB_PARENT_ROOT || !indx[i].has_paths)
+ if (i == SQDB_PARENT_ROOT || !indx[i].has_paths || key.ptr != keylimits.ptr)
goto parent_dns_match;
- if (key.ptr != url->host.ptr) {
- blob_t tmpkey = key;
-
- /* Not exact dns match, but there's paths. Check if we
- * have only one more dns entry and it's of form www1 or
- * such. If so, this should be treated as exact match. */
- if (dots_done != url->num_dots)
- goto parent_dns_match;
-
- got = blob_expand_head(&tmpkey, url->host, '.');
- if (blob_is_null(got) ||
- !blob_pull_matching(&got, BLOB_STR("www")))
- goto parent_dns_match;
-
- blob_pull_uint(&got, 10);
- if (got.len != 0)
- goto parent_dns_match;
- }
-
/* and then search for path matches -- construct hashing
* string of url decoded path */
- b = BLOB_BUF(buffer);
- blob_push(&b, key);
- key = blob_pushed(BLOB_BUF(buffer), b);
- blob_push_urldecode(&b, url->path);
- b = blob_pushed(BLOB_BUF(buffer), b);
+ blob_push_urldecode(&keybuf, url->path);
+ key = keylimits = blob_pushed(BLOB_BUF(buffer), keybuf);
while (indx[i].has_paths) {
/* add one more path component */
- got = blob_expand_tail(&key, b, '/');
+ got = blob_expand_tail(&key, keylimits, '/');
if (blob_is_null(got))
break;
previ = i;
@@ -282,7 +272,6 @@ static int url_classify(struct url_info *url, struct sqdb *db)
i = previ;
goto parent_dns_match;
}
- mkey = key;
}
parent_dns_match:
@@ -432,7 +421,7 @@ int main(int argc, char **argv)
struct sqdb db;
int opt;
- sqdb_open(&db, "squark.db");
+ sqdb_open(&db, "/var/lib/squark/squark.db");
while ((opt = getopt(argc, argv, "r:b:")) != -1) {
switch (opt) {