summaryrefslogtreecommitdiffstats
path: root/squark-filter.c
diff options
context:
space:
mode:
Diffstat (limited to 'squark-filter.c')
-rw-r--r--squark-filter.c198
1 files changed, 198 insertions, 0 deletions
diff --git a/squark-filter.c b/squark-filter.c
new file mode 100644
index 0000000..8973d04
--- /dev/null
+++ b/squark-filter.c
@@ -0,0 +1,198 @@
+#include <stdio.h>
+
+#include <cmph.h>
+
+#include "squarkdb.h"
+#include "blob.h"
+
+struct url_info {
+ blob_t protocol;
+ blob_t username;
+ blob_t password;
+ blob_t host;
+ blob_t path;
+ blob_t query;
+ blob_t fragment;
+ int port;
+};
+
+/* URI is generalized as:
+ * [proto://][user[:password]@]domain.name[:port][/[path/to][?p=a&q=b;r=c][#fragment]]
+ * Character literals used as separators are:
+ * : / @ ? & ; #
+ * Also URI escaping says to treat %XX as encoded hex value.
+ */
+
+static int url_parse(blob_t uri, struct url_info *nfo)
+{
+ blob_t before_colon;
+ blob_t word;
+
+ memset(nfo, 0, sizeof(*nfo));
+
+ /* parse protocol, username/password and domain name/port */
+ do {
+ word = blob_pull_cspn(&uri, BLOB_STR(":@/?"));
+ switch (uri.len ? uri.ptr[0] : '/') {
+ case ':':
+ blob_pull_skip(&uri, 1);
+ if (blob_is_null(nfo->protocol) &&
+ blob_pull_matching(&uri, BLOB_STR("//")))
+ nfo->protocol = word;
+ else
+ before_colon = word;
+ break;
+ case '@':
+ blob_pull_skip(&uri, 1);
+ if (!blob_is_null(nfo->username) ||
+ !blob_is_null(nfo->password))
+ goto error;
+ if (!blob_is_null(before_colon)) {
+ nfo->username = before_colon;
+ nfo->password = word;
+ } else
+ nfo->username = word;
+ before_colon = BLOB_NULL;
+ break;
+ case '/':
+ case '?':
+ if (!blob_is_null(before_colon)) {
+ nfo->host = before_colon;
+ nfo->port = blob_pull_uint(&word, 10);
+ } else
+ nfo->host = word;
+ break;
+ }
+ } while (blob_is_null(nfo->host) && !blob_is_null(uri));
+
+ /* rest of the components */
+ nfo->path = blob_pull_cspn(&uri, BLOB_STR("?&;#"));
+ nfo->query = blob_pull_cspn(&uri, BLOB_STR("#"));
+ nfo->fragment = uri;
+
+ /* fill in defaults if needed */
+ if (blob_is_null(nfo->protocol)) {
+ if (nfo->port == 443)
+ nfo->protocol = BLOB_STR("https");
+ else
+ nfo->protocol = BLOB_STR("http");
+ if (nfo->port == 0)
+ nfo->port = 80;
+ } else if (nfo->port == 0) {
+ if (blob_cmp(nfo->protocol, BLOB_STR("https")) == 0)
+ nfo->port = 443;
+ else
+ nfo->port = 80;
+ }
+ if (blob_is_null(nfo->path))
+ nfo->path = BLOB_STR("/");
+
+ return 1;
+error:
+ return 0;
+}
+
+static void url_print(struct url_info *nfo)
+{
+#define print_field(nfo, x) if (!blob_is_null(nfo->x)) printf(" %s{%.*s}", #x, nfo->x.len, nfo->x.ptr)
+ print_field(nfo, protocol);
+ print_field(nfo, username);
+ print_field(nfo, password);
+ print_field(nfo, host);
+ printf(" port{%d}", nfo->port);
+ print_field(nfo, path);
+ print_field(nfo, query);
+ print_field(nfo, fragment);
+#undef print_field
+}
+
+static blob_t url_classify(struct url_info *url, struct sqdb *db)
+{
+ blob_t key, got, tld;
+ void *cmph;
+ struct sqdb_index_entry *indx;
+ uint32_t *categories;
+ char *strings;
+ cmph_uint32 i = -1, previ;
+
+ cmph = sqdb_section_get(db, SQDB_SECTION_INDEX_MPH, NULL);
+ indx = sqdb_section_get(db, SQDB_SECTION_INDEX, NULL);
+ strings = sqdb_section_get(db, SQDB_SECTION_STRINGS, NULL);
+
+ /* search for most qualified domain match; do first lookup
+ * with two domain components */
+ key = BLOB_PTR_LEN(url->host.ptr + url->host.len, 0);
+ tld = blob_expand_head(&key, url->host, '.');
+
+ do {
+ /* add one more domain component */
+ got = blob_expand_head(&key, url->host, '.');
+ if (blob_is_null(got))
+ break;
+
+ previ = i;
+ i = cmph_search_packed(cmph, key.ptr, key.len);
+ if (blob_cmp(got, BLOB_STR(&strings[indx[i].component])) != 0) {
+ /* the subdomain did no longer match, use
+ * parents classification */
+ i = previ;
+ goto parent_dns_match;
+ }
+ if (!blob_is_null(tld)) {
+ if (blob_cmp(tld, BLOB_STR(&strings[indx[indx[i].parent].component])) != 0) {
+ /* top level domain did not match */
+ i = -1;
+ goto parent_dns_match;
+ }
+ tld = BLOB_NULL;
+ }
+ } while (indx[i].has_subdomains);
+
+ if (key.ptr != url->host.ptr) {
+ /* the full of dns part did not match, so we skip the
+ * path name search */
+ goto parent_dns_match;
+ }
+
+ /* and then search for path matches */
+
+
+parent_dns_match:
+ if (i == -1)
+ return BLOB_STR("unknown");
+
+ categories = sqdb_section_get(db, SQDB_SECTION_CATEGORIES, NULL);
+ printf("%d\n", indx[i].category);
+ return BLOB_STR(&strings[categories[indx[i].category]]);
+}
+
+int main(int argc, char **argv)
+{
+ const char * const uri[] = {
+ "http://sex.com",
+ "http://facebook.com:1234/",
+ "https://slashdot.org/path/to/me",
+ "http://user:pass@paistortuga.com/~mocosoft",
+ "user@weather.whenu.speedera.net",
+ "zedo1.speedera.net",
+ "foo.com/stuff?query;bar#frag",
+ "foo.com?query;bar#frag",
+ };
+ struct sqdb db;
+ struct url_info nfo;
+ blob_t cat;
+ int i;
+
+ sqdb_open(&db, "squark.db");
+ for (i = 0; i < ARRAY_SIZE(uri); i++) {
+ if (url_parse(BLOB_STR(uri[i]), &nfo)) {
+ cat = url_classify(&nfo, &db);
+ printf("%s - %.*s -", uri[i], cat.len, cat.ptr);
+ url_print(&nfo);
+ printf("\n");
+ } else {
+ printf("%s - BAD_URL\n", uri[i]);
+ }
+ }
+ sqdb_close(&db);
+}