#include #include #include "squarkdb.h" #include "blob.h" struct url_info { blob_t protocol; blob_t username; blob_t password; blob_t host; blob_t path; blob_t query; blob_t fragment; int port; }; /* URI is generalized as: * [proto://][user[:password]@]domain.name[:port][/[path/to][?p=a&q=b;r=c][#fragment]] * Character literals used as separators are: * : / @ ? & ; # * Also URI escaping says to treat %XX as encoded hex value. */ static int url_parse(blob_t uri, struct url_info *nfo) { blob_t before_colon; blob_t word; memset(nfo, 0, sizeof(*nfo)); /* parse protocol, username/password and domain name/port */ do { word = blob_pull_cspn(&uri, BLOB_STR(":@/?")); switch (uri.len ? uri.ptr[0] : '/') { case ':': blob_pull_skip(&uri, 1); if (blob_is_null(nfo->protocol) && blob_pull_matching(&uri, BLOB_STR("//"))) nfo->protocol = word; else before_colon = word; break; case '@': blob_pull_skip(&uri, 1); if (!blob_is_null(nfo->username) || !blob_is_null(nfo->password)) goto error; if (!blob_is_null(before_colon)) { nfo->username = before_colon; nfo->password = word; } else nfo->username = word; before_colon = BLOB_NULL; break; case '/': case '?': if (!blob_is_null(before_colon)) { nfo->host = before_colon; nfo->port = blob_pull_uint(&word, 10); } else nfo->host = word; break; } } while (blob_is_null(nfo->host) && !blob_is_null(uri)); /* rest of the components */ nfo->path = blob_pull_cspn(&uri, BLOB_STR("?&;#")); nfo->query = blob_pull_cspn(&uri, BLOB_STR("#")); nfo->fragment = uri; /* fill in defaults if needed */ if (blob_is_null(nfo->protocol)) { if (nfo->port == 443) nfo->protocol = BLOB_STR("https"); else nfo->protocol = BLOB_STR("http"); if (nfo->port == 0) nfo->port = 80; } else if (nfo->port == 0) { if (blob_cmp(nfo->protocol, BLOB_STR("https")) == 0) nfo->port = 443; else nfo->port = 80; } if (blob_is_null(nfo->path)) nfo->path = BLOB_STR("/"); return 1; error: return 0; } static void url_print(struct url_info *nfo) { #define print_field(nfo, x) if (!blob_is_null(nfo->x)) printf(" %s{%.*s}", #x, nfo->x.len, nfo->x.ptr) print_field(nfo, protocol); print_field(nfo, username); print_field(nfo, password); print_field(nfo, host); printf(" port{%d}", nfo->port); print_field(nfo, path); print_field(nfo, query); print_field(nfo, fragment); #undef print_field } static blob_t url_classify(struct url_info *url, struct sqdb *db) { blob_t key, got, tld; void *cmph; struct sqdb_index_entry *indx; uint32_t *categories; char *strings; cmph_uint32 i = -1, previ; cmph = sqdb_section_get(db, SQDB_SECTION_INDEX_MPH, NULL); indx = sqdb_section_get(db, SQDB_SECTION_INDEX, NULL); strings = sqdb_section_get(db, SQDB_SECTION_STRINGS, NULL); /* search for most qualified domain match; do first lookup * with two domain components */ key = BLOB_PTR_LEN(url->host.ptr + url->host.len, 0); tld = blob_expand_head(&key, url->host, '.'); do { /* add one more domain component */ got = blob_expand_head(&key, url->host, '.'); if (blob_is_null(got)) break; previ = i; i = cmph_search_packed(cmph, key.ptr, key.len); if (blob_cmp(got, BLOB_STR(&strings[indx[i].component])) != 0) { /* the subdomain did no longer match, use * parents classification */ i = previ; goto parent_dns_match; } if (!blob_is_null(tld)) { if (blob_cmp(tld, BLOB_STR(&strings[indx[indx[i].parent].component])) != 0) { /* top level domain did not match */ i = -1; goto parent_dns_match; } tld = BLOB_NULL; } } while (indx[i].has_subdomains); if (key.ptr != url->host.ptr) { /* the full of dns part did not match, so we skip the * path name search */ goto parent_dns_match; } /* and then search for path matches */ parent_dns_match: if (i == -1) return BLOB_STR("unknown"); categories = sqdb_section_get(db, SQDB_SECTION_CATEGORIES, NULL); printf("%d\n", indx[i].category); return BLOB_STR(&strings[categories[indx[i].category]]); } int main(int argc, char **argv) { const char * const uri[] = { "http://sex.com", "http://facebook.com:1234/", "https://slashdot.org/path/to/me", "http://user:pass@paistortuga.com/~mocosoft", "user@weather.whenu.speedera.net", "zedo1.speedera.net", "foo.com/stuff?query;bar#frag", "foo.com?query;bar#frag", }; struct sqdb db; struct url_info nfo; blob_t cat; int i; sqdb_open(&db, "squark.db"); for (i = 0; i < ARRAY_SIZE(uri); i++) { if (url_parse(BLOB_STR(uri[i]), &nfo)) { cat = url_classify(&nfo, &db); printf("%s - %.*s -", uri[i], cat.len, cat.ptr); url_print(&nfo); printf("\n"); } else { printf("%s - BAD_URL\n", uri[i]); } } sqdb_close(&db); }