#include #include #include #include #include "squarkdb.h" #include "blob.h" static int running = 1; static uint64_t banned_categories = 0; static const blob_t space = BLOB_STR_INIT(" "); static const blob_t lf = BLOB_STR_INIT("\n"); static blob_t redirect_page; struct url_info { blob_t protocol; blob_t username; blob_t password; blob_t host; blob_t path; blob_t query; blob_t fragment; int port; }; /* URI is generalized as: * [proto://][user[:password]@]domain.name[:port][/[path/to][?p=a&q=b;r=c][#fragment]] * Character literals used as separators are: * : / @ ? & ; # * Also URI escaping says to treat %XX as encoded hex value. */ static int url_parse(blob_t uri, struct url_info *nfo) { blob_t before_colon; blob_t word; memset(nfo, 0, sizeof(*nfo)); /* parse protocol, username/password and domain name/port */ do { word = blob_pull_cspn(&uri, BLOB_STR(":@/?")); switch (uri.len ? uri.ptr[0] : '/') { case ':': blob_pull_skip(&uri, 1); if (blob_is_null(nfo->protocol) && blob_pull_matching(&uri, BLOB_STR("//"))) nfo->protocol = word; else before_colon = word; break; case '@': blob_pull_skip(&uri, 1); if (!blob_is_null(nfo->username) || !blob_is_null(nfo->password)) goto error; if (!blob_is_null(before_colon)) { nfo->username = before_colon; nfo->password = word; } else nfo->username = word; before_colon = BLOB_NULL; break; case '/': case '?': if (!blob_is_null(before_colon)) { nfo->host = before_colon; nfo->port = blob_pull_uint(&word, 10); } else nfo->host = word; break; } } while (blob_is_null(nfo->host) && !blob_is_null(uri)); /* rest of the components */ nfo->path = blob_pull_cspn(&uri, BLOB_STR("?&;#")); nfo->query = blob_pull_cspn(&uri, BLOB_STR("#")); nfo->fragment = uri; /* fill in defaults if needed */ if (blob_is_null(nfo->protocol)) { if (nfo->port == 443) nfo->protocol = BLOB_STR("https"); else nfo->protocol = BLOB_STR("http"); if (nfo->port == 0) nfo->port = 80; } else if (nfo->port == 0) { if (blob_cmp(nfo->protocol, BLOB_STR("https")) == 0) nfo->port = 443; else nfo->port = 80; } if (blob_is_null(nfo->path)) nfo->path = BLOB_STR("/"); return 1; error: return 0; } static void url_print(struct url_info *nfo) { #define print_field(nfo, x) if (!blob_is_null(nfo->x)) printf(" %s{%.*s}", #x, nfo->x.len, nfo->x.ptr) print_field(nfo, protocol); print_field(nfo, username); print_field(nfo, password); print_field(nfo, host); printf(" port{%d}", nfo->port); print_field(nfo, path); print_field(nfo, query); print_field(nfo, fragment); #undef print_field } static int url_classify(struct url_info *url, struct sqdb *db) { unsigned char buffer[1024]; blob_t b, key, got, tld, mkey; void *cmph; struct sqdb_index_entry *indx; cmph_uint32 i = -1, previ; cmph = sqdb_section_get(db, SQDB_SECTION_INDEX_MPH, NULL); indx = sqdb_section_get(db, SQDB_SECTION_INDEX, NULL); /* search for most qualified domain match; do first lookup * with two domain components */ key = BLOB_PTR_LEN(url->host.ptr + url->host.len, 0); tld = blob_expand_head(&key, url->host, '.'); do { /* add one more domain component */ got = blob_expand_head(&key, url->host, '.'); if (blob_is_null(got)) break; previ = i; i = cmph_search_packed(cmph, key.ptr, key.len); if (blob_cmp(got, sqdb_get_string_literal(db, indx[i].component)) != 0) { /* the subdomain did no longer match, use * parents classification */ i = previ; goto parent_dns_match; } if (!blob_is_null(tld)) { if (blob_cmp(tld, sqdb_get_string_literal(db, indx[indx[i].parent].component)) != 0) { /* top level domain did not match */ i = -1; goto parent_dns_match; } tld = BLOB_NULL; } mkey = key; } while (indx[i].has_subdomains); if (key.ptr != url->host.ptr || !indx[i].has_paths) { /* the full dns part did not match, or there's no more * specific paths in db -- skip the path name search */ goto parent_dns_match; } /* and then search for path matches -- construct hashing * string of url decoded path */ b = BLOB_BUF(buffer); blob_push(&b, key); key = blob_pushed(BLOB_BUF(buffer), b); blob_push_urldecode(&b, url->path); b = blob_pushed(BLOB_BUF(buffer), b); while (indx[i].has_paths) { /* add one more path component */ got = blob_expand_tail(&key, b, '/'); if (blob_is_null(got)) break; previ = i; i = cmph_search_packed(cmph, key.ptr, key.len); tld = sqdb_get_string_literal(db, indx[i].component); if (blob_cmp(got, sqdb_get_string_literal(db, indx[i].component)) != 0) { /* the subdomain did no longer match, use * parents classification */ i = previ; goto parent_dns_match; } mkey = key; } parent_dns_match: if (i == -1) return 0; /* no category */ return indx[i].category; } static blob_t get_category_name(struct sqdb *db, int id) { uint32_t *c, clen; c = sqdb_section_get(db, SQDB_SECTION_CATEGORIES, &clen); if (c == NULL || id < 0 || id * sizeof(uint32_t) >= clen) return BLOB_NULL; return sqdb_get_string_literal(db, c[id]); } static int find_category_id(struct sqdb *db, blob_t cat) { uint32_t size, *ptr; int i; ptr = sqdb_section_get(db, SQDB_SECTION_CATEGORIES, &size); if (ptr == NULL) return -1; size /= sizeof(uint32_t); for (i = 0; i < size; i++) if (blob_cmp(cat, sqdb_get_string_literal(db, ptr[i])) == 0) return i; return -1; } static void send_ok(blob_t tag) { static char buffer[64]; blob_t b = BLOB_BUF(buffer); blob_push(&b, tag); blob_push(&b, lf); b = blob_pushed(BLOB_BUF(buffer), b); write(STDOUT_FILENO, b.ptr, b.len); } static void send_redirect(struct sqdb *db, blob_t tag, blob_t url, int categ, blob_t username) { static char buffer[8*1024]; blob_t b = BLOB_BUF(buffer); blob_push(&b, tag); blob_push(&b, BLOB_STR(" 302:")); blob_push(&b, redirect_page); blob_push(&b, BLOB_STR("?REASON=")); blob_push(&b, get_category_name(db, categ)); blob_push(&b, BLOB_STR("&USER=")); blob_push(&b, username); blob_push(&b, lf); b = blob_pushed(BLOB_BUF(buffer), b); write(STDOUT_FILENO, b.ptr, b.len); } static void read_input(struct sqdb *db) { static char buffer[8 * 1024]; static blob_t left; blob_t b, line, id, url, username; struct url_info nfo; int r, category; if (blob_is_null(left)) left = BLOB_BUF(buffer); r = read(STDIN_FILENO, left.ptr, left.len); if (r < 0) return; if (r == 0) { running = 0; return; } left.ptr += r; left.len -= r; b = blob_pushed(BLOB_BUF(buffer), left); do { line = blob_pull_cspn(&b, lf); if (!blob_pull_matching(&b, lf)) return; id = blob_pull_cspn(&line, space); blob_pull_spn(&line, space); url = blob_pull_cspn(&line, space); blob_pull_spn(&line, space); blob_pull_cspn(&line, space); /* client addr / fqdn */ blob_pull_spn(&line, space); username = blob_pull_cspn(&line, space); /* http method */ /* urlgroup */ /* myaddr=xxx myport=xxx etc */ if (!blob_is_null(username)) { /* valid request, handle it */ if (url_parse(url, &nfo)) category = url_classify(&nfo, db); else category = 0; if ((1ULL << category) & banned_categories) send_redirect(db, id, url, category, username); else send_ok(id); } if (b.len) { memcpy(buffer, b.ptr, b.len); b.ptr = buffer; } left = BLOB_PTR_LEN(buffer + b.len, sizeof(buffer) - b.len); } while (b.len); } static void ban_category(struct sqdb *db, blob_t c) { int category; category = find_category_id(db, c); if (category >= 0) banned_categories |= 1ULL << category; else fprintf(stderr, "WARNING: unknown category '%.*s'\n", c.len, c.ptr); } int main(int argc, char **argv) { struct sqdb db; int opt; sqdb_open(&db, "squark.db"); while ((opt = getopt(argc, argv, "r:b:")) != -1) { switch (opt) { case 'r': redirect_page = BLOB_STRLEN(optarg); break; case 'b': ban_category(&db, BLOB_STRLEN(optarg)); break; } } while (running) read_input(&db); sqdb_close(&db); }