From 2e58fc0a7a69ecbe4a48b296bcf6313825fcfa7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timo=20Ter=C3=A4s?= Date: Wed, 18 Aug 2010 23:03:58 +0300 Subject: db, filter: fix db generation of ipv4 style addresses Properly embed the ipv4 address in database now. Teach filter to understand the two new reserved component id's. --- sqdb-build.lua | 34 ++++++++++++++++++++++++++-------- squark-filter.c | 27 +++++++++++++++++---------- squarkdb.h | 3 +++ 3 files changed, 46 insertions(+), 18 deletions(-) diff --git a/sqdb-build.lua b/sqdb-build.lua index 2b301fc..cd039e2 100755 --- a/sqdb-build.lua +++ b/sqdb-build.lua @@ -193,6 +193,14 @@ local function enum_tree(cb, category, dns, data) end end +function iptonumber(str) + local num = 0 + for elem in str:gmatch("%d+") do + num = num * 256 + assert(tonumber(elem)) + end + return num +end + local function enum_all(cb) local ipaddr, data, category @@ -204,8 +212,7 @@ local function enum_all(cb) if data.paths ~= nil then enum_paths(cb, data.category, ipaddr, data.paths) end - -- fixme, calculate ip as 32-bit value - cb(ipaddr, nil, 0, data.category, nil, data.paths) + cb(ipaddr, nil, iptonumber(ipaddr), data.category, nil, data.paths) end end @@ -307,11 +314,22 @@ db:write_section("categories", all_categories_by_id) db:create_index(num_entries) enum_all( function(uri, parent_uri, component, category, childs, paths) - db:assign_index(db:hash(uri), - category and category.id or 0, - childs and true or false, - paths and true or false, - all_strings[component] or 0, - parent_uri and db:hash(parent_uri) or 0) + if parent_uri == nil and type(component) == "number" then + -- Embedded IPv4 address + db:assign_index(db:hash(uri), + category and category.id or 0, + childs and true or false, + paths and true or false, + component, + -2) + else + -- Regular entry + db:assign_index(db:hash(uri), + category and category.id or 0, + childs and true or false, + paths and true or false, + all_strings[component] or 0, + parent_uri and db:hash(parent_uri) or -1) + end end ) diff --git a/squark-filter.c b/squark-filter.c index 097f420..fac9241 100644 --- a/squark-filter.c +++ b/squark-filter.c @@ -182,7 +182,7 @@ static int url_classify(struct url_info *url, struct sqdb *db) blob_t b, key, got, tld, mkey; void *cmph; struct sqdb_index_entry *indx; - cmph_uint32 i = -1, previ; + cmph_uint32 i = SQDB_PARENT_ROOT, previ; int dots_done = 1; cmph = sqdb_section_get(db, SQDB_SECTION_INDEX_MPH, NULL); @@ -201,26 +201,33 @@ static int url_classify(struct url_info *url, struct sqdb *db) previ = i; i = cmph_search_packed(cmph, key.ptr, key.len); - if (blob_cmp(got, sqdb_get_string_literal(db, indx[i].component)) != 0) { - /* the subdomain did no longer match, use - * parents classification */ - i = previ; - goto parent_dns_match; - } if (!blob_is_null(tld)) { - if (blob_cmp(tld, sqdb_get_string_literal(db, indx[indx[i].parent].component)) != 0) { + int p = indx[i].parent; + + if (p == SQDB_PARENT_ROOT || + p == SQDB_PARENT_IPV4 || + indx[p].parent != SQDB_PARENT_ROOT || + blob_cmp(tld, sqdb_get_string_literal(db, indx[p].component)) != 0) { /* top level domain did not match */ i = -1; goto parent_dns_match; } tld = BLOB_NULL; + previ = p; + } + if (indx[i].parent != previ || + blob_cmp(got, sqdb_get_string_literal(db, indx[i].component)) != 0) { + /* the subdomain did no longer match, use + * parents classification */ + i = previ; + goto parent_dns_match; } mkey = key; dots_done++; } while (indx[i].has_subdomains); /* No paths to match for */ - if (!indx[i].has_paths) + if (i == SQDB_PARENT_ROOT || !indx[i].has_paths) goto parent_dns_match; if (key.ptr != url->host.ptr) { @@ -268,7 +275,7 @@ static int url_classify(struct url_info *url, struct sqdb *db) } parent_dns_match: - if (i == -1) + if (i == SQDB_PARENT_ROOT) return 0; /* no category */ return indx[i].category; diff --git a/squarkdb.h b/squarkdb.h index 743325e..68c1a2a 100644 --- a/squarkdb.h +++ b/squarkdb.h @@ -34,6 +34,9 @@ struct sqdb_header { struct sqdb_section section[SQDB_SECTION_MAX]; }; +#define SQDB_PARENT_ROOT 0xffffff +#define SQDB_PARENT_IPV4 0xfffffe + struct sqdb_index_entry { uint32_t has_subdomains : 1; uint32_t has_paths : 1; -- cgit v1.2.3