diff options
author | Timo Teräs <timo.teras@iki.fi> | 2010-08-09 15:07:26 +0300 |
---|---|---|
committer | Timo Teräs <timo.teras@iki.fi> | 2010-08-09 15:07:26 +0300 |
commit | b5a5dd614101000f653e6ecb96ab34ae3f44353f (patch) | |
tree | 5b9cb2a8b9d56eefc2e044d8845bb5fbedb6ba49 | |
parent | 02e7cfc6b4603be8ff3b69abbfad50193aaee845 (diff) | |
download | squark-b5a5dd614101000f653e6ecb96ab34ae3f44353f.tar.bz2 squark-b5a5dd614101000f653e6ecb96ab34ae3f44353f.tar.xz |
squarkdb: cmph based url database for squark filtering
Implement basics of squarkdb which will be used by squark-filter
to categorize URIs. Implementation is based on libcmph and uses
file format suitable to be mmap:ed from squark-filter.
Lua code is used to create the squark database from standard
domain / url blacklists.
-rw-r--r-- | Makefile | 21 | ||||
-rw-r--r-- | lua-squarkdb.c | 301 | ||||
-rwxr-xr-x | sqdb-build.lua | 302 | ||||
-rw-r--r-- | squarkdb.c | 121 | ||||
-rw-r--r-- | squarkdb.h | 50 |
5 files changed, 789 insertions, 6 deletions
@@ -1,13 +1,22 @@ +TARGETS=squark-auth squarkdb.so + +NETSNMP_CFLAGS:=$(shell net-snmp-config --cflags) +NETSNMP_LIBS:=$(shell net-snmp-config --libs) +LUA_CFLAGS:=$(shell pkg-config --cflags lua5.1) +LUA_LIBS:=$(shell pkg-config --libs lua5.1) +CMPH_CFLAGS:=$(shell pkg-config --cflags cmph) +CMPH_LIBS:=$(shell pkg-config --libs cmph) + CC=gcc -OBJS1=squark-auth.o -TARGETS=squark-auth -CFLAGS=-g -I. $(shell net-snmp-config --cflags) -std=gnu99 -Wall -BUILDLIBS=$(shell net-snmp-config --libs) +CFLAGS=-g -I. $(NETSNMP_CFLAGS) $(LUA_CFLAGS) $(CMPH_CFLAGS) -std=gnu99 -Wall all: $(TARGETS) -squark-auth: $(OBJS1) - $(CC) -g -o $@ $(OBJS1) $(BUILDLIBS) -nopie +squark-auth: squark-auth.o + $(CC) -o $@ $< $(NETSNMP_LIBS) + +squarkdb.so: lua-squarkdb.o squarkdb.o + $(CC) -shared -o $@ $^ $(LUA_LIBS) $(CMPH_LIBS) clean: rm $(OBJS1) $(TARGETS) diff --git a/lua-squarkdb.c b/lua-squarkdb.c new file mode 100644 index 0000000..80a0d32 --- /dev/null +++ b/lua-squarkdb.c @@ -0,0 +1,301 @@ +#include <string.h> + +#include <lua.h> +#include <lualib.h> +#include <lauxlib.h> + +#include <cmph.h> + +#include "squarkdb.h" + +#define SQUARKDB_META "squarkdb" + +struct sqdb *Lsqdb_checkarg(lua_State *L, int index) +{ + struct sqdb *db; + + luaL_checktype(L, index, LUA_TUSERDATA); + db = (struct sqdb *) luaL_checkudata(L, index, SQUARKDB_META); + if (db == NULL) + luaL_typerror(L, index, SQUARKDB_META); + return db; +} + +static int Lsqdb_new(lua_State *L) +{ + struct sqdb *db; + const char *fn; + + fn = luaL_checklstring(L, 1, NULL); + + db = (struct sqdb *) lua_newuserdata(L, sizeof(struct sqdb)); + luaL_getmetatable(L, SQUARKDB_META); + lua_setmetatable(L, -2); + + if (sqdb_create(db, fn) < 0) + luaL_error(L, "Failed to create SquarkDB file '%s'", fn); + + return 1; +} + +static int Lsqdb_destroy(lua_State *L) +{ + struct sqdb *db; + + db = Lsqdb_checkarg(L, 1); + sqdb_close(db); + + return 1; +} + + +struct ioa_data { + lua_State *main; + lua_State *thread; +}; + +static void ioa_rewind(void *data) +{ + struct ioa_data *ioa = (struct ioa_data *) data; + + /* pop previous thread */ + lua_pop(ioa->main, 1); + + /* create a new lua thread */ + ioa->thread = lua_newthread(ioa->main); + lua_pushvalue(ioa->main, -2); /* copy function to top */ + lua_xmove(ioa->main, ioa->thread, 1); /* move function from L to NL */ +} + +static cmph_uint32 ioa_count_keys(void *data) +{ + struct ioa_data *ioa = (struct ioa_data *) data; + lua_State *NL; + cmph_uint32 cnt = 0; + + NL = lua_newthread(ioa->main); + lua_pushvalue(ioa->main, -2); /* copy function to top */ + lua_xmove(ioa->main, NL, 1); /* move function from L to NL */ + + do { + cnt++; + lua_settop(NL, 1); + } while (lua_resume(NL, 0) == LUA_YIELD); + ioa_rewind(data); + + return cnt - 1; +} + +static int ioa_read(void *data, char **key, cmph_uint32 *len) +{ + struct ioa_data *ioa = (struct ioa_data *) data; + lua_State *L = ioa->thread; + size_t l; + + /* get next key from lua thread */ + lua_settop(L, 1); + if (lua_resume(L, 0) != LUA_YIELD || + !lua_isstring(L, 1)) { + *key = NULL; + *len = 0; + return -1; + } + + *key = (char *) lua_tolstring(L, 1, &l); + *len = l; + + return l; +} + +static void ioa_dispose(void *data, char *key, cmph_uint32 len) +{ + /* LUA takes care of garbage collection */ +} + +static int Lsqdb_hash(lua_State *L) +{ + struct sqdb *db; + void *ptr; + cmph_uint32 hash; + const char *key; + size_t keylen; + + db = Lsqdb_checkarg(L, 1); + key = luaL_checklstring(L, 2, &keylen); + + ptr = sqdb_section_get(db, SQDB_SECTION_INDEX_MPH, NULL); + hash = cmph_search_packed(ptr, key, keylen); + + lua_pushinteger(L, hash); + + return 1; +} + +static int Lsqdb_generate_hash(lua_State *L) +{ + struct sqdb *db; + struct ioa_data ioa; + cmph_config_t *cfg; + cmph_t *cmph; + cmph_io_adapter_t io; + + char *ptr; + cmph_uint32 packed; + + db = Lsqdb_checkarg(L, 1); + luaL_argcheck(L, lua_isfunction(L, 2) && !lua_iscfunction(L, 2), + 2, "Lua function expected"); + + ioa.main = L; + io.data = &ioa; + io.nkeys = ioa_count_keys(io.data); + io.read = ioa_read; + io.dispose = ioa_dispose; + io.rewind = ioa_rewind; + + cfg = cmph_config_new(&io); + if (cfg == NULL) + luaL_error(L, "Failed to create CMPH config"); + + cmph_config_set_algo(cfg, CMPH_CHD); + cmph = cmph_new(cfg); + cmph_config_destroy(cfg); + + if (cmph == NULL) + luaL_error(L, "Failed to create minimal perfect hash"); + + packed = cmph_packed_size(cmph); + ptr = sqdb_section_create(db, SQDB_SECTION_INDEX_MPH, packed); + if (ptr == NULL) { + cmph_destroy(cmph); + luaL_error(L, "Unable to allocation MPH section from SquarkDB"); + } + + cmph_pack(cmph, ptr); + cmph_destroy(cmph); + + lua_pushinteger(L, io.nkeys); + lua_pushinteger(L, packed); + + return 2; +} + +static int Lsqdb_create_index(lua_State *L) +{ + struct sqdb *db; + lua_Integer num_entries; + void *ptr; + + db = Lsqdb_checkarg(L, 1); + num_entries = luaL_checkinteger(L, 2); + + ptr = sqdb_section_create(db, SQDB_SECTION_INDEX, sizeof(struct sqdb_index_entry) * num_entries); + if (ptr == NULL) + luaL_error(L, "Failed to create INDEX section"); + + return 0; +} + +static int Lsqdb_assign_index(lua_State *L) +{ + struct sqdb *db; + size_t size; + lua_Integer idx; + struct sqdb_index_entry *ptr; + + db = Lsqdb_checkarg(L, 1); + idx = luaL_checkinteger(L, 2); + + ptr = sqdb_section_get(db, SQDB_SECTION_INDEX, &size); + if (size <= 0 || idx * sizeof(struct sqdb_index_entry) >= size) + luaL_error(L, "Bad index assignment (idx=%d, section size=%d)", idx, size); + + ptr += idx; + if (ptr->component != 0) + luaL_error(L, "Index entry %d has been already assigned", idx); + + ptr->category = luaL_checkinteger(L, 3); + ptr->has_subdomains = lua_toboolean(L, 4); + ptr->has_paths = lua_toboolean(L, 5); + ptr->component = luaL_checkinteger(L, 6); + ptr->parent = luaL_checkinteger(L, 7); + + return 0; +} + +static int Lsqdb_map_strings(lua_State *L) +{ + struct sqdb *db; + const char *str; + char *ptr; + size_t len, total, pos; + + db = Lsqdb_checkarg(L, 1); + luaL_checktype(L, 2, LUA_TTABLE); + + /* go through the table and count total amount of data */ + total = 0; + lua_pushnil(L); + while (lua_next(L, 2) != 0) { + str = luaL_checklstring(L, -2, &len); + total += len + 1; + lua_pop(L, 1); + } + + /* create string literal section */ + ptr = sqdb_section_create(db, SQDB_SECTION_STRINGS, total); + if (ptr == NULL) + luaL_error(L, "Failed to create string literal section (%d bytes)", total); + + /* populate string literals and return their indices */ + pos = 0; + lua_pushnil(L); + while (lua_next(L, 2) != 0) { + str = lua_tolstring(L, -2, &len); + memcpy(&ptr[pos], str, len + 1); + lua_pop(L, 1); + + /* table[key] = pos */ + lua_pushvalue(L, -1); + lua_pushinteger(L, pos); + lua_rawset(L, 2); + + pos += len + 1; + } + + return 0; +} + +static const luaL_reg sqdb_meta_methods[] = { + { "__gc", Lsqdb_destroy }, + { NULL, NULL } +}; + +static const luaL_reg squarkdb_methods[] = { + { "new", Lsqdb_new }, + { "hash", Lsqdb_hash }, + { "generate_hash", Lsqdb_generate_hash }, + { "create_index", Lsqdb_create_index }, + { "assign_index", Lsqdb_assign_index }, + { "map_strings", Lsqdb_map_strings }, + { NULL, NULL } +}; + +LUALIB_API int luaopen_squarkdb(lua_State *L) +{ + /* Register squarkdb library */ + luaI_openlib(L, "squarkdb", squarkdb_methods, 0); + + /* And metatable for it */ + luaL_newmetatable(L, SQUARKDB_META); + luaI_openlib(L, NULL, sqdb_meta_methods, 0); + lua_pushliteral(L, "__index"); + lua_pushvalue(L, -3); + lua_rawset(L, -3); + lua_pushliteral(L, "__metatable"); + lua_pushvalue(L, -3); + lua_rawset(L, -3); + lua_pop(L, 1); + + return 1; +} diff --git a/sqdb-build.lua b/sqdb-build.lua new file mode 100755 index 0000000..6cde349 --- /dev/null +++ b/sqdb-build.lua @@ -0,0 +1,302 @@ +#!/usr/bin/lua + +require("squarkdb") + +local all_strings = {} +local all_domains = {} +local all_ips = {} + +local all_categories = {} +local num_categories = 0 + +local strfind = string.find +local strsub = string.sub +local tinsert = table.insert + +local function strsplit(delimiter, text) + local list = {} + local pos = 1 + --if strfind("", delimiter, 1) then -- this would result in endless loops + -- error("delimiter matches empty string!") + --end + while 1 do + local first, last = strfind(text, delimiter, pos) + if first then -- found? + tinsert(list, strsub(text, pos, first-1)) + pos = last+1 + else + tinsert(list, strsub(text, pos)) + break + end + end + return list +end + +local function account_string(s) + all_strings[s] = true +end + +local function get_category(category_text) + local cat + + cat = all_categories[category_text] + if cat ~= nil then return cat end + + num_categories = num_categories + 1 + cat = { desc=category_text, id=num_categories } + all_categories[category_text] = cat + + account_string(category_text) + + return cat +end + +local function get_domain(domain, locked) + local parts, entry, idx, p, child + + parts = strsplit("[.]", domain) + entry = all_domains + for idx=#parts,1,-1 do + p = parts[idx] + if entry.children == nil then + entry.children = {} + end + child = entry.children[p] + if child == nil then + child = {} + entry.children[p] = child + end + if child.locked and not locked then + return nil + end + entry = child + end + return child +end + +local function get_path(domain_entry, path, locked) + local entry, p, n, component + + entry = domain_entry + for n,component in pairs(strsplit("/", path)) do + if entry.paths == nil then + entry.paths = {} + end + p = entry.paths[component] + if p == nil then + p = {} + entry.paths[component] = p + end + if p.locked and not locked then + return nil + end + entry = p + end + return p +end + +local function is_ip_addr(s) + return s:match("^%d+\.%d+\.%d+\.%d+$") +end + +local function read_urls(filename, category, locked) + local fd, url, domain, path, d + + fd = io.open(filename) + if fd == nil then + print("WARNING: File " .. filename .. " does not exist") + return + end + print("Reading " .. filename) + for url in fd:lines() do + url = url:gsub("#.*", "") + url = url:gsub(" *^", "") + url = url:lower() + url = url:gsub("^(www%d*[.])", "") + domain, path = url:match("([^/]*)/?(.*)") + domain = domain:gsub(":.*", "") + domain = domain:gsub("[.]$", "") -- trailing dot + if domain == "" then + d = nil + elseif not is_ip_addr(domain) then + d = get_domain(domain, locked) + else + d = all_ips[domain] + if d == nil then + d = {} + all_ips[domain] = d + end + end + if d == nil then + --if url ~= "" then + -- print(url .. " ignored due to locked record") + --end + elseif path ~= "" then + if d.category ~= category and #path < 100 and path:match("([?;&])") == nil then + path = path:gsub("^/", "") + path = path:gsub("/$", "") + p = get_path(d, path, locked) + if p ~= nil then + p.category = category + if locked then + p.locked = true + end + end + end + else + if d.category == nil then + d.category = category + if locked then + d.locked = true + end + end + end + end + fd:close() +end + +local function enum_paths(cb, category, path, data) + local fpath, cpath, cdata, cat + + for cpath, cdata in pairs(data) do + fpath = path .. "/" .. cpath + cat = cdata.category or category + cb(fpath, path, cpath, cat, false, cdata.paths) + if cdata.paths then + enum_paths(cb, cat, fpath, cdata.paths) + end + end +end + +local function enum_tree(cb, category, dns, data) + local cdns, cdata, fdns + local cat = data.category or category + + if data.paths ~= nil then + enum_paths(cb, cat, dns, data.paths) + end + if data.children ~= nil then + for cdns, cdata in pairs(data.children) do + if dns ~= nil then + fdns = cdns .. "." .. dns + else + fdns = cdns + end + cb(fdns, dns, cdns, cat, data.children, data.paths) + enum_tree(cb, cat, fdns, cdata) + end + end +end + +local function enum_all(cb) + local ipaddr, data, category + + -- enumerate all domains + enum_tree(cb, nil, nil, all_domains) + + -- all IP addresses + for ipaddr, data in pairs(all_ips) do + if data.paths ~= nil then + enum_paths(cb, data.category, ipaddr, data.paths) + end + -- fixme, calculate ip as 32-bit value + cb(ipaddr, nil, 0, data.category, nil, data.paths) + end +end + +local function prune_paths(paths, category) + local path, pdata, cat + local num_paths = 0 + + for path, pdata in pairs(paths) do + local sub_paths = 0 + + cat = pdata.category or category + if pdata.paths ~= nil then + sub_paths = prune_paths(pdata.paths, cat) + if sub_paths == 0 then + pdata.paths = nil + end + end + if cat == category and sub_paths == 0 then + paths[path] = nil + else + num_paths = num_paths + 1 + account_string(path) + end + end + return num_paths +end + +local function prune_tree(d, category) + local num_childs = 0 + local num_paths = 0 + local cat + + cat = d.category or category + if d.children ~= nil then + for n, child in pairs(d.children) do + if prune_tree(child, cat, count) then + d.children[n] = nil + else + num_childs = num_childs + 1 + account_string(n) + end + end + if num_childs == 0 then + d.children = nil + end + end + --print(name, d.category, category, d.num_paths, num_childs) + if d.paths ~= nil then + num_paths = prune_paths(d.paths, d.category) + end + if cat == category and num_paths == 0 and num_childs == 0 then + --num_pruned_leafs = num_pruned_leafs + 1 + return true + end + return false +end + +local function load_lists(conffile, part) + local line, fields, cat + + for line in io.lines(conffile) do + line = line:gsub("#(.*)", "") + fields = strsplit("[\t ]", line) + if fields[1] == "STOP" then + break + end + if fields[3] then + read_urls("lists/" .. fields[2] .. "list/" .. fields[3] .. "/" .. part, + get_category(fields[1]), + fields[4] == "LOCK") + end + end +end + +-- start by reading in all classification data +load_lists("lists.conf", "domains") +prune_tree(all_domains, nil) +load_lists("lists.conf", "urls") +prune_tree(all_domains, nil) + +-- generate database +local db = squarkdb.new("squark.db") +num_entries = db:generate_hash(function() enum_all(coroutine.yield) end) + +-- write string literals +db:map_strings(all_strings) + +-- create master index +db:create_index(num_entries) +enum_all( + function(uri, parent_uri, component, category, childs, paths) + db:assign_index(db:hash(uri), + category and category.id or 0, + childs and true or false, + paths and true or false, + all_strings[component] or 0, + parent_uri and db:hash(parent_uri) or 0) + end +) diff --git a/squarkdb.c b/squarkdb.c new file mode 100644 index 0000000..a62ea61 --- /dev/null +++ b/squarkdb.c @@ -0,0 +1,121 @@ +#include <fcntl.h> +#include <unistd.h> +#include <string.h> +#include <sys/mman.h> + +#include "squarkdb.h" + +#define PAGE_SIZE 4096 +#define ALIGN(s,a) (((s) + a - 1) & ~(a - 1)) + +const char *sqdb_section_names[SQDB_SECTION_MAX] = { + [SQDB_SECTION_STRINGS] = "strings", + [SQDB_SECTION_CATEGORIES] = "categories", + [SQDB_SECTION_INDEX] = "index", + [SQDB_SECTION_INDEX_MPH] = "index_mph", + [SQDB_SECTION_KEYWORD] = "keyword", + [SQDB_SECTION_KEYWORD_MPH] = "keyword_mph", +}; + +static int sqdb_allocate(struct sqdb *db, size_t s) +{ + size_t old_size, new_size; + void *base; + + old_size = db->file_length; + new_size = ALIGN(db->file_length + s, PAGE_SIZE); + + if (new_size == ALIGN(db->file_length, PAGE_SIZE)) { + db->file_length += s; + return old_size; + } + + if (ftruncate(db->fd, new_size) < 0) + return -1; + + if (db->mmap_base == NULL) { + base = mmap(NULL, new_size, PROT_READ|PROT_WRITE, + MAP_SHARED, db->fd, 0); + } else { + base = mremap(db->mmap_base, ALIGN(old_size, PAGE_SIZE), + new_size, MREMAP_MAYMOVE); + } + if (base == MAP_FAILED) + return -1; + + db->mmap_base = base; + db->file_length += ALIGN(s, 16); + + return old_size; +} + +int sqdb_create(struct sqdb *db, const char *fn) +{ + struct sqdb_header *hdr; + int rc; + + db->fd = open(fn, O_CREAT | O_TRUNC | O_RDWR, 0666); + if (db->fd < 0) + return -1; + + db->file_length = 0; + db->mmap_base = NULL; + + rc = sqdb_allocate(db, sizeof(struct sqdb_header)); + if (rc < 0) { + close(db->fd); + return rc; + } + + hdr = db->mmap_base; + strcpy(hdr->description, "Squark Filtering Database"); + hdr->version = 1; + hdr->magic = 0xdbdbdbdb; + hdr->num_sections = SQDB_SECTION_MAX; + + return 0; +} + +int sqdb_open(struct sqdb *db, const char *fn); + +void sqdb_close(struct sqdb *db) +{ + if (db->mmap_base) + munmap(db->mmap_base, ALIGN(db->file_length, PAGE_SIZE)); + close(db->fd); +} + +void *sqdb_section_create(struct sqdb *db, int id, u_int32_t size) +{ + struct sqdb_header *hdr; + size_t pos; + + hdr = db->mmap_base; + if (hdr->section[id].offset || hdr->section[id].length) + return NULL; + + pos = sqdb_allocate(db, size); + if (pos < 0) + return NULL; + + /* sqdb_allocate can remap mmap_base */ + hdr = db->mmap_base; + hdr->section[id].offset = pos; + hdr->section[id].length = size; + + return db->mmap_base + pos; +} + +void *sqdb_section_get(struct sqdb *db, int id, u_int32_t *size) +{ + struct sqdb_header *hdr = db->mmap_base; + + if (hdr->section[id].offset == 0) + return NULL; + + if (size) + *size = hdr->section[id].length; + + return db->mmap_base + hdr->section[id].offset; +} + diff --git a/squarkdb.h b/squarkdb.h new file mode 100644 index 0000000..2f96155 --- /dev/null +++ b/squarkdb.h @@ -0,0 +1,50 @@ +#ifndef SQUARKDB_H +#define SQUARKDB_H + +#include <stdint.h> + +#define SQDB_SECTION_STRINGS 0 +#define SQDB_SECTION_CATEGORIES 1 +#define SQDB_SECTION_INDEX 2 +#define SQDB_SECTION_INDEX_MPH 3 +#define SQDB_SECTION_KEYWORD 4 +#define SQDB_SECTION_KEYWORD_MPH 5 +#define SQDB_SECTION_MAX 8 + +struct sqdb { + int fd; + void * mmap_base; + size_t file_length; +}; + +struct sqdb_section { + u_int32_t offset; + u_int32_t length; +}; + +struct sqdb_header { + char description[116]; + u_int32_t num_sections; + u_int32_t version; + u_int32_t magic; + struct sqdb_section section[SQDB_SECTION_MAX]; +}; + +struct sqdb_index_entry { + u_int32_t has_subdomains : 1; + u_int32_t has_paths : 1; + u_int32_t category : 6; + u_int32_t parent : 24; + u_int32_t component; +}; + +const char *sqdb_section_names[SQDB_SECTION_MAX]; + +int sqdb_create(struct sqdb *db, const char *fn); +int sqdb_open(struct sqdb *db, const char *fn); +void sqdb_close(struct sqdb *db); + +void *sqdb_section_create(struct sqdb *db, int id, u_int32_t size); +void *sqdb_section_get(struct sqdb *db, int id, u_int32_t *size); + +#endif |