path: root/sqdb-build.lua
diff options
authorTimo Teräs <>2010-08-09 15:07:26 +0300
committerTimo Teräs <>2010-08-09 15:07:26 +0300
commitb5a5dd614101000f653e6ecb96ab34ae3f44353f (patch)
tree5b9cb2a8b9d56eefc2e044d8845bb5fbedb6ba49 /sqdb-build.lua
parent02e7cfc6b4603be8ff3b69abbfad50193aaee845 (diff)
squarkdb: cmph based url database for squark filtering
Implement basics of squarkdb which will be used by squark-filter to categorize URIs. Implementation is based on libcmph and uses file format suitable to be mmap:ed from squark-filter. Lua code is used to create the squark database from standard domain / url blacklists.
Diffstat (limited to 'sqdb-build.lua')
1 files changed, 302 insertions, 0 deletions
diff --git a/sqdb-build.lua b/sqdb-build.lua
new file mode 100755
index 0000000..6cde349
--- /dev/null
+++ b/sqdb-build.lua
@@ -0,0 +1,302 @@
+local all_strings = {}
+local all_domains = {}
+local all_ips = {}
+local all_categories = {}
+local num_categories = 0
+local strfind = string.find
+local strsub = string.sub
+local tinsert = table.insert
+local function strsplit(delimiter, text)
+ local list = {}
+ local pos = 1
+ --if strfind("", delimiter, 1) then -- this would result in endless loops
+ -- error("delimiter matches empty string!")
+ --end
+ while 1 do
+ local first, last = strfind(text, delimiter, pos)
+ if first then -- found?
+ tinsert(list, strsub(text, pos, first-1))
+ pos = last+1
+ else
+ tinsert(list, strsub(text, pos))
+ break
+ end
+ end
+ return list
+local function account_string(s)
+ all_strings[s] = true
+local function get_category(category_text)
+ local cat
+ cat = all_categories[category_text]
+ if cat ~= nil then return cat end
+ num_categories = num_categories + 1
+ cat = { desc=category_text, id=num_categories }
+ all_categories[category_text] = cat
+ account_string(category_text)
+ return cat
+local function get_domain(domain, locked)
+ local parts, entry, idx, p, child
+ parts = strsplit("[.]", domain)
+ entry = all_domains
+ for idx=#parts,1,-1 do
+ p = parts[idx]
+ if entry.children == nil then
+ entry.children = {}
+ end
+ child = entry.children[p]
+ if child == nil then
+ child = {}
+ entry.children[p] = child
+ end
+ if child.locked and not locked then
+ return nil
+ end
+ entry = child
+ end
+ return child
+local function get_path(domain_entry, path, locked)
+ local entry, p, n, component
+ entry = domain_entry
+ for n,component in pairs(strsplit("/", path)) do
+ if entry.paths == nil then
+ entry.paths = {}
+ end
+ p = entry.paths[component]
+ if p == nil then
+ p = {}
+ entry.paths[component] = p
+ end
+ if p.locked and not locked then
+ return nil
+ end
+ entry = p
+ end
+ return p
+local function is_ip_addr(s)
+ return s:match("^%d+\.%d+\.%d+\.%d+$")
+local function read_urls(filename, category, locked)
+ local fd, url, domain, path, d
+ fd =
+ if fd == nil then
+ print("WARNING: File " .. filename .. " does not exist")
+ return
+ end
+ print("Reading " .. filename)
+ for url in fd:lines() do
+ url = url:gsub("#.*", "")
+ url = url:gsub(" *^", "")
+ url = url:lower()
+ url = url:gsub("^(www%d*[.])", "")
+ domain, path = url:match("([^/]*)/?(.*)")
+ domain = domain:gsub(":.*", "")
+ domain = domain:gsub("[.]$", "") -- trailing dot
+ if domain == "" then
+ d = nil
+ elseif not is_ip_addr(domain) then
+ d = get_domain(domain, locked)
+ else
+ d = all_ips[domain]
+ if d == nil then
+ d = {}
+ all_ips[domain] = d
+ end
+ end
+ if d == nil then
+ --if url ~= "" then
+ -- print(url .. " ignored due to locked record")
+ --end
+ elseif path ~= "" then
+ if d.category ~= category and #path < 100 and path:match("([?;&])") == nil then
+ path = path:gsub("^/", "")
+ path = path:gsub("/$", "")
+ p = get_path(d, path, locked)
+ if p ~= nil then
+ p.category = category
+ if locked then
+ p.locked = true
+ end
+ end
+ end
+ else
+ if d.category == nil then
+ d.category = category
+ if locked then
+ d.locked = true
+ end
+ end
+ end
+ end
+ fd:close()
+local function enum_paths(cb, category, path, data)
+ local fpath, cpath, cdata, cat
+ for cpath, cdata in pairs(data) do
+ fpath = path .. "/" .. cpath
+ cat = cdata.category or category
+ cb(fpath, path, cpath, cat, false, cdata.paths)
+ if cdata.paths then
+ enum_paths(cb, cat, fpath, cdata.paths)
+ end
+ end
+local function enum_tree(cb, category, dns, data)
+ local cdns, cdata, fdns
+ local cat = data.category or category
+ if data.paths ~= nil then
+ enum_paths(cb, cat, dns, data.paths)
+ end
+ if data.children ~= nil then
+ for cdns, cdata in pairs(data.children) do
+ if dns ~= nil then
+ fdns = cdns .. "." .. dns
+ else
+ fdns = cdns
+ end
+ cb(fdns, dns, cdns, cat, data.children, data.paths)
+ enum_tree(cb, cat, fdns, cdata)
+ end
+ end
+local function enum_all(cb)
+ local ipaddr, data, category
+ -- enumerate all domains
+ enum_tree(cb, nil, nil, all_domains)
+ -- all IP addresses
+ for ipaddr, data in pairs(all_ips) do
+ if data.paths ~= nil then
+ enum_paths(cb, data.category, ipaddr, data.paths)
+ end
+ -- fixme, calculate ip as 32-bit value
+ cb(ipaddr, nil, 0, data.category, nil, data.paths)
+ end
+local function prune_paths(paths, category)
+ local path, pdata, cat
+ local num_paths = 0
+ for path, pdata in pairs(paths) do
+ local sub_paths = 0
+ cat = pdata.category or category
+ if pdata.paths ~= nil then
+ sub_paths = prune_paths(pdata.paths, cat)
+ if sub_paths == 0 then
+ pdata.paths = nil
+ end
+ end
+ if cat == category and sub_paths == 0 then
+ paths[path] = nil
+ else
+ num_paths = num_paths + 1
+ account_string(path)
+ end
+ end
+ return num_paths
+local function prune_tree(d, category)
+ local num_childs = 0
+ local num_paths = 0
+ local cat
+ cat = d.category or category
+ if d.children ~= nil then
+ for n, child in pairs(d.children) do
+ if prune_tree(child, cat, count) then
+ d.children[n] = nil
+ else
+ num_childs = num_childs + 1
+ account_string(n)
+ end
+ end
+ if num_childs == 0 then
+ d.children = nil
+ end
+ end
+ --print(name, d.category, category, d.num_paths, num_childs)
+ if d.paths ~= nil then
+ num_paths = prune_paths(d.paths, d.category)
+ end
+ if cat == category and num_paths == 0 and num_childs == 0 then
+ --num_pruned_leafs = num_pruned_leafs + 1
+ return true
+ end
+ return false
+local function load_lists(conffile, part)
+ local line, fields, cat
+ for line in io.lines(conffile) do
+ line = line:gsub("#(.*)", "")
+ fields = strsplit("[\t ]", line)
+ if fields[1] == "STOP" then
+ break
+ end
+ if fields[3] then
+ read_urls("lists/" .. fields[2] .. "list/" .. fields[3] .. "/" .. part,
+ get_category(fields[1]),
+ fields[4] == "LOCK")
+ end
+ end
+-- start by reading in all classification data
+load_lists("lists.conf", "domains")
+prune_tree(all_domains, nil)
+load_lists("lists.conf", "urls")
+prune_tree(all_domains, nil)
+-- generate database
+local db ="squark.db")
+num_entries = db:generate_hash(function() enum_all(coroutine.yield) end)
+-- write string literals
+-- create master index
+ function(uri, parent_uri, component, category, childs, paths)
+ db:assign_index(db:hash(uri),
+ category and or 0,
+ childs and true or false,
+ paths and true or false,
+ all_strings[component] or 0,
+ parent_uri and db:hash(parent_uri) or 0)
+ end