summaryrefslogtreecommitdiffstats
path: root/src/sqdb-build.lua
diff options
context:
space:
mode:
Diffstat (limited to 'src/sqdb-build.lua')
-rwxr-xr-xsrc/sqdb-build.lua335
1 files changed, 335 insertions, 0 deletions
diff --git a/src/sqdb-build.lua b/src/sqdb-build.lua
new file mode 100755
index 0000000..cd039e2
--- /dev/null
+++ b/src/sqdb-build.lua
@@ -0,0 +1,335 @@
+#!/usr/bin/lua
+
+require("squarkdb")
+
+local all_strings = {}
+local all_domains = {}
+local all_ips = {}
+
+local all_categories = {}
+local all_categories_by_id = {}
+local num_categories = 0
+
+local strfind = string.find
+local strsub = string.sub
+local tinsert = table.insert
+
+local function strsplit(delimiter, text)
+ local list = {}
+ local pos = 1
+ --if strfind("", delimiter, 1) then -- this would result in endless loops
+ -- error("delimiter matches empty string!")
+ --end
+ while 1 do
+ local first, last = strfind(text, delimiter, pos)
+ if first then -- found?
+ tinsert(list, strsub(text, pos, first-1))
+ pos = last+1
+ else
+ tinsert(list, strsub(text, pos))
+ break
+ end
+ end
+ return list
+end
+
+local function account_string(s)
+ all_strings[s] = true
+end
+
+local function get_category(category_text)
+ local cat
+
+ cat = all_categories[category_text]
+ if cat ~= nil then return cat end
+
+ -- start category ID's from zero
+ cat = { desc=category_text, id=num_categories }
+ all_categories[category_text] = cat
+ num_categories = num_categories + 1
+
+ -- but index them from one
+ all_categories_by_id[num_categories] = cat
+
+ account_string(category_text)
+
+ return cat
+end
+
+local function get_domain(domain, locked)
+ local parts, entry, idx, p, child
+
+ parts = strsplit("[.]", domain)
+ entry = all_domains
+ for idx=#parts,1,-1 do
+ p = parts[idx]
+ if entry.children == nil then
+ entry.children = {}
+ end
+ child = entry.children[p]
+ if child == nil then
+ child = {}
+ entry.children[p] = child
+ end
+ if child.locked and not locked then
+ return nil
+ end
+ entry = child
+ end
+ return child
+end
+
+local function get_path(domain_entry, path, locked)
+ local entry, p, n, component
+
+ entry = domain_entry
+ for n,component in pairs(strsplit("/", path)) do
+ if entry.paths == nil then
+ entry.paths = {}
+ end
+ p = entry.paths[component]
+ if p == nil then
+ p = {}
+ entry.paths[component] = p
+ end
+ if p.locked and not locked then
+ return nil
+ end
+ entry = p
+ end
+ return p
+end
+
+local function is_ip_addr(s)
+ return s:match("^%d+\.%d+\.%d+\.%d+$")
+end
+
+local function read_urls(filename, category, locked)
+ local fd, url, domain, path, d
+
+ fd = io.open(filename)
+ if fd == nil then
+ print("WARNING: File " .. filename .. " does not exist")
+ return
+ end
+ print("Reading " .. filename)
+ for url in fd:lines() do
+ url = url:gsub("#.*", "")
+ url = url:gsub(" *^", "")
+ url = url:lower()
+ url = url:gsub("^(www%d*[.])([^.]*[.])", "%2")
+ domain, path = url:match("([^/]*)/?(.*)")
+ domain = domain:gsub(":.*", "")
+ domain = domain:gsub("[.]$", "") -- trailing dot
+ if domain == "" then
+ d = nil
+ elseif not is_ip_addr(domain) then
+ d = get_domain(domain, locked)
+ else
+ d = all_ips[domain]
+ if d == nil then
+ d = {}
+ all_ips[domain] = d
+ end
+ end
+ if d == nil then
+ --if url ~= "" then
+ -- print(url .. " ignored due to locked record")
+ --end
+ elseif path ~= "" then
+ if d.category ~= category and #path < 100 and path:match("([?;&])") == nil then
+ path = path:gsub("^/", "")
+ path = path:gsub("/$", "")
+ p = get_path(d, path, locked)
+ if p ~= nil then
+ p.category = category
+ if locked then
+ p.locked = true
+ end
+ end
+ end
+ else
+ if d.category == nil then
+ d.category = category
+ if locked then
+ d.locked = true
+ end
+ end
+ end
+ end
+ fd:close()
+end
+
+local function enum_paths(cb, category, path, data)
+ local fpath, cpath, cdata, cat
+
+ for cpath, cdata in pairs(data) do
+ fpath = path .. "/" .. cpath
+ cat = cdata.category or category
+ cb(fpath, path, cpath, cat, false, cdata.paths)
+ if cdata.paths then
+ enum_paths(cb, cat, fpath, cdata.paths)
+ end
+ end
+end
+
+local function enum_tree(cb, category, dns, data)
+ local cdns, cdata, fdns, cat
+
+ if data.paths ~= nil then
+ enum_paths(cb, category, dns, data.paths)
+ end
+ if data.children ~= nil then
+ for cdns, cdata in pairs(data.children) do
+ if dns ~= nil then
+ fdns = cdns .. "." .. dns
+ else
+ fdns = cdns
+ end
+ cat = cdata.category or category
+ cb(fdns, dns, cdns, cat, cdata.children, cdata.paths)
+ enum_tree(cb, cat, fdns, cdata)
+ end
+ end
+end
+
+function iptonumber(str)
+ local num = 0
+ for elem in str:gmatch("%d+") do
+ num = num * 256 + assert(tonumber(elem))
+ end
+ return num
+end
+
+local function enum_all(cb)
+ local ipaddr, data, category
+
+ -- enumerate all domains
+ enum_tree(cb, nil, nil, all_domains)
+
+ -- all IP addresses
+ for ipaddr, data in pairs(all_ips) do
+ if data.paths ~= nil then
+ enum_paths(cb, data.category, ipaddr, data.paths)
+ end
+ cb(ipaddr, nil, iptonumber(ipaddr), data.category, nil, data.paths)
+ end
+end
+
+local function prune_paths(paths, category)
+ local path, pdata, cat
+ local num_paths = 0
+
+ for path, pdata in pairs(paths) do
+ local sub_paths = 0
+
+ cat = pdata.category or category
+ if pdata.paths ~= nil then
+ sub_paths = prune_paths(pdata.paths, cat)
+ if sub_paths == 0 then
+ pdata.paths = nil
+ end
+ end
+ if cat == category and sub_paths == 0 then
+ paths[path] = nil
+ else
+ num_paths = num_paths + 1
+ account_string(path)
+ end
+ end
+ return num_paths
+end
+
+local function prune_tree(d, pcategory)
+ local num_childs = 0
+ local num_paths = 0
+ local cat
+
+ cat = d.category or pcategory
+ if d.children ~= nil then
+ for n, child in pairs(d.children) do
+ if prune_tree(child, cat, n) then
+ d.children[n] = nil
+ else
+ num_childs = num_childs + 1
+ account_string(n)
+ end
+ end
+ if num_childs == 0 then
+ d.children = nil
+ end
+ end
+ --print(name, d.category, category, d.num_paths, num_childs)
+ if d.paths ~= nil then
+ num_paths = prune_paths(d.paths, cat)
+ if num_paths == 0 then
+ d.paths = nil
+ end
+ end
+ if d.category == pcategory and num_paths == 0 and num_childs == 0 then
+ --num_pruned_leafs = num_pruned_leafs + 1
+ return true
+ end
+ return false
+end
+
+local function load_lists(conffile, part)
+ local line, fields, cat
+
+ for line in io.lines(conffile) do
+ line = line:gsub("#(.*)", "")
+ fields = strsplit("[\t ]", line)
+ if fields[1] == "STOP" then
+ break
+ end
+ if fields[3] then
+ read_urls("lists/" .. fields[2] .. "list/" .. fields[3] .. "/" .. part,
+ get_category(fields[1]),
+ fields[4] == "LOCK")
+ end
+ end
+end
+
+-- start by reading in all classification data
+get_category("unknown")
+load_lists("lists.conf", "domains")
+prune_tree(all_domains, nil)
+load_lists("lists.conf", "urls")
+prune_tree(all_domains, nil)
+
+-- generate database
+local db = squarkdb.new("squark.db")
+num_entries = db:generate_hash(function() enum_all(coroutine.yield) end)
+
+-- write string literals
+db:map_strings(all_strings)
+
+-- map category names and write the category section out
+for id, cdata in ipairs(all_categories_by_id) do
+ all_categories_by_id[id] = all_strings[cdata.desc]
+end
+db:write_section("categories", all_categories_by_id)
+
+-- create master index
+db:create_index(num_entries)
+enum_all(
+ function(uri, parent_uri, component, category, childs, paths)
+ if parent_uri == nil and type(component) == "number" then
+ -- Embedded IPv4 address
+ db:assign_index(db:hash(uri),
+ category and category.id or 0,
+ childs and true or false,
+ paths and true or false,
+ component,
+ -2)
+ else
+ -- Regular entry
+ db:assign_index(db:hash(uri),
+ category and category.id or 0,
+ childs and true or false,
+ paths and true or false,
+ all_strings[component] or 0,
+ parent_uri and db:hash(parent_uri) or -1)
+ end
+ end
+)