summaryrefslogtreecommitdiffstats
path: root/sqdb-build.lua
diff options
context:
space:
mode:
Diffstat (limited to 'sqdb-build.lua')
-rwxr-xr-xsqdb-build.lua335
1 files changed, 0 insertions, 335 deletions
diff --git a/sqdb-build.lua b/sqdb-build.lua
deleted file mode 100755
index cd039e2..0000000
--- a/sqdb-build.lua
+++ /dev/null
@@ -1,335 +0,0 @@
-#!/usr/bin/lua
-
-require("squarkdb")
-
-local all_strings = {}
-local all_domains = {}
-local all_ips = {}
-
-local all_categories = {}
-local all_categories_by_id = {}
-local num_categories = 0
-
-local strfind = string.find
-local strsub = string.sub
-local tinsert = table.insert
-
-local function strsplit(delimiter, text)
- local list = {}
- local pos = 1
- --if strfind("", delimiter, 1) then -- this would result in endless loops
- -- error("delimiter matches empty string!")
- --end
- while 1 do
- local first, last = strfind(text, delimiter, pos)
- if first then -- found?
- tinsert(list, strsub(text, pos, first-1))
- pos = last+1
- else
- tinsert(list, strsub(text, pos))
- break
- end
- end
- return list
-end
-
-local function account_string(s)
- all_strings[s] = true
-end
-
-local function get_category(category_text)
- local cat
-
- cat = all_categories[category_text]
- if cat ~= nil then return cat end
-
- -- start category ID's from zero
- cat = { desc=category_text, id=num_categories }
- all_categories[category_text] = cat
- num_categories = num_categories + 1
-
- -- but index them from one
- all_categories_by_id[num_categories] = cat
-
- account_string(category_text)
-
- return cat
-end
-
-local function get_domain(domain, locked)
- local parts, entry, idx, p, child
-
- parts = strsplit("[.]", domain)
- entry = all_domains
- for idx=#parts,1,-1 do
- p = parts[idx]
- if entry.children == nil then
- entry.children = {}
- end
- child = entry.children[p]
- if child == nil then
- child = {}
- entry.children[p] = child
- end
- if child.locked and not locked then
- return nil
- end
- entry = child
- end
- return child
-end
-
-local function get_path(domain_entry, path, locked)
- local entry, p, n, component
-
- entry = domain_entry
- for n,component in pairs(strsplit("/", path)) do
- if entry.paths == nil then
- entry.paths = {}
- end
- p = entry.paths[component]
- if p == nil then
- p = {}
- entry.paths[component] = p
- end
- if p.locked and not locked then
- return nil
- end
- entry = p
- end
- return p
-end
-
-local function is_ip_addr(s)
- return s:match("^%d+\.%d+\.%d+\.%d+$")
-end
-
-local function read_urls(filename, category, locked)
- local fd, url, domain, path, d
-
- fd = io.open(filename)
- if fd == nil then
- print("WARNING: File " .. filename .. " does not exist")
- return
- end
- print("Reading " .. filename)
- for url in fd:lines() do
- url = url:gsub("#.*", "")
- url = url:gsub(" *^", "")
- url = url:lower()
- url = url:gsub("^(www%d*[.])([^.]*[.])", "%2")
- domain, path = url:match("([^/]*)/?(.*)")
- domain = domain:gsub(":.*", "")
- domain = domain:gsub("[.]$", "") -- trailing dot
- if domain == "" then
- d = nil
- elseif not is_ip_addr(domain) then
- d = get_domain(domain, locked)
- else
- d = all_ips[domain]
- if d == nil then
- d = {}
- all_ips[domain] = d
- end
- end
- if d == nil then
- --if url ~= "" then
- -- print(url .. " ignored due to locked record")
- --end
- elseif path ~= "" then
- if d.category ~= category and #path < 100 and path:match("([?;&])") == nil then
- path = path:gsub("^/", "")
- path = path:gsub("/$", "")
- p = get_path(d, path, locked)
- if p ~= nil then
- p.category = category
- if locked then
- p.locked = true
- end
- end
- end
- else
- if d.category == nil then
- d.category = category
- if locked then
- d.locked = true
- end
- end
- end
- end
- fd:close()
-end
-
-local function enum_paths(cb, category, path, data)
- local fpath, cpath, cdata, cat
-
- for cpath, cdata in pairs(data) do
- fpath = path .. "/" .. cpath
- cat = cdata.category or category
- cb(fpath, path, cpath, cat, false, cdata.paths)
- if cdata.paths then
- enum_paths(cb, cat, fpath, cdata.paths)
- end
- end
-end
-
-local function enum_tree(cb, category, dns, data)
- local cdns, cdata, fdns, cat
-
- if data.paths ~= nil then
- enum_paths(cb, category, dns, data.paths)
- end
- if data.children ~= nil then
- for cdns, cdata in pairs(data.children) do
- if dns ~= nil then
- fdns = cdns .. "." .. dns
- else
- fdns = cdns
- end
- cat = cdata.category or category
- cb(fdns, dns, cdns, cat, cdata.children, cdata.paths)
- enum_tree(cb, cat, fdns, cdata)
- end
- end
-end
-
-function iptonumber(str)
- local num = 0
- for elem in str:gmatch("%d+") do
- num = num * 256 + assert(tonumber(elem))
- end
- return num
-end
-
-local function enum_all(cb)
- local ipaddr, data, category
-
- -- enumerate all domains
- enum_tree(cb, nil, nil, all_domains)
-
- -- all IP addresses
- for ipaddr, data in pairs(all_ips) do
- if data.paths ~= nil then
- enum_paths(cb, data.category, ipaddr, data.paths)
- end
- cb(ipaddr, nil, iptonumber(ipaddr), data.category, nil, data.paths)
- end
-end
-
-local function prune_paths(paths, category)
- local path, pdata, cat
- local num_paths = 0
-
- for path, pdata in pairs(paths) do
- local sub_paths = 0
-
- cat = pdata.category or category
- if pdata.paths ~= nil then
- sub_paths = prune_paths(pdata.paths, cat)
- if sub_paths == 0 then
- pdata.paths = nil
- end
- end
- if cat == category and sub_paths == 0 then
- paths[path] = nil
- else
- num_paths = num_paths + 1
- account_string(path)
- end
- end
- return num_paths
-end
-
-local function prune_tree(d, pcategory)
- local num_childs = 0
- local num_paths = 0
- local cat
-
- cat = d.category or pcategory
- if d.children ~= nil then
- for n, child in pairs(d.children) do
- if prune_tree(child, cat, n) then
- d.children[n] = nil
- else
- num_childs = num_childs + 1
- account_string(n)
- end
- end
- if num_childs == 0 then
- d.children = nil
- end
- end
- --print(name, d.category, category, d.num_paths, num_childs)
- if d.paths ~= nil then
- num_paths = prune_paths(d.paths, cat)
- if num_paths == 0 then
- d.paths = nil
- end
- end
- if d.category == pcategory and num_paths == 0 and num_childs == 0 then
- --num_pruned_leafs = num_pruned_leafs + 1
- return true
- end
- return false
-end
-
-local function load_lists(conffile, part)
- local line, fields, cat
-
- for line in io.lines(conffile) do
- line = line:gsub("#(.*)", "")
- fields = strsplit("[\t ]", line)
- if fields[1] == "STOP" then
- break
- end
- if fields[3] then
- read_urls("lists/" .. fields[2] .. "list/" .. fields[3] .. "/" .. part,
- get_category(fields[1]),
- fields[4] == "LOCK")
- end
- end
-end
-
--- start by reading in all classification data
-get_category("unknown")
-load_lists("lists.conf", "domains")
-prune_tree(all_domains, nil)
-load_lists("lists.conf", "urls")
-prune_tree(all_domains, nil)
-
--- generate database
-local db = squarkdb.new("squark.db")
-num_entries = db:generate_hash(function() enum_all(coroutine.yield) end)
-
--- write string literals
-db:map_strings(all_strings)
-
--- map category names and write the category section out
-for id, cdata in ipairs(all_categories_by_id) do
- all_categories_by_id[id] = all_strings[cdata.desc]
-end
-db:write_section("categories", all_categories_by_id)
-
--- create master index
-db:create_index(num_entries)
-enum_all(
- function(uri, parent_uri, component, category, childs, paths)
- if parent_uri == nil and type(component) == "number" then
- -- Embedded IPv4 address
- db:assign_index(db:hash(uri),
- category and category.id or 0,
- childs and true or false,
- paths and true or false,
- component,
- -2)
- else
- -- Regular entry
- db:assign_index(db:hash(uri),
- category and category.id or 0,
- childs and true or false,
- paths and true or false,
- all_strings[component] or 0,
- parent_uri and db:hash(parent_uri) or -1)
- end
- end
-)