diff options
Diffstat (limited to 'sqdb-build.lua')
-rwxr-xr-x | sqdb-build.lua | 335 |
1 files changed, 0 insertions, 335 deletions
diff --git a/sqdb-build.lua b/sqdb-build.lua deleted file mode 100755 index cd039e2..0000000 --- a/sqdb-build.lua +++ /dev/null @@ -1,335 +0,0 @@ -#!/usr/bin/lua - -require("squarkdb") - -local all_strings = {} -local all_domains = {} -local all_ips = {} - -local all_categories = {} -local all_categories_by_id = {} -local num_categories = 0 - -local strfind = string.find -local strsub = string.sub -local tinsert = table.insert - -local function strsplit(delimiter, text) - local list = {} - local pos = 1 - --if strfind("", delimiter, 1) then -- this would result in endless loops - -- error("delimiter matches empty string!") - --end - while 1 do - local first, last = strfind(text, delimiter, pos) - if first then -- found? - tinsert(list, strsub(text, pos, first-1)) - pos = last+1 - else - tinsert(list, strsub(text, pos)) - break - end - end - return list -end - -local function account_string(s) - all_strings[s] = true -end - -local function get_category(category_text) - local cat - - cat = all_categories[category_text] - if cat ~= nil then return cat end - - -- start category ID's from zero - cat = { desc=category_text, id=num_categories } - all_categories[category_text] = cat - num_categories = num_categories + 1 - - -- but index them from one - all_categories_by_id[num_categories] = cat - - account_string(category_text) - - return cat -end - -local function get_domain(domain, locked) - local parts, entry, idx, p, child - - parts = strsplit("[.]", domain) - entry = all_domains - for idx=#parts,1,-1 do - p = parts[idx] - if entry.children == nil then - entry.children = {} - end - child = entry.children[p] - if child == nil then - child = {} - entry.children[p] = child - end - if child.locked and not locked then - return nil - end - entry = child - end - return child -end - -local function get_path(domain_entry, path, locked) - local entry, p, n, component - - entry = domain_entry - for n,component in pairs(strsplit("/", path)) do - if entry.paths == nil then - entry.paths = {} - end - p = entry.paths[component] - if p == nil then - p = {} - entry.paths[component] = p - end - if p.locked and not locked then - return nil - end - entry = p - end - return p -end - -local function is_ip_addr(s) - return s:match("^%d+\.%d+\.%d+\.%d+$") -end - -local function read_urls(filename, category, locked) - local fd, url, domain, path, d - - fd = io.open(filename) - if fd == nil then - print("WARNING: File " .. filename .. " does not exist") - return - end - print("Reading " .. filename) - for url in fd:lines() do - url = url:gsub("#.*", "") - url = url:gsub(" *^", "") - url = url:lower() - url = url:gsub("^(www%d*[.])([^.]*[.])", "%2") - domain, path = url:match("([^/]*)/?(.*)") - domain = domain:gsub(":.*", "") - domain = domain:gsub("[.]$", "") -- trailing dot - if domain == "" then - d = nil - elseif not is_ip_addr(domain) then - d = get_domain(domain, locked) - else - d = all_ips[domain] - if d == nil then - d = {} - all_ips[domain] = d - end - end - if d == nil then - --if url ~= "" then - -- print(url .. " ignored due to locked record") - --end - elseif path ~= "" then - if d.category ~= category and #path < 100 and path:match("([?;&])") == nil then - path = path:gsub("^/", "") - path = path:gsub("/$", "") - p = get_path(d, path, locked) - if p ~= nil then - p.category = category - if locked then - p.locked = true - end - end - end - else - if d.category == nil then - d.category = category - if locked then - d.locked = true - end - end - end - end - fd:close() -end - -local function enum_paths(cb, category, path, data) - local fpath, cpath, cdata, cat - - for cpath, cdata in pairs(data) do - fpath = path .. "/" .. cpath - cat = cdata.category or category - cb(fpath, path, cpath, cat, false, cdata.paths) - if cdata.paths then - enum_paths(cb, cat, fpath, cdata.paths) - end - end -end - -local function enum_tree(cb, category, dns, data) - local cdns, cdata, fdns, cat - - if data.paths ~= nil then - enum_paths(cb, category, dns, data.paths) - end - if data.children ~= nil then - for cdns, cdata in pairs(data.children) do - if dns ~= nil then - fdns = cdns .. "." .. dns - else - fdns = cdns - end - cat = cdata.category or category - cb(fdns, dns, cdns, cat, cdata.children, cdata.paths) - enum_tree(cb, cat, fdns, cdata) - end - end -end - -function iptonumber(str) - local num = 0 - for elem in str:gmatch("%d+") do - num = num * 256 + assert(tonumber(elem)) - end - return num -end - -local function enum_all(cb) - local ipaddr, data, category - - -- enumerate all domains - enum_tree(cb, nil, nil, all_domains) - - -- all IP addresses - for ipaddr, data in pairs(all_ips) do - if data.paths ~= nil then - enum_paths(cb, data.category, ipaddr, data.paths) - end - cb(ipaddr, nil, iptonumber(ipaddr), data.category, nil, data.paths) - end -end - -local function prune_paths(paths, category) - local path, pdata, cat - local num_paths = 0 - - for path, pdata in pairs(paths) do - local sub_paths = 0 - - cat = pdata.category or category - if pdata.paths ~= nil then - sub_paths = prune_paths(pdata.paths, cat) - if sub_paths == 0 then - pdata.paths = nil - end - end - if cat == category and sub_paths == 0 then - paths[path] = nil - else - num_paths = num_paths + 1 - account_string(path) - end - end - return num_paths -end - -local function prune_tree(d, pcategory) - local num_childs = 0 - local num_paths = 0 - local cat - - cat = d.category or pcategory - if d.children ~= nil then - for n, child in pairs(d.children) do - if prune_tree(child, cat, n) then - d.children[n] = nil - else - num_childs = num_childs + 1 - account_string(n) - end - end - if num_childs == 0 then - d.children = nil - end - end - --print(name, d.category, category, d.num_paths, num_childs) - if d.paths ~= nil then - num_paths = prune_paths(d.paths, cat) - if num_paths == 0 then - d.paths = nil - end - end - if d.category == pcategory and num_paths == 0 and num_childs == 0 then - --num_pruned_leafs = num_pruned_leafs + 1 - return true - end - return false -end - -local function load_lists(conffile, part) - local line, fields, cat - - for line in io.lines(conffile) do - line = line:gsub("#(.*)", "") - fields = strsplit("[\t ]", line) - if fields[1] == "STOP" then - break - end - if fields[3] then - read_urls("lists/" .. fields[2] .. "list/" .. fields[3] .. "/" .. part, - get_category(fields[1]), - fields[4] == "LOCK") - end - end -end - --- start by reading in all classification data -get_category("unknown") -load_lists("lists.conf", "domains") -prune_tree(all_domains, nil) -load_lists("lists.conf", "urls") -prune_tree(all_domains, nil) - --- generate database -local db = squarkdb.new("squark.db") -num_entries = db:generate_hash(function() enum_all(coroutine.yield) end) - --- write string literals -db:map_strings(all_strings) - --- map category names and write the category section out -for id, cdata in ipairs(all_categories_by_id) do - all_categories_by_id[id] = all_strings[cdata.desc] -end -db:write_section("categories", all_categories_by_id) - --- create master index -db:create_index(num_entries) -enum_all( - function(uri, parent_uri, component, category, childs, paths) - if parent_uri == nil and type(component) == "number" then - -- Embedded IPv4 address - db:assign_index(db:hash(uri), - category and category.id or 0, - childs and true or false, - paths and true or false, - component, - -2) - else - -- Regular entry - db:assign_index(db:hash(uri), - category and category.id or 0, - childs and true or false, - paths and true or false, - all_strings[component] or 0, - parent_uri and db:hash(parent_uri) or -1) - end - end -) |