From 25593b5e6fea76ed7c08db586924032c0810c27e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timo=20Ter=C3=A4s?= Date: Sun, 7 Nov 2010 00:47:39 +0200 Subject: squark: reorganize sources to src directory --- src/sqdb-build.lua | 335 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 335 insertions(+) create mode 100755 src/sqdb-build.lua (limited to 'src/sqdb-build.lua') diff --git a/src/sqdb-build.lua b/src/sqdb-build.lua new file mode 100755 index 0000000..cd039e2 --- /dev/null +++ b/src/sqdb-build.lua @@ -0,0 +1,335 @@ +#!/usr/bin/lua + +require("squarkdb") + +local all_strings = {} +local all_domains = {} +local all_ips = {} + +local all_categories = {} +local all_categories_by_id = {} +local num_categories = 0 + +local strfind = string.find +local strsub = string.sub +local tinsert = table.insert + +local function strsplit(delimiter, text) + local list = {} + local pos = 1 + --if strfind("", delimiter, 1) then -- this would result in endless loops + -- error("delimiter matches empty string!") + --end + while 1 do + local first, last = strfind(text, delimiter, pos) + if first then -- found? + tinsert(list, strsub(text, pos, first-1)) + pos = last+1 + else + tinsert(list, strsub(text, pos)) + break + end + end + return list +end + +local function account_string(s) + all_strings[s] = true +end + +local function get_category(category_text) + local cat + + cat = all_categories[category_text] + if cat ~= nil then return cat end + + -- start category ID's from zero + cat = { desc=category_text, id=num_categories } + all_categories[category_text] = cat + num_categories = num_categories + 1 + + -- but index them from one + all_categories_by_id[num_categories] = cat + + account_string(category_text) + + return cat +end + +local function get_domain(domain, locked) + local parts, entry, idx, p, child + + parts = strsplit("[.]", domain) + entry = all_domains + for idx=#parts,1,-1 do + p = parts[idx] + if entry.children == nil then + entry.children = {} + end + child = entry.children[p] + if child == nil then + child = {} + entry.children[p] = child + end + if child.locked and not locked then + return nil + end + entry = child + end + return child +end + +local function get_path(domain_entry, path, locked) + local entry, p, n, component + + entry = domain_entry + for n,component in pairs(strsplit("/", path)) do + if entry.paths == nil then + entry.paths = {} + end + p = entry.paths[component] + if p == nil then + p = {} + entry.paths[component] = p + end + if p.locked and not locked then + return nil + end + entry = p + end + return p +end + +local function is_ip_addr(s) + return s:match("^%d+\.%d+\.%d+\.%d+$") +end + +local function read_urls(filename, category, locked) + local fd, url, domain, path, d + + fd = io.open(filename) + if fd == nil then + print("WARNING: File " .. filename .. " does not exist") + return + end + print("Reading " .. filename) + for url in fd:lines() do + url = url:gsub("#.*", "") + url = url:gsub(" *^", "") + url = url:lower() + url = url:gsub("^(www%d*[.])([^.]*[.])", "%2") + domain, path = url:match("([^/]*)/?(.*)") + domain = domain:gsub(":.*", "") + domain = domain:gsub("[.]$", "") -- trailing dot + if domain == "" then + d = nil + elseif not is_ip_addr(domain) then + d = get_domain(domain, locked) + else + d = all_ips[domain] + if d == nil then + d = {} + all_ips[domain] = d + end + end + if d == nil then + --if url ~= "" then + -- print(url .. " ignored due to locked record") + --end + elseif path ~= "" then + if d.category ~= category and #path < 100 and path:match("([?;&])") == nil then + path = path:gsub("^/", "") + path = path:gsub("/$", "") + p = get_path(d, path, locked) + if p ~= nil then + p.category = category + if locked then + p.locked = true + end + end + end + else + if d.category == nil then + d.category = category + if locked then + d.locked = true + end + end + end + end + fd:close() +end + +local function enum_paths(cb, category, path, data) + local fpath, cpath, cdata, cat + + for cpath, cdata in pairs(data) do + fpath = path .. "/" .. cpath + cat = cdata.category or category + cb(fpath, path, cpath, cat, false, cdata.paths) + if cdata.paths then + enum_paths(cb, cat, fpath, cdata.paths) + end + end +end + +local function enum_tree(cb, category, dns, data) + local cdns, cdata, fdns, cat + + if data.paths ~= nil then + enum_paths(cb, category, dns, data.paths) + end + if data.children ~= nil then + for cdns, cdata in pairs(data.children) do + if dns ~= nil then + fdns = cdns .. "." .. dns + else + fdns = cdns + end + cat = cdata.category or category + cb(fdns, dns, cdns, cat, cdata.children, cdata.paths) + enum_tree(cb, cat, fdns, cdata) + end + end +end + +function iptonumber(str) + local num = 0 + for elem in str:gmatch("%d+") do + num = num * 256 + assert(tonumber(elem)) + end + return num +end + +local function enum_all(cb) + local ipaddr, data, category + + -- enumerate all domains + enum_tree(cb, nil, nil, all_domains) + + -- all IP addresses + for ipaddr, data in pairs(all_ips) do + if data.paths ~= nil then + enum_paths(cb, data.category, ipaddr, data.paths) + end + cb(ipaddr, nil, iptonumber(ipaddr), data.category, nil, data.paths) + end +end + +local function prune_paths(paths, category) + local path, pdata, cat + local num_paths = 0 + + for path, pdata in pairs(paths) do + local sub_paths = 0 + + cat = pdata.category or category + if pdata.paths ~= nil then + sub_paths = prune_paths(pdata.paths, cat) + if sub_paths == 0 then + pdata.paths = nil + end + end + if cat == category and sub_paths == 0 then + paths[path] = nil + else + num_paths = num_paths + 1 + account_string(path) + end + end + return num_paths +end + +local function prune_tree(d, pcategory) + local num_childs = 0 + local num_paths = 0 + local cat + + cat = d.category or pcategory + if d.children ~= nil then + for n, child in pairs(d.children) do + if prune_tree(child, cat, n) then + d.children[n] = nil + else + num_childs = num_childs + 1 + account_string(n) + end + end + if num_childs == 0 then + d.children = nil + end + end + --print(name, d.category, category, d.num_paths, num_childs) + if d.paths ~= nil then + num_paths = prune_paths(d.paths, cat) + if num_paths == 0 then + d.paths = nil + end + end + if d.category == pcategory and num_paths == 0 and num_childs == 0 then + --num_pruned_leafs = num_pruned_leafs + 1 + return true + end + return false +end + +local function load_lists(conffile, part) + local line, fields, cat + + for line in io.lines(conffile) do + line = line:gsub("#(.*)", "") + fields = strsplit("[\t ]", line) + if fields[1] == "STOP" then + break + end + if fields[3] then + read_urls("lists/" .. fields[2] .. "list/" .. fields[3] .. "/" .. part, + get_category(fields[1]), + fields[4] == "LOCK") + end + end +end + +-- start by reading in all classification data +get_category("unknown") +load_lists("lists.conf", "domains") +prune_tree(all_domains, nil) +load_lists("lists.conf", "urls") +prune_tree(all_domains, nil) + +-- generate database +local db = squarkdb.new("squark.db") +num_entries = db:generate_hash(function() enum_all(coroutine.yield) end) + +-- write string literals +db:map_strings(all_strings) + +-- map category names and write the category section out +for id, cdata in ipairs(all_categories_by_id) do + all_categories_by_id[id] = all_strings[cdata.desc] +end +db:write_section("categories", all_categories_by_id) + +-- create master index +db:create_index(num_entries) +enum_all( + function(uri, parent_uri, component, category, childs, paths) + if parent_uri == nil and type(component) == "number" then + -- Embedded IPv4 address + db:assign_index(db:hash(uri), + category and category.id or 0, + childs and true or false, + paths and true or false, + component, + -2) + else + -- Regular entry + db:assign_index(db:hash(uri), + category and category.id or 0, + childs and true or false, + paths and true or false, + all_strings[component] or 0, + parent_uri and db:hash(parent_uri) or -1) + end + end +) -- cgit v1.2.3