#!/usr/bin/lua require("squarkdb") local all_strings = {} local all_domains = {} local all_ips = {} local all_categories = {} local all_categories_by_id = {} local num_categories = 0 local strfind = string.find local strsub = string.sub local tinsert = table.insert local function strsplit(delimiter, text) local list = {} local pos = 1 --if strfind("", delimiter, 1) then -- this would result in endless loops -- error("delimiter matches empty string!") --end while 1 do local first, last = strfind(text, delimiter, pos) if first then -- found? tinsert(list, strsub(text, pos, first-1)) pos = last+1 else tinsert(list, strsub(text, pos)) break end end return list end local function account_string(s) all_strings[s] = true end local function get_category(category_text) local cat cat = all_categories[category_text] if cat ~= nil then return cat end -- start category ID's from zero cat = { desc=category_text, id=num_categories } all_categories[category_text] = cat num_categories = num_categories + 1 -- but index them from one all_categories_by_id[num_categories] = cat account_string(category_text) return cat end local function get_domain(domain, locked) local parts, entry, idx, p, child parts = strsplit("[.]", domain) entry = all_domains for idx=#parts,1,-1 do p = parts[idx] if entry.children == nil then entry.children = {} end child = entry.children[p] if child == nil then child = {} entry.children[p] = child end if child.locked and not locked then return nil end entry = child end return child end local function get_path(domain_entry, path, locked) local entry, p, n, component entry = domain_entry for n,component in pairs(strsplit("/", path)) do if entry.paths == nil then entry.paths = {} end p = entry.paths[component] if p == nil then p = {} entry.paths[component] = p end if p.locked and not locked then return nil end entry = p end return p end local function is_ip_addr(s) return s:match("^%d+\.%d+\.%d+\.%d+$") end local function read_urls(filename, category, locked) local fd, url, domain, path, d fd = io.open(filename) if fd == nil then print("WARNING: File " .. filename .. " does not exist") return end print("Reading " .. filename) for url in fd:lines() do url = url:gsub("#.*", "") url = url:gsub(" *^", "") url = url:lower() url = url:gsub("^(www%d*[.])([^.]*[.])", "%2") domain, path = url:match("([^/]*)/?(.*)") domain = domain:gsub(":.*", "") domain = domain:gsub("[.]$", "") -- trailing dot if domain == "" then d = nil elseif not is_ip_addr(domain) then d = get_domain(domain, locked) else d = all_ips[domain] if d == nil then d = {} all_ips[domain] = d end end if d == nil then --if url ~= "" then -- print(url .. " ignored due to locked record") --end elseif path ~= "" then if d.category ~= category and #path < 100 and path:match("([?;&])") == nil then path = path:gsub("^/", "") path = path:gsub("/$", "") p = get_path(d, path, locked) if p ~= nil then p.category = category if locked then p.locked = true end end end else if d.category == nil then d.category = category if locked then d.locked = true end end end end fd:close() end local function enum_paths(cb, category, path, data) local fpath, cpath, cdata, cat for cpath, cdata in pairs(data) do fpath = path .. "/" .. cpath cat = cdata.category or category cb(fpath, path, cpath, cat, false, cdata.paths) if cdata.paths then enum_paths(cb, cat, fpath, cdata.paths) end end end local function enum_tree(cb, category, dns, data) local cdns, cdata, fdns, cat if data.paths ~= nil then enum_paths(cb, category, dns, data.paths) end if data.children ~= nil then for cdns, cdata in pairs(data.children) do if dns ~= nil then fdns = cdns .. "." .. dns else fdns = cdns end cat = cdata.category or category cb(fdns, dns, cdns, cat, cdata.children, cdata.paths) enum_tree(cb, cat, fdns, cdata) end end end function iptonumber(str) local num = 0 for elem in str:gmatch("%d+") do num = num * 256 + assert(tonumber(elem)) end return num end local function enum_all(cb) local ipaddr, data, category -- enumerate all domains enum_tree(cb, nil, nil, all_domains) -- all IP addresses for ipaddr, data in pairs(all_ips) do if data.paths ~= nil then enum_paths(cb, data.category, ipaddr, data.paths) end cb(ipaddr, nil, iptonumber(ipaddr), data.category, nil, data.paths) end end local function prune_paths(paths, category) local path, pdata, cat local num_paths = 0 for path, pdata in pairs(paths) do local sub_paths = 0 cat = pdata.category or category if pdata.paths ~= nil then sub_paths = prune_paths(pdata.paths, cat) if sub_paths == 0 then pdata.paths = nil end end if cat == category and sub_paths == 0 then paths[path] = nil else num_paths = num_paths + 1 account_string(path) end end return num_paths end local function prune_tree(d, pcategory) local num_childs = 0 local num_paths = 0 local cat cat = d.category or pcategory if d.children ~= nil then for n, child in pairs(d.children) do if prune_tree(child, cat, n) then d.children[n] = nil else num_childs = num_childs + 1 account_string(n) end end if num_childs == 0 then d.children = nil end end --print(name, d.category, category, d.num_paths, num_childs) if d.paths ~= nil then num_paths = prune_paths(d.paths, cat) if num_paths == 0 then d.paths = nil end end if d.category == pcategory and num_paths == 0 and num_childs == 0 then --num_pruned_leafs = num_pruned_leafs + 1 return true end return false end local function load_lists(conffile, part) local line, fields, cat for line in io.lines(conffile) do line = line:gsub("#(.*)", "") fields = strsplit("[\t ]", line) if fields[1] == "STOP" then break end if fields[3] then read_urls("lists/" .. fields[2] .. "list/" .. fields[3] .. "/" .. part, get_category(fields[1]), fields[4] == "LOCK") end end end -- start by reading in all classification data get_category("unknown") load_lists("lists.conf", "domains") prune_tree(all_domains, nil) load_lists("lists.conf", "urls") prune_tree(all_domains, nil) -- generate database local db = squarkdb.new("squark.db") num_entries = db:generate_hash(function() enum_all(coroutine.yield) end) -- write string literals db:map_strings(all_strings) -- map category names and write the category section out for id, cdata in ipairs(all_categories_by_id) do all_categories_by_id[id] = all_strings[cdata.desc] end db:write_section("categories", all_categories_by_id) -- create master index db:create_index(num_entries) enum_all( function(uri, parent_uri, component, category, childs, paths) if parent_uri == nil and type(component) == "number" then -- Embedded IPv4 address db:assign_index(db:hash(uri), category and category.id or 0, childs and true or false, paths and true or false, component, -2) else -- Regular entry db:assign_index(db:hash(uri), category and category.id or 0, childs and true or false, paths and true or false, all_strings[component] or 0, parent_uri and db:hash(parent_uri) or -1) end end )