1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
|
#!/usr/bin/lua
require("squarkdb")
local all_strings = {}
local all_domains = {}
local all_ips = {}
local all_categories = {}
local num_categories = 0
local strfind = string.find
local strsub = string.sub
local tinsert = table.insert
local function strsplit(delimiter, text)
local list = {}
local pos = 1
--if strfind("", delimiter, 1) then -- this would result in endless loops
-- error("delimiter matches empty string!")
--end
while 1 do
local first, last = strfind(text, delimiter, pos)
if first then -- found?
tinsert(list, strsub(text, pos, first-1))
pos = last+1
else
tinsert(list, strsub(text, pos))
break
end
end
return list
end
local function account_string(s)
all_strings[s] = true
end
local function get_category(category_text)
local cat
cat = all_categories[category_text]
if cat ~= nil then return cat end
num_categories = num_categories + 1
cat = { desc=category_text, id=num_categories }
all_categories[category_text] = cat
account_string(category_text)
return cat
end
local function get_domain(domain, locked)
local parts, entry, idx, p, child
parts = strsplit("[.]", domain)
entry = all_domains
for idx=#parts,1,-1 do
p = parts[idx]
if entry.children == nil then
entry.children = {}
end
child = entry.children[p]
if child == nil then
child = {}
entry.children[p] = child
end
if child.locked and not locked then
return nil
end
entry = child
end
return child
end
local function get_path(domain_entry, path, locked)
local entry, p, n, component
entry = domain_entry
for n,component in pairs(strsplit("/", path)) do
if entry.paths == nil then
entry.paths = {}
end
p = entry.paths[component]
if p == nil then
p = {}
entry.paths[component] = p
end
if p.locked and not locked then
return nil
end
entry = p
end
return p
end
local function is_ip_addr(s)
return s:match("^%d+\.%d+\.%d+\.%d+$")
end
local function read_urls(filename, category, locked)
local fd, url, domain, path, d
fd = io.open(filename)
if fd == nil then
print("WARNING: File " .. filename .. " does not exist")
return
end
print("Reading " .. filename)
for url in fd:lines() do
url = url:gsub("#.*", "")
url = url:gsub(" *^", "")
url = url:lower()
url = url:gsub("^(www%d*[.])", "")
domain, path = url:match("([^/]*)/?(.*)")
domain = domain:gsub(":.*", "")
domain = domain:gsub("[.]$", "") -- trailing dot
if domain == "" then
d = nil
elseif not is_ip_addr(domain) then
d = get_domain(domain, locked)
else
d = all_ips[domain]
if d == nil then
d = {}
all_ips[domain] = d
end
end
if d == nil then
--if url ~= "" then
-- print(url .. " ignored due to locked record")
--end
elseif path ~= "" then
if d.category ~= category and #path < 100 and path:match("([?;&])") == nil then
path = path:gsub("^/", "")
path = path:gsub("/$", "")
p = get_path(d, path, locked)
if p ~= nil then
p.category = category
if locked then
p.locked = true
end
end
end
else
if d.category == nil then
d.category = category
if locked then
d.locked = true
end
end
end
end
fd:close()
end
local function enum_paths(cb, category, path, data)
local fpath, cpath, cdata, cat
for cpath, cdata in pairs(data) do
fpath = path .. "/" .. cpath
cat = cdata.category or category
cb(fpath, path, cpath, cat, false, cdata.paths)
if cdata.paths then
enum_paths(cb, cat, fpath, cdata.paths)
end
end
end
local function enum_tree(cb, category, dns, data)
local cdns, cdata, fdns
local cat = data.category or category
if data.paths ~= nil then
enum_paths(cb, cat, dns, data.paths)
end
if data.children ~= nil then
for cdns, cdata in pairs(data.children) do
if dns ~= nil then
fdns = cdns .. "." .. dns
else
fdns = cdns
end
cb(fdns, dns, cdns, cat, data.children, data.paths)
enum_tree(cb, cat, fdns, cdata)
end
end
end
local function enum_all(cb)
local ipaddr, data, category
-- enumerate all domains
enum_tree(cb, nil, nil, all_domains)
-- all IP addresses
for ipaddr, data in pairs(all_ips) do
if data.paths ~= nil then
enum_paths(cb, data.category, ipaddr, data.paths)
end
-- fixme, calculate ip as 32-bit value
cb(ipaddr, nil, 0, data.category, nil, data.paths)
end
end
local function prune_paths(paths, category)
local path, pdata, cat
local num_paths = 0
for path, pdata in pairs(paths) do
local sub_paths = 0
cat = pdata.category or category
if pdata.paths ~= nil then
sub_paths = prune_paths(pdata.paths, cat)
if sub_paths == 0 then
pdata.paths = nil
end
end
if cat == category and sub_paths == 0 then
paths[path] = nil
else
num_paths = num_paths + 1
account_string(path)
end
end
return num_paths
end
local function prune_tree(d, category)
local num_childs = 0
local num_paths = 0
local cat
cat = d.category or category
if d.children ~= nil then
for n, child in pairs(d.children) do
if prune_tree(child, cat, count) then
d.children[n] = nil
else
num_childs = num_childs + 1
account_string(n)
end
end
if num_childs == 0 then
d.children = nil
end
end
--print(name, d.category, category, d.num_paths, num_childs)
if d.paths ~= nil then
num_paths = prune_paths(d.paths, d.category)
end
if cat == category and num_paths == 0 and num_childs == 0 then
--num_pruned_leafs = num_pruned_leafs + 1
return true
end
return false
end
local function load_lists(conffile, part)
local line, fields, cat
for line in io.lines(conffile) do
line = line:gsub("#(.*)", "")
fields = strsplit("[\t ]", line)
if fields[1] == "STOP" then
break
end
if fields[3] then
read_urls("lists/" .. fields[2] .. "list/" .. fields[3] .. "/" .. part,
get_category(fields[1]),
fields[4] == "LOCK")
end
end
end
-- start by reading in all classification data
load_lists("lists.conf", "domains")
prune_tree(all_domains, nil)
load_lists("lists.conf", "urls")
prune_tree(all_domains, nil)
-- generate database
local db = squarkdb.new("squark.db")
num_entries = db:generate_hash(function() enum_all(coroutine.yield) end)
-- write string literals
db:map_strings(all_strings)
-- create master index
db:create_index(num_entries)
enum_all(
function(uri, parent_uri, component, category, childs, paths)
db:assign_index(db:hash(uri),
category and category.id or 0,
childs and true or false,
paths and true or false,
all_strings[component] or 0,
parent_uri and db:hash(parent_uri) or 0)
end
)
|