summaryrefslogtreecommitdiffstats
path: root/src/sqdb-build.lua
blob: cd039e2965c9bf7b6946159727a660fa89df3899 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
#!/usr/bin/lua

require("squarkdb")

local all_strings = {}
local all_domains = {}
local all_ips = {}

local all_categories = {}
local all_categories_by_id = {}
local num_categories = 0

local strfind = string.find
local strsub = string.sub
local tinsert = table.insert

local function strsplit(delimiter, text)
	local list = {}
	local pos = 1
	--if strfind("", delimiter, 1) then -- this would result in endless loops
	--	error("delimiter matches empty string!")
	--end
	while 1 do
		local first, last = strfind(text, delimiter, pos)
		if first then -- found?
			tinsert(list, strsub(text, pos, first-1))
			pos = last+1
		else
			tinsert(list, strsub(text, pos))
			break
		end
	end
	return list
end

local function account_string(s)
	all_strings[s] = true
end

local function get_category(category_text)
	local cat

	cat = all_categories[category_text]
	if cat ~= nil then return cat end

	-- start category ID's from zero
	cat = { desc=category_text, id=num_categories }
	all_categories[category_text] = cat
	num_categories = num_categories + 1

	-- but index them from one
	all_categories_by_id[num_categories] = cat

	account_string(category_text)

	return cat
end

local function get_domain(domain, locked)
	local parts, entry, idx, p, child

	parts = strsplit("[.]", domain)
	entry = all_domains
	for idx=#parts,1,-1 do
		p = parts[idx]
		if entry.children == nil then
			entry.children = {}
		end
		child = entry.children[p]
		if child == nil then
			child = {}
			entry.children[p] = child
		end
		if child.locked and not locked then
			return nil
		end
		entry = child
	end
	return child
end

local function get_path(domain_entry, path, locked)
	local entry, p, n, component

	entry = domain_entry
	for n,component in pairs(strsplit("/", path)) do
		if entry.paths == nil then
			entry.paths = {}
		end
		p = entry.paths[component]
		if p == nil then
			p = {}
			entry.paths[component] = p
		end
		if p.locked and not locked then
			return nil
		end
		entry = p
	end
	return p
end

local function is_ip_addr(s)
	return s:match("^%d+\.%d+\.%d+\.%d+$")
end

local function read_urls(filename, category, locked)
	local fd, url, domain, path, d

	fd = io.open(filename)
	if fd == nil then
		print("WARNING: File " .. filename .. " does not exist")
		return
	end
	print("Reading " .. filename)
	for url in fd:lines() do
		url = url:gsub("#.*", "")
		url = url:gsub(" *^", "")
		url = url:lower()
		url = url:gsub("^(www%d*[.])([^.]*[.])", "%2")
		domain, path = url:match("([^/]*)/?(.*)")
		domain = domain:gsub(":.*", "")
		domain = domain:gsub("[.]$", "")	-- trailing dot
		if domain == "" then
			d = nil
		elseif not is_ip_addr(domain) then
			d = get_domain(domain, locked)
		else
			d = all_ips[domain]
			if d == nil then
				d = {}
				all_ips[domain] = d
			end
		end
		if d == nil then
			--if url ~= "" then
			--	print(url .. " ignored due to locked record")
			--end
		elseif path ~= "" then
			if d.category ~= category and #path < 100 and path:match("([?;&])") == nil then
				path = path:gsub("^/", "")
				path = path:gsub("/$", "")
				p = get_path(d, path, locked)
				if p ~= nil then
					p.category = category
					if locked then
						p.locked = true
					end
				end
			end
		else
			if d.category == nil then
				d.category = category
				if locked then
					d.locked = true
				end
			end
		end
 	end
 	fd:close()
end

local function enum_paths(cb, category, path, data)
	local fpath, cpath, cdata, cat

	for cpath, cdata in pairs(data) do
		fpath = path .. "/" .. cpath
		cat = cdata.category or category
		cb(fpath, path, cpath, cat, false, cdata.paths)
		if cdata.paths then
			enum_paths(cb, cat, fpath, cdata.paths)
		end
	end
end

local function enum_tree(cb, category, dns, data)
	local cdns, cdata, fdns, cat

	if data.paths ~= nil then
		enum_paths(cb, category, dns, data.paths)
	end
	if data.children ~= nil then
		for cdns, cdata in pairs(data.children) do
			if dns ~= nil then
				fdns = cdns .. "." .. dns
			else
				fdns = cdns
			end
			cat = cdata.category or category
			cb(fdns, dns, cdns, cat, cdata.children, cdata.paths)
			enum_tree(cb, cat, fdns, cdata)
		end
	end
end

function iptonumber(str)
	local num = 0
	for elem in str:gmatch("%d+") do
		num = num * 256 + assert(tonumber(elem))
	end
	return num
end

local function enum_all(cb)
	local ipaddr, data, category

	-- enumerate all domains
	enum_tree(cb, nil, nil, all_domains)

	-- all IP addresses
	for ipaddr, data in pairs(all_ips) do
		if data.paths ~= nil then
			enum_paths(cb, data.category, ipaddr, data.paths)
		end
		cb(ipaddr, nil, iptonumber(ipaddr), data.category, nil, data.paths)
	end
end

local function prune_paths(paths, category)
	local path, pdata, cat
	local num_paths = 0

	for path, pdata in pairs(paths) do
		local sub_paths = 0

		cat = pdata.category or category
		if pdata.paths ~= nil then
			sub_paths = prune_paths(pdata.paths, cat)
			if sub_paths == 0 then
				pdata.paths = nil
			end
		end
		if cat == category and sub_paths == 0 then
			paths[path] = nil
		else
			num_paths = num_paths + 1
			account_string(path)
		end
	end
	return num_paths
end

local function prune_tree(d, pcategory)
	local num_childs = 0
	local num_paths = 0
	local cat

	cat = d.category or pcategory
	if d.children ~= nil then
		for n, child in pairs(d.children) do
			if prune_tree(child, cat, n) then
				d.children[n] = nil
			else
				num_childs = num_childs + 1
				account_string(n)
			end
		end
		if num_childs == 0 then
			d.children = nil
		end
	end
	--print(name, d.category, category, d.num_paths, num_childs)
	if d.paths ~= nil then
		num_paths = prune_paths(d.paths, cat)
		if num_paths == 0 then
			d.paths = nil
		end
	end
	if d.category == pcategory and num_paths == 0 and num_childs == 0 then
		--num_pruned_leafs = num_pruned_leafs + 1
		return true
	end
	return false
end

local function load_lists(conffile, part)
	local line, fields, cat

	for line in io.lines(conffile) do
		line = line:gsub("#(.*)", "")
		fields = strsplit("[\t ]", line)
		if fields[1] == "STOP" then
			break
		end
		if fields[3] then
			read_urls("lists/" .. fields[2] .. "list/" .. fields[3] .. "/" .. part,
				  get_category(fields[1]),
				  fields[4] == "LOCK")
		end
	end
end

-- start by reading in all classification data
get_category("unknown")
load_lists("lists.conf", "domains")
prune_tree(all_domains, nil)
load_lists("lists.conf", "urls")
prune_tree(all_domains, nil)

-- generate database
local db = squarkdb.new("squark.db")
num_entries = db:generate_hash(function() enum_all(coroutine.yield) end)

-- write string literals
db:map_strings(all_strings)

-- map category names and write the category section out
for id, cdata in ipairs(all_categories_by_id) do
	all_categories_by_id[id] = all_strings[cdata.desc]
end
db:write_section("categories", all_categories_by_id)

-- create master index
db:create_index(num_entries)
enum_all(
	function(uri, parent_uri, component, category, childs, paths)
		if parent_uri == nil and type(component) == "number" then
			-- Embedded IPv4 address
			db:assign_index(db:hash(uri),
					category and category.id or 0,
					childs and true or false,
					paths and true or false,
					component,
					-2)
		else
			-- Regular entry
			db:assign_index(db:hash(uri),
					category and category.id or 0,
					childs and true or false,
					paths and true or false,
					all_strings[component] or 0,
					parent_uri and db:hash(parent_uri) or -1)
		end
	end
)