diff options
author | Natanael Copa <ncopa@alpinelinux.org> | 2009-10-23 08:55:25 +0000 |
---|---|---|
committer | Natanael Copa <ncopa@alpinelinux.org> | 2009-10-23 08:55:25 +0000 |
commit | 33459f27ce11a7450ea6775da6bfe39c2b609c24 (patch) | |
tree | 032ffc6d391ad458f7172d1f2fdbdc7a28346aa0 | |
parent | 12abd7f87902f16aac06fec36fc0903c4985adfd (diff) | |
download | acf-weblog-33459f27ce11a7450ea6775da6bfe39c2b609c24.tar.bz2 acf-weblog-33459f27ce11a7450ea6775da6bfe39c2b609c24.tar.xz |
use iterator functions to import logfiles
Rather than passing over a table with the entire parsed logfile we pass
over an iterator function.
This way we dont need to have the entire logfile in memory at the same
time and we can handle extremely big logfiles without consume lots of
memory.
-rw-r--r-- | weblog-model.lua | 97 |
1 files changed, 65 insertions, 32 deletions
diff --git a/weblog-model.lua b/weblog-model.lua index 4a2e16a..e28464b 100644 --- a/weblog-model.lua +++ b/weblog-model.lua @@ -180,7 +180,7 @@ end local importsquidlog = function(logentries, sourcename) con:execute("START TRANSACTION") - for i,entry in pairs(logentries) do + for entry in logentries do local sql = string.format("INSERT INTO weblog VALUES ('%s', '%s', '%s', '%s', '%s', '%s')", escape(sourcename), escape(entry.clientip), escape(entry.clientuserid:lower()), escape(entry.logdatetime), escape(entry.URL), escape(entry.bytes)) @@ -191,7 +191,7 @@ end local importdglog = function(logentries, sourcename) con:execute("START TRANSACTION") - for i,entry in pairs(logentries) do + for entry in logentries do local sql = string.format("INSERT INTO blocklog VALUES ('%s', '0.0.0.0', '%s', '%s', '%s', '%s', '%s', '%s', '%s')", escape(sourcename), escape(entry.clientuserid:lower()), escape(entry.logdatetime), escape(entry.URL), escape(entry.bytes), escape(entry.reason), escape(entry.score or "0"), escape(entry.shortreason)) @@ -552,39 +552,71 @@ end -- ################################################################################ -- LOG FILE FUNCTIONS -local parsesquidlog = function(f) - local logentries = {} - for line in f:lines() do - -- Format of squid log (space separated): - -- time elapsed remotehost code/status bytes method URL rfc931 peerstatus/peerhost - local words = {} - for word in string.gmatch(line, "%S+") do - words[#words+1] = word - end - local logentry = {logdatetime=words[1], elapsed=words[2], clientip=words[3], code=string.match(words[4], "^[^/]*"), status=string.match(words[4], "[^/]*$"), bytes=words[5], method=words[6], URL=words[7], clientuserid=words[8], peerstatus=string.match(words[9], "^[^/]*"), peerhost=string.match(words[9], "[^/]*$")} - logentry.logdatetime = os.date("%Y-%m-%d %H:%M:%S", logentry.logdatetime)..string.match(logentry.logdatetime, "%..*") - -- Don't care about local requests (from DG) - if logentry.clientip ~= "127.0.0.1" then - logentries[#logentries+1] = logentry +local function parsesquidlog_line(line) + -- Format of squid log (space separated): + -- time elapsed remotehost code/status bytes method URL rfc931 peerstatus/peerhost + local words = {} + for word in string.gmatch(line, "%S+") do + words[#words+1] = word + end + local logentry = {logdatetime=words[1], + elapsed=words[2], + clientip=words[3], + code=string.match(words[4], "^[^/]*"), + status=string.match(words[4], "[^/]*$"), + bytes=words[5], + method=words[6], + URL=words[7], + clientuserid=words[8], + peerstatus=string.match(words[9], "^[^/]*"), + peerhost=string.match(words[9], "[^/]*$")} + + logentry.logdatetime = os.date("%Y-%m-%d %H:%M:%S", logentry.logdatetime)..string.match(logentry.logdatetime, "%..*") + return logentry +end + +local function parsesquidlog_iter(f) + return function() + while true do + line = f:read("*line") + if line == nil then + return nil + end + local logentry = parsesquidlog_line(line) + -- Don't care about local requests (from DG) + if logentry.clientip ~= "127.0.0.1" then + return logentry + end end end - return logentries end -local parsedglog = function(f) - local logentries = {} - for line in f:lines() do - local words = format.string_to_table(line, "\t") - local logentry = {logdatetime=words[1], clientuserid=words[2], clientip=words[3], URL=words[4], reason=words[5], method=words[6], bytes=words[7], shortreason=words[9]} - if logentry.reason ~= "" then - if logentry.shortreason == "" then logentry.shortreason = logentry.reason end - logentry.score = string.match(logentry.reason, "^.*: ([0-9]+) ") - logentry.logdatetime = string.gsub(logentry.logdatetime, "%.", "-") +local function parsedglog_line(line) + local words = format.string_to_table(line, "\t") + return { logdatetime=words[1], clientuserid=words[2], clientip=words[3], + URL=words[4], reason=words[5], method=words[6], bytes=words[7], + shortreason=words[9]} +end + +local function parsedglog_iter(f) + return function() + while true do + line = f:read("*line") + if line == nil then + return nil + end + local logentry = parsedglog_line(line) - logentries[#logentries+1] = logentry + if logentry.reason ~= "" then + if logentry.shortreason == "" then + logentry.shortreason = logentry.reason + end + logentry.score = string.match(logentry.reason, "^.*: ([0-9]+) ") + logentry.logdatetime = string.gsub(logentry.logdatetime, "%.", "-") + return logentry + end end end - return logentries end -- ################################################################################ @@ -846,11 +878,12 @@ end -- import either squid or dg log file. -- delete logfile after -function importlogfile(source, cookiesfile, file, parselog_func, importlog_func) +function importlogfile(source, cookiesfile, file, parselog_iter, importlog_func) + local logentries logme("Processing " .. file ) logme("Getting " .. file ) loghandle = openlogfile(source, cookiesfile, file) - logentries = parselog_func(loghandle) + logentries = parselog_iter(loghandle) importlog_func(logentries, source.sourcename) loghandle:close() logme("Deleting " .. file ) @@ -876,10 +909,10 @@ function importlogs() for j,file in ipairs(files) do if string.match(file, "dansguardian/access%.log[%.%-]") then count = count + 1 - importlogfile(source, cookeisfile, file, parsedglog, importdglog) + importlogfile(source, cookeisfile, file, parsedglog_iter, importdglog) elseif string.match(file, "squid/access%.log[%.%-]") then count = count + 1 - importlogfile(source, cookeisfile, file, parsesquidlog, importsquidlog) + importlogfile(source, cookeisfile, file, parsesquidlog_iter, importsquidlog) end end end |