You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@whimsical.apache.org by cu...@apache.org on 2018/05/05 15:39:46 UTC

[whimsy] branch master updated: Simplistic error logs parser

This is an automated email from the ASF dual-hosted git repository.

curcuru pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/whimsy.git


The following commit(s) were added to refs/heads/master by this push:
     new 318905d  Simplistic error logs parser
318905d is described below

commit 318905dbc80317f9851fb4a495fa5c2b662f64cc
Author: Shane Curcuru <as...@shanecurcuru.org>
AuthorDate: Sat May 5 11:39:40 2018 -0400

    Simplistic error logs parser
---
 tools/logparser.rb | 207 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 207 insertions(+)

diff --git a/tools/logparser.rb b/tools/logparser.rb
new file mode 100644
index 0000000..cfc0474
--- /dev/null
+++ b/tools/logparser.rb
@@ -0,0 +1,207 @@
+#!/usr/bin/env ruby
+# Gather simple statistics from whimsy server logs
+# TODO security check ASF::Auth.decode before reading log files
+$LOAD_PATH.unshift File.realpath(File.expand_path('../../lib', __FILE__))
+require 'whimsy/asf'
+require 'json'
+require 'set'
+require 'stringio'
+require 'zlib'
+require 'yaml'
+
+# Utility methods to turn server logs into hashes of interesting data
+module LogParser
+  extend self
+  
+  # Constants and ignored regex for whimsy_access logs
+  RUSER = 'remote_user'
+  REFERER = 'referer'
+  REMAINDER = 'remainder'
+  HITTOTAL = 'total'
+  URIHIT = 'uri'
+  IGNORED_URIS = [ 
+    /\A\/whimsy.svg/, 
+    /\A\/favicon.ico/, 
+    /\A\/robots.txt/, 
+    /\A\/assets/, 
+    /\A\/fonts/,
+    /\A\/icons/,
+    /\.js\z/,
+    /\.css\z/,
+    /\.action\z/,
+    /\.zip\z/, # Below here are all from site scanners
+    /\..?ar\z/,
+    /\.tar\..{1}z.?\z/,
+    /\.bak\z/,
+    /\.sql\z/,
+    /\.7z\z/,
+    /\.asp\.?\z/i,
+    /\.txt\z/,
+    /\.php\z/,
+    /\.woff2/
+  ]
+  
+  # Related to timestamps in error log output
+  TRUNCATE = 6 # Ensure consistency in keys
+  TIME_OFFSET = 10000000.0 # Offset milliseconds slightly for array entries
+  
+  # Read a text or .gz file
+  # @param f filename: .log or .log.gz
+  # @return File.read(f)
+  def read_logz(f)
+    if f.end_with? '.gz'
+      reader = Zlib::GzipReader.open(f)
+      logfile = reader.read
+      reader.close
+      stream.close rescue nil
+    else
+      logfile = File.read(f)
+    end
+    return logfile
+  end
+  
+  # Parse whimsy_access and return interesting entries
+  # @param f filename of whimsy_access.log or .gz
+  # @return array of reduced, scrubbed entries as hashes
+  def parse_whimsy_access(f)
+    access = read_logz(f).scan(/<%JSON:httpd_access%> (\{.*\})/).flatten
+    logs = JSON.parse('[' + access.join(',') + ']')
+      .reject{ |i| (i['useragent'] =~ /Ping My Box/) || (i['uri'] =~ Regexp.union(IGNORED_URIS)) || (i['status'] == 304) }
+    logs.each do |i|
+      %w(geo_country geo_long geo_lat geo_coords geo_city geo_combo duration request bytes vhost document request_method clientip query_string).each do |g|
+        i.delete(g)
+      end
+    end
+    return logs
+  end
+  
+  # Collate/partition whimsy_access entries by app areas
+  # @param logs full set of items to scan
+  # @return apps - apps categorized, with REMAINDER entry all others
+  def collate_whimsy_access(logs)
+    remainder = logs
+    apps = {}
+    %w(status roster board public secretary).each do |a|
+      apps[a] = Hash.new{|h,k| h[k] = [] }
+      apps[a][RUSER] = Hash.new{|h,k| h[k] = 0 }
+      apps[a][REFERER] = Hash.new{|h,k| h[k] = 0 }
+      apps[a][URIHIT] = Hash.new{|h,k| h[k] = 0 }
+    end
+    apps.each do |app, data|
+      items, remainder = remainder.partition{ |l| l['uri'] =~ /\A\/#{app}/ }
+      items.each do |l|
+        data[RUSER][l[RUSER]] += 1
+        data[REFERER][l[REFERER]] += 1
+        data[URIHIT][l[URIHIT]] += 1
+      end
+    end
+    apps[REMAINDER] = Hash.new{|h,k| h[k] = [] }
+    apps[REMAINDER][RUSER] = Hash.new{|h,k| h[k] = 0 }
+    apps[REMAINDER][REFERER] = Hash.new{|h,k| h[k] = 0 }
+    apps[REMAINDER][URIHIT] = Hash.new{|h,k| h[k] = 0 }
+    apps[REMAINDER]['useragent'] = Hash.new{|h,k| h[k] = 0 }
+    remainder.each do |l|
+      apps[REMAINDER][RUSER][l[RUSER]] += 1
+      apps[REMAINDER][REFERER][l[REFERER]] += 1
+      apps[REMAINDER][URIHIT][l[URIHIT]] += 1
+      apps[REMAINDER]['useragent'][l['useragent']] += 1
+    end
+    return apps
+  end
+  
+  # Get a simplistic hash report of access entries
+  # @param f filepath to whimsy_access.log
+  # @return app_report, misses_data
+  def get_access_reports(f)
+    access = parse_whimsy_access(f)
+    hits, miss = access.partition{ |l| l['status'] == 200 }
+    apps = collate_whimsy_access(hits)
+    return apps, miss
+  end
+  
+  # Parse error.log and return interesting entries
+  # @param f filename of error.log or .gz
+  # @param logs hash to append to (created if nil) 
+  # @return hash of string|array of interesting entries
+  #   "timestamp" => "Passenger restarts and messages", 
+  #   "timestamp" => ['_ERROR msg', '_WARN msg'... ]
+  def parse_error_log(f, logs = {})
+    last_time = 'uninitialized_time' # Cheap marker
+    read_logz(f).lines.each do |l|
+      begin
+        # Emit each interesting item in order we read it 
+        #   Include good-enough timestamping, even for un-timestamped items
+        # (Date.today.to_time + 4/100000.0).iso8601(TRUNCATE)
+        if l =~ /\[ . (.{24}) .+\]: (.+)/
+          last_time = $1
+          capture = $2
+          if capture =~ /Passenger/
+            logs[DateTime.parse(last_time).iso8601(TRUNCATE)] = capture
+          end
+        elsif l =~ /(_ERROR|_WARN  (.+)whimsy)/
+          if ! (l =~ /rack.rb/) # Don't need these
+            # Offset our time so it doesn't overwrite any Passenger entries
+            (logs[(DateTime.parse(last_time) + 1/TIME_OFFSET).iso8601(TRUNCATE)] ||= []) << l
+          end
+        end
+      rescue StandardError => e
+        puts e
+      end
+    end
+    return logs
+  end
+  
+  # Parse error.log* files in dir and return interesting entries
+  # @param d directory to scan for error.log*
+  # @return hash of arrays of interesting entries
+  def parse_error_logs(d, logs = {})
+    Dir[File.join(d, 'error.lo*')].each do |f|
+      parse_error_log(f, logs)
+    end
+    return logs
+  end
+  
+  # Parse whimsy_error.log and return interesting entries
+  # @param f filename of error.log or .gz
+  # @return hash of string of interesting entries
+  #   "timestamp" => "AH01215: undefined method `map' for #<String:0x0000000240e1e0> (NoMethodError): /x1/srv/whimsy/www/status/errors.cgi"
+  def parse_whimsy_error(f, logs = {})
+    r = Regexp.new('\[(?<errdate>[^\]]*)\] \[cgi:error\] (\[([^\]]*)\] ){2}(?<errline>.+)')
+    read_logz(f).lines.each do |l|
+      if (m = r.match(l))
+        begin
+          logs[DateTime.parse(m[1]).iso8601(6)] = m[2]
+        rescue StandardError
+          # Fallback to merely using the string representation
+          logs[m[1]] = m[2]
+        end
+      end
+    end
+    return logs
+  end
+  
+  # Parse whimsy_error.log* files in dir and return interesting entries
+  # @param d directory to scan for whimsy_error.log*
+  # @return hash of arrays of interesting entries
+  def parse_whimsy_errors(d, logs = {})
+    Dir[File.join(d, 'whimsy_error.lo*')].each do |f|
+      parse_whimsy_error(f, logs)
+    end
+    return logs
+  end
+  
+  # Get a list of all current|available error logs interesting entries
+  # @param d directory to scan for *error.log*
+  # @return hash of arrays of interesting entries
+  def get_errors(d = '/x1/srv/whimsy/www/members/log', current = true)
+    if current
+      logs = LogParser.parse_whimsy_error(File.join(d, 'whimsy_error.log'))
+      LogParser.parse_error_log(File.join(d, 'error.log'), logs)
+    else
+      logs = LogParser.parse_whimsy_errors(d)
+      LogParser.parse_error_logs(d, logs)
+    end
+    return logs.sort.to_h # Sort by time order
+  end
+end
+

-- 
To stop receiving notification emails like this one, please contact
curcuru@apache.org.