APacheDEX-1.6.2/0000755000175000017500000000000012323755130014341 5ustar vincentvincent00000000000000APacheDEX-1.6.2/apachedex/0000755000175000017500000000000012323755130016263 5ustar vincentvincent00000000000000APacheDEX-1.6.2/apachedex/apachedex.js0000644000175000017500000000475412134056667020566 0ustar vincentvincent00000000000000function updateGraphTooltip(event, pos, item, previousIndex, tooltip, plot) { if (item) { if (previousIndex != item.dataIndex) { previousIndex = item.dataIndex; var plot_offset = plot.getPlotOffset(); var offset = plot.offset(); tooltip.find(".x").html(item.series.xaxis.tickFormatter( item.datapoint[0], item.series.xaxis)); tooltip.find(".y").html(item.series.yaxis.options.axisLabel + " : " + item.datapoint[1]); tooltip.css("left", item.pageX - offset.left + plot_offset.left + 5 + "px"); tooltip.show(); // query offsetHeight *after* making the tooltip visible tooltip.css("top", item.pageY - offset.top + plot_offset.top - 5 - tooltip.prop("offsetHeight") + "px"); } } else { if (previousIndex != null) { tooltip.hide(); previousIndex = null; } } return previousIndex; } scale_map = { log100To0: [ function (v) { return -Math.log(101 - v); }, function (v) { return 101 - Math.exp(-v); } ], log0ToAny: [ function (v) { return Math.log(v + 1); }, function (v) { return Math.exp(v) - 1; } ] } function updateAxisTransform(axis) { if (axis != undefined) { transform_list = scale_map[axis.transform]; if (transform_list == undefined) { return; } axis.transform = transform_list[0]; axis.inverseTransform = transform_list[1]; } } function renderGraph(container) { var container = $(container); var previousIndex = null; var tooltip = container.next(".tooltip"); var options = $.parseJSON(container.attr("data-options")); updateAxisTransform(options.xaxis); updateAxisTransform(options.yaxis); var plot = $.plot( container, $.parseJSON(container.attr("data-points")), options ); tooltip.detach(); container.append(tooltip); container.bind("plothover", function (event, pos, item) { previousIndex = updateGraphTooltip(event, pos, item, previousIndex, tooltip, plot); }); } function toggleGraph(node) { var container = $(node).parent().find(".container"); // Note: toggling *after* rendering cause layout problems with flot. container.toggle(); if (container.attr("data-rendered-marker") == null) { container.attr("data-rendered-marker", "rendered"); container.find(".graph").each(function (i){renderGraph(this)}); } } function hideGraph(node) { $(node).parent().hide(); } $(function() { $(".graph:visible").each(function (i){renderGraph(this)}); $(".hidden_graph .container").draggable(); }); APacheDEX-1.6.2/apachedex/jquery.flot.annotate.js0000644000175000017500000000052412135560640022715 0ustar vincentvincent00000000000000/* Annotation Plugin for flot. http://github.com/vpelletier/flot-anotate License: GPLv2+ */ (function ($) { function init(plot) { plot.hooks.draw.push(function (plot, ctx) { }); } $.plot.plugins.push({ init: init, options: { }, name: "annotate" }); })(jQuery); APacheDEX-1.6.2/apachedex/jquery.flot.axislabels.js0000644000175000017500000004225412135556662023252 0ustar vincentvincent00000000000000/* Axis Labels Plugin for flot. http://github.com/markrcote/flot-axislabels Original code is Copyright (c) 2010 Xuan Luo. Original code was released under the GPLv3 license by Xuan Luo, September 2010. Original code was rereleased under the MIT license by Xuan Luo, April 2012. Improvements by Mark Cote. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ (function ($) { var options = { }; function canvasSupported() { return !!document.createElement('canvas').getContext; } function canvasTextSupported() { if (!canvasSupported()) { return false; } var dummy_canvas = document.createElement('canvas'); var context = dummy_canvas.getContext('2d'); return typeof context.fillText == 'function'; } function css3TransitionSupported() { var div = document.createElement('div'); return typeof div.style.MozTransition != 'undefined' // Gecko || typeof div.style.OTransition != 'undefined' // Opera || typeof div.style.webkitTransition != 'undefined' // WebKit || typeof div.style.transition != 'undefined'; } function AxisLabel(axisName, position, padding, plot, opts) { this.axisName = axisName; this.position = position; this.padding = padding; this.plot = plot; this.opts = opts; this.width = 0; this.height = 0; } CanvasAxisLabel.prototype = new AxisLabel(); CanvasAxisLabel.prototype.constructor = CanvasAxisLabel; function CanvasAxisLabel(axisName, position, padding, plot, opts) { AxisLabel.prototype.constructor.call(this, axisName, position, padding, plot, opts); } CanvasAxisLabel.prototype.calculateSize = function() { if (!this.opts.axisLabelFontSizePixels) this.opts.axisLabelFontSizePixels = 14; if (!this.opts.axisLabelFontFamily) this.opts.axisLabelFontFamily = 'sans-serif'; var textWidth = this.opts.axisLabelFontSizePixels + this.padding; var textHeight = this.opts.axisLabelFontSizePixels + this.padding; if (this.position == 'left' || this.position == 'right') { this.width = this.opts.axisLabelFontSizePixels + this.padding; this.height = 0; } else { this.width = 0; this.height = this.opts.axisLabelFontSizePixels + this.padding; } }; CanvasAxisLabel.prototype.draw = function(box) { var ctx = this.plot.getCanvas().getContext('2d'); ctx.save(); ctx.font = this.opts.axisLabelFontSizePixels + 'px ' + this.opts.axisLabelFontFamily; var width = ctx.measureText(this.opts.axisLabel).width; var height = this.opts.axisLabelFontSizePixels; var x, y, angle = 0; if (this.position == 'top') { x = box.left + box.width/2 - width/2; y = box.top + height*0.72; } else if (this.position == 'bottom') { x = box.left + box.width/2 - width/2; y = box.top + box.height - height*0.72; } else if (this.position == 'left') { x = box.left + height*0.72; y = box.height/2 + box.top + width/2; angle = -Math.PI/2; } else if (this.position == 'right') { x = box.left + box.width - height*0.72; y = box.height/2 + box.top - width/2; angle = Math.PI/2; } ctx.translate(x, y); ctx.rotate(angle); ctx.fillText(this.opts.axisLabel, 0, 0); ctx.restore(); }; HtmlAxisLabel.prototype = new AxisLabel(); HtmlAxisLabel.prototype.constructor = HtmlAxisLabel; function HtmlAxisLabel(axisName, position, padding, plot, opts) { AxisLabel.prototype.constructor.call(this, axisName, position, padding, plot, opts); } HtmlAxisLabel.prototype.calculateSize = function() { var elem = $('
' + this.opts.axisLabel + '
'); this.plot.getPlaceholder().append(elem); // store height and width of label itself, for use in draw() this.labelWidth = elem.outerWidth(true); this.labelHeight = elem.outerHeight(true); elem.remove(); this.width = this.height = 0; if (this.position == 'left' || this.position == 'right') { this.width = this.labelWidth + this.padding; } else { this.height = this.labelHeight + this.padding; } }; HtmlAxisLabel.prototype.draw = function(box) { this.plot.getPlaceholder().find('#' + this.axisName + 'Label').remove(); var elem = $('
' + this.opts.axisLabel + '
'); this.plot.getPlaceholder().append(elem); if (this.position == 'top') { elem.css('left', box.left + box.width/2 - this.labelWidth/2 + 'px'); elem.css('top', box.top + 'px'); } else if (this.position == 'bottom') { elem.css('left', box.left + box.width/2 - this.labelWidth/2 + 'px'); elem.css('top', box.top + box.height - this.labelHeight + 'px'); } else if (this.position == 'left') { elem.css('top', box.top + box.height/2 - this.labelHeight/2 + 'px'); elem.css('left', box.left + 'px'); } else if (this.position == 'right') { elem.css('top', box.top + box.height/2 - this.labelHeight/2 + 'px'); elem.css('left', box.left + box.width - this.labelWidth + 'px'); } }; CssTransformAxisLabel.prototype = new HtmlAxisLabel(); CssTransformAxisLabel.prototype.constructor = CssTransformAxisLabel; function CssTransformAxisLabel(axisName, position, padding, plot, opts) { HtmlAxisLabel.prototype.constructor.call(this, axisName, position, padding, plot, opts); } CssTransformAxisLabel.prototype.calculateSize = function() { HtmlAxisLabel.prototype.calculateSize.call(this); this.width = this.height = 0; if (this.position == 'left' || this.position == 'right') { this.width = this.labelHeight + this.padding; } else { this.height = this.labelHeight + this.padding; } }; CssTransformAxisLabel.prototype.transforms = function(degrees, x, y) { var stransforms = { '-moz-transform': '', '-webkit-transform': '', '-o-transform': '', '-ms-transform': '' }; if (x != 0 || y != 0) { var stdTranslate = ' translate(' + x + 'px, ' + y + 'px)'; stransforms['-moz-transform'] += stdTranslate; stransforms['-webkit-transform'] += stdTranslate; stransforms['-o-transform'] += stdTranslate; stransforms['-ms-transform'] += stdTranslate; } if (degrees != 0) { var rotation = degrees / 90; var stdRotate = ' rotate(' + degrees + 'deg)'; stransforms['-moz-transform'] += stdRotate; stransforms['-webkit-transform'] += stdRotate; stransforms['-o-transform'] += stdRotate; stransforms['-ms-transform'] += stdRotate; } var s = 'top: 0; left: 0; '; for (var prop in stransforms) { if (stransforms[prop]) { s += prop + ':' + stransforms[prop] + ';'; } } s += ';'; return s; }; CssTransformAxisLabel.prototype.calculateOffsets = function(box) { var offsets = { x: 0, y: 0, degrees: 0 }; if (this.position == 'bottom') { offsets.x = box.left + box.width/2 - this.labelWidth/2; offsets.y = box.top + box.height - this.labelHeight; } else if (this.position == 'top') { offsets.x = box.left + box.width/2 - this.labelWidth/2; offsets.y = box.top; } else if (this.position == 'left') { offsets.degrees = -90; offsets.x = box.left - this.labelWidth/2 + this.labelHeight/2; offsets.y = box.height/2 + box.top; } else if (this.position == 'right') { offsets.degrees = 90; offsets.x = box.left + box.width - this.labelWidth/2 - this.labelHeight/2; offsets.y = box.height/2 + box.top; } return offsets; }; CssTransformAxisLabel.prototype.draw = function(box) { this.plot.getPlaceholder().find("." + this.axisName + "Label").remove(); var offsets = this.calculateOffsets(box); var elem = $('
' + this.opts.axisLabel + '
'); this.plot.getPlaceholder().append(elem); }; IeTransformAxisLabel.prototype = new CssTransformAxisLabel(); IeTransformAxisLabel.prototype.constructor = IeTransformAxisLabel; function IeTransformAxisLabel(axisName, position, padding, plot, opts) { CssTransformAxisLabel.prototype.constructor.call(this, axisName, position, padding, plot, opts); this.requiresResize = false; } IeTransformAxisLabel.prototype.transforms = function(degrees, x, y) { // I didn't feel like learning the crazy Matrix stuff, so this uses // a combination of the rotation transform and CSS positioning. var s = ''; if (degrees != 0) { var rotation = degrees/90; while (rotation < 0) { rotation += 4; } s += ' filter: progid:DXImageTransform.Microsoft.BasicImage(rotation=' + rotation + '); '; // see below this.requiresResize = (this.position == 'right'); } if (x != 0) { s += 'left: ' + x + 'px; '; } if (y != 0) { s += 'top: ' + y + 'px; '; } return s; }; IeTransformAxisLabel.prototype.calculateOffsets = function(box) { var offsets = CssTransformAxisLabel.prototype.calculateOffsets.call( this, box); // adjust some values to take into account differences between // CSS and IE rotations. if (this.position == 'top') { // FIXME: not sure why, but placing this exactly at the top causes // the top axis label to flip to the bottom... offsets.y = box.top + 1; } else if (this.position == 'left') { offsets.x = box.left; offsets.y = box.height/2 + box.top - this.labelWidth/2; } else if (this.position == 'right') { offsets.x = box.left + box.width - this.labelHeight; offsets.y = box.height/2 + box.top - this.labelWidth/2; } return offsets; }; IeTransformAxisLabel.prototype.draw = function(box) { CssTransformAxisLabel.prototype.draw.call(this, box); if (this.requiresResize) { var elem = this.plot.getPlaceholder().find("." + this.axisName + "Label"); // Since we used CSS positioning instead of transforms for // translating the element, and since the positioning is done // before any rotations, we have to reset the width and height // in case the browser wrapped the text (specifically for the // y2axis). elem.css('width', this.labelWidth); elem.css('height', this.labelHeight); } }; function init(plot) { // This is kind of a hack. There are no hooks in Flot between // the creation and measuring of the ticks (setTicks, measureTickLabels // in setupGrid() ) and the drawing of the ticks and plot box // (insertAxisLabels in setupGrid() ). // // Therefore, we use a trick where we run the draw routine twice: // the first time to get the tick measurements, so that we can change // them, and then have it draw it again. var secondPass = false; var axisLabels = {}; var axisOffsetCounts = { left: 0, right: 0, top: 0, bottom: 0 }; var defaultPadding = 2; // padding between axis and tick labels plot.hooks.draw.push(function (plot, ctx) { var hasAxisLabels = false; if (!secondPass) { // MEASURE AND SET OPTIONS $.each(plot.getAxes(), function(axisName, axis) { var opts = axis.options // Flot 0.7 || plot.getOptions()[axisName]; // Flot 0.6 if (!opts || !opts.axisLabel || !axis.show) return; hasAxisLabels = true; var renderer = null; if (!opts.axisLabelUseHtml && navigator.appName == 'Microsoft Internet Explorer') { var ua = navigator.userAgent; var re = new RegExp("MSIE ([0-9]{1,}[\.0-9]{0,})"); if (re.exec(ua) != null) { rv = parseFloat(RegExp.$1); } if (rv >= 9 && !opts.axisLabelUseCanvas && !opts.axisLabelUseHtml) { renderer = CssTransformAxisLabel; } else if (!opts.axisLabelUseCanvas && !opts.axisLabelUseHtml) { renderer = IeTransformAxisLabel; } else if (opts.axisLabelUseCanvas) { renderer = CanvasAxisLabel; } else { renderer = HtmlAxisLabel; } } else { if (opts.axisLabelUseHtml || (!css3TransitionSupported() && !canvasTextSupported()) && !opts.axisLabelUseCanvas) { renderer = HtmlAxisLabel; } else if (opts.axisLabelUseCanvas || !css3TransitionSupported()) { renderer = CanvasAxisLabel; } else { renderer = CssTransformAxisLabel; } } var padding = opts.axisLabelPadding === undefined ? defaultPadding : opts.axisLabelPadding; axisLabels[axisName] = new renderer(axisName, axis.position, padding, plot, opts); // flot interprets axis.labelHeight and .labelWidth as // the height and width of the tick labels. We increase // these values to make room for the axis label and // padding. axisLabels[axisName].calculateSize(); // AxisLabel.height and .width are the size of the // axis label and padding. axis.labelHeight += axisLabels[axisName].height; axis.labelWidth += axisLabels[axisName].width; opts.labelHeight = axis.labelHeight; opts.labelWidth = axis.labelWidth; }); // if there are axis labels re-draw with new label widths and heights if (hasAxisLabels) { secondPass = true; plot.setupGrid(); plot.draw(); } } else { // DRAW $.each(plot.getAxes(), function(axisName, axis) { var opts = axis.options // Flot 0.7 || plot.getOptions()[axisName]; // Flot 0.6 if (!opts || !opts.axisLabel || !axis.show) return; axisLabels[axisName].draw(axis.box); }); } }); } $.plot.plugins.push({ init: init, options: options, name: 'axisLabels', version: '2.0b0' }); })(jQuery); APacheDEX-1.6.2/apachedex/__init__.py0000755000175000017500000016246412323754724020424 0ustar vincentvincent00000000000000#!/usr/bin/env python ############################################################################## # # Copyright (c) 2013 Nexedi SA and Contributors. All Rights Reserved. # Vincent Pelletier # # WARNING: This program as such is intended to be used by professional # programmers who take the whole responsability of assessing all potential # consequences resulting from its eventual inadequacies and bugs # End users who are looking for a ready-to-use solution with commercial # garantees and support are strongly adviced to contract a Free Software # Service Company # # This program is Free Software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # ############################################################################## from __future__ import print_function, division, absolute_import, \ unicode_literals from cgi import escape from collections import defaultdict, Counter from datetime import datetime, timedelta, date, tzinfo from functools import partial from operator import itemgetter from urllib import splittype, splithost import argparse import bz2 import calendar import codecs import functools import gzip import httplib import itertools import json import math import os import pkgutil import platform import re import shlex import sys import time import traceback try: import pytz except ImportError: pytz = None def getResource(name, encoding='utf-8'): return pkgutil.get_data(__name__, name).decode(encoding) def _wrapOpen(func): @functools.wraps(func) def wrapper(*args, **kw): encoding = kw.pop('encoding', None) info = codecs.lookup(encoding) errors = kw.pop('errors', 'strict') file_object = func(*args, **kw) if encoding is None: return file_object srw = codecs.StreamReaderWriter( file_object, info.streamreader, info.streamwriter, errors, ) srw.encoding = encoding return srw return wrapper lzma = None gzip_open = gzip.open if sys.version_info >= (3, 3): import lzma bz2_open = bz2.open _read_mode = 'rt' else: gzip_open = _wrapOpen(gzip_open) bz2_open = _wrapOpen(bz2.BZ2File) _read_mode = 'r' FILE_OPENER_LIST = [ (gzip_open, IOError), (bz2_open, IOError), ] if lzma is None: try: from backports import lzma except ImportError: pass if lzma is not None: FILE_OPENER_LIST.append((lzma.open, lzma.LZMAError)) # XXX: what encoding ? apache doesn't document one, but requests are supposed # to be urlencoded, so pure ascii. Are timestamps localised ? INPUT_ENCODING = 'ascii' MONTH_VALUE_DICT = dict((y, x) for (x, y) in enumerate(('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'), 1)) US_PER_S = 10 ** 6 N_SLOWEST = 20 N_ERROR_URL = 10 N_REFERRER_PER_ERROR_URL = 5 N_USER_AGENT = 20 ITEMGETTER0 = itemgetter(0) ITEMGETTER1 = itemgetter(1) APDEX_TOLERATING_COEF = 4 AUTO_PERIOD_COEF = 200 # Larger (x < LARGER_THAN_INTEGER_STR == True) than any string starting with # a number LARGER_THAN_INTEGER_STR = 'A' SMALLER_THAN_INTEGER_STR = '' def statusIsError(status): return status[0] > '3' def getClassForDuration(duration, threshold): if duration <= threshold: return '' if duration <= threshold * APDEX_TOLERATING_COEF: return 'warning' return 'problem' def getClassForStatusHit(hit, status): if hit and statusIsError(status): return 'problem' return '' def getDataPoints(apdex_dict, status_period_dict={}): period_error_dict = defaultdict(int) for status, period_dict in status_period_dict.iteritems(): if statusIsError(status): for period, hit in period_dict.iteritems(): period_error_dict[period] += hit # If there was an error, there was a hit, and apdex_dict must contain it # (at same date). assert len(set(period_error_dict) - set(apdex_dict)) == 0 return [ ( value_date, apdex.getApdex() * 100, apdex.hit, period_error_dict.get(value_date, 0), ) for value_date, apdex in sorted(apdex_dict.iteritems(), key=ITEMGETTER0) ] def prepareDataForGraph(daily_data, date_format, placeholder_delta, coefficient_callback, x_min=None, x_max=None): current_date = datetime.strptime(x_min or daily_data[0][0], date_format) new_daily_data = [] append = new_daily_data.append for (measure_date_string, apdex, hit, error_hit) in daily_data: measure_date = datetime.strptime(measure_date_string, date_format) if current_date < measure_date: append((current_date.strftime(date_format), 100, 0, 0)) placeholder_end_date = measure_date - placeholder_delta if placeholder_end_date > current_date: append((placeholder_end_date.strftime(date_format), 100, 0, 0)) coef = coefficient_callback(measure_date) append((measure_date_string, apdex, hit * coef, error_hit * coef)) current_date = measure_date + placeholder_delta if x_max is not None and current_date < datetime.strptime(x_max, date_format): append((current_date.strftime(date_format), 100, 0, 0)) append((x_max, 100, 0, 0)) return new_daily_data def graphPair(daily_data, date_format, graph_period, apdex_y_min=None, hit_y_min=None, hit_y_max=None, apdex_y_scale=None, hit_y_scale=None): date_list = [int(calendar.timegm(time.strptime(x[0], date_format)) * 1000) for x in daily_data] timeformat = '%Y/
%m/%d
%H:%M' # There is room for about 10 labels on the X axis. minTickSize = (max(1, (date_list[-1] - date_list[0]) / (60 * 60 * 1000 * 10)), 'hour') # Guesstimation: 6px per digit. If only em were allowed... yLabelWidth = max(int(math.log10(max(x[2] for x in daily_data))) + 1, 3) * 6 return graph('apdex', [zip(date_list, (round(x[1], 2) for x in daily_data))], { 'xaxis': { 'mode': 'time', 'timeformat': timeformat, 'minTickSize': minTickSize, }, 'yaxis': { 'min': apdex_y_min, 'max': 100, 'axisLabel': 'apdex (%)', 'labelWidth': yLabelWidth, 'transform': apdex_y_scale, }, 'lines': {'show': True}, 'grid': { 'hoverable': True, }, }, ) + graph('Hits (per %s)' % graph_period, [ { 'label': 'Errors', 'data': zip(date_list, (x[3] for x in daily_data)), 'color': 'red', }, { 'label': 'Hits', 'data': zip(date_list, (x[2] for x in daily_data)), }, ], { 'xaxis': { 'mode': 'time', 'timeformat': timeformat, 'minTickSize': minTickSize, }, 'yaxis': { 'min': hit_y_min, 'max': hit_y_max, 'axisLabel': 'Hits', 'labelWidth': yLabelWidth, 'tickDecimals': 0, 'transform': hit_y_scale, }, 'lines': {'show': True}, 'grid': { 'hoverable': True, }, 'legend': { 'backgroundOpacity': 0.25, }, }, ) def graph(title, data, options={}): result = [] append = result.append append('

%s

' '
' '
') return ''.join(result) class APDEXStats(object): def __init__(self, threshold, getDuration): threshold *= US_PER_S self.threshold = threshold self.threshold4 = threshold * APDEX_TOLERATING_COEF self.apdex_1 = 0 self.apdex_4 = 0 self.hit = 0 self.duration_total = 0 self.duration_max = 0 self.getDuration = getDuration def accumulate(self, match): duration = self.getDuration(match) self.duration_total += duration self.duration_max = max(self.duration_max, duration) if not statusIsError(match.group('status')): if duration <= self.threshold: self.apdex_1 += 1 elif duration <= self.threshold4: self.apdex_4 += 1 self.hit += 1 def accumulateFrom(self, other): for attribute in ('apdex_1', 'apdex_4', 'hit', 'duration_total'): setattr(self, attribute, getattr(self, attribute) + getattr(other, attribute)) self.duration_max = max(self.duration_max, other.duration_max) def getApdex(self): if self.hit: return (self.apdex_1 + self.apdex_4 * .5) / self.hit return 1 def getAverage(self): if self.hit: return float(self.duration_total) / (US_PER_S * self.hit) return 0 def getMax(self): return float(self.duration_max) / US_PER_S @staticmethod def asHTMLHeader(overall=False): return 'apdexhitsavg (s)' \ 'max (s)' % (overall and ' class="overall_right"' or '') def asHTML(self, threshold, overall=False): apdex = self.getApdex() average = self.getAverage() maximum = self.getMax() hit = self.hit if hit: extra_class = '' apdex_style = 'color: #%s; background-color: #%s' % ( (apdex < .5 and 'f' or '0') * 3, ('%x' % (apdex * 0xf)) * 3, ) else: extra_class = 'no_hit' apdex_style = '' if overall: extra_right_class = 'overall_right' else: extra_right_class = '' return '' \ '%(apdex)i%%%(hit)s' \ '%(average).2f' \ '%(max).2f' % { 'extra_class': extra_class, 'apdex_style': apdex_style, 'apdex': round(apdex * 100), 'hit': hit, 'average_class': getClassForDuration(average, threshold), 'average': average, 'max_class': getClassForDuration(maximum, threshold), 'max': maximum, 'extra_right_class': extra_right_class, } @classmethod def fromJSONState(cls, state, getDuration): result = cls(0, getDuration) result.__dict__.update(state) return result def asJSONState(self): result = self.__dict__.copy() del result['getDuration'] return result _APDEXDateDictAsJSONState = lambda date_dict: dict(((y, z.asJSONState()) for y, z in date_dict.iteritems())) class GenericSiteStats(object): def __init__(self, threshold, getDuration, suffix, error_detail=False, user_agent_detail=False): self.threshold = threshold self.suffix = suffix self.error_detail = error_detail self.status = defaultdict(partial(defaultdict, int)) if error_detail: # status -> url -> referrer -> count self.error_url_count = defaultdict(partial(defaultdict, Counter)) self.url_apdex = defaultdict(partial(APDEXStats, threshold, getDuration)) self.apdex = defaultdict(partial(APDEXStats, threshold, getDuration)) self.user_agent_detail = user_agent_detail self.user_agent_counter = Counter() def rescale(self, convert, getDuration): for status, date_dict in self.status.iteritems(): new_date_dict = defaultdict(int) for value_date, status_count in date_dict.iteritems(): new_date_dict[convert(value_date)] += status_count self.status[status] = new_date_dict new_apdex = defaultdict(partial(APDEXStats, self.threshold, getDuration)) for value_date, data in self.apdex.iteritems(): new_apdex[convert(value_date)].accumulateFrom(data) self.apdex = new_apdex def accumulate(self, match, url_match, value_date): self.apdex[value_date].accumulate(match) if url_match is None: url = match.group('request') else: url = url_match.group('url') # XXX: can eat memory if there are many different urls self.url_apdex[url.split('?', 1)[0]].accumulate(match) status = match.group('status') self.status[status][value_date] += 1 if self.error_detail and statusIsError(status): # XXX: can eat memory if there are many errors on many different urls self.error_url_count[status][url][match.group('referer')] += 1 if self.user_agent_detail: self.user_agent_counter[match.group('agent')] += 1 def getApdexData(self): return getDataPoints(self.apdex, self.status) def asHTML(self, date_format, placeholder_delta, graph_period, graph_coefficient, encoding, stat_filter=lambda x: x, x_min=None, x_max=None, apdex_y_min=None, hit_y_min=None, hit_y_max=None, apdex_y_scale=None, hit_y_scale=None, ): result = [] append = result.append apdex = APDEXStats(self.threshold, None) for data in self.apdex.itervalues(): apdex.accumulateFrom(data) append('

Overall

') append(APDEXStats.asHTMLHeader()) append('') append(apdex.asHTML(self.threshold)) append('

Hottest pages

') append(APDEXStats.asHTMLHeader()) append('') for url, data in sorted(self.url_apdex.iteritems(), key=lambda x: x[1].getAverage() * x[1].hit, reverse=True)[:N_SLOWEST]: append('') append(data.asHTML(self.threshold)) append('' % escape(url)) append('
url
%s
') if self.user_agent_detail: append('

User agents

' '') for user_agent, hit in self.user_agent_counter.most_common(N_USER_AGENT): append('' % (hit, escape(user_agent))) append('
hitsuser agent
%s%s
') column_set = set() filtered_status = defaultdict(partial(defaultdict, int)) for status, date_dict in self.status.iteritems(): filtered_date_dict = filtered_status[status] for value_date, value in date_dict.iteritems(): filtered_date_dict[stat_filter(value_date)] += value column_set.update(filtered_date_dict) column_list = sorted(column_set) append('

Hits per status code

' '') for column in column_list: append('' % column) append('') def hitTd(hit, status): return '' % (getClassForStatusHit(hit, status), hit) def statusAsHtml(status): try: definition = httplib.responses[int(status)] except KeyError: return status else: return '%s' % (definition, status) has_errors = False for status, data_dict in sorted(filtered_status.iteritems(), key=ITEMGETTER0): has_errors |= statusIsError(status) append('' % statusAsHtml(status)) append(hitTd(sum(data_dict.itervalues()), status)) for column in column_list: append(hitTd(data_dict[column], status)) append('') append('
statusoverall%s
%s
%s
') if self.error_detail and has_errors: def getHitForUrl(referer_counter): return sum(referer_counter.itervalues()) filtered_status_url = defaultdict(partial(defaultdict, dict)) for status, url_dict in self.error_url_count.iteritems(): filtered_status_url[status] = sorted(url_dict.iteritems(), key=lambda x: getHitForUrl(x[1]), reverse=True)[:N_ERROR_URL] append('

Error detail

' '') for status, url_list in sorted(filtered_status_url.iteritems(), key=ITEMGETTER0): append('' % (len(url_list), statusAsHtml(status))) first_url = True for url, referer_counter in url_list: if first_url: first_url = False else: append('') append('' '' % ( getHitForUrl(referer_counter), escape(url), '
'.join('%i: %s' % (hit, escape(referer)) for referer, hit in referer_counter.most_common( N_REFERRER_PER_ERROR_URL)), )) append('') append('
statushitsurlreferers
%s
%s%s%s
') return '\n'.join(result) @classmethod def fromJSONState(cls, state, getDuration, suffix): error_detail = state['error_detail'] result = cls(state['threshold'], getDuration, suffix, error_detail, state.get('user_agent_detail', True)) if error_detail: error_url_count = result.error_url_count for state_status, state_url_dict in state['error_url_count'].iteritems(): url_dict = error_url_count[state_status] for url, counter in state_url_dict.iteritems(): url_dict[url].update(counter) for attribute_id in ('url_apdex', 'apdex'): attribute = getattr(result, attribute_id) for key, apdex_state in state[attribute_id].iteritems(): attribute[key] = APDEXStats.fromJSONState(apdex_state, getDuration) status = result.status for status_code, date_dict in state['status'].iteritems(): status[status_code].update(date_dict) result.user_agent_counter.update(state['user_agent_counter']) return result def asJSONState(self): return { 'threshold': self.threshold, 'error_detail': self.error_detail, 'error_url_count': getattr(self, 'error_url_count', None), 'url_apdex': _APDEXDateDictAsJSONState(self.url_apdex), 'apdex': _APDEXDateDictAsJSONState(self.apdex), 'status': self.status, 'user_agent_counter': self.user_agent_counter, 'user_agent_detail': self.user_agent_detail, } def accumulateFrom(self, other): # XXX: ignoring: threshold, getDuration, suffix, error_detail, # user_agent_detail. # Assuming they are consistently set. if self.error_detail: for status, other_url_dict in other.error_url_count.iteritems(): url_dict = self.error_url_count[status] for url, referer_counter in other_url_dict.iteritems(): url_dict[url].update(referer_counter) for attribute_id in ('url_apdex', 'apdex'): self_attribute = getattr(self, attribute_id) for key, apdex_data in getattr(other, attribute_id).iteritems(): self_attribute[key].accumulateFrom(apdex_data) status = self.status for status_code, other_date_dict in other.status.iteritems(): date_dict = status[status_code] for status_date, count in other_date_dict.iteritems(): date_dict[status_date] += count self.user_agent_counter.update(other.user_agent_counter) class ERP5SiteStats(GenericSiteStats): """ Heuristic used: - ignore any GET parameter - If the first in-site url chunk ends with "_module", count line as belonging to a module - If a line belongs to a module and has at least 2 slashes after module, count line as belonging to a document of that module """ def __init__(self, threshold, getDuration, suffix, error_detail=False, user_agent_detail=False): super(ERP5SiteStats, self).__init__(threshold, getDuration, suffix, error_detail=error_detail, user_agent_detail=user_agent_detail) # Key levels: # - module id (string) # - is document (bool) # - date (string) self.module = defaultdict(partial(defaultdict, partial( defaultdict, partial(APDEXStats, threshold, getDuration)))) self.no_module = defaultdict(partial(APDEXStats, threshold, getDuration)) self.site_search = defaultdict(partial(APDEXStats, threshold, getDuration)) def rescale(self, convert, getDuration): super(ERP5SiteStats, self).rescale(convert, getDuration) threshold = self.threshold for document_dict in self.module.itervalues(): for is_document, date_dict in document_dict.iteritems(): new_date_dict = defaultdict(partial(APDEXStats, threshold, getDuration)) for value_date, data in date_dict.iteritems(): new_date_dict[convert(value_date)].accumulateFrom(data) document_dict[is_document] = new_date_dict for attribute_id in ('no_module', 'site_search'): attribute = defaultdict(partial(APDEXStats, threshold, getDuration)) for value_date, data in getattr(self, attribute_id).iteritems(): attribute[convert(value_date)].accumulateFrom(data) setattr(self, attribute_id, attribute) def accumulate(self, match, url_match, value_date): split = self.suffix(url_match.group('url')).split('?', 1)[0].split('/') if split and split[0].endswith('_module'): super(ERP5SiteStats, self).accumulate(match, url_match, value_date) module = split[0] self.module[module][ len(split) > 1 and (split[1] != 'view' and '_view' not in split[1]) ][value_date].accumulate(match) elif split and split[0] == 'ERP5Site_viewSearchResult': super(ERP5SiteStats, self).accumulate(match, url_match, value_date) self.site_search[value_date].accumulate(match) else: self.no_module[value_date].accumulate(match) def asHTML(self, date_format, placeholder_delta, graph_period, graph_coefficient, encoding, stat_filter=lambda x: x, x_min=None, x_max=None, apdex_y_min=None, hit_y_min=None, hit_y_max=None, apdex_y_scale=None, hit_y_scale=None, ): result = [] append = result.append append('

Stats per module

' '' '') module_document_overall = defaultdict(partial(APDEXStats, self.threshold, None)) filtered_module = defaultdict(partial(defaultdict, partial( defaultdict, partial(APDEXStats, self.threshold, None)))) filtered_no_module = defaultdict(partial(APDEXStats, self.threshold, None)) for value_date, value in self.no_module.iteritems(): filtered_no_module[stat_filter(value_date)].accumulateFrom(value) column_set = set(filtered_no_module) filtered_site_search = defaultdict(partial(APDEXStats, self.threshold, None)) for value_date, value in self.site_search.iteritems(): filtered_site_search[stat_filter(value_date)].accumulateFrom(value) column_set.update(filtered_site_search) for key, is_document_dict in self.module.iteritems(): filtered_is_document_dict = filtered_module[key] for key, data_dict in is_document_dict.iteritems(): filtered_data_dict = filtered_is_document_dict[key] module_document_apdex = module_document_overall[key] for value_date, value in data_dict.iteritems(): filtered_data_dict[stat_filter(value_date)].accumulateFrom(value) module_document_apdex.accumulateFrom(value) column_set.update(filtered_data_dict) column_list = sorted(column_set) for column in column_list: append('' % column) append('') for i in xrange(len(column_list) + 1): append(APDEXStats.asHTMLHeader(i == 0)) append('') def apdexAsColumns(data_dict): data_total = APDEXStats(self.threshold, None) for data in data_dict.itervalues(): data_total.accumulateFrom(data) append(data_total.asHTML(self.threshold, True)) for column in column_list: append(data_dict[column].asHTML(self.threshold)) return data_total def hiddenGraph(data_dict, title): append('') for module_id, data_dict in sorted(filtered_module.iteritems(), key=ITEMGETTER0): append('' '' % module_id) hiddenGraph(self.module[module_id][False], module_id + ' (module)') apdexAsColumns(data_dict[False]) append('') hiddenGraph(self.module[module_id][True], module_id + ' (document)') apdexAsColumns(data_dict[True]) append('') append('') hiddenGraph(self.site_search, 'site search') site_search_overall = apdexAsColumns(filtered_site_search) append('') hiddenGraph(self.no_module, 'other') no_module_overall = apdexAsColumns(filtered_no_module) append('
moduleoverall%s
') data = getDataPoints(data_dict) if len(data) > 1: append('+' '
' '
%s
' '
close
' % title ) append(graphPair( prepareDataForGraph( data, date_format, placeholder_delta, graph_coefficient, x_min=x_min, x_max=x_max, ), date_format, graph_period, apdex_y_min=apdex_y_min, hit_y_min=hit_y_min, hit_y_max=hit_y_max, apdex_y_scale=apdex_y_scale, hit_y_scale=hit_y_scale, )) append('
') append('
%smodule
document
site search' '
other

Per-level overall

' '') append(APDEXStats.asHTMLHeader()) append('') append(no_module_overall.asHTML(self.threshold)) append('') append(site_search_overall.asHTML(self.threshold)) append('') append(module_document_overall[False].asHTML(self.threshold)) append('') append(module_document_overall[True].asHTML(self.threshold)) append('
level
other
site search
module
document
') append(super(ERP5SiteStats, self).asHTML(date_format, placeholder_delta, graph_period, graph_coefficient, encoding, stat_filter=stat_filter, x_min=x_min, x_max=x_max, apdex_y_min=apdex_y_min, hit_y_min=hit_y_min, hit_y_max=hit_y_max, apdex_y_scale=apdex_y_scale, hit_y_scale=hit_y_scale, )) return '\n'.join(result) @classmethod def fromJSONState(cls, state, getDuration, suffix): result = super(ERP5SiteStats, cls).fromJSONState(state, getDuration, suffix) for module_id, module_dict_state in state['module'].iteritems(): module_dict = result.module[module_id] for is_document, date_dict_state in module_dict_state.iteritems(): date_dict = module_dict[is_document == 'true'] for value_date, apdex_state in date_dict_state.iteritems(): date_dict[value_date] = APDEXStats.fromJSONState(apdex_state, getDuration) for attribute_id in ('no_module', 'site_search'): attribute = getattr(result, attribute_id) for value_date, apdex_state in state[attribute_id].iteritems(): attribute[value_date] = APDEXStats.fromJSONState(apdex_state, getDuration) return result def asJSONState(self): result = super(ERP5SiteStats, self).asJSONState() result['module'] = module = {} for module_id, module_dict in self.module.iteritems(): module_dict_state = module[module_id] = {} for is_document, date_dict in module_dict.iteritems(): module_dict_state[is_document] = _APDEXDateDictAsJSONState(date_dict) for attribute_id in ('no_module', 'site_search'): result[attribute_id] = _APDEXDateDictAsJSONState(getattr(self, attribute_id)) return result def accumulateFrom(self, other): super(ERP5SiteStats, self).accumulateFrom(other) module = self.module for module_id, other_module_dict in other.module.iteritems(): module_dict = module[module_id] for is_document, other_date_dict in other_module_dict.iteritems(): date_dict = module_dict[is_document] for value_date, apdex in other_date_dict.iteritems(): date_dict[value_date].accumulateFrom(apdex) for attribute_id in ('no_module', 'site_search'): attribute = getattr(self, attribute_id) for value_date, apdex in getattr(other, attribute_id).iteritems(): attribute[value_date].accumulateFrom(apdex) DURATION_US_FORMAT = '%D' DURATION_S_FORMAT = '%T' server_name_group_dict = { '%v': lambda x, path: x.group('servername') + '/' + path, '%V': lambda x, path: x.group('canonical_servername') + '/' + path, } logformat_dict = { '%h': r'(?P[^ ]*)', '%b': r'(?P[0-9-]*?)', '%l': r'(?P[^ ]*)', '%u': r'(?P[^ ]*)', '%t': r'\[(?P[^\]]*)\]', '%r': r'(?P[^"]*)', # XXX: expected to be enclosed in ". See also REQUEST_PATTERN '%>s': r'(?P[0-9]*?)', '%O': r'(?P[0-9-]*?)', '%{Referer}i': r'(?P[^"]*)', # XXX: expected to be enclosed in " '%{REMOTE_USER}i': r'(?P[^"]*)', # XXX: expected to be enclosed in " '%{User-Agent}i': r'(?P[^"]*)', # XXX: expected to be enclosed in " DURATION_US_FORMAT: r'(?P[0-9]*)', DURATION_S_FORMAT: r'(?P[0-9]*)', '%%': r'%', '%v': r'(?P[^ ]*)', '%V': r'(?P[^ ]*)', # TODO: add more formats } # Expensive, but more robust, variants expensive_logformat_dict = { '%r': r'(?P(\\.|[^\\"])*)', '%{Referer}i': r'(?P(\\.|[^\\"])*)', '%{User-Agent}i': r'(?P(\\.|[^\\"])*)', '%{REMOTE_USER}i': r'(?P(\\.|[^\\"])*)', } REQUEST_PATTERN = re.compile('(?P[^ ]*) (?P[^ ]*)' '( (?P.*))?') class AggregateSiteUrl(argparse.Action): __argument_to_aggregator = { '--base': GenericSiteStats, '--erp5-base': ERP5SiteStats, '--skip-base': None, } def __call__(self, parser, namespace, values, option_string=None): action = base_action = self.__argument_to_aggregator[option_string] site_list, site_caption_dict = getattr(namespace, self.dest) next_value = iter(values).next while True: try: value = next_value() except StopIteration: break if value in site_caption_dict: raise ValueError('Duplicate base: %r' % value) if action is not None and value[0] == '+': caption = value[1:] try: value = next_value() except StopIteration: raise ValueError('No base follows caption %r' % value) else: caption = value site_caption_dict[value] = caption match = re.compile(value).match if base_action is not None: match_suffix = re.compile(value + '(?P.*)').match action = partial(base_action, suffix=lambda x: match_suffix(x).group('suffix')) site_list.append((value, match, action)) class ShlexArgumentParser(argparse.ArgumentParser): """ Two objectives in this class: - use shlex to split config files - when recursively including files, do it from referer's path instead of current working directory, to facilitate relative inclusion. """ # XXX: I whould like to be able to hook inside _read_args_from_files, but # it would be dirtier. Instead, declare a private method doing similar # replacement before handing args to original parse_known_args. def __read_args_from_files(self, args, cwd): new_args = [] append = new_args.append extend = new_args.extend args = iter(args) for arg in args: if arg[:1] in self.fromfile_prefix_chars: filepath = arg[1:] if not filepath: filepath = next(args) new_cwd = os.path.normpath(os.path.join( cwd, os.path.dirname(filepath), )) try: with open(os.path.join(new_cwd, os.path.basename(filepath)) ) as in_file: extend(self.__read_args_from_files( shlex.split(in_file.read(), comments=True), new_cwd, )) except IOError, exc: self.error(str(exc)) else: append(arg) return new_args def parse_known_args(self, args=None, namespace=None): if args is None: args = sys.argv[1:] else: args = list(args) args = self.__read_args_from_files(args, os.getcwd()) return super(ShlexArgumentParser, self).parse_known_args(args=args, namespace=namespace) _month_offset_cache = {} def _asWeekString(dt): year = dt.year month = dt.month day = dt.day key = (year, month) try: offset = _month_offset_cache[key] except KeyError: # Substract 1 to exclude first day of month, and 1 to prepare for next # operation (avoid substracting on each run). offset = date(year, month, 1).timetuple().tm_yday - 2 _month_offset_cache[key] = offset day_of_year = day + offset day -= day_of_year - (day_of_year // 7 * 7) if day < 1: month -= 1 day += calendar.monthrange(year, month)[1] assert day > 0 and month > 0, (dt, year, month, day) return '%04i/%02i/%02i' % (year, month, day) def _weekStringAsQuarterString(timestamp): year, month, _ = timestamp.split('/') return '%s/%02i' % (year, (int(month) - 1) // 3 * 3 + 1) def _roundWeek(dt): day_of_year = dt.timetuple().tm_yday return dt - timedelta(day_of_year - ((day_of_year - 1) // 7 * 7 + 1)) def _getWeekCoefficient(dt): if dt.month != 12: return 1 # 32 = 31 days of December + 1 day so YYYY/12/31 is still 1 day of measure, # and return value is 7. return max(1, 7. / (32 - dt.day)) def _round6Hour(dt): return dt.replace(hour=dt.hour // 6 * 6) def _hourAsWeekString(timestamp): dt = datetime.strptime(timestamp, '%Y/%m/%d %H') return (dt - timedelta(dt.weekday())).date().strftime('%Y/%m/%d') def _asHalfDayString(timestamp): prefix, _ = timestamp.rsplit(':', 1) prefix, hours = prefix.split(' ') return '%s %02i' % (prefix, int(hours) // 12 * 12) def _asQuarterHourString(timestamp): prefix, minute = timestamp.rsplit(':', 1) return '%s:%02i' % (prefix, int(minute) // 15 * 15) # Key: argument (represents table granularity) # Value: # - cheap conversion from apache date format to graph granularity # must be sortable consistently with time flow # - conversion from gaph granularity to table granularity # - graph granularity caption # - format string to parse and generate graph granularity into/from # datetime.datetime instance # - period during which a placeholder point will be added if there is no data # point # - round a datetime.datetime instance so once represented using given format # string it is a valid graph-granularity date for period # - coefficient to apply to hit count for given (graph granularity) # datetime.datetime. Most useful in case of "7 days", as last month's week # may be a single day, causing graph to display a value up to 7 times lower # than what it should be. period_parser = { 'year': ( lambda x: x.strftime('%Y/%m'), lambda x: x.split('/', 1)[0], 'month', '%Y/%m', # Longest month: 31 days timedelta(31), lambda x: x, # Error margin without correction: 3/31 = 10% lambda x: 31. / calendar.monthrange(x.year, x.month)[1], ), 'quarter': ( _asWeekString, _weekStringAsQuarterString, # Note: Not calendar weeks, but chunks of 7 days starting on first year's # day. Cheaper to compute than locating first sunday/monday of the year. '7 days', '%Y/%m/%d', timedelta(7), _roundWeek, # Error margin without correction: (366 % 7 = 2) 2/7 = 29% _getWeekCoefficient, ), 'month': ( lambda x: x.strftime('%Y/%m/%d'), lambda x: '/'.join(x.split('/', 2)[:2]), 'day', '%Y/%m/%d', # Longest day: 24 hours + 1h DST (never more ?) timedelta(seconds=3600 * 25), lambda x: x, # Error margin without correction: (DST) 1/24 = 4% lambda x: 1, ), 'week': ( lambda x: x.strftime('%Y/%m/%d ') + '%02i' % (x.hour // 6 * 6), _hourAsWeekString, '6 hours', '%Y/%m/%d %H', timedelta(seconds=3600 * 6), _round6Hour, # Error margin without correction: (DST) 1/6 = 17% lambda x: 1, ), 'day': ( lambda x: x.strftime('%Y/%m/%d %H'), lambda x: x.split(' ')[0], 'hour', '%Y/%m/%d %H', # Longest hour: 60 * 60 seconds + 1 leap second. timedelta(seconds=3601), lambda x: x, # Error margin without correction: (leap) 1/3600 = .03% lambda x: 1, ), 'halfday': ( lambda x: x.strftime('%Y/%m/%d %H:') + '%02i' % (x.minute // 30 * 30), _asHalfDayString, '30 minutes', '%Y/%m/%d %H:%M', timedelta(seconds=30 * 60), lambda x: x.replace(minute=x.minute // 30 * 30), lambda x: 1, ), 'quarterhour': ( lambda x: x.strftime('%Y/%m/%d %H:%M'), _asQuarterHourString, 'minute', '%Y/%m/%d %H:%M', timedelta(seconds=60), lambda x: x, lambda x: 1, ), } apdex_y_scale_dict = { 'linear': None, 'log': 'log100To0', } hit_y_scale_dict = { 'linear': None, 'log': 'log0ToAny', } def asHTML(out, encoding, per_site, args, default_site, period_parameter_dict, stats, site_caption_dict): period = period_parameter_dict['period'] decimator = period_parameter_dict['decimator'] date_format = period_parameter_dict['date_format'] placeholder_delta = period_parameter_dict['placeholder_delta'] graph_period = period_parameter_dict['graph_period'] graph_coefficient = period_parameter_dict['graph_coefficient'] hit_y_max = args.fixed_yrange if hit_y_max is not None: apdex_y_min = hit_y_min = 0 if hit_y_max < 0: hit_y_max = None else: apdex_y_min = hit_y_min = None out.write('\n' 'Stats' % encoding) js_path = args.js js_embed = js_path is None or args.js_embed if js_embed: out.write('') else: out.write('' % js_path) for script in ('jquery.js', 'jquery.flot.js', 'jquery.flot.time.js', 'jquery.flot.axislabels.js', 'jquery-ui.js', 'apachedex.js'): if js_embed: out.write('') else: out.write('' % ( js_path, script)) apdex_y_scale = apdex_y_scale_dict[args.apdex_yscale] hit_y_scale = hit_y_scale_dict[args.hit_yscale] out.write('

Overall

') site_list = list(enumerate(sorted(per_site.iteritems(), key=lambda x: site_caption_dict[x[0]]))) html_site_caption_dict = {} for i, (site_id, _) in site_list: html_site_caption_dict[site_id] = escape(site_caption_dict[site_id]) if len(per_site) > 1: out.write('

Index

    ') for i, (site_id, _) in site_list: out.write('
  1. %s
  2. ' % (i, escape(repr(site_id), quote=True), html_site_caption_dict[site_id])) out.write('
') out.write('

Parameters

') for caption, value in ( ('apdex threshold', '%.2fs' % args.apdex), ('period', args.period or (period + ' (auto)')), ('timezone', args.to_timezone or "(input's)") ): out.write('' % ( caption, value)) out.write('
%s%s

Hits per %s

' '' % period) hit_per_day = defaultdict(int) x_min = LARGER_THAN_INTEGER_STR x_max = SMALLER_THAN_INTEGER_STR for site_data in per_site.itervalues(): apdex_data_list = site_data.getApdexData() if apdex_data_list: x_min = min(x_min, apdex_data_list[0][0]) x_max = max(x_max, apdex_data_list[-1][0]) for hit_date, _, hit, _ in apdex_data_list: hit_per_day[decimator(hit_date)] += hit if x_min == LARGER_THAN_INTEGER_STR: x_min = None x_max = None for hit_date, hit in sorted(hit_per_day.iteritems(), key=ITEMGETTER0): out.write('' % (hit_date, hit)) out.write('
datehits
%s%s
') for i, (site_id, data) in site_list: out.write('

%s

' % (i, escape(repr(site_id), quote=True), html_site_caption_dict[site_id])) apdex_data = data.getApdexData() if apdex_data: out.write( graphPair( prepareDataForGraph( apdex_data, date_format, placeholder_delta, graph_coefficient, x_min=x_min, x_max=x_max, ), date_format, graph_period, apdex_y_min=apdex_y_min, hit_y_min=hit_y_min, hit_y_max=hit_y_max, apdex_y_scale=apdex_y_scale, hit_y_scale=hit_y_scale, ) ) out.write(data.asHTML(date_format, placeholder_delta, graph_period, graph_coefficient, encoding, decimator, x_min=x_min, x_max=x_max, apdex_y_min=apdex_y_min, hit_y_min=hit_y_min, hit_y_max=hit_y_max, apdex_y_scale=apdex_y_scale, hit_y_scale=hit_y_scale, )) end_stat_time = time.time() if args.stats: out.write('

Parsing stats

') buildno, builddate = platform.python_build() end_parsing_time = stats['end_parsing_time'] parsing_time = end_parsing_time - stats['parsing_start_time'] all_lines = stats['all_lines'] for caption, value in ( ('Execution date', datetime.now().isoformat()), ('Interpreter', '%s %s build %s (%s)' % ( platform.python_implementation(), platform.python_version(), buildno, builddate, )), ('State file count', stats['state_file_count']), ('State loading time', timedelta(seconds=stats['parsing_start_time'] - stats['loading_start_time'])), ('File count', stats['file_count']), ('Lines', all_lines), ('... malformed', stats['malformed_lines']), ('... URL-less', stats['no_url_lines']), ('... skipped (URL)', stats['skipped_lines']), ('... skipped (user agent)', stats['skipped_user_agent']), ('Parsing time', timedelta(seconds=parsing_time)), ('Parsing rate', '%i line/s' % (all_lines / parsing_time)), ('Rendering time', timedelta(seconds=( end_stat_time - end_parsing_time))), ): out.write('' % ( caption, value)) out.write('
%s%s
') out.write('') def asJSON(out, encoding, per_site, *_): json.dump([(x, y.asJSONState()) for x, y in per_site.iteritems()], out) format_generator = { 'html': (asHTML, 'utf-8'), 'json': (asJSON, 'ascii'), } ZERO_TIMEDELTA = timedelta(0, 0) class AutoTZInfo(tzinfo): """ Only for fixed UTC offsets ([+-]HHMM) Because datetime.strptime doesn't support %z. """ def __init__(self, name): assert len(name) == 5, repr(name) sign = name[0] assert sign in '+-', sign hour = int(name[1:3]) assert 0 <= hour <= 12, hour minute = int(name[3:]) assert 0 <= minute < 60, minute if sign == '-': hour = -hour minute = -minute self.offset = timedelta(hours=hour, minutes=minute) self.name = name def utcoffset(self, dt): return self.offset def dst(self, dt): return ZERO_TIMEDELTA def tzname(self, dt): return self.name _tz_cache = {} def getTZInfo(tz): try: return _tz_cache[tz] except KeyError: _tz_cache[tz] = tzi = AutoTZInfo(tz) return tzi def _gracefulExit(func): @functools.wraps(func) def wrapper(*args, **kw): try: return func(*args, **kw) except KeyboardInterrupt: sys.exit(1) return wrapper @_gracefulExit def main(): parser = ShlexArgumentParser(description='Compute Apdex out of ' 'apache-style log files', fromfile_prefix_chars='@') parser.add_argument('logfile', nargs='*', help='Log files to process. Use - for stdin.') parser.add_argument('-l', '--logformat', default='%h %l %u %t "%r" %>s %O "%{Referer}i" "%{User-Agent}i" %D', help='Apache LogFormat used to generate provided logs. ' 'Default: %(default)r') parser.add_argument('-o', '--out', default='-', help='Filename to write output to. Use - for stdout. Default: %(default)s') parser.add_argument('-q', '--quiet', action='store_true', help='Suppress warnings about malformed lines.') parser.add_argument('-Q', '--no-progress', action='store_true', help='Suppress progress indication (file being parsed, lines counter). ' 'Does not imply -q.') parser.add_argument('--state-file', nargs='+', default=[], help='Use given JSON files as initial state. Use - for stdin.') parser.add_argument('--to-timezone', help='Timezone to convert log ' 'timestamps to before splitting days. If not provided, no conversion ' 'happens. In addition to "Continent/City" format which know about DST ' 'but requires pytz module, fixed UTC offsets can be provided in the ' '+hhmm form (ex: -0700 for UTC-7). This form does not require pytz ' 'module.') group = parser.add_argument_group('generated content (all formats)') group.add_argument('-a', '--apdex', default=1.0, type=float, help='First threshold for Apdex computation, in seconds. ' 'Default: %(default).2fs') group.add_argument('-e', '--error-detail', action='store_true', help='Include detailed report (url & referers) for error statuses.') group.add_argument('-u', '--user-agent-detail', action='store_true', help='Include report of most frequent user agents.') group.add_argument('-f', '--format', choices=format_generator, default='html', help='Format in which output should be generated.') group.add_argument('-p', '--period', choices=period_parser, help='Periodicity of sampling buckets. Default: (decide from data).') group = parser.add_argument_group('generated content (html)') group.add_argument('-s', '--stats', action='store_true', help='Enable parsing stats (time spent parsing input, time spent ' 'generating output, ...)') group.add_argument('--js', help='Folder containing needed js files.') group.add_argument('--js-embed', action='store_true', help='Embed js files instead of linking to them.') group.add_argument('--fixed-yrange', nargs='?', type=int, const=-1, help='Fix graph vertical range: 0-100%% for apdex, 0-value for hits. ' 'Negative value means hit max is adapted to data (used when this ' 'argument is provided without value).') group.add_argument('--apdex-yscale', default='linear', choices=apdex_y_scale_dict, help='apdex graph ordinate scale. Default: %(default)s') group.add_argument('--hit-yscale', default='linear', choices=hit_y_scale_dict, help='hit graph ordinate scale. Default: %(default)s') group = parser.add_argument_group('site matching', 'Earlier arguments take ' 'precedence. Arguments are Python regexes, matching urlencoded strings.' 'Regex matches can be named by providing a "+"-prefixed string before ' 'regex.') group.add_argument('-d', '--default', help='Caption for lines matching no prefix, or skip them if not provided.') group.add_argument('--base', dest='path', default=([], {}), nargs='+', action=AggregateSiteUrl, help='Title (optional) and regexes matching parts of a site.') group.add_argument('--erp5-base', dest='path', nargs='+', action=AggregateSiteUrl, help='Similar to --base, but with specialised statistics. Ex: ' '"/erp5(/|$|\?)"') group.add_argument('--skip-base', dest='path', nargs='+', action=AggregateSiteUrl, help='Absolute base url(s) to ignore.') group.add_argument('--match-servername', choices=server_name_group_dict, help='Prefix URL with (canonical) server name.') group = parser.add_argument_group('filtering') group.add_argument('--skip-user-agent', nargs='+', default=[], action='append', help='List of user agents from which hits should be ' 'ignored. Useful to exclude monitoring systems.') args = parser.parse_args() if DURATION_US_FORMAT in args.logformat: getDuration = lambda x: int(x.group('duration')) elif DURATION_S_FORMAT in args.logformat: getDuration = lambda x: int(x.group('duration_s')) * US_PER_S else: parser.error('Neither %D nor %T are present in logformat, apdex ' 'cannot be computed.') if args.match_servername is not None and \ args.match_servername not in args.logformat: parser.error('--match-servername %s requested, but missing ' 'from logformat.' % args.match_servername) get_url_prefix = server_name_group_dict.get(args.match_servername, lambda _, path: path) line_regex = '' expensive_line_regex = '' try: n = iter(args.logformat).next while True: key = None expensive_char = char = n() if char == '%': fmt = n() key = char + fmt if fmt == '{': while fmt != '}': fmt = n() key += fmt key += n() elif fmt == '>': key += n() char = logformat_dict[key] expensive_char = expensive_logformat_dict.get(key, char) line_regex += char expensive_line_regex += expensive_char except StopIteration: assert not key, key matchline = re.compile(line_regex).match expensive_matchline = re.compile(expensive_line_regex).match matchrequest = REQUEST_PATTERN.match if args.period is None: next_period_data = ((x, y[4] * AUTO_PERIOD_COEF) for (x, y) in sorted(period_parser.iteritems(), key=lambda x: x[1][4])).next period, to_next_period = next_period_data() original_period = period earliest_date = latest_date = None def getNextPeriod(): # datetime is slow (compared to string operations), but not many choices return (datetime.strptime(earliest_date, date_format) + to_next_period ).strftime(date_format) def rescale(x): result = round_date(datetime.strptime(x, old_date_format)).strftime(date_format) return result else: to_next_period = None period = args.period def _matchToDateTime(match): dt, tz = match.group('timestamp').split() day, month, rest = dt.split('/', 2) return datetime.strptime( '%s/%02i/%s' % (day, MONTH_VALUE_DICT[month], rest), '%d/%m/%Y:%H:%M:%S').replace(tzinfo=getTZInfo(tz)) if args.to_timezone: to_timezone = args.to_timezone if re.match(r'^[+-]\d{4}$', to_timezone): getTimezoneInfo = getTZInfo else: if pytz is None: raise ValueError('pytz is not available, cannot convert timezone.') getTimezoneInfo = pytz.timezone tz_info = getTimezoneInfo(to_timezone) matchToDateTime = lambda x: _matchToDateTime(x).astimezone(tz_info) else: matchToDateTime = _matchToDateTime asDate, decimator, graph_period, date_format, placeholder_delta, \ round_date, graph_coefficient = period_parser[period] site_list, site_caption_dict = args.path default_site = args.default if default_site is None: default_action = None if not [None for _, _, x in site_list if x is not None]: parser.error('None of --default, --erp5-base and --base were ' 'specified, nothing to do.') else: default_action = partial(GenericSiteStats, suffix=lambda x: x) site_caption_dict[None] = default_site infile_list = args.logfile quiet = args.quiet threshold = args.apdex error_detail = args.error_detail user_agent_detail = args.user_agent_detail file_count = len(infile_list) per_site = {} if '-' in args.state_file and '-' in infile_list: parser.error('stdin cannot be used both as log and state input.') loading_start_time = time.time() for state_file_name in args.state_file: print('Loading %s...' % state_file_name, end='', file=sys.stderr) if state_file_name == '-': state_file = sys.stdin else: state_file = codecs.open(state_file_name, encoding='ascii') with state_file: load_start = time.time() state = json.load(state_file) for url, site_state in state: if url is None: site = None action = default_action else: for site, prefix_match, action in site_list: if site == url: break else: site = None action = default_action if action is None: print('Info: no prefix match %r, stats skipped' % url, file='sys.stderr') continue site_stats = action.func.fromJSONState(site_state, getDuration, action.keywords['suffix']) if site in per_site: per_site[site].accumulateFrom(site_stats) else: per_site[site] = site_stats print('done (%s)' % timedelta(seconds=time.time() - load_start), file=sys.stderr) skip_user_agent = [re.compile(x).match for x in itertools.chain(*args.skip_user_agent)] malformed_lines = 0 skipped_lines = 0 no_url_lines = 0 all_lines = 0 skipped_user_agent = 0 show_progress = not args.no_progress parsing_start_time = time.time() for fileno, filename in enumerate(infile_list, 1): if show_progress: print('Processing %s [%i/%i]' % (filename, fileno, file_count), file=sys.stderr) if filename == '-': logfile = sys.stdin else: for opener, exc in FILE_OPENER_LIST: logfile = opener(filename, _read_mode, encoding=INPUT_ENCODING) try: logfile.readline() except exc: continue else: logfile.seek(0) break else: logfile = codecs.open(filename, _read_mode, encoding=INPUT_ENCODING) lineno = 0 for lineno, line in enumerate(logfile, 1): if show_progress and lineno % 5000 == 0: print(lineno, end='\r', file=sys.stderr) match = matchline(line) if match is None: match = expensive_matchline(line) if match is None: if not quiet: print('Malformed line at %s:%i: %r' % (filename, lineno, line), file=sys.stderr) malformed_lines += 1 continue agent = match.group('agent') if any(x(agent) for x in skip_user_agent): skipped_user_agent += 1 continue url_match = matchrequest(match.group('request')) if url_match is None: no_url_lines += 1 continue url = url_match.group('url') if url.startswith('http'): url = splithost(splittype(url)[1])[1] url = get_url_prefix(match, url) for site, prefix_match, action in site_list: if prefix_match(url) is not None: break else: site = None action = default_action if action is None: skipped_lines += 1 continue hit_date = asDate(matchToDateTime(match)) if to_next_period is not None: if latest_date is None or latest_date < hit_date: latest_date = hit_date if earliest_date is None or hit_date < earliest_date: earliest_date = hit_date next_period = getNextPeriod() try: while latest_date > next_period: period, to_next_period = next_period_data() next_period = getNextPeriod() except StopIteration: to_next_period = None if original_period != period: original_period = period if show_progress: print('Increasing period to %s...' % period, end='', file=sys.stderr) old_date_format = date_format asDate, decimator, graph_period, date_format, placeholder_delta, \ round_date, graph_coefficient = period_parser[period] latest_date = rescale(latest_date) earliest_date = rescale(earliest_date) period_increase_start = time.time() for site_data in per_site.itervalues(): site_data.rescale(rescale, getDuration) if show_progress: print('done (%s)' % timedelta(seconds=time.time() - period_increase_start), file=sys.stderr) hit_date = asDate(matchToDateTime(match)) try: site_data = per_site[site] except KeyError: site_data = per_site[site] = action(threshold, getDuration, error_detail=error_detail, user_agent_detail=user_agent_detail) try: site_data.accumulate(match, url_match, hit_date) except Exception: if not quiet: print('Error analysing line at %s:%i: %r' % (filename, lineno, line), file=sys.stderr) traceback.print_exc(file=sys.stderr) all_lines += lineno if show_progress: print(lineno, file=sys.stderr) end_parsing_time = time.time() generator, out_encoding = format_generator[args.format] if args.out == '-': out = codecs.getwriter(out_encoding)(sys.stdout) else: out = codecs.open(args.out, 'w', encoding=out_encoding) with out: generator(out, out_encoding, per_site, args, default_site, { 'period': period, 'decimator': decimator, 'date_format': date_format, 'placeholder_delta': placeholder_delta, 'graph_period': graph_period, 'graph_coefficient': graph_coefficient, }, { 'state_file_count': len(args.state_file), 'loading_start_time': loading_start_time, 'parsing_start_time': parsing_start_time, 'end_parsing_time': end_parsing_time, 'file_count': file_count, 'all_lines': all_lines, 'malformed_lines': malformed_lines, 'no_url_lines': no_url_lines, 'skipped_lines': skipped_lines, 'skipped_user_agent': skipped_user_agent, }, site_caption_dict, ) if __name__ == '__main__': __resource_base = os.path.join(*os.path.split(__file__)[:-1]) def getResource(name, encoding='utf-8'): return codecs.open( os.path.join(__resource_base, name), encoding=encoding, ).read() main() APacheDEX-1.6.2/apachedex/apachedex.css0000644000175000017500000000300012130352723020705 0ustar vincentvincent00000000000000.stats th, .stats td { border: solid 1px #000; } .stats th { text-align: center; } .stats td { text-align: right; } .stats th.text, .stats td.text { text-align: left; } .stats td.no_hit { color: #ccc; } .stats_erp5 td { border-style: dotted; } .stats_erp5 tr.group_top td { border-top-style: solid; } .stats_erp5 tr.group_bottom td { border-bottom-style: solid; } .stats_erp5 td.group_left { border-left-style: solid; } .stats_erp5 td.group_right { border-right-style: solid; } .stats_erp5 .overall_right { border-right-width: .2em; } .hidden_graph .positioner { position: absolute; left: 50%; } .hidden_graph .container { display: none; position: absolute; left: -301px; background-color: #fff; border: 1px solid #000; } .hidden_graph:hover .container { visibility: visible; } .action { text-decoration: underline; color: blue; } .hidden_graph .title { float: left; } .hidden_graph .close { float: right; } table.stats { border-collapse: collapse; } .problem { background-color: #f00; color: white; } .warning { background-color: #f80; color: white; } h1 { background-color: #ccc; } h2 { background-color: #eee; } .axisLabels { color: rgb(84,84,84) !important; } .flot-x-axis .tickLabel { text-align: center; } .tooltip { position: absolute; display: none; padding: 0.1em; border: 1px solid #000; background-color: #fff; opacity: 0.80; } .tooltip .x br { display: none; } abbr { border-bottom: 1px dotted #000; cursor: help; } APacheDEX-1.6.2/parallel_parse.sh0000755000175000017500000000207112201375326017666 0ustar vincentvincent00000000000000#!/bin/bash usage() { echo "Usage:" echo " find [...] -print0 | $0 \\" echo " parallelism state_dir out_file command [arg1 [...]]" echo "Reads filenames to process from stdin, null-delimited." echo echo "Example: parsing any number of log files with up to 4" echo "processes in parallel with locally built pypy:" echo " $ mkdir state" echo " $ $0 4 state out.tml /usr/local/bin/pypy \\" echo " bin/apachedex --period week" } if [ $# -lt 4 ]; then usage exit 1 fi if [ "$1" = "-h" -o "$1" = "--help" ]; then usage exit 0 fi PARALLELISM="$1" shift STATE_DIR="$1" mkdir -p "$STATE_DIR" || exit $? shift OUT_FILE="$1" shift # XXX: any simpler way ? xargs -0 -r -n 1 -P "$PARALLELISM" -I "@FILE@" -- "$SHELL" -c 'INFILE="$1";shift;STATE_DIR="$1";shift;echo -n .;exec "$@" -Q --format json --out "$STATE_DIR/$(sed s:/:@:g <<< "$INFILE").json" "$INFILE"' "$0" "@FILE@" "$STATE_DIR" "$@" echo # XXX: what if there are too many state files for a single execution ? find "$STATE_DIR" -type f -print0 | xargs -0 -r "$@" --out "$OUT_FILE" --state-file APacheDEX-1.6.2/README.rst0000644000175000017500000002306712135572501016040 0ustar vincentvincent00000000000000Compute APDEX from Apache-style logs. Overview ======== Parses Apache-style logs and generates several statistics intended for a website developer audience: - APDEX (Application Performance inDEX, see http://www.apdex.org) ratio (plotted) Because you want to know how satisfied your users are. - hit count (plotted) Because achieving 100% APDEX is easy when there is nobody around. - HTTP status codes, with optional detailed output of the most frequent URLs per error status code, along with their most frequent referers Because your forgot to update a link to that conditionally-used browser compatibility javascript you renamed. - Hottest pages (pages which use rendering time the most) Because you want to know where to invest time to get highest user experience improvement. - ERP5 sites: per-module statistics, with module and document views separated Because module and document types are not born equal in usage patterns. Some parsing performance figures: On a 2.3Ghz Corei5, apachedex achieves 97000 lines/s ( pypy-c-jit-62994-bd32583a3f11-linux64) and 43000 lines/s (CPython 2.7). Those were measures on a 3000000-hits logfile, with 3 --skip-base, 1 --erp5-base, 3 --base and --default set. --\*base values were similar in simplicity to the ones provided in examples below. What APacheDEX is not ===================== APacheDEX does not produce website audience statistics like AWStats, Google Analytics (etc) could do. APacheDEX does not monitor website availability & resource usage like Zabbix, Cacti, Ganglia, Nagios (etc) could do. Requirements ============ Dependencies ------------ As such, apachedex has no strict dependencies outside of standard python 2.7 installation. But generated output needs a few javascript files which come from other projects: - jquery.js - jquery.flot.js - jquery.flot.time.js (official flot plugin) - jquery.flot.axislabels.js (third-party flot plugin) If you installed apachedex (using an egg or with a distribution's package) you should have them already. If you are running from repository, you need to fetch them first:: python setup.py deps Also, apachedex can make use of backports.lzma (http://pypi.python.org/pypi/backports.lzma/) if it's installed to support xz file compression. Input ----- All default "combined" log format fields are supported (more can easily be added), plus %D. Mandatory fields are (in any order) `%t`, `%r` (for request's URL), `%>s`, `%{Referer}i`, `%D`. Just tell apachedex the value from your apache log configuration (see `--logformat` argument documentation). Input files may be provided uncompressed or compressed in: - bzip - gzip2 - xz (if module backports.lzma is installed) Input filename "-" is understood as stdin. Output ------ The output is HTML + CSS + JS, so you need a web browser to read it. Output filename "-" is understood as stdout. Usage ===== A few usage examples. See embedded help (`-h`/`--help`) for further options. Most basic usage:: apachedex --default website access.log Generate stand-alone output (suitable for inclusion in a mail, for example):: apachedex --default website --js-embed access.log --out attachment.html A log file with requests for 2 websites for which individual stats are desired, and hits outside those base urls are ignored:: apachedex --base "/site1(/|$|\?)" "/site2(/|$|\?)" A log file with a site section to ignore. Order does not matter:: apachedex --skip-base "/ignored(/|$|\?)" --default website A mix of both above examples. Order matters !:: apachedex --skip-base "/site1/ignored(/|$|\?)" \ --base "/site1(/|$|\?)" "/site2(/|$|\?)" Matching non-ASCII urls works by using urlencoded strings:: apachedex --base "/%E6%96%87%E5%AD%97%E5%8C%96%E3%81%91(/|$|\\?)" access.log Naming websites so that report looks less intimidating, by interleaving "+"-prefixed titles with regexes (title must be just before regex):: apachedex --default "Public website" --base "+Back office" \ "/backoffice(/|$|\\?)" "+User access" "/secure(/|$|\\?)" access.log Saving the result of an analysis for faster reuse:: apachedex --default foo --format json --out save_state.json --period day \ access.log Although not required, it is strongly advised to provide `--period` argument, as mixing states saved with different periods (fixed or auto-detected from data) give hard-to-read results and can cause problems if loaded data gets converted to a larger period. Continuing a saved analysis, updating collected data:: apachedex --default foo --format json --state-file save_state.json \ --out save_state.json --period day access.2.log Generating HTML output from two state files, aggregating their content without parsing more logs:: apachedex --default foo --state-file save_state.json save_state.2.json \ --period day --out index.html Configuration files =================== Providing a filename prefixed by "@" puts the content of that file in place of that argument, recursively. Each file is loaded relative to the containing directory of referencing file, or current working directory for command line. - foo/dev.cfg:: --error-detail @site.cfg --stats - foo/site.cfg:: --default Front-office # This is a comment --prefix "+Back office" "/back(/|$|\?)" # This is another comment --skip-prefix "/baz/ignored(/|$|\?)" --prefix +Something "/baz(/|$|\?)" - command line:: apachedex --skip-base "/ignored(/|$|\?)" @foo/dev.cfg --out index.html \ access.log This is equivalent to:: apachedex --skip-base "/ignored(/|$|\?)" --error-detail \ --default Front-office --prefix "+Back office" "/back(/|$|\?)" \ --skip-prefix "/baz/ignored(/|$|\?)" --prefix +Something "/baz(/|$|\?)" \ --stats --out index.html access.log Portability note: the use of paths containing directory elements inside configuration files is discouraged, as it's not portable. This may change later (ex: deciding that import paths are URLs and applying their rules). Periods ======= When providing the `--period` argument, two related settings are affected: - the period represented by each point in a graph (most important for the hit graph, as it represents the number of hits per such period) - the period represented by each column in per-period tables (status codes per date, hits per day...) Also, when `--period` is not provided, apachedex uses a threshold to tell when to switch to the larger period. That period was chosen to correspond to 200 graph points, which represents a varying number of table columns. .. table :: Details of `--period` argument =========== ========== ========== ============== ========================= --period graph table to next period columns until next period =========== ========== ========== ============== ========================= quarterhour minute 15 minutes 200 minutes 8 (3.3 hours) halfday 30 minutes 12 hours 100 hours 9 (4.1 days) day hour day 200 hours 9 (8.3 days) week 6 hours week 1200 hours 8 (7.1 weeks) month day month 5000 hours 7 (~6.7 months) quarter 7 days quarter 1400 days 16 (15.3 weeks) year month year (n/a) (infinity) =========== ========== ========== ============== ========================= "7 days" period used in `--period quarter` are not weeks strictly speaking: a week starts a monday/sunday, pendending on the locale. "7 days" start on the first day of the year, for simplicity - and performance. "week" used for `--period week` are really weeks, although starting on monday independently from locale. When there are no hits for more than a graph period, placeholders are generated at 0 hit value (which is the reality) and 100% apdex (this is arbitrary). Those placeholders only affect graphs, and do not affect averages nor table content. Because not all graph periods are actually equal in length (because of leap seconds, DST, leap years, year containing a non-integer number of weeks), some hit graph points are artificially corrected against these effects. Here also, the correction only affects graphs, neither averages nor table content. For example, on non-leap years, the last year's "7 days" period lasts a single day. Ploted hit count is then multiplied by 7 (and 3.5 on leap years). Performance =========== For better performance... - pipe decompressed files to apachedex instead of having apachedex decompress files itself:: bzcat access.log.bz2 | apachedex [...] - - when letting apachedex decide statistic granularity with multiple log files, provide earliest and latest log files first (whatever order) so apachedex can adapt its data structure to analysed time range before there is too much data:: apachedex [...] access.log.1.gz access.log.99.gz access.log.2.gz \ access.log.3.gz [...] access.98.gz - parse log files in parallel processes, saving analysis output and aggregating them in the end:: for LOG in access*.log; do apachedex "$@" --format json --out "$LOG.json" "$LOG" & done wait apachedex "$@" --out access.html --state-file access.*.json If you have bash and have an xargs implementation supporting `-P`, you may want to use `parallel_parse.sh` available in source distribution or from repository. Notes ===== Loading saved states generated with different sets of parameters is not prevented, but can produce nonsense/unreadable results. Or it can save the day if you do want to mix different parameters (ex: you have some logs generated with %T, others with %D). It is unclear how saved state format will evolve. Be prepared to have to regenerate saved states when you upgrade APacheDEX. APacheDEX-1.6.2/PKG-INFO0000644000175000017500000003121412323755130015437 0ustar vincentvincent00000000000000Metadata-Version: 1.1 Name: APacheDEX Version: 1.6.2 Summary: Compute APDEX from Apache-style logs. Home-page: http://git.erp5.org/gitweb/apachedex.git Author: Vincent Pelletier Author-email: vincent@nexedi.com License: GPL 2+ Description: .. contents:: Compute APDEX from Apache-style logs. Overview ======== Parses Apache-style logs and generates several statistics intended for a website developer audience: - APDEX (Application Performance inDEX, see http://www.apdex.org) ratio (plotted) Because you want to know how satisfied your users are. - hit count (plotted) Because achieving 100% APDEX is easy when there is nobody around. - HTTP status codes, with optional detailed output of the most frequent URLs per error status code, along with their most frequent referers Because your forgot to update a link to that conditionally-used browser compatibility javascript you renamed. - Hottest pages (pages which use rendering time the most) Because you want to know where to invest time to get highest user experience improvement. - ERP5 sites: per-module statistics, with module and document views separated Because module and document types are not born equal in usage patterns. Some parsing performance figures: On a 2.3Ghz Corei5, apachedex achieves 97000 lines/s ( pypy-c-jit-62994-bd32583a3f11-linux64) and 43000 lines/s (CPython 2.7). Those were measures on a 3000000-hits logfile, with 3 --skip-base, 1 --erp5-base, 3 --base and --default set. --\*base values were similar in simplicity to the ones provided in examples below. What APacheDEX is not ===================== APacheDEX does not produce website audience statistics like AWStats, Google Analytics (etc) could do. APacheDEX does not monitor website availability & resource usage like Zabbix, Cacti, Ganglia, Nagios (etc) could do. Requirements ============ Dependencies ------------ As such, apachedex has no strict dependencies outside of standard python 2.7 installation. But generated output needs a few javascript files which come from other projects: - jquery.js - jquery.flot.js - jquery.flot.time.js (official flot plugin) - jquery.flot.axislabels.js (third-party flot plugin) If you installed apachedex (using an egg or with a distribution's package) you should have them already. If you are running from repository, you need to fetch them first:: python setup.py deps Also, apachedex can make use of backports.lzma (http://pypi.python.org/pypi/backports.lzma/) if it's installed to support xz file compression. Input ----- All default "combined" log format fields are supported (more can easily be added), plus %D. Mandatory fields are (in any order) `%t`, `%r` (for request's URL), `%>s`, `%{Referer}i`, `%D`. Just tell apachedex the value from your apache log configuration (see `--logformat` argument documentation). Input files may be provided uncompressed or compressed in: - bzip - gzip2 - xz (if module backports.lzma is installed) Input filename "-" is understood as stdin. Output ------ The output is HTML + CSS + JS, so you need a web browser to read it. Output filename "-" is understood as stdout. Usage ===== A few usage examples. See embedded help (`-h`/`--help`) for further options. Most basic usage:: apachedex --default website access.log Generate stand-alone output (suitable for inclusion in a mail, for example):: apachedex --default website --js-embed access.log --out attachment.html A log file with requests for 2 websites for which individual stats are desired, and hits outside those base urls are ignored:: apachedex --base "/site1(/|$|\?)" "/site2(/|$|\?)" A log file with a site section to ignore. Order does not matter:: apachedex --skip-base "/ignored(/|$|\?)" --default website A mix of both above examples. Order matters !:: apachedex --skip-base "/site1/ignored(/|$|\?)" \ --base "/site1(/|$|\?)" "/site2(/|$|\?)" Matching non-ASCII urls works by using urlencoded strings:: apachedex --base "/%E6%96%87%E5%AD%97%E5%8C%96%E3%81%91(/|$|\\?)" access.log Naming websites so that report looks less intimidating, by interleaving "+"-prefixed titles with regexes (title must be just before regex):: apachedex --default "Public website" --base "+Back office" \ "/backoffice(/|$|\\?)" "+User access" "/secure(/|$|\\?)" access.log Saving the result of an analysis for faster reuse:: apachedex --default foo --format json --out save_state.json --period day \ access.log Although not required, it is strongly advised to provide `--period` argument, as mixing states saved with different periods (fixed or auto-detected from data) give hard-to-read results and can cause problems if loaded data gets converted to a larger period. Continuing a saved analysis, updating collected data:: apachedex --default foo --format json --state-file save_state.json \ --out save_state.json --period day access.2.log Generating HTML output from two state files, aggregating their content without parsing more logs:: apachedex --default foo --state-file save_state.json save_state.2.json \ --period day --out index.html Configuration files =================== Providing a filename prefixed by "@" puts the content of that file in place of that argument, recursively. Each file is loaded relative to the containing directory of referencing file, or current working directory for command line. - foo/dev.cfg:: --error-detail @site.cfg --stats - foo/site.cfg:: --default Front-office # This is a comment --prefix "+Back office" "/back(/|$|\?)" # This is another comment --skip-prefix "/baz/ignored(/|$|\?)" --prefix +Something "/baz(/|$|\?)" - command line:: apachedex --skip-base "/ignored(/|$|\?)" @foo/dev.cfg --out index.html \ access.log This is equivalent to:: apachedex --skip-base "/ignored(/|$|\?)" --error-detail \ --default Front-office --prefix "+Back office" "/back(/|$|\?)" \ --skip-prefix "/baz/ignored(/|$|\?)" --prefix +Something "/baz(/|$|\?)" \ --stats --out index.html access.log Portability note: the use of paths containing directory elements inside configuration files is discouraged, as it's not portable. This may change later (ex: deciding that import paths are URLs and applying their rules). Periods ======= When providing the `--period` argument, two related settings are affected: - the period represented by each point in a graph (most important for the hit graph, as it represents the number of hits per such period) - the period represented by each column in per-period tables (status codes per date, hits per day...) Also, when `--period` is not provided, apachedex uses a threshold to tell when to switch to the larger period. That period was chosen to correspond to 200 graph points, which represents a varying number of table columns. .. table :: Details of `--period` argument =========== ========== ========== ============== ========================= --period graph table to next period columns until next period =========== ========== ========== ============== ========================= quarterhour minute 15 minutes 200 minutes 8 (3.3 hours) halfday 30 minutes 12 hours 100 hours 9 (4.1 days) day hour day 200 hours 9 (8.3 days) week 6 hours week 1200 hours 8 (7.1 weeks) month day month 5000 hours 7 (~6.7 months) quarter 7 days quarter 1400 days 16 (15.3 weeks) year month year (n/a) (infinity) =========== ========== ========== ============== ========================= "7 days" period used in `--period quarter` are not weeks strictly speaking: a week starts a monday/sunday, pendending on the locale. "7 days" start on the first day of the year, for simplicity - and performance. "week" used for `--period week` are really weeks, although starting on monday independently from locale. When there are no hits for more than a graph period, placeholders are generated at 0 hit value (which is the reality) and 100% apdex (this is arbitrary). Those placeholders only affect graphs, and do not affect averages nor table content. Because not all graph periods are actually equal in length (because of leap seconds, DST, leap years, year containing a non-integer number of weeks), some hit graph points are artificially corrected against these effects. Here also, the correction only affects graphs, neither averages nor table content. For example, on non-leap years, the last year's "7 days" period lasts a single day. Ploted hit count is then multiplied by 7 (and 3.5 on leap years). Performance =========== For better performance... - pipe decompressed files to apachedex instead of having apachedex decompress files itself:: bzcat access.log.bz2 | apachedex [...] - - when letting apachedex decide statistic granularity with multiple log files, provide earliest and latest log files first (whatever order) so apachedex can adapt its data structure to analysed time range before there is too much data:: apachedex [...] access.log.1.gz access.log.99.gz access.log.2.gz \ access.log.3.gz [...] access.98.gz - parse log files in parallel processes, saving analysis output and aggregating them in the end:: for LOG in access*.log; do apachedex "$@" --format json --out "$LOG.json" "$LOG" & done wait apachedex "$@" --out access.html --state-file access.*.json If you have bash and have an xargs implementation supporting `-P`, you may want to use `parallel_parse.sh` available in source distribution or from repository. Notes ===== Loading saved states generated with different sets of parameters is not prevented, but can produce nonsense/unreadable results. Or it can save the day if you do want to mix different parameters (ex: you have some logs generated with %T, others with %D). It is unclear how saved state format will evolve. Be prepared to have to regenerate saved states when you upgrade APacheDEX. Platform: any Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+) Classifier: Operating System :: OS Independent Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3 Classifier: Programming Language :: Python :: Implementation :: PyPy Classifier: Programming Language :: Python :: Implementation :: CPython Classifier: Topic :: System :: Logging Classifier: Topic :: Text Processing :: Filters Classifier: Topic :: Text Processing :: Markup :: HTML APacheDEX-1.6.2/setup.py0000644000175000017500000000546012323754750016067 0ustar vincentvincent00000000000000from os.path import join, exists from setuptools import setup, find_packages import hashlib import os import sys extra = {} if sys.version_info >= (3, ): extra['use_2to3'] = True from urllib.request import urlretrieve else: from urllib import urlretrieve FLOT_SHA = 'aefe4e729b2d14efe6e8c0db359cb0e9aa6aae52' FLOT_AXISLABELS_SHA = '80453cd7fb8a9cad084cf6b581034ada3339dbf8' JQUERY_VERSION = '1.9.1' JQUERY_UI_VERSION = '1.10.2' DEPS = { 'jquery.flot.js': ( 'http://raw.github.com/flot/flot/%s/jquery.flot.js' % FLOT_SHA, '7b599c575f19c33bf0d93a6bbac3af02', ), 'jquery.flot.time.js': ( 'http://raw.github.com/flot/flot/%s/jquery.flot.time.js' % FLOT_SHA, 'c0aec1608bf2fbb79f24d1905673e2c3', ), 'jquery.flot.axislabels.js': ( 'http://raw.github.com/markrcote/flot-axislabels/%s/' 'jquery.flot.axislabels.js' % FLOT_AXISLABELS_SHA, 'a8526e0c1ed3b5cbc1a6b3ebb22bf334', ), 'jquery.js': ( 'http://code.jquery.com/jquery-%s.min.js' % JQUERY_VERSION, '397754ba49e9e0cf4e7c190da78dda05', ), 'jquery-ui.js': ( 'http://code.jquery.com/ui/%s/jquery-ui.min.js' % JQUERY_UI_VERSION, '3e6acb1e6426ef90d2e786a006a4ea28', ), } _file_dirname = os.path.dirname(__file__) def download(url, filename, hexdigest): filename = join(_file_dirname, 'apachedex', filename) if not exists(filename): urlretrieve(url, filename) if hashlib.md5(open(filename, 'rb').read()).hexdigest() != hexdigest: raise EnvironmentError('Checksum mismatch downloading %r' % filename) for filename, (url, hexdigest) in DEPS.items(): download(url, filename, hexdigest) # XXX: turn this into a setuptool command ? if sys.argv[1:] == ['deps']: sys.exit(0) description = open(join(_file_dirname, 'README.rst')).read() setup( name='APacheDEX', version='1.6.2', description=next(x for x in description.splitlines() if x.strip()), long_description=".. contents::\n\n" + description, author='Vincent Pelletier', author_email='vincent@nexedi.com', url='http://git.erp5.org/gitweb/apachedex.git', license='GPL 2+', platforms=['any'], classifiers=[ 'Intended Audience :: Developers', 'License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+)', 'Operating System :: OS Independent', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: Implementation :: PyPy', 'Programming Language :: Python :: Implementation :: CPython', 'Topic :: System :: Logging', 'Topic :: Text Processing :: Filters', 'Topic :: Text Processing :: Markup :: HTML', ], packages=find_packages(), entry_points = { 'console_scripts': [ 'apachedex=apachedex:main', ], }, package_data={ 'apachedex': list(DEPS.keys()) + ['apachedex.js', 'apachedex.css'], }, zip_safe=True, **extra ) APacheDEX-1.6.2/MANIFEST.in0000644000175000017500000000027012135564540016102 0ustar vincentvincent00000000000000include README.rst include TODO include COPYING include apachedex/jquery*.js include apachedex/apachedex.js include apachedex/apachedex.css include parallel_parse.sh include stdeb.cfg APacheDEX-1.6.2/TODO0000644000175000017500000000026012135566025015033 0ustar vincentvincent00000000000000- use some templating system instead of hardcoded html strings - allow user to specify min & max dates - move all N_* constants into command line arguments - graph annotations APacheDEX-1.6.2/COPYING0000644000175000017500000004310312127477154015406 0ustar vincentvincent00000000000000 GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. APacheDEX-1.6.2/setup.cfg0000644000175000017500000000007312323755130016162 0ustar vincentvincent00000000000000[egg_info] tag_build = tag_date = 0 tag_svn_revision = 0 APacheDEX-1.6.2/stdeb.cfg0000644000175000017500000000004412135564540016125 0ustar vincentvincent00000000000000[DEFAULT] XS-Python-Version: >= 2.7 APacheDEX-1.6.2/APacheDEX.egg-info/0000755000175000017500000000000012323755130017515 5ustar vincentvincent00000000000000APacheDEX-1.6.2/APacheDEX.egg-info/top_level.txt0000644000175000017500000000001212323755127022246 0ustar vincentvincent00000000000000apachedex APacheDEX-1.6.2/APacheDEX.egg-info/dependency_links.txt0000644000175000017500000000000112323755127023571 0ustar vincentvincent00000000000000 APacheDEX-1.6.2/APacheDEX.egg-info/SOURCES.txt0000644000175000017500000000103112323755130021374 0ustar vincentvincent00000000000000COPYING MANIFEST.in README.rst TODO parallel_parse.sh setup.py stdeb.cfg APacheDEX.egg-info/PKG-INFO APacheDEX.egg-info/SOURCES.txt APacheDEX.egg-info/dependency_links.txt APacheDEX.egg-info/entry_points.txt APacheDEX.egg-info/top_level.txt APacheDEX.egg-info/zip-safe apachedex/__init__.py apachedex/apachedex.css apachedex/apachedex.js apachedex/jquery-ui.js apachedex/jquery.flot.annotate.js apachedex/jquery.flot.axislabels.js apachedex/jquery.flot.js apachedex/jquery.flot.labels.js apachedex/jquery.flot.time.js apachedex/jquery.jsAPacheDEX-1.6.2/APacheDEX.egg-info/zip-safe0000644000175000017500000000000112133060516021141 0ustar vincentvincent00000000000000 APacheDEX-1.6.2/APacheDEX.egg-info/PKG-INFO0000644000175000017500000003121412323755127020621 0ustar vincentvincent00000000000000Metadata-Version: 1.1 Name: APacheDEX Version: 1.6.2 Summary: Compute APDEX from Apache-style logs. Home-page: http://git.erp5.org/gitweb/apachedex.git Author: Vincent Pelletier Author-email: vincent@nexedi.com License: GPL 2+ Description: .. contents:: Compute APDEX from Apache-style logs. Overview ======== Parses Apache-style logs and generates several statistics intended for a website developer audience: - APDEX (Application Performance inDEX, see http://www.apdex.org) ratio (plotted) Because you want to know how satisfied your users are. - hit count (plotted) Because achieving 100% APDEX is easy when there is nobody around. - HTTP status codes, with optional detailed output of the most frequent URLs per error status code, along with their most frequent referers Because your forgot to update a link to that conditionally-used browser compatibility javascript you renamed. - Hottest pages (pages which use rendering time the most) Because you want to know where to invest time to get highest user experience improvement. - ERP5 sites: per-module statistics, with module and document views separated Because module and document types are not born equal in usage patterns. Some parsing performance figures: On a 2.3Ghz Corei5, apachedex achieves 97000 lines/s ( pypy-c-jit-62994-bd32583a3f11-linux64) and 43000 lines/s (CPython 2.7). Those were measures on a 3000000-hits logfile, with 3 --skip-base, 1 --erp5-base, 3 --base and --default set. --\*base values were similar in simplicity to the ones provided in examples below. What APacheDEX is not ===================== APacheDEX does not produce website audience statistics like AWStats, Google Analytics (etc) could do. APacheDEX does not monitor website availability & resource usage like Zabbix, Cacti, Ganglia, Nagios (etc) could do. Requirements ============ Dependencies ------------ As such, apachedex has no strict dependencies outside of standard python 2.7 installation. But generated output needs a few javascript files which come from other projects: - jquery.js - jquery.flot.js - jquery.flot.time.js (official flot plugin) - jquery.flot.axislabels.js (third-party flot plugin) If you installed apachedex (using an egg or with a distribution's package) you should have them already. If you are running from repository, you need to fetch them first:: python setup.py deps Also, apachedex can make use of backports.lzma (http://pypi.python.org/pypi/backports.lzma/) if it's installed to support xz file compression. Input ----- All default "combined" log format fields are supported (more can easily be added), plus %D. Mandatory fields are (in any order) `%t`, `%r` (for request's URL), `%>s`, `%{Referer}i`, `%D`. Just tell apachedex the value from your apache log configuration (see `--logformat` argument documentation). Input files may be provided uncompressed or compressed in: - bzip - gzip2 - xz (if module backports.lzma is installed) Input filename "-" is understood as stdin. Output ------ The output is HTML + CSS + JS, so you need a web browser to read it. Output filename "-" is understood as stdout. Usage ===== A few usage examples. See embedded help (`-h`/`--help`) for further options. Most basic usage:: apachedex --default website access.log Generate stand-alone output (suitable for inclusion in a mail, for example):: apachedex --default website --js-embed access.log --out attachment.html A log file with requests for 2 websites for which individual stats are desired, and hits outside those base urls are ignored:: apachedex --base "/site1(/|$|\?)" "/site2(/|$|\?)" A log file with a site section to ignore. Order does not matter:: apachedex --skip-base "/ignored(/|$|\?)" --default website A mix of both above examples. Order matters !:: apachedex --skip-base "/site1/ignored(/|$|\?)" \ --base "/site1(/|$|\?)" "/site2(/|$|\?)" Matching non-ASCII urls works by using urlencoded strings:: apachedex --base "/%E6%96%87%E5%AD%97%E5%8C%96%E3%81%91(/|$|\\?)" access.log Naming websites so that report looks less intimidating, by interleaving "+"-prefixed titles with regexes (title must be just before regex):: apachedex --default "Public website" --base "+Back office" \ "/backoffice(/|$|\\?)" "+User access" "/secure(/|$|\\?)" access.log Saving the result of an analysis for faster reuse:: apachedex --default foo --format json --out save_state.json --period day \ access.log Although not required, it is strongly advised to provide `--period` argument, as mixing states saved with different periods (fixed or auto-detected from data) give hard-to-read results and can cause problems if loaded data gets converted to a larger period. Continuing a saved analysis, updating collected data:: apachedex --default foo --format json --state-file save_state.json \ --out save_state.json --period day access.2.log Generating HTML output from two state files, aggregating their content without parsing more logs:: apachedex --default foo --state-file save_state.json save_state.2.json \ --period day --out index.html Configuration files =================== Providing a filename prefixed by "@" puts the content of that file in place of that argument, recursively. Each file is loaded relative to the containing directory of referencing file, or current working directory for command line. - foo/dev.cfg:: --error-detail @site.cfg --stats - foo/site.cfg:: --default Front-office # This is a comment --prefix "+Back office" "/back(/|$|\?)" # This is another comment --skip-prefix "/baz/ignored(/|$|\?)" --prefix +Something "/baz(/|$|\?)" - command line:: apachedex --skip-base "/ignored(/|$|\?)" @foo/dev.cfg --out index.html \ access.log This is equivalent to:: apachedex --skip-base "/ignored(/|$|\?)" --error-detail \ --default Front-office --prefix "+Back office" "/back(/|$|\?)" \ --skip-prefix "/baz/ignored(/|$|\?)" --prefix +Something "/baz(/|$|\?)" \ --stats --out index.html access.log Portability note: the use of paths containing directory elements inside configuration files is discouraged, as it's not portable. This may change later (ex: deciding that import paths are URLs and applying their rules). Periods ======= When providing the `--period` argument, two related settings are affected: - the period represented by each point in a graph (most important for the hit graph, as it represents the number of hits per such period) - the period represented by each column in per-period tables (status codes per date, hits per day...) Also, when `--period` is not provided, apachedex uses a threshold to tell when to switch to the larger period. That period was chosen to correspond to 200 graph points, which represents a varying number of table columns. .. table :: Details of `--period` argument =========== ========== ========== ============== ========================= --period graph table to next period columns until next period =========== ========== ========== ============== ========================= quarterhour minute 15 minutes 200 minutes 8 (3.3 hours) halfday 30 minutes 12 hours 100 hours 9 (4.1 days) day hour day 200 hours 9 (8.3 days) week 6 hours week 1200 hours 8 (7.1 weeks) month day month 5000 hours 7 (~6.7 months) quarter 7 days quarter 1400 days 16 (15.3 weeks) year month year (n/a) (infinity) =========== ========== ========== ============== ========================= "7 days" period used in `--period quarter` are not weeks strictly speaking: a week starts a monday/sunday, pendending on the locale. "7 days" start on the first day of the year, for simplicity - and performance. "week" used for `--period week` are really weeks, although starting on monday independently from locale. When there are no hits for more than a graph period, placeholders are generated at 0 hit value (which is the reality) and 100% apdex (this is arbitrary). Those placeholders only affect graphs, and do not affect averages nor table content. Because not all graph periods are actually equal in length (because of leap seconds, DST, leap years, year containing a non-integer number of weeks), some hit graph points are artificially corrected against these effects. Here also, the correction only affects graphs, neither averages nor table content. For example, on non-leap years, the last year's "7 days" period lasts a single day. Ploted hit count is then multiplied by 7 (and 3.5 on leap years). Performance =========== For better performance... - pipe decompressed files to apachedex instead of having apachedex decompress files itself:: bzcat access.log.bz2 | apachedex [...] - - when letting apachedex decide statistic granularity with multiple log files, provide earliest and latest log files first (whatever order) so apachedex can adapt its data structure to analysed time range before there is too much data:: apachedex [...] access.log.1.gz access.log.99.gz access.log.2.gz \ access.log.3.gz [...] access.98.gz - parse log files in parallel processes, saving analysis output and aggregating them in the end:: for LOG in access*.log; do apachedex "$@" --format json --out "$LOG.json" "$LOG" & done wait apachedex "$@" --out access.html --state-file access.*.json If you have bash and have an xargs implementation supporting `-P`, you may want to use `parallel_parse.sh` available in source distribution or from repository. Notes ===== Loading saved states generated with different sets of parameters is not prevented, but can produce nonsense/unreadable results. Or it can save the day if you do want to mix different parameters (ex: you have some logs generated with %T, others with %D). It is unclear how saved state format will evolve. Be prepared to have to regenerate saved states when you upgrade APacheDEX. Platform: any Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+) Classifier: Operating System :: OS Independent Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3 Classifier: Programming Language :: Python :: Implementation :: PyPy Classifier: Programming Language :: Python :: Implementation :: CPython Classifier: Topic :: System :: Logging Classifier: Topic :: Text Processing :: Filters Classifier: Topic :: Text Processing :: Markup :: HTML APacheDEX-1.6.2/APacheDEX.egg-info/entry_points.txt0000644000175000017500000000005612323755127023022 0ustar vincentvincent00000000000000[console_scripts] apachedex = apachedex:main