#! /usr/bin/python

# logsummize.py
#
# Purpose: collects size and count statistics abount common "static"
# file (like 'jpg','png','gif','css','js') from an Apache access log
# in combined format
#
# Usage: the script has to be passed the name of the log file and,
# optionally, a date to close search on. The date is treated as a string
# so pass it as "04/Apr/2009"
#
# Example: logsummize.py access.log 04/Apr/2009
#
# Author: Claudio Cicali <claudio@cicali.org>
# Date: 8/4/2009
#
# Copyright 2009 Claudio Cicali
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

import sys, re

if __name__ == "__main__":

  extensions = ['jpg','png','gif','css','js']

  totals = {}
  for extension in extensions:
    totals[extension] = {'size': 0, 'count': 0}

  try:
    f = open(sys.argv[1], 'r')
  except:
    print "Log file missing or unreadble (first script parameter)"
    sys.exit(-1)
    
  try:
    datelimit = sys.argv[2]
  except:
    datelimit = ''
    
  for line in f:
    if datelimit != '':
      data = re.compile("""\[(.*?):""").search(line)
      if data is not None and datelimit != data.group(1):
        continue

    # "GET /img/btnbg-2.png HTTP/1.1" 200
    data = re.compile(""""GET (.*\.(%s)) HTTP/1.\d" (\d{3}) (\d+)""" % '|'.join(extensions)).search(line)
    if data is not None:
      # Filename
      file = data.group(1)
      # Extension
      extension = data.group(2)
      # Status
      status = data.group(3)
      # Size
      size = data.group(4)
      if status == "200":
        totals[extension]['count'] += 1
        totals[extension]['size'] += int(size)

  totalSize = totalItems = 0
  for extension, data in totals.iteritems():
    print "%d %s: %dKB" % (data['count'], extension, data['size'] / 1024)
    totalSize += data['size']
    totalItems += data['count']

  print "\n%s items, %dKB total" % (totalItems, totalSize / 1024)

