From 30a0b3c1b5e9dfe7bd68a001f174cb5ab470ba1c Mon Sep 17 00:00:00 2001 From: Matt Rendina Date: Wed, 28 Aug 2019 13:06:08 -0400 Subject: Several updates. Added hashing support of input files. Generalized logfile regex. Simplified filtering of packages without user agent. Additional fraction metrics output. --- logparse.py | 248 ++++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 173 insertions(+), 75 deletions(-) diff --git a/logparse.py b/logparse.py index 4b8edb0..baee14d 100755 --- a/logparse.py +++ b/logparse.py @@ -6,6 +6,7 @@ import re from glob import glob import argparse from math import ceil +import hashlib import gzip import socket import pandas as pd @@ -14,39 +15,58 @@ import matplotlib.pyplot as plt import matplotlib.dates as mdates from dateutil import parser as dpar from collections import OrderedDict +import yaml + + +def md5(fname): + hash_md5 = hashlib.md5() + with open(fname, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + # regex pattern to extract key values from each line of an apache/nginx access log # Accommodate PUTs as well as second URLs (normally "-") -patt = '(?P\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}) .* .* \\[(?P\\d{2}\\/[a-zA-Z]{3}\\/\\d{4}):(?P