From 68c4adcc9b220947461128473e0c12c2049d7801 Mon Sep 17 00:00:00 2001 From: Matt Rendina Date: Thu, 25 Jul 2019 15:46:39 -0400 Subject: Initial commit --- logparse.py | 248 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 248 insertions(+) create mode 100755 logparse.py diff --git a/logparse.py b/logparse.py new file mode 100755 index 0000000..4a21054 --- /dev/null +++ b/logparse.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 + +import re +import gzip +import socket +import pandas as pd +import datetime as dt +import matplotlib.pyplot as plt +import matplotlib.dates as mdates + +# Notes +# df.to_pickle(filename) for serializing a pandas data frame to disk. +# df.read_pickle(filename) to get it back. + +# regex pattern to extract key values from each line of an apache/nginx access log +# Accommodate PUTs as well as second URLs (normally "-") +patt = '(?P\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}) - - \\[(?P\\d{2}\\/[a-zA-Z]{3}\\/\\d{4}):(?P