aboutsummaryrefslogtreecommitdiff
path: root/logparse.py
diff options
context:
space:
mode:
Diffstat (limited to 'logparse.py')
-rwxr-xr-xlogparse.py226
1 files changed, 124 insertions, 102 deletions
diff --git a/logparse.py b/logparse.py
index 9d36278..4986dd1 100755
--- a/logparse.py
+++ b/logparse.py
@@ -4,6 +4,7 @@ import os
import sys
import re
from glob import glob
+import pickle
import argparse
from math import ceil
import hashlib
@@ -32,7 +33,7 @@ patt = '(?P<ipaddress>\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}) .* .* \\[(?P<da
logpattern = re.compile(patt)
-class logData():
+class LogData():
columns = {
'ipaddress': {},
@@ -41,17 +42,37 @@ class logData():
'time': {},
'path': {},
'status': {},
+ 'size': {},
'name': {}, # derived
}
def __init__(self,
+ dataset_name,
gethostnames=False,
ignore_hosts=[]):
+ '''dataset is a dict
+ dataframe - pandas dataframe containing digested log data
+ file_hashes - MD5 hashes of each file that was read to compose the dataframe'''
+ self.dataset_name = dataset_name
+ self.dataset = None
+ self.data = None
self.digest_path = 'digests'
self.gethostnames = gethostnames
self.hostnames = {}
self.ignore_hosts = ignore_hosts
+ try:
+ print('reading dataset...')
+ with open(self.dataset_name, 'rb') as f:
+ self.dataset = pickle.load(f)
+ except:
+ print(f'{self.dataset_name} not found. Creating empty dataset.')
+ with open(self.dataset_name, 'a') as f:
+ self.dataset = {'dataframe':pd.DataFrame(self.columns),
+ 'file_hashes': []}
+ self.data = self.dataset['dataframe']
+ self.hashes = self.dataset['file_hashes']
+
def poll_hostnames(self):
if ipaddress not in self.hostnames.keys():
try:
@@ -79,7 +100,6 @@ class logData():
except(TypeError):
pass
-
try:
match = logpattern.match(line)
print(f'logpattern.match(line): {match}')
@@ -100,8 +120,8 @@ class logData():
tarball = re.sub(patt0, '', path)
namematch = patt1.match(tarball)
name = namematch.group('simplename')
-
status = match.group('status')
+ size = match.group('size')
hostname = ''
df = df.append({'ipaddress':ipaddress,
'hostname':hostname,
@@ -109,6 +129,7 @@ class logData():
'time':time,
'path':path,
'status':status,
+ 'size':size,
'name':name},
ignore_index=True)
except(AttributeError):
@@ -123,22 +144,13 @@ class logData():
and parses each that has not already been ingested.'''
# Create data frame for receiving all log data.
- locframe = pd.DataFrame(self.columns)
-
- # Track which files have been parsed by storing the MD5 hash of each
- # once it's been read.
- pfile = 'parsed_files.dat'
- if not os.path.exists(pfile):
- open(pfile, 'a').close()
- with open(pfile, 'r') as f:
- already_parsed = f.read().split()
- parsed = open(pfile, 'a')
+ newdata = pd.DataFrame(self.columns)
for log in sorted(logs):
# Compute MD5 hash of file and compare to list of files that
# have already been parsed. If new, parse, if not, skip.
hashval = md5(log)
- if hashval in already_parsed:
+ if hashval in self.hashes:
print(f'File {log} already parsed.')
continue
df = pd.DataFrame(self.columns)
@@ -153,34 +165,58 @@ class logData():
with open(log, 'r') as f:
df = self.process_lines(f)
print(f'df shape = {df.shape}')
- locframe = locframe.append(df, ignore_index=True)
- print(locframe.shape)
- parsed.write(f'{hashval}\n')
-
- parsed.close()
- return(locframe)
-
-
-
-def filter_pkgs(df):
- '''Filter dataframe df down to just the rows the represent
- successful (HTTP 200) conda package (.bz2 files) downloads.'''
- print(df)
- inlen = len(df)
- out = df.loc[df['path'].str.contains('bz2')]
- out = out.loc[(out['status'] == '200') | (out['status'] == '302')]
- outlen = len(out)
- print(f'{inlen-outlen} rows removed to leave conda txns only')
- return(out)
-
+ newdata = newdata.append(df, ignore_index=True)
+ print(newdata.shape)
+ self.hashes.append(hashval)
+
+ # If any new log files were read, filter down to only conda package downloads
+ # Then sort by date.
+ if len(newdata.index) != 0:
+ newdata = self.filter_pkgs(newdata)
+ newdata = newdata.sort_values(by='date')
+ newdata = newdata.drop_duplicates()
+ # Normalize any 'conda-dev' channel names to 'astroconda-dev'
+ newdata = newdata.replace('/conda-dev', '/astroconda-dev', regex=True)
+ # Add newdata to (potentially empty) existing data
+ self.data = self.data.append(newdata, ignore_index=True)
+ self.dataset['dataframe'] = self.data
+
+ def filter_pkgs(self, df):
+ '''Filter dataframe df down to just the rows the represent
+ successful (HTTP 200) conda package (.bz2 files) downloads.'''
+ inlen = len(df)
+ out = df.loc[df['path'].str.contains('bz2')]
+ out = out.loc[(out['status'] == '200') | (out['status'] == '302')]
+ outlen = len(out)
+ print(f'{inlen-outlen} rows removed to leave conda txns only')
+ return(out)
+
+ def write_dataset(self, dataset_name=None):
+ '''Serialize working dataset and write it to disk using a filename
+ provided, if requested.
+
+ Parameters
+ ----------
+ dataset_name : string
+ Optional name to use for file when writing working dataset to disk.
+ If not provided, the current name of the working dataset file will
+ be used.'''
+ if dataset_name:
+ dsname = dataset_name
+ else:
+ dsname = self.dataset_name
+ pickle.dump(self.dataset, open(dsname, 'wb'))
def main():
- # TODO: Allow specification of a digested data file with fallback to a default.
ap = argparse.ArgumentParser(
prog='logparse.py',
description='Parse and digest apache/nginx access logs in either'
' raw or .gz format.')
+ ap.add_argument('dataset_name', type=str,
+ help='Name of dataset file. If file does not exist and'
+ ' log data file names are provided for parsing, this '
+ 'file will be created.')
ap.add_argument('--config',
'-c',
help='Configuration file used to adjust behavior of the '
@@ -195,7 +231,9 @@ def main():
'-w',
help='Restrict examination of data to the window of dates'
' provided.\n'
- ' Format: YYYY.MM.DD-YYYY.MM.DD')
+ ' Format: YYYY.MM.DD-YYYY.MM.DD'
+ ' Omitting a date window will operate on all data contained'
+ ' within the given dataset.')
ap.add_argument('--ignorehosts',
'-i',
help='IP addresses of hosts to ignore when parsing logs.'
@@ -204,6 +242,9 @@ def main():
nargs='+')
args = ap.parse_args()
+ # Dataset filename
+ dataset_name = args.dataset_name
+
with open(args.config, 'r') as f:
config = yaml.safe_load(f)
@@ -218,58 +259,37 @@ def main():
else:
files.append(expanded)
except(TypeError):
+ print('No log files provided.')
+ print(f'Importing existing dataset {dataset_name}.')
pass
inf_hosts = config['infrastructure_hosts']
num_inf_hosts = len(inf_hosts)
- # Read in any pre-existing parsed data.
- datfile = 'dataframe.dat'
- if not os.path.exists(datfile):
- open(datfile, 'a').close()
-
- with open(datfile, 'r') as f:
- try:
- data = pd.read_pickle(datfile)
- except:
- data = pd.DataFrame(logData.columns)
-
# TODO: Should host filtering take place here?
# It leaves a disconnect between the pickled data which _may_ have
# been culled and the actual data being referenced.
- logproc = logData(ignore_hosts=args.ignorehosts)
- newdata = logproc.read_logs(files)
-
- # If any new log files were read, filter down to only conda package downloads
- # Then sort by date.
- if len(newdata.index) != 0:
- newdata = filter_pkgs(newdata)
- newdata = newdata.sort_values(by='date')
- # Add newdata to existing data (potentially empty)
- data = data.append(newdata, ignore_index=True)
- print('.0')
-
- # Remove any duplicate rows in data:
- data = data.drop_duplicates()
- print('.2')
-
- # Normalize all conda-dev channel names to astroconda-dev
- data = data.replace('/conda-dev', '/astroconda-dev', regex=True)
- print(data)
- print('.3')
-
- # Dump data to disk for use during subsequent runs.
- data.to_pickle(datfile)
+ logproc = LogData(dataset_name, ignore_hosts=args.ignorehosts)
+ logproc.read_logs(files)
+ print('writing (potentially updated) dataset')
+ logproc.write_dataset()
+
+ # Filtering and analysis begins here
+ data = logproc.data
print(f'num full data rows = {len(data.index)}')
# Filter out a particular time period for examination
- window_start = pd.to_datetime('2019-09-15')
- window_end = pd.to_datetime('2019-09-21')
- print(f'Filtering based on window {window_start} - {window_end}.')
- data = data[pd.to_datetime(data['date']) >= window_start]
- data = data[pd.to_datetime(data['date']) <= window_end]
- print(f'num windowed data rows = {len(data.index)}')
+ # Set limits on a time period to examine
+ if args.window:
+ start = args.window.split('-')[0].replace('.', '-')
+ end = args.window.split('-')[1].replace('.', '-')
+ window_start = pd.to_datetime(start)
+ window_end = pd.to_datetime(end)
+ print(f'Filtering based on window {window_start} - {window_end}.')
+ data = data[pd.to_datetime(data['date']) >= window_start]
+ data = data[pd.to_datetime(data['date']) <= window_end]
+ print(f'num windowed data rows = {len(data.index)}')
all_unique_hosts = list(set(data['ipaddress']))
#for host in all_unique_hosts:
@@ -367,16 +387,13 @@ def main():
num_onsite_hosts = len(set(onsite['ipaddress']))
print(f'num unique on-site hosts: {num_onsite_hosts}')
+ infra = pkgs[pkgs['ipaddress'].str.contains('|'.join(inf_hosts))]
+
# Fraction of downloads to off-site hosts
# Totals of unique software titles
# i.e. name without version, hash, py or build iteration values
# Extract simple package titles from 'path' column of data frame.
- #names = pkgs['path'].str.replace('/.*/.*/', '', regex=True)
- #repl = lambda m: m.group('simplename')
- #names = list(names.str.replace('(?P<simplename>.*)-.*-.*\.tar\.bz2$',
- # repl,
- # regex=True))
names = list(pkgs['name'])
unique_names = list(set(names))
name_statsums = []
@@ -384,12 +401,16 @@ def main():
statsum = {}
statsum['name'] = name
statsum['total'] = names.count(name)
-
# Sum on- and off-site transactions for each package name
+ # 'on-site' means transactions to non-infrastructure hosts.
name_txns = pkgs[pkgs['name'] == name]
on_txns = name_txns[name_txns['ipaddress'].str.contains(
'|'.join(int_host_patterns), regex=True)]
+ # Filter out hosts designated as infrastructure hosts in config file.
+ on_txns = on_txns[~on_txns['ipaddress'].str.contains(
+ '|'.join(inf_hosts))]
+
num_onsite_txns = len(on_txns.index)
statsum['onsite'] = num_onsite_txns
@@ -398,38 +419,39 @@ def main():
num_offsite_txns = len(off_txns.index)
statsum['offsite'] = num_offsite_txns
+ infra_txns = name_txns[name_txns['ipaddress'].str.contains(
+ '|'.join(inf_hosts))]
+ num_infra_txns = len(infra_txns.index)
+ statsum['infra'] = num_infra_txns
+
name_statsums.append(statsum)
name_statsums.sort(key=lambda x: x['total'], reverse=True)
- y = []
- y = [i['total'] for i in name_statsums]
- y_onsite = [i['onsite'] for i in name_statsums]
- print(f'y_onsite: {y_onsite}')
- y_offsite = [i['offsite'] for i in name_statsums]
- print(f'y_offisite: {y_offsite}')
- x = [i['name'] for i in name_statsums]
- print('name_statsums')
- print(name_statsums)
-
- # Calculate fractions of properties of each unique package title
- # for stacked bar plot purposes.
-
+ x_onsite = [i['onsite'] for i in name_statsums]
+ x_infra = [i['infra'] for i in name_statsums]
+ x_offsite = [i['offsite'] for i in name_statsums]
+ y = [i['name'] for i in name_statsums]
print(f'Number of unique {chan} titles downloaded: {len(unique_names)}')
# For each unique softare name, sum the number of transactions from internal hosts.
- width = 5.0
fig, axes = plt.subplots(figsize=(10,25))
plt.grid(which='major', axis='x')
- plt.title(f'{start_date.strftime("%m-%d-%Y")} - {end_date.strftime("%m-%d-%Y")}')
+ plt.title(f'{chan} -- {start_date.strftime("%m-%d-%Y")} - {end_date.strftime("%m-%d-%Y")}')
plt.xlabel('Number of downloads')
- axes.set_ylim(0,len(name_statsums))
+ axes.set_ylim(-1,len(name_statsums))
plt.gca().invert_yaxis()
width = 1
- barlist = axes.barh(x, y_onsite, width, edgecolor='black')
- barlist = axes.barh(x, y_offsite, width, left=y_onsite, edgecolor='black')
- #for id in iraf_ids:
- # barlist[id].set_color('grey')
+ from operator import add
+ # Horizontal stacked bar chart with off-site, on-site, and infrastructure transactions.
+ barlist = axes.barh(y, x_offsite, width, edgecolor='white', color='tab:blue')
+ barlist = axes.barh(y, x_onsite, width, left=x_offsite, edgecolor='white', color='tab:green')
+ # Sum bars up to this point to correctly stack the subsequent one(s).
+ offset = list(map(add, x_offsite, x_onsite))
+ barlist = axes.barh(y, x_infra, width, left=offset, edgecolor='white', color='tab:olive')
+
+ axes.legend(['off-site', 'on-site', 'infrastructure'])
+
plt.tight_layout()
plt.savefig(f'{chan}.png')