From 9db19630aaf79b5986a4eb69af97c11ddf196fc5 Mon Sep 17 00:00:00 2001 From: Matt Rendina Date: Tue, 8 Oct 2019 11:50:25 -0400 Subject: Consolidate file hashes and dataframe into one dataset; Accept date window on command line; adjust bar graph for consistency. (#2) --- logparse.py | 226 +++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 124 insertions(+), 102 deletions(-) diff --git a/logparse.py b/logparse.py index 9d36278..4986dd1 100755 --- a/logparse.py +++ b/logparse.py @@ -4,6 +4,7 @@ import os import sys import re from glob import glob +import pickle import argparse from math import ceil import hashlib @@ -32,7 +33,7 @@ patt = '(?P\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}) .* .* \\[(?P= window_start] - data = data[pd.to_datetime(data['date']) <= window_end] - print(f'num windowed data rows = {len(data.index)}') + # Set limits on a time period to examine + if args.window: + start = args.window.split('-')[0].replace('.', '-') + end = args.window.split('-')[1].replace('.', '-') + window_start = pd.to_datetime(start) + window_end = pd.to_datetime(end) + print(f'Filtering based on window {window_start} - {window_end}.') + data = data[pd.to_datetime(data['date']) >= window_start] + data = data[pd.to_datetime(data['date']) <= window_end] + print(f'num windowed data rows = {len(data.index)}') all_unique_hosts = list(set(data['ipaddress'])) #for host in all_unique_hosts: @@ -367,16 +387,13 @@ def main(): num_onsite_hosts = len(set(onsite['ipaddress'])) print(f'num unique on-site hosts: {num_onsite_hosts}') + infra = pkgs[pkgs['ipaddress'].str.contains('|'.join(inf_hosts))] + # Fraction of downloads to off-site hosts # Totals of unique software titles # i.e. name without version, hash, py or build iteration values # Extract simple package titles from 'path' column of data frame. - #names = pkgs['path'].str.replace('/.*/.*/', '', regex=True) - #repl = lambda m: m.group('simplename') - #names = list(names.str.replace('(?P.*)-.*-.*\.tar\.bz2$', - # repl, - # regex=True)) names = list(pkgs['name']) unique_names = list(set(names)) name_statsums = [] @@ -384,12 +401,16 @@ def main(): statsum = {} statsum['name'] = name statsum['total'] = names.count(name) - # Sum on- and off-site transactions for each package name + # 'on-site' means transactions to non-infrastructure hosts. name_txns = pkgs[pkgs['name'] == name] on_txns = name_txns[name_txns['ipaddress'].str.contains( '|'.join(int_host_patterns), regex=True)] + # Filter out hosts designated as infrastructure hosts in config file. + on_txns = on_txns[~on_txns['ipaddress'].str.contains( + '|'.join(inf_hosts))] + num_onsite_txns = len(on_txns.index) statsum['onsite'] = num_onsite_txns @@ -398,38 +419,39 @@ def main(): num_offsite_txns = len(off_txns.index) statsum['offsite'] = num_offsite_txns + infra_txns = name_txns[name_txns['ipaddress'].str.contains( + '|'.join(inf_hosts))] + num_infra_txns = len(infra_txns.index) + statsum['infra'] = num_infra_txns + name_statsums.append(statsum) name_statsums.sort(key=lambda x: x['total'], reverse=True) - y = [] - y = [i['total'] for i in name_statsums] - y_onsite = [i['onsite'] for i in name_statsums] - print(f'y_onsite: {y_onsite}') - y_offsite = [i['offsite'] for i in name_statsums] - print(f'y_offisite: {y_offsite}') - x = [i['name'] for i in name_statsums] - print('name_statsums') - print(name_statsums) - - # Calculate fractions of properties of each unique package title - # for stacked bar plot purposes. - + x_onsite = [i['onsite'] for i in name_statsums] + x_infra = [i['infra'] for i in name_statsums] + x_offsite = [i['offsite'] for i in name_statsums] + y = [i['name'] for i in name_statsums] print(f'Number of unique {chan} titles downloaded: {len(unique_names)}') # For each unique softare name, sum the number of transactions from internal hosts. - width = 5.0 fig, axes = plt.subplots(figsize=(10,25)) plt.grid(which='major', axis='x') - plt.title(f'{start_date.strftime("%m-%d-%Y")} - {end_date.strftime("%m-%d-%Y")}') + plt.title(f'{chan} -- {start_date.strftime("%m-%d-%Y")} - {end_date.strftime("%m-%d-%Y")}') plt.xlabel('Number of downloads') - axes.set_ylim(0,len(name_statsums)) + axes.set_ylim(-1,len(name_statsums)) plt.gca().invert_yaxis() width = 1 - barlist = axes.barh(x, y_onsite, width, edgecolor='black') - barlist = axes.barh(x, y_offsite, width, left=y_onsite, edgecolor='black') - #for id in iraf_ids: - # barlist[id].set_color('grey') + from operator import add + # Horizontal stacked bar chart with off-site, on-site, and infrastructure transactions. + barlist = axes.barh(y, x_offsite, width, edgecolor='white', color='tab:blue') + barlist = axes.barh(y, x_onsite, width, left=x_offsite, edgecolor='white', color='tab:green') + # Sum bars up to this point to correctly stack the subsequent one(s). + offset = list(map(add, x_offsite, x_onsite)) + barlist = axes.barh(y, x_infra, width, left=offset, edgecolor='white', color='tab:olive') + + axes.legend(['off-site', 'on-site', 'infrastructure']) + plt.tight_layout() plt.savefig(f'{chan}.png') -- cgit