diff options
Diffstat (limited to 'logparse.py')
-rwxr-xr-x | logparse.py | 226 |
1 files changed, 124 insertions, 102 deletions
diff --git a/logparse.py b/logparse.py index 9d36278..4986dd1 100755 --- a/logparse.py +++ b/logparse.py @@ -4,6 +4,7 @@ import os import sys import re from glob import glob +import pickle import argparse from math import ceil import hashlib @@ -32,7 +33,7 @@ patt = '(?P<ipaddress>\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}) .* .* \\[(?P<da logpattern = re.compile(patt) -class logData(): +class LogData(): columns = { 'ipaddress': {}, @@ -41,17 +42,37 @@ class logData(): 'time': {}, 'path': {}, 'status': {}, + 'size': {}, 'name': {}, # derived } def __init__(self, + dataset_name, gethostnames=False, ignore_hosts=[]): + '''dataset is a dict + dataframe - pandas dataframe containing digested log data + file_hashes - MD5 hashes of each file that was read to compose the dataframe''' + self.dataset_name = dataset_name + self.dataset = None + self.data = None self.digest_path = 'digests' self.gethostnames = gethostnames self.hostnames = {} self.ignore_hosts = ignore_hosts + try: + print('reading dataset...') + with open(self.dataset_name, 'rb') as f: + self.dataset = pickle.load(f) + except: + print(f'{self.dataset_name} not found. Creating empty dataset.') + with open(self.dataset_name, 'a') as f: + self.dataset = {'dataframe':pd.DataFrame(self.columns), + 'file_hashes': []} + self.data = self.dataset['dataframe'] + self.hashes = self.dataset['file_hashes'] + def poll_hostnames(self): if ipaddress not in self.hostnames.keys(): try: @@ -79,7 +100,6 @@ class logData(): except(TypeError): pass - try: match = logpattern.match(line) print(f'logpattern.match(line): {match}') @@ -100,8 +120,8 @@ class logData(): tarball = re.sub(patt0, '', path) namematch = patt1.match(tarball) name = namematch.group('simplename') - status = match.group('status') + size = match.group('size') hostname = '' df = df.append({'ipaddress':ipaddress, 'hostname':hostname, @@ -109,6 +129,7 @@ class logData(): 'time':time, 'path':path, 'status':status, + 'size':size, 'name':name}, ignore_index=True) except(AttributeError): @@ -123,22 +144,13 @@ class logData(): and parses each that has not already been ingested.''' # Create data frame for receiving all log data. - locframe = pd.DataFrame(self.columns) - - # Track which files have been parsed by storing the MD5 hash of each - # once it's been read. - pfile = 'parsed_files.dat' - if not os.path.exists(pfile): - open(pfile, 'a').close() - with open(pfile, 'r') as f: - already_parsed = f.read().split() - parsed = open(pfile, 'a') + newdata = pd.DataFrame(self.columns) for log in sorted(logs): # Compute MD5 hash of file and compare to list of files that # have already been parsed. If new, parse, if not, skip. hashval = md5(log) - if hashval in already_parsed: + if hashval in self.hashes: print(f'File {log} already parsed.') continue df = pd.DataFrame(self.columns) @@ -153,34 +165,58 @@ class logData(): with open(log, 'r') as f: df = self.process_lines(f) print(f'df shape = {df.shape}') - locframe = locframe.append(df, ignore_index=True) - print(locframe.shape) - parsed.write(f'{hashval}\n') - - parsed.close() - return(locframe) - - - -def filter_pkgs(df): - '''Filter dataframe df down to just the rows the represent - successful (HTTP 200) conda package (.bz2 files) downloads.''' - print(df) - inlen = len(df) - out = df.loc[df['path'].str.contains('bz2')] - out = out.loc[(out['status'] == '200') | (out['status'] == '302')] - outlen = len(out) - print(f'{inlen-outlen} rows removed to leave conda txns only') - return(out) - + newdata = newdata.append(df, ignore_index=True) + print(newdata.shape) + self.hashes.append(hashval) + + # If any new log files were read, filter down to only conda package downloads + # Then sort by date. + if len(newdata.index) != 0: + newdata = self.filter_pkgs(newdata) + newdata = newdata.sort_values(by='date') + newdata = newdata.drop_duplicates() + # Normalize any 'conda-dev' channel names to 'astroconda-dev' + newdata = newdata.replace('/conda-dev', '/astroconda-dev', regex=True) + # Add newdata to (potentially empty) existing data + self.data = self.data.append(newdata, ignore_index=True) + self.dataset['dataframe'] = self.data + + def filter_pkgs(self, df): + '''Filter dataframe df down to just the rows the represent + successful (HTTP 200) conda package (.bz2 files) downloads.''' + inlen = len(df) + out = df.loc[df['path'].str.contains('bz2')] + out = out.loc[(out['status'] == '200') | (out['status'] == '302')] + outlen = len(out) + print(f'{inlen-outlen} rows removed to leave conda txns only') + return(out) + + def write_dataset(self, dataset_name=None): + '''Serialize working dataset and write it to disk using a filename + provided, if requested. + + Parameters + ---------- + dataset_name : string + Optional name to use for file when writing working dataset to disk. + If not provided, the current name of the working dataset file will + be used.''' + if dataset_name: + dsname = dataset_name + else: + dsname = self.dataset_name + pickle.dump(self.dataset, open(dsname, 'wb')) def main(): - # TODO: Allow specification of a digested data file with fallback to a default. ap = argparse.ArgumentParser( prog='logparse.py', description='Parse and digest apache/nginx access logs in either' ' raw or .gz format.') + ap.add_argument('dataset_name', type=str, + help='Name of dataset file. If file does not exist and' + ' log data file names are provided for parsing, this ' + 'file will be created.') ap.add_argument('--config', '-c', help='Configuration file used to adjust behavior of the ' @@ -195,7 +231,9 @@ def main(): '-w', help='Restrict examination of data to the window of dates' ' provided.\n' - ' Format: YYYY.MM.DD-YYYY.MM.DD') + ' Format: YYYY.MM.DD-YYYY.MM.DD' + ' Omitting a date window will operate on all data contained' + ' within the given dataset.') ap.add_argument('--ignorehosts', '-i', help='IP addresses of hosts to ignore when parsing logs.' @@ -204,6 +242,9 @@ def main(): nargs='+') args = ap.parse_args() + # Dataset filename + dataset_name = args.dataset_name + with open(args.config, 'r') as f: config = yaml.safe_load(f) @@ -218,58 +259,37 @@ def main(): else: files.append(expanded) except(TypeError): + print('No log files provided.') + print(f'Importing existing dataset {dataset_name}.') pass inf_hosts = config['infrastructure_hosts'] num_inf_hosts = len(inf_hosts) - # Read in any pre-existing parsed data. - datfile = 'dataframe.dat' - if not os.path.exists(datfile): - open(datfile, 'a').close() - - with open(datfile, 'r') as f: - try: - data = pd.read_pickle(datfile) - except: - data = pd.DataFrame(logData.columns) - # TODO: Should host filtering take place here? # It leaves a disconnect between the pickled data which _may_ have # been culled and the actual data being referenced. - logproc = logData(ignore_hosts=args.ignorehosts) - newdata = logproc.read_logs(files) - - # If any new log files were read, filter down to only conda package downloads - # Then sort by date. - if len(newdata.index) != 0: - newdata = filter_pkgs(newdata) - newdata = newdata.sort_values(by='date') - # Add newdata to existing data (potentially empty) - data = data.append(newdata, ignore_index=True) - print('.0') - - # Remove any duplicate rows in data: - data = data.drop_duplicates() - print('.2') - - # Normalize all conda-dev channel names to astroconda-dev - data = data.replace('/conda-dev', '/astroconda-dev', regex=True) - print(data) - print('.3') - - # Dump data to disk for use during subsequent runs. - data.to_pickle(datfile) + logproc = LogData(dataset_name, ignore_hosts=args.ignorehosts) + logproc.read_logs(files) + print('writing (potentially updated) dataset') + logproc.write_dataset() + + # Filtering and analysis begins here + data = logproc.data print(f'num full data rows = {len(data.index)}') # Filter out a particular time period for examination - window_start = pd.to_datetime('2019-09-15') - window_end = pd.to_datetime('2019-09-21') - print(f'Filtering based on window {window_start} - {window_end}.') - data = data[pd.to_datetime(data['date']) >= window_start] - data = data[pd.to_datetime(data['date']) <= window_end] - print(f'num windowed data rows = {len(data.index)}') + # Set limits on a time period to examine + if args.window: + start = args.window.split('-')[0].replace('.', '-') + end = args.window.split('-')[1].replace('.', '-') + window_start = pd.to_datetime(start) + window_end = pd.to_datetime(end) + print(f'Filtering based on window {window_start} - {window_end}.') + data = data[pd.to_datetime(data['date']) >= window_start] + data = data[pd.to_datetime(data['date']) <= window_end] + print(f'num windowed data rows = {len(data.index)}') all_unique_hosts = list(set(data['ipaddress'])) #for host in all_unique_hosts: @@ -367,16 +387,13 @@ def main(): num_onsite_hosts = len(set(onsite['ipaddress'])) print(f'num unique on-site hosts: {num_onsite_hosts}') + infra = pkgs[pkgs['ipaddress'].str.contains('|'.join(inf_hosts))] + # Fraction of downloads to off-site hosts # Totals of unique software titles # i.e. name without version, hash, py or build iteration values # Extract simple package titles from 'path' column of data frame. - #names = pkgs['path'].str.replace('/.*/.*/', '', regex=True) - #repl = lambda m: m.group('simplename') - #names = list(names.str.replace('(?P<simplename>.*)-.*-.*\.tar\.bz2$', - # repl, - # regex=True)) names = list(pkgs['name']) unique_names = list(set(names)) name_statsums = [] @@ -384,12 +401,16 @@ def main(): statsum = {} statsum['name'] = name statsum['total'] = names.count(name) - # Sum on- and off-site transactions for each package name + # 'on-site' means transactions to non-infrastructure hosts. name_txns = pkgs[pkgs['name'] == name] on_txns = name_txns[name_txns['ipaddress'].str.contains( '|'.join(int_host_patterns), regex=True)] + # Filter out hosts designated as infrastructure hosts in config file. + on_txns = on_txns[~on_txns['ipaddress'].str.contains( + '|'.join(inf_hosts))] + num_onsite_txns = len(on_txns.index) statsum['onsite'] = num_onsite_txns @@ -398,38 +419,39 @@ def main(): num_offsite_txns = len(off_txns.index) statsum['offsite'] = num_offsite_txns + infra_txns = name_txns[name_txns['ipaddress'].str.contains( + '|'.join(inf_hosts))] + num_infra_txns = len(infra_txns.index) + statsum['infra'] = num_infra_txns + name_statsums.append(statsum) name_statsums.sort(key=lambda x: x['total'], reverse=True) - y = [] - y = [i['total'] for i in name_statsums] - y_onsite = [i['onsite'] for i in name_statsums] - print(f'y_onsite: {y_onsite}') - y_offsite = [i['offsite'] for i in name_statsums] - print(f'y_offisite: {y_offsite}') - x = [i['name'] for i in name_statsums] - print('name_statsums') - print(name_statsums) - - # Calculate fractions of properties of each unique package title - # for stacked bar plot purposes. - + x_onsite = [i['onsite'] for i in name_statsums] + x_infra = [i['infra'] for i in name_statsums] + x_offsite = [i['offsite'] for i in name_statsums] + y = [i['name'] for i in name_statsums] print(f'Number of unique {chan} titles downloaded: {len(unique_names)}') # For each unique softare name, sum the number of transactions from internal hosts. - width = 5.0 fig, axes = plt.subplots(figsize=(10,25)) plt.grid(which='major', axis='x') - plt.title(f'{start_date.strftime("%m-%d-%Y")} - {end_date.strftime("%m-%d-%Y")}') + plt.title(f'{chan} -- {start_date.strftime("%m-%d-%Y")} - {end_date.strftime("%m-%d-%Y")}') plt.xlabel('Number of downloads') - axes.set_ylim(0,len(name_statsums)) + axes.set_ylim(-1,len(name_statsums)) plt.gca().invert_yaxis() width = 1 - barlist = axes.barh(x, y_onsite, width, edgecolor='black') - barlist = axes.barh(x, y_offsite, width, left=y_onsite, edgecolor='black') - #for id in iraf_ids: - # barlist[id].set_color('grey') + from operator import add + # Horizontal stacked bar chart with off-site, on-site, and infrastructure transactions. + barlist = axes.barh(y, x_offsite, width, edgecolor='white', color='tab:blue') + barlist = axes.barh(y, x_onsite, width, left=x_offsite, edgecolor='white', color='tab:green') + # Sum bars up to this point to correctly stack the subsequent one(s). + offset = list(map(add, x_offsite, x_onsite)) + barlist = axes.barh(y, x_infra, width, left=offset, edgecolor='white', color='tab:olive') + + axes.legend(['off-site', 'on-site', 'infrastructure']) + plt.tight_layout() plt.savefig(f'{chan}.png') |