1 files changed, 124 insertions, 102 deletions
diff --git a/logparse.py b/logparse.py
index 9d36278..4986dd1 100755
--- a/logparse.py
+++ b/logparse.py
@@ -4,6 +4,7 @@ import os
 import sys
 import re
 from glob import glob
+import pickle
 import argparse
 from math import ceil
 import hashlib
@@ -32,7 +33,7 @@ patt = '(?P<ipaddress>\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}) .* .* \\[(?P<da
         
 logpattern = re.compile(patt)
 
-class logData():
+class LogData():
 
     columns = {
         'ipaddress': {},
@@ -41,17 +42,37 @@ class logData():
         'time': {},
         'path': {},
         'status': {},
+        'size': {},
         'name': {}, # derived
     }
 
     def __init__(self,
+                 dataset_name,
                  gethostnames=False,
                  ignore_hosts=[]):
+        '''dataset is a dict
+            dataframe - pandas dataframe containing digested log data
+            file_hashes - MD5 hashes of each file that was read to compose the dataframe'''
+        self.dataset_name = dataset_name
+        self.dataset = None
+        self.data = None
         self.digest_path = 'digests'
         self.gethostnames = gethostnames
         self.hostnames = {}
         self.ignore_hosts = ignore_hosts
 
+        try:
+            print('reading dataset...')
+            with open(self.dataset_name, 'rb')  as f:
+                self.dataset = pickle.load(f)
+        except:
+            print(f'{self.dataset_name} not found. Creating empty dataset.')
+            with open(self.dataset_name, 'a') as f:
+                self.dataset = {'dataframe':pd.DataFrame(self.columns),
+                                'file_hashes': []}
+        self.data = self.dataset['dataframe']
+        self.hashes = self.dataset['file_hashes']
+
     def poll_hostnames(self):
         if ipaddress not in self.hostnames.keys():
             try:
@@ -79,7 +100,6 @@ class logData():
             except(TypeError):
                 pass
 
-
             try:
                 match = logpattern.match(line)
                 print(f'logpattern.match(line): {match}')
@@ -100,8 +120,8 @@ class logData():
                 tarball = re.sub(patt0, '', path)
                 namematch = patt1.match(tarball)
                 name = namematch.group('simplename')
-
                 status = match.group('status')
+                size = match.group('size')
                 hostname = ''
                 df = df.append({'ipaddress':ipaddress,
                                 'hostname':hostname,
@@ -109,6 +129,7 @@ class logData():
                                 'time':time,
                                 'path':path,
                                 'status':status,
+                                'size':size,
                                 'name':name},
                                 ignore_index=True)
             except(AttributeError):
@@ -123,22 +144,13 @@ class logData():
         and parses each that has not already been ingested.'''
     
         # Create data frame for receiving all log data.
-        locframe = pd.DataFrame(self.columns)
-
-        # Track which files have been parsed by storing the MD5 hash of each
-        # once it's been read.
-        pfile = 'parsed_files.dat'
-        if not os.path.exists(pfile):
-            open(pfile, 'a').close()
-        with open(pfile, 'r') as f:
-            already_parsed = f.read().split()
-        parsed = open(pfile, 'a')
+        newdata = pd.DataFrame(self.columns)
 
         for log in sorted(logs):
             # Compute MD5 hash of file and compare to list of files that
             # have already been parsed. If new, parse, if not, skip.
             hashval = md5(log)
-            if hashval in already_parsed:
+            if hashval in self.hashes:
                 print(f'File {log} already parsed.')
                 continue
             df = pd.DataFrame(self.columns)
@@ -153,34 +165,58 @@ class logData():
                 with open(log, 'r') as f:
                     df = self.process_lines(f)
             print(f'df shape = {df.shape}')
-            locframe = locframe.append(df, ignore_index=True)
-            print(locframe.shape)
-            parsed.write(f'{hashval}\n')
-
-        parsed.close()
-        return(locframe)
-
-
-
-def filter_pkgs(df):
-    '''Filter dataframe df down to just the rows the represent
-    successful (HTTP 200) conda package (.bz2 files) downloads.'''
-    print(df)
-    inlen = len(df)
-    out = df.loc[df['path'].str.contains('bz2')]
-    out = out.loc[(out['status'] == '200') | (out['status'] == '302')]
-    outlen = len(out)
-    print(f'{inlen-outlen} rows removed to leave conda txns only')
-    return(out)
-
+            newdata = newdata.append(df, ignore_index=True)
+            print(newdata.shape)
+            self.hashes.append(hashval)
+
+        # If any new log files were read, filter down to only conda package downloads
+        # Then sort by date.
+        if len(newdata.index) != 0:
+            newdata = self.filter_pkgs(newdata)
+            newdata = newdata.sort_values(by='date')
+            newdata = newdata.drop_duplicates()
+            # Normalize any 'conda-dev' channel names to 'astroconda-dev'
+            newdata = newdata.replace('/conda-dev', '/astroconda-dev', regex=True)
+            # Add newdata to (potentially empty) existing data
+            self.data = self.data.append(newdata, ignore_index=True)
+            self.dataset['dataframe'] = self.data
+
+    def filter_pkgs(self, df):
+        '''Filter dataframe df down to just the rows the represent
+        successful (HTTP 200) conda package (.bz2 files) downloads.'''
+        inlen = len(df)
+        out = df.loc[df['path'].str.contains('bz2')]
+        out = out.loc[(out['status'] == '200') | (out['status'] == '302')]
+        outlen = len(out)
+        print(f'{inlen-outlen} rows removed to leave conda txns only')
+        return(out)
+
+    def write_dataset(self, dataset_name=None):
+        '''Serialize working dataset and write it to disk using a filename
+        provided, if requested.
+
+	Parameters
+	----------
+	dataset_name : string
+	    Optional name to use for file when writing working dataset to disk.
+	    If not provided, the current name of the working dataset file will
+            be used.'''
+        if dataset_name:
+            dsname = dataset_name
+        else:
+            dsname = self.dataset_name
+        pickle.dump(self.dataset, open(dsname, 'wb'))
 
 
 def main():
-    # TODO: Allow specification of a digested data file with fallback to a default.
     ap = argparse.ArgumentParser(
             prog='logparse.py',
             description='Parse and digest apache/nginx access logs in either'
             ' raw or .gz format.')
+    ap.add_argument('dataset_name', type=str,
+                    help='Name of dataset file. If file does not exist and'
+                    ' log data file names are provided for parsing, this '
+                    'file will be created.')
     ap.add_argument('--config',
                     '-c',
                     help='Configuration file used to adjust behavior of the '
@@ -195,7 +231,9 @@ def main():
                     '-w',
                     help='Restrict examination of data to the window of dates'
                     ' provided.\n'
-                    ' Format: YYYY.MM.DD-YYYY.MM.DD')
+                    ' Format: YYYY.MM.DD-YYYY.MM.DD'
+		    ' Omitting a date window will operate on all data contained'
+		    ' within the given dataset.')
     ap.add_argument('--ignorehosts',
                     '-i',
                     help='IP addresses of hosts to ignore when parsing logs.'
@@ -204,6 +242,9 @@ def main():
                     nargs='+')
     args = ap.parse_args()
 
+    # Dataset filename
+    dataset_name = args.dataset_name
+
     with open(args.config, 'r') as f:
         config = yaml.safe_load(f)
 
@@ -218,58 +259,37 @@ def main():
             else:
                 files.append(expanded)
     except(TypeError):
+        print('No log files provided.')
+        print(f'Importing existing dataset {dataset_name}.')
         pass
 
     inf_hosts = config['infrastructure_hosts']
     num_inf_hosts = len(inf_hosts)
 
-    # Read in any pre-existing parsed data.
-    datfile = 'dataframe.dat'
-    if not os.path.exists(datfile):
-        open(datfile, 'a').close()
-
-    with open(datfile, 'r') as f:
-        try:
-            data = pd.read_pickle(datfile)
-        except:
-            data = pd.DataFrame(logData.columns)
-
     # TODO: Should host filtering take place here?
     #       It leaves a disconnect between the pickled data which _may_ have
     #       been culled and the actual data being referenced.
-    logproc = logData(ignore_hosts=args.ignorehosts)
-    newdata = logproc.read_logs(files)
-
-    # If any new log files were read, filter down to only conda package downloads
-    # Then sort by date.
-    if len(newdata.index) != 0:
-        newdata = filter_pkgs(newdata)
-        newdata = newdata.sort_values(by='date')
-        # Add newdata to existing data (potentially empty)
-        data = data.append(newdata, ignore_index=True)
-    print('.0')
-
-    # Remove any duplicate rows in data:
-    data = data.drop_duplicates()
-    print('.2')
-
-    # Normalize all conda-dev channel names to astroconda-dev
-    data = data.replace('/conda-dev', '/astroconda-dev', regex=True)
-    print(data)
-    print('.3')
-
-    # Dump data to disk for use during subsequent runs.
-    data.to_pickle(datfile)
+    logproc = LogData(dataset_name, ignore_hosts=args.ignorehosts)
+    logproc.read_logs(files)
 
+    print('writing (potentially updated) dataset')
+    logproc.write_dataset()
+
+    # Filtering and analysis begins here
+    data = logproc.data
     print(f'num full data rows = {len(data.index)}')
 
     # Filter out a particular time period for examination
-    window_start = pd.to_datetime('2019-09-15')
-    window_end = pd.to_datetime('2019-09-21')
-    print(f'Filtering based on window {window_start} - {window_end}.')
-    data = data[pd.to_datetime(data['date']) >= window_start]
-    data = data[pd.to_datetime(data['date']) <= window_end]
-    print(f'num windowed data rows = {len(data.index)}')
+    # Set limits on a time period to examine
+    if args.window:
+        start = args.window.split('-')[0].replace('.', '-')
+        end = args.window.split('-')[1].replace('.', '-')
+        window_start = pd.to_datetime(start)
+        window_end = pd.to_datetime(end)
+        print(f'Filtering based on window {window_start} - {window_end}.')
+        data = data[pd.to_datetime(data['date']) >= window_start]
+        data = data[pd.to_datetime(data['date']) <= window_end]
+        print(f'num windowed data rows = {len(data.index)}')
 
     all_unique_hosts = list(set(data['ipaddress']))
     #for host in all_unique_hosts:
@@ -367,16 +387,13 @@ def main():
         num_onsite_hosts = len(set(onsite['ipaddress']))
         print(f'num unique on-site hosts: {num_onsite_hosts}')
 
+        infra = pkgs[pkgs['ipaddress'].str.contains('|'.join(inf_hosts))]
+
         # Fraction of downloads to off-site hosts
 
         # Totals of unique software titles
         # i.e. name without version, hash, py or build iteration values
         # Extract simple package titles from 'path' column of data frame.
-        #names = pkgs['path'].str.replace('/.*/.*/', '', regex=True)
-        #repl = lambda m: m.group('simplename')
-        #names = list(names.str.replace('(?P<simplename>.*)-.*-.*\.tar\.bz2$',
-        #        repl,
-        #        regex=True))
         names = list(pkgs['name'])
         unique_names = list(set(names))
         name_statsums = []
@@ -384,12 +401,16 @@ def main():
             statsum = {}
             statsum['name'] = name
             statsum['total'] = names.count(name)
-
             # Sum on- and off-site transactions for each package name
+            # 'on-site' means transactions to non-infrastructure hosts.
             name_txns = pkgs[pkgs['name'] == name]
 
             on_txns = name_txns[name_txns['ipaddress'].str.contains(
             '|'.join(int_host_patterns), regex=True)]
+            # Filter out hosts designated as infrastructure hosts in config file.
+            on_txns = on_txns[~on_txns['ipaddress'].str.contains(
+            '|'.join(inf_hosts))]
+
             num_onsite_txns = len(on_txns.index)
             statsum['onsite'] = num_onsite_txns
 
@@ -398,38 +419,39 @@ def main():
             num_offsite_txns = len(off_txns.index)
             statsum['offsite'] = num_offsite_txns
 
+            infra_txns = name_txns[name_txns['ipaddress'].str.contains(
+            '|'.join(inf_hosts))]
+            num_infra_txns = len(infra_txns.index)
+            statsum['infra'] = num_infra_txns
+
             name_statsums.append(statsum)
 
         name_statsums.sort(key=lambda x: x['total'], reverse=True)
-        y = []
-        y = [i['total'] for i in name_statsums]
-        y_onsite = [i['onsite'] for i in name_statsums]
-        print(f'y_onsite: {y_onsite}')
-        y_offsite = [i['offsite'] for i in name_statsums]
-        print(f'y_offisite: {y_offsite}')
-        x = [i['name'] for i in name_statsums]
-        print('name_statsums')
-        print(name_statsums)
-
-        # Calculate fractions of properties of each unique package title
-        #  for stacked bar plot purposes.
-
+        x_onsite = [i['onsite'] for i in name_statsums]
+        x_infra = [i['infra'] for i in name_statsums]
+        x_offsite = [i['offsite'] for i in name_statsums]
+        y = [i['name'] for i in name_statsums]
 
         print(f'Number of unique {chan} titles downloaded: {len(unique_names)}')
         # For each unique softare name, sum the number of transactions from internal hosts.
-        width = 5.0
         fig, axes = plt.subplots(figsize=(10,25))
         plt.grid(which='major', axis='x')
-        plt.title(f'{start_date.strftime("%m-%d-%Y")} - {end_date.strftime("%m-%d-%Y")}')
+        plt.title(f'{chan} -- {start_date.strftime("%m-%d-%Y")} - {end_date.strftime("%m-%d-%Y")}')
         plt.xlabel('Number of downloads')
-        axes.set_ylim(0,len(name_statsums))
+        axes.set_ylim(-1,len(name_statsums))
 
         plt.gca().invert_yaxis()
         width = 1
-        barlist = axes.barh(x, y_onsite, width, edgecolor='black')
-        barlist = axes.barh(x, y_offsite, width, left=y_onsite, edgecolor='black')
-        #for id in iraf_ids:
-        #    barlist[id].set_color('grey')
+        from operator import add
+        # Horizontal stacked bar chart with off-site, on-site, and infrastructure transactions.
+        barlist = axes.barh(y, x_offsite, width, edgecolor='white', color='tab:blue')
+        barlist = axes.barh(y, x_onsite, width, left=x_offsite, edgecolor='white', color='tab:green')
+        # Sum bars up to this point to correctly stack the subsequent one(s).
+        offset = list(map(add, x_offsite, x_onsite))
+        barlist = axes.barh(y, x_infra, width, left=offset, edgecolor='white', color='tab:olive')
+
+        axes.legend(['off-site', 'on-site', 'infrastructure'])
+
         plt.tight_layout()
         plt.savefig(f'{chan}.png')