1 files changed, 88 insertions, 48 deletions
diff --git a/logparse.py b/logparse.py
index 185f6ae..4c1c7ea 100755
--- a/logparse.py
+++ b/logparse.py
@@ -30,7 +30,7 @@ def md5(fname):
 # Accommodate PUTs as well as second URLs (normally "-")
 patt = '(?P<ipaddress>\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}) .* .* \\[(?P<date>\\d{2}\\/[a-zA-Z]{3}\\/\\d{4}):(?P<time>\\d{2}:\\d{2}:\\d{2}) (\\+|\\-)\\d{4}] ".* (?P<path>.*?) .*" (?P<status>\\d*) (?P<size>\\d*)'
         
-p = re.compile(patt)
+logpattern = re.compile(patt)
 
 class logData():
 
@@ -41,7 +41,7 @@ class logData():
         'time': {},
         'path': {},
         'status': {},
-        'agent': {},
+        'name': {}, # derived
     }
 
     def __init__(self,
@@ -78,8 +78,11 @@ class logData():
                         continue
             except(TypeError):
                 pass
+
+
             try:
-                match = p.match(line)
+                match = logpattern.match(line)
+                print(f'logpattern.match(line): {match}')
             except:
                 line_errors += 1
                 print(f'Line parse error: {line}')
@@ -90,17 +93,24 @@ class logData():
                 dateobj = dpar.parse(date)
                 time = match.group('time')
                 path = match.group('path')
+
+                # Extract simple package titles from 'path' column of data frame.
+                patt0 = re.compile('/.*/.*/')
+                patt1 = re.compile('(?P<simplename>.*)-.*-.*\.tar\.bz2$')
+                tarball = re.sub(patt0, '', path)
+                namematch = patt1.match(tarball)
+                name = namematch.group('simplename')
+
                 status = match.group('status')
-                #agent = match.group('agent')
                 hostname = ''
                 df = df.append({'ipaddress':ipaddress,
                                 'hostname':hostname,
                                 'date':dateobj,
                                 'time':time,
                                 'path':path,
-                                'status':status},
+                                'status':status,
+                                'name':name},
                                 ignore_index=True)
-                                #'agent':agent}, ignore_index=True)
             except(AttributeError):
                 unparseable += 1
         print(f'unparseable lines : {unparseable}')
@@ -157,8 +167,6 @@ def filter_pkgs(df):
     successful (HTTP 200) conda package (.bz2 files) downloads.'''
     print(df)
     inlen = len(df)
-    ##out = df.loc[df['agent'].str.contains('conda')]
-    ##out = out.loc[out['path'].str.contains('bz2')]
     out = df.loc[df['path'].str.contains('bz2')]
     out = out.loc[(out['status'] == '200') | (out['status'] == '302')]
     outlen = len(out)
@@ -168,6 +176,7 @@ def filter_pkgs(df):
 
 
 def main():
+    # TODO: Allow specification of a digested data file with fallback to a default.
     ap = argparse.ArgumentParser(
             prog='logparse.py',
             description='Parse and digest apache/nginx access logs in either'
@@ -199,14 +208,17 @@ def main():
         config = yaml.safe_load(f)
 
     files = []
-    for filespec in args.files:
-        expanded =  glob(filespec)
-        expanded.sort()
-        if isinstance(expanded, list):
-            for name in expanded:
-                files.append(name)
-        else:
-            files.append(expanded)
+    try:
+        for filespec in args.files:
+            expanded =  glob(filespec)
+            expanded.sort()
+            if isinstance(expanded, list):
+                for name in expanded:
+                    files.append(name)
+            else:
+                files.append(expanded)
+    except(TypeError):
+        pass
 
     inf_hosts = config['infrastructure_hosts']
     num_inf_hosts = len(inf_hosts)
@@ -233,24 +245,27 @@ def main():
     if len(newdata.index) != 0:
         newdata = filter_pkgs(newdata)
         newdata = newdata.sort_values(by='date')
-
-    # Append newdata to existing data (potentially empty)
-    data = data.append(newdata, ignore_index=True)
+        # Add newdata to existing data (potentially empty)
+        data = data.append(newdata, ignore_index=True)
+    print('.0')
 
     # Remove any duplicate rows in data:
     data = data.drop_duplicates()
-
-    # Dump data to disk for use during subsequent runs.
-    data.to_pickle(datfile)
+    print('.2')
 
     # Normalize all conda-dev channel names to astroconda-dev
     data = data.replace('/conda-dev', '/astroconda-dev', regex=True)
+    print(data)
+    print('.3')
+
+    # Dump data to disk for use during subsequent runs.
+    data.to_pickle(datfile)
 
     print(f'num full data rows = {len(data.index)}')
 
     # Filter out a particular time period for examination
-    window_start = pd.to_datetime('2019-08-22')
-    window_end = pd.to_datetime('2019-08-30')
+    window_start = pd.to_datetime('2019-09-12')
+    window_end = pd.to_datetime('2019-09-19')
     print(f'Filtering based on window {window_start} - {window_end}.')
     data = data[pd.to_datetime(data['date']) >= window_start]
     data = data[pd.to_datetime(data['date']) <= window_end]
@@ -303,8 +318,6 @@ def main():
         # Downloads per day over time frame
         for date in dates:
             bydate[date] = len(pkgs[pkgs['date'] == date])
-        #for date in bydate:
-        #    print(f'{date} : {bydate[date]}')
 
         chan_downloads = len(pkgs.index)
         print(f'Downloads: {chan_downloads}')
@@ -335,6 +348,8 @@ def main():
         #for i in range(top):
         #    print(pkg_totals[i])
 
+        # What is the fraction of downloads for each OS?
+
         # What fraction of total downloads come from non-infrastructure on-site hosts?
         noninf = pkgs[~pkgs['ipaddress'].isin(config['infrastructure_hosts'])]
         total_noninf = len(noninf.index)
@@ -357,38 +372,63 @@ def main():
         # Totals of unique software titles
         # i.e. name without version, hash, py or build iteration values
         # Extract simple package titles from 'path' column of data frame.
-        names = pkgs['path'].str.replace('/.*/.*/', '', regex=True)
-        repl = lambda m: m.group('simplename')
-        names = list(names.str.replace('(?P<simplename>.*)-.*-.*\.tar\.bz2$',
-                repl,
-                regex=True))
+        #names = pkgs['path'].str.replace('/.*/.*/', '', regex=True)
+        #repl = lambda m: m.group('simplename')
+        #names = list(names.str.replace('(?P<simplename>.*)-.*-.*\.tar\.bz2$',
+        #        repl,
+        #        regex=True))
+        names = list(set(pkgs['name']))
+        print('*')
+        print(names)
+        print('*')
         unique_names = list(set(names))
-        name_totals = []
+        name_statsums = []
         for name in unique_names:
-            total = names.count(name)
-            name_totals.append([name, total])
-        name_totals.sort(key=lambda x: x[1], reverse=True)
+            statsum = {}
+            statsum['name'] = name
+            statsum['total'] = names.count(name)
+            # Not correct. Need to match up unique_names with host associated with
+            #    each full package name to avoid over broad matches of unique names
+            #    with substrings of longer names that contain the unique name.
+            #statsum['onsite'] = len(pkgs[pkgs['ipaddress'].str.contains(
+            #    '|'.join(int_host_patterns), regex=True)])
+            #statsum['offsite'] = len(pkgs[~pkgs['ipaddress'].str.contains(
+            #    '|'.join(int_host_patterns), regex=True)])
+            name_statsums.append(statsum)
+
+            # Sum on-site transactions for each package name
+            #npkgs = pkgs[pkgs['name'
+ 
+
+        name_statsums.sort(key=lambda x: x['total'], reverse=True)
         y = []
-        x = [x[0] for x in name_totals]
-        #print(f'Number of unique {chan} titles downloaded: {len(unique_names)}')
-        for total in name_totals:
-            y.append(total[1])
-            #print(f'{total[0]}: {total[1]}')
+        y = [i['total'] for i in name_statsums]
+        x = [i['name'] for i in name_statsums]
+        print(name_statsums)
+
+        # Calculate fractions of properties of each unique package title
+        #  for stacked bar plot purposes.
+
+
+        print(f'Number of unique {chan} titles downloaded: {len(unique_names)}')
+        # For each unique softare name, sum the number of transactions from internal hosts.
         width = 5.0
         fig, axes = plt.subplots(figsize=(10,25))
         plt.grid(which='major', axis='x')
         plt.title(f'{start_date.strftime("%m-%d-%Y")} - {end_date.strftime("%m-%d-%Y")}')
         plt.xlabel('Number of downloads')
-        axes.set_ylim(0,len(name_totals))
-        iraf_ids = []
-        for i,name in enumerate(x):
-            if 'iraf' in name:
-                iraf_ids.append(i)
+        axes.set_ylim(0,len(name_statsums))
+
+        #iraf_ids = []
+        #for i,name in enumerate(x):
+        #    if 'iraf' in name:
+        #        iraf_ids.append(i)
 
         plt.gca().invert_yaxis()
-        barlist = axes.barh(x,y,1,edgecolor='black')
-        for id in iraf_ids:
-            barlist[id].set_color('grey')
+        width = 1
+        barlist = axes.barh(x, y, width, edgecolor='black')
+        #for id in iraf_ids:
+        #    barlist[id].set_color('grey')
         plt.tight_layout()
         plt.savefig(f'{chan}.png')