1 files changed, 194 insertions, 173 deletions
diff --git a/logparse.py b/logparse.py
index 4a21054..4b8edb0 100755
--- a/logparse.py
+++ b/logparse.py
@@ -1,58 +1,32 @@
 #!/usr/bin/env python3
 
+import os
+import sys
 import re
+from glob import glob
+import argparse
+from math import ceil
 import gzip
 import socket
 import pandas as pd
 import datetime as dt
 import matplotlib.pyplot as plt
 import matplotlib.dates as mdates
-
-# Notes
-# df.to_pickle(filename) for serializing a pandas data frame to disk.
-# df.read_pickle(filename) to get it back.
+from dateutil import parser as dpar
+from collections import OrderedDict
 
 # regex pattern to extract key values from each line of an apache/nginx access log
 # Accommodate PUTs as well as second URLs (normally "-")
-patt = '(?P<ipaddress>\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}) - - \\[(?P<date>\\d{2}\\/[a-zA-Z]{3}\\/\\d{4}):(?P<time>\\d{2}:\\d{2}:\\d{2}) (\\+|\\-)\\d{4}] ("(GET|POST|PUT) )(?P<path>.*?) HTTP/1.1" (?P<status>\\d*) \\d* ".*" "(?P<agent>.*)"'
+patt = '(?P<ipaddress>\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}) .* .* \\[(?P<date>\\d{2}\\/[a-zA-Z]{3}\\/\\d{4}):(?P<time>\\d{2}:\\d{2}:\\d{2}) (\\+|\\-)\\d{4}] ".* (?P<path>.*?) .*" (?P<status>\\d*) \\d* ".*" "(?P<agent>.*)"'
         
 p = re.compile(patt)
 
-columns = {
-        'ipaddress': {},
-        'date': {},
-        'time': {},
-        'path': {},
-        'status': {},
-        'agent': {},
-       }
-
-df = pd.DataFrame(columns)
-
-#files = [
-#        'astroconda_access.log'
-#        ]
-
-files = [
-        'ssb.stsci.edu.access.log-20190715.gz',
-        #'ssb.stsci.edu.access.log-20190716.gz',
-        #'ssb.stsci.edu.access.log-20190717.gz',
-        #'ssb.stsci.edu.access.log-20190718.gz',
-        #'ssb.stsci.edu.access.log-20190719.gz',
-        #'ssb.stsci.edu.access.log-20190720.gz',
-        #'ssb.stsci.edu.access.log-20190721.gz',
-        #'ssb.stsci.edu.access.log-20190722.gz',
-        #'ssb.stsci.edu.access.log',
-        ]
-
-# Addresses for hosts that should be ignored, such
-# as those from which security scan connections come.
-ignore_address = '10.128.19.7'  # Security scan host.
-
 
 class logData():
 
-    def __init__(self, hostnames=False):
+    def __init__(self,
+                 gethostnames=False,
+                 ignore_hosts=[]):
         self.columns = {
                 'ipaddress': {},
                 'hostname': {},
@@ -62,187 +36,234 @@ class logData():
                 'status': {},
                 'agent': {},
                }
-        self.df = pd.DataFrame(columns)
+        self.dframe = pd.DataFrame(self.columns)
         self.digest_path = 'digests'
+        self.gethostnames = gethostnames
+        self.ignore_hosts = ignore_hosts
 
-
-    def process_lines(f):
+    def process_lines(self, f):
+        print('process lines')
+        df = pd.DataFrame(self.columns)
+        unparseable = 0
         for line in f.readlines():
+            print(line)
             try:
                 line = str(line.decode("utf-8"))
             except(AttributeError):
                 pass
-            if ignore_address in line:
+            # Ignore transactions from particular IP addresses as requested.
+            try:
+                for host in self.ignore_hosts:
+                    if host in line:
+                        continue
+            except(TypeError):
                 continue
             try:
                 match = p.match(line)
             except:
+                line_errors += 1
                 pass
-            ipaddress = match.group('ipaddress')
-            date = match.group('date')
-            time = match.group('time')
-            path = match.group('path')
-            status = match.group('status')
-            agent = match.group('agent')
+            print(match)
+            try:
+                ipaddress = match.group('ipaddress')
+                date = match.group('date')
+                dateobj = dpar.parse(date)
+                time = match.group('time')
+                path = match.group('path')
+                status = match.group('status')
+                agent = match.group('agent')
+            except(AttributeError):
+                unparseable += 1
             # Selective polling of hostnames here.
             hostname = '?'
-            self.df = df.append({'ipaddress':ipaddress,
+            df = df.append({'ipaddress':ipaddress,
                             'hostname':hostname,
-                            'date':date,
+                            'date':dateobj,
                             'time':time,
                             'path':path, 
                             'status':status, 
                             'agent':agent}, ignore_index=True)
+        print(f'unparseable lines : {unparseable}')
         return(df)
     
-    
-    def digest_log(logfile):
-        '''Read in either a text log file or a gzipped log file, extract key values
-        and store them in a pandas data frame, which is returned.'''
-        if '.gz' in fname:
-            with gzip.open(fname, 'r') as f:
-                df = process_lines(df, f)
-        else:
-            with open(fname, 'r') as f:
-                df = process_lines(df, f)
-        return(df)
-    
-    
-    def read_logs(logs):
+    def read_logs(self, logs):
         '''Accepts:
-        a pandas dataframe to which the log data will be appended.
     
         a list of apache/nginx access log files, either raw or .gz,
         and parses each that does not already have a corresponding digested
         data frame in the 'digests' subdir.'''
     
         # Create data frame for receiving log data
-        columns = {
-                'ipaddress': {},
-                'hostname': {},
-                'date': {},
-                'time': {},
-                'path': {},
-                'status': {},
-                'agent': {},
-               }
-        dframe = pd.DataFrame(columns)
+        df = pd.DataFrame(self.columns)
+        locframe = pd.DataFrame(self.columns)
     
         # Sort list of logs before processing so data will be appended in
         # chronological order.
-        for log in logs:
-            print(log)
-            setname = re.sub(log, '\.gz$', '')
-            try:
-                dframe = pd.read_pickle(f'digests/{setname}')
-            except(FileNotFoundError):
+        for log in sorted(logs):
+            setname = re.sub('\.gz$', '', log)
+            setpath = os.path.join(self.digest_path, setname)
+            pklpath = os.path.join(self.digest_path, f'{setname}.pkl')
+            print(f'ingesting dataset = {setname}')
+            if os.path.isfile(pklpath):
+                df = pd.read_pickle(pklpath)
+            else:
+                print('parsing log file')
                 if '.gz' in log:
                     with gzip.open(log, 'r') as f:
-                        dframe = process_lines(df, f)
+                        df = self.process_lines(f)
                 else:
                     with open(log, 'r') as f:
-                        dframe = process_lines(df, f)
-            dframe.append(df, ignore_index=True)
-        return(dframe)
-        
+                        df = self.process_lines(f)
+                print(f'df shape = {df.shape}')
+            # Dump digested log data to disk for more efficient repeated use.
+            df.to_pickle(f'{setpath}.pkl')
+            locframe = locframe.append(df, ignore_index=True)
+            print(locframe.shape)
+        return(locframe)
+
+
+
+def filter_pkgs(df):
+    '''Filter dataframe df down to just the rows the represent
+    successful (HTTP 200) conda package (.bz2 files) downloads.'''
+    inlen = len(df)
+    out = df.loc[df['agent'].str.contains('conda')]
+    print(out)
+    out = out.loc[out['path'].str.contains('bz2')]
+    out = out.loc[out['status'].str.contains('200')]
+    outlen = len(out)
+    print(f'{inlen-outlen} rows removed to leave conda txns only')
+    return(out)
+
 
-"""
-# If a stored data frame already exists, load it, otherwise set about
-# parsing the log files and creating one.
-try:
-    print('Looking for pickled data frame...')
-    raise(FileNotFoundError)
-    #pkg_txns = pd.read_pickle('data.pkl')
-    #pkg_txns = pd.read_pickle('astroconda.org.pkl')
-except(FileNotFoundError):
-    # iterate over log files and read in values to a master data frame.
-    for fname in files:
-        print(fname)
-        if '.gz' in fname:
-            with gzip.open(fname, 'r') as f:
-                df = process_lines(df, f)
+
+
+def main():
+    ap = argparse.ArgumentParser(
+            prog='logparse.py',
+            description='Parse and digest apache/nginx access logs in either'
+            ' raw or .gz format.')
+    ap.add_argument('--files',
+                    '-f',
+                    help='List of log files to parse, raw or .gz are accepted.'
+                    ' glob syntax is also honored.',
+                    nargs='+')
+    ap.add_argument('--ignorehosts',
+                    '-i',
+                    help='IP addresses of hosts to ignore when parsing logs.'
+                    ' Useful for saving time by not reading in transactions '
+                    'from security scans, etc.',
+                    nargs='+')
+    args = ap.parse_args()
+
+    files = []
+    for filespec in args.files:
+        expanded =  glob(filespec)
+        expanded.sort()
+        if isinstance(expanded, list):
+            for name in expanded:
+                files.append(name)
         else:
-            with open(fname, 'r') as f:
-                df = process_lines(df, f)
-    
-    # Create frame with only package downloads from conda.
-    
-    # Conda transactions
-    conda_txns = df.loc[df['agent'].str.contains('conda')]
+            files.append(expanded)
+
+    # TODO: Should host filtering take place here?
+    #       It leaves a disconnect between the pickled data which _may_ have been
+    #       culled and the actual data being referenced by the inclusion of a file
+    #       that has data from an exluded host within it.
+    logproc = logData(ignore_hosts=args.ignorehosts)
+    data = logproc.read_logs(files)
+
+    allpkgs = filter_pkgs(data)
+    allpkgs = allpkgs.sort_values(by='date')
+
+    start_date = allpkgs.iloc[0]['date']
+    end_date = allpkgs.iloc[-1]['date']
+    time_range = end_date - start_date
+    days_elapsed = time_range.days
+    if days_elapsed == 0:
+        days_elapsed = 1
     
-    # Package transactions
-    pkg_txns = conda_txns.loc[conda_txns['path'].str.contains('bz2')]
-    pkg_txns = pkg_txns.loc[pkg_txns['status'].str.contains('200')]
+    print(f'Over the period {start_date.strftime("%m-%d-%Y")} '
+          f'to {end_date.strftime("%m-%d-%Y")}')
+    print(f'{days_elapsed} days')
 
+    # Normalize all conda-dev channel names to astroconda-dev
+    allpkgs = allpkgs.replace('/conda-dev', '/astroconda-dev', regex=True)
 
-df = pkg_txns
+    # All packages in a dictionary by channel.
+    chans = [path.split('/')[1] for path in allpkgs['path']]
+    chans = set(chans)
+    chan_pkgs = {}
+    for chan in chans:
+        # Trailing '/' added to ensure only a single channel gets stored for each
+        # due to matching overlap depending on length of substring.
+        chan_pkgs[chan] = allpkgs[allpkgs['path'].str.contains(chan+'/')]
 
-# Of package downloads, compile a list of downloads/day
+    # For each channel, generate summary report of the download activity.
+    for chan in chan_pkgs.keys():
+        print(f'\n\nSummary for channel: {chan}')
+        print('-----------------------------')
 
-totals = []
-dates = list(set(df['date']))
-dates.sort()
-x = [dt.datetime.strptime(d, '%d/%b/%Y').date() for d in dates]
-y = []
-print(f'length of x list {len(x)}')
-for date in dates:
-    num = len(pkg_txns[pkg_txns.date == date])
-    total = {date:num}
-    totals.append(total)
-    y.append(num)
+        pkgs = chan_pkgs[chan]
+        # Unique days
+        dates = set(pkgs['date'])
+        dates = list(dates)
+        dates.sort()
+        bydate = OrderedDict()
 
-#plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y'))
-#plt.gca().xaxis.set_major_locator(mdates.DayLocator())
-#plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7))
-plt.plot(x,y)
-plt.figure(figsize=(2,2))
-plt.savefig('astroconda_org.png')
-##plt.show()
-###plt.gcf().autofmt_xdate()
+        # Downloads per day over time frame
+        for date in dates:
+            bydate[date] = len(pkgs[pkgs['date'] == date])
+        #for date in bydate:
+        #    print(f'{date} : {bydate[date]}')
 
-internal = pkg_txns[pkg_txns.ipaddress.str.startswith('10.') | pkg_txns.ipaddress.str.startswith('172.')]
-external = pkg_txns[~(pkg_txns.ipaddress.str.startswith('10.') | pkg_txns.ipaddress.str.startswith('172.'))]
+        total_downloads = len(pkgs.index)
+        print(f'Total downloads: {total_downloads}')
+        # Downloads per week over time frame
 
+        print(f'Average downloads per day: {ceil(total_downloads / days_elapsed)}')
 
-    
+        # Number of unique hosts and geographic location
+        unique_hosts = set(pkgs['ipaddress'])
+        print(f'Unique hosts {len(unique_hosts)}')
 
+        ## Unique packages
+        unique_pkgs = set(pkgs['path'])
+        print(f'Unique packages {len(unique_pkgs)}')
+
+        # Totals of unique package files
+        #pkg_totals = []
+        #for pkg in unique_pkgs:
+        #    total = len(pkgs[pkgs['path'] == pkg].index)
+        #    pkg_totals.append([pkg, total])
+        #pkg_totals.sort(key=lambda x: x[1], reverse=True)
+        #if len(unique_pkgs) > 5:
+        #    top = 10
+        #else:
+        #    top = len(unique_pkgs)
+        #print(f'Top {top} {chan} package filenames:')
+        #for i in range(top):
+        #    print(pkg_totals[i])
+
+        # Totals of unique software names
+        # i.e. name without version, hash, py or build iteration values
+        # Extract simple package titles from 'path' column of data frame.
+        names = pkgs['path'].str.replace('/.*/.*/', '', regex=True)
+        repl = lambda m: m.group('simplename')
+        names = list(names.str.replace('(?P<simplename>.*)-.*-.*\.tar\.bz2$',
+                repl,
+                regex=True))
+        unique_names = set(names)
+        print(f'Number of unique {chan} titles downloaded: {len(unique_names)}')
+        name_totals = []
+        for name in unique_names:
+            total = names.count(name)
+            name_totals.append([name, total])
+        name_totals.sort(key=lambda x: x[1], reverse=True)
+        for total in name_totals:
+            print(f'{total[0]}: {total[1]}')
+
+
+if __name__ == "__main__":
+    main()
 
-def downloads_by_host(downloads):
-    '''Show hostnames of all currently online hosts whose address appears in
-    the logs.'''
-    dls_by_host = []
-    for addy in set(downloads.ipaddress):
-        tmp = {}
-        pkgs = downloads.path[downloads.ipaddress == addy]
-        tmp['ipaddress'] = addy
-        tmp['downloads'] = len(pkgs)
-        path = pkgs.iloc[0]  # Assuming all packages requested by a given host are for the same platform.
-        if 'linux-64' in path:  # index here is not the right way to do it
-            tmp['os'] = 'linux'
-        elif 'osx-64' in path:
-            tmp['os'] = 'osx'
-        else:
-            tmp['os'] = 'os?'
-        try:
-            tmp['hostname'] = socket.gethostbyaddr(addy)[0]
-            #tmp['hostname'] = '?'
-        except:
-            tmp['hostname'] = 'Not online?'
-        dls_by_host.append(tmp)
-    return(dls_by_host)
-
-internal_by_host = downloads_by_host(internal)
-internal_by_host = sorted(internal_by_host, key = lambda k: k['downloads'])
-internal_by_host.reverse()
-print('Internal')
-for host in internal_by_host:
-    print(f"{host['downloads']:<6} {host['ipaddress']:<17} {host['os']:<5} {host['hostname']}")
-
-
-external_by_host = downloads_by_host(external)
-external_by_host = sorted(external_by_host, key = lambda k: k['downloads'])
-external_by_host.reverse()
-print('External')
-for host in external_by_host:
-    print(f"{host['downloads']:<6} {host['ipaddress']:<17} {host['os']:<5} {host['hostname']}")
-"""