diff options
author | Matt Rendina <mrendina@stsci.edu> | 2019-08-06 11:31:59 -0400 |
---|---|---|
committer | Matt Rendina <mrendina@stsci.edu> | 2019-08-06 11:31:59 -0400 |
commit | a8c187c28b335b648437b8e19edd150c3bf32132 (patch) | |
tree | 9a43cf034130026328f555330fd2cdf63599175c /logparse.py | |
parent | 68c4adcc9b220947461128473e0c12c2049d7801 (diff) | |
download | conmets-a8c187c28b335b648437b8e19edd150c3bf32132.tar.gz |
Generate primary report from log files.
Diffstat (limited to 'logparse.py')
-rwxr-xr-x | logparse.py | 367 |
1 files changed, 194 insertions, 173 deletions
diff --git a/logparse.py b/logparse.py index 4a21054..4b8edb0 100755 --- a/logparse.py +++ b/logparse.py @@ -1,58 +1,32 @@ #!/usr/bin/env python3 +import os +import sys import re +from glob import glob +import argparse +from math import ceil import gzip import socket import pandas as pd import datetime as dt import matplotlib.pyplot as plt import matplotlib.dates as mdates - -# Notes -# df.to_pickle(filename) for serializing a pandas data frame to disk. -# df.read_pickle(filename) to get it back. +from dateutil import parser as dpar +from collections import OrderedDict # regex pattern to extract key values from each line of an apache/nginx access log # Accommodate PUTs as well as second URLs (normally "-") -patt = '(?P<ipaddress>\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}) - - \\[(?P<date>\\d{2}\\/[a-zA-Z]{3}\\/\\d{4}):(?P<time>\\d{2}:\\d{2}:\\d{2}) (\\+|\\-)\\d{4}] ("(GET|POST|PUT) )(?P<path>.*?) HTTP/1.1" (?P<status>\\d*) \\d* ".*" "(?P<agent>.*)"' +patt = '(?P<ipaddress>\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}) .* .* \\[(?P<date>\\d{2}\\/[a-zA-Z]{3}\\/\\d{4}):(?P<time>\\d{2}:\\d{2}:\\d{2}) (\\+|\\-)\\d{4}] ".* (?P<path>.*?) .*" (?P<status>\\d*) \\d* ".*" "(?P<agent>.*)"' p = re.compile(patt) -columns = { - 'ipaddress': {}, - 'date': {}, - 'time': {}, - 'path': {}, - 'status': {}, - 'agent': {}, - } - -df = pd.DataFrame(columns) - -#files = [ -# 'astroconda_access.log' -# ] - -files = [ - 'ssb.stsci.edu.access.log-20190715.gz', - #'ssb.stsci.edu.access.log-20190716.gz', - #'ssb.stsci.edu.access.log-20190717.gz', - #'ssb.stsci.edu.access.log-20190718.gz', - #'ssb.stsci.edu.access.log-20190719.gz', - #'ssb.stsci.edu.access.log-20190720.gz', - #'ssb.stsci.edu.access.log-20190721.gz', - #'ssb.stsci.edu.access.log-20190722.gz', - #'ssb.stsci.edu.access.log', - ] - -# Addresses for hosts that should be ignored, such -# as those from which security scan connections come. -ignore_address = '10.128.19.7' # Security scan host. - class logData(): - def __init__(self, hostnames=False): + def __init__(self, + gethostnames=False, + ignore_hosts=[]): self.columns = { 'ipaddress': {}, 'hostname': {}, @@ -62,187 +36,234 @@ class logData(): 'status': {}, 'agent': {}, } - self.df = pd.DataFrame(columns) + self.dframe = pd.DataFrame(self.columns) self.digest_path = 'digests' + self.gethostnames = gethostnames + self.ignore_hosts = ignore_hosts - - def process_lines(f): + def process_lines(self, f): + print('process lines') + df = pd.DataFrame(self.columns) + unparseable = 0 for line in f.readlines(): + print(line) try: line = str(line.decode("utf-8")) except(AttributeError): pass - if ignore_address in line: + # Ignore transactions from particular IP addresses as requested. + try: + for host in self.ignore_hosts: + if host in line: + continue + except(TypeError): continue try: match = p.match(line) except: + line_errors += 1 pass - ipaddress = match.group('ipaddress') - date = match.group('date') - time = match.group('time') - path = match.group('path') - status = match.group('status') - agent = match.group('agent') + print(match) + try: + ipaddress = match.group('ipaddress') + date = match.group('date') + dateobj = dpar.parse(date) + time = match.group('time') + path = match.group('path') + status = match.group('status') + agent = match.group('agent') + except(AttributeError): + unparseable += 1 # Selective polling of hostnames here. hostname = '?' - self.df = df.append({'ipaddress':ipaddress, + df = df.append({'ipaddress':ipaddress, 'hostname':hostname, - 'date':date, + 'date':dateobj, 'time':time, 'path':path, 'status':status, 'agent':agent}, ignore_index=True) + print(f'unparseable lines : {unparseable}') return(df) - - def digest_log(logfile): - '''Read in either a text log file or a gzipped log file, extract key values - and store them in a pandas data frame, which is returned.''' - if '.gz' in fname: - with gzip.open(fname, 'r') as f: - df = process_lines(df, f) - else: - with open(fname, 'r') as f: - df = process_lines(df, f) - return(df) - - - def read_logs(logs): + def read_logs(self, logs): '''Accepts: - a pandas dataframe to which the log data will be appended. a list of apache/nginx access log files, either raw or .gz, and parses each that does not already have a corresponding digested data frame in the 'digests' subdir.''' # Create data frame for receiving log data - columns = { - 'ipaddress': {}, - 'hostname': {}, - 'date': {}, - 'time': {}, - 'path': {}, - 'status': {}, - 'agent': {}, - } - dframe = pd.DataFrame(columns) + df = pd.DataFrame(self.columns) + locframe = pd.DataFrame(self.columns) # Sort list of logs before processing so data will be appended in # chronological order. - for log in logs: - print(log) - setname = re.sub(log, '\.gz$', '') - try: - dframe = pd.read_pickle(f'digests/{setname}') - except(FileNotFoundError): + for log in sorted(logs): + setname = re.sub('\.gz$', '', log) + setpath = os.path.join(self.digest_path, setname) + pklpath = os.path.join(self.digest_path, f'{setname}.pkl') + print(f'ingesting dataset = {setname}') + if os.path.isfile(pklpath): + df = pd.read_pickle(pklpath) + else: + print('parsing log file') if '.gz' in log: with gzip.open(log, 'r') as f: - dframe = process_lines(df, f) + df = self.process_lines(f) else: with open(log, 'r') as f: - dframe = process_lines(df, f) - dframe.append(df, ignore_index=True) - return(dframe) - + df = self.process_lines(f) + print(f'df shape = {df.shape}') + # Dump digested log data to disk for more efficient repeated use. + df.to_pickle(f'{setpath}.pkl') + locframe = locframe.append(df, ignore_index=True) + print(locframe.shape) + return(locframe) + + + +def filter_pkgs(df): + '''Filter dataframe df down to just the rows the represent + successful (HTTP 200) conda package (.bz2 files) downloads.''' + inlen = len(df) + out = df.loc[df['agent'].str.contains('conda')] + print(out) + out = out.loc[out['path'].str.contains('bz2')] + out = out.loc[out['status'].str.contains('200')] + outlen = len(out) + print(f'{inlen-outlen} rows removed to leave conda txns only') + return(out) + -""" -# If a stored data frame already exists, load it, otherwise set about -# parsing the log files and creating one. -try: - print('Looking for pickled data frame...') - raise(FileNotFoundError) - #pkg_txns = pd.read_pickle('data.pkl') - #pkg_txns = pd.read_pickle('astroconda.org.pkl') -except(FileNotFoundError): - # iterate over log files and read in values to a master data frame. - for fname in files: - print(fname) - if '.gz' in fname: - with gzip.open(fname, 'r') as f: - df = process_lines(df, f) + + +def main(): + ap = argparse.ArgumentParser( + prog='logparse.py', + description='Parse and digest apache/nginx access logs in either' + ' raw or .gz format.') + ap.add_argument('--files', + '-f', + help='List of log files to parse, raw or .gz are accepted.' + ' glob syntax is also honored.', + nargs='+') + ap.add_argument('--ignorehosts', + '-i', + help='IP addresses of hosts to ignore when parsing logs.' + ' Useful for saving time by not reading in transactions ' + 'from security scans, etc.', + nargs='+') + args = ap.parse_args() + + files = [] + for filespec in args.files: + expanded = glob(filespec) + expanded.sort() + if isinstance(expanded, list): + for name in expanded: + files.append(name) else: - with open(fname, 'r') as f: - df = process_lines(df, f) - - # Create frame with only package downloads from conda. - - # Conda transactions - conda_txns = df.loc[df['agent'].str.contains('conda')] + files.append(expanded) + + # TODO: Should host filtering take place here? + # It leaves a disconnect between the pickled data which _may_ have been + # culled and the actual data being referenced by the inclusion of a file + # that has data from an exluded host within it. + logproc = logData(ignore_hosts=args.ignorehosts) + data = logproc.read_logs(files) + + allpkgs = filter_pkgs(data) + allpkgs = allpkgs.sort_values(by='date') + + start_date = allpkgs.iloc[0]['date'] + end_date = allpkgs.iloc[-1]['date'] + time_range = end_date - start_date + days_elapsed = time_range.days + if days_elapsed == 0: + days_elapsed = 1 - # Package transactions - pkg_txns = conda_txns.loc[conda_txns['path'].str.contains('bz2')] - pkg_txns = pkg_txns.loc[pkg_txns['status'].str.contains('200')] + print(f'Over the period {start_date.strftime("%m-%d-%Y")} ' + f'to {end_date.strftime("%m-%d-%Y")}') + print(f'{days_elapsed} days') + # Normalize all conda-dev channel names to astroconda-dev + allpkgs = allpkgs.replace('/conda-dev', '/astroconda-dev', regex=True) -df = pkg_txns + # All packages in a dictionary by channel. + chans = [path.split('/')[1] for path in allpkgs['path']] + chans = set(chans) + chan_pkgs = {} + for chan in chans: + # Trailing '/' added to ensure only a single channel gets stored for each + # due to matching overlap depending on length of substring. + chan_pkgs[chan] = allpkgs[allpkgs['path'].str.contains(chan+'/')] -# Of package downloads, compile a list of downloads/day + # For each channel, generate summary report of the download activity. + for chan in chan_pkgs.keys(): + print(f'\n\nSummary for channel: {chan}') + print('-----------------------------') -totals = [] -dates = list(set(df['date'])) -dates.sort() -x = [dt.datetime.strptime(d, '%d/%b/%Y').date() for d in dates] -y = [] -print(f'length of x list {len(x)}') -for date in dates: - num = len(pkg_txns[pkg_txns.date == date]) - total = {date:num} - totals.append(total) - y.append(num) + pkgs = chan_pkgs[chan] + # Unique days + dates = set(pkgs['date']) + dates = list(dates) + dates.sort() + bydate = OrderedDict() -#plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y')) -#plt.gca().xaxis.set_major_locator(mdates.DayLocator()) -#plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7)) -plt.plot(x,y) -plt.figure(figsize=(2,2)) -plt.savefig('astroconda_org.png') -##plt.show() -###plt.gcf().autofmt_xdate() + # Downloads per day over time frame + for date in dates: + bydate[date] = len(pkgs[pkgs['date'] == date]) + #for date in bydate: + # print(f'{date} : {bydate[date]}') -internal = pkg_txns[pkg_txns.ipaddress.str.startswith('10.') | pkg_txns.ipaddress.str.startswith('172.')] -external = pkg_txns[~(pkg_txns.ipaddress.str.startswith('10.') | pkg_txns.ipaddress.str.startswith('172.'))] + total_downloads = len(pkgs.index) + print(f'Total downloads: {total_downloads}') + # Downloads per week over time frame + print(f'Average downloads per day: {ceil(total_downloads / days_elapsed)}') - + # Number of unique hosts and geographic location + unique_hosts = set(pkgs['ipaddress']) + print(f'Unique hosts {len(unique_hosts)}') + ## Unique packages + unique_pkgs = set(pkgs['path']) + print(f'Unique packages {len(unique_pkgs)}') + + # Totals of unique package files + #pkg_totals = [] + #for pkg in unique_pkgs: + # total = len(pkgs[pkgs['path'] == pkg].index) + # pkg_totals.append([pkg, total]) + #pkg_totals.sort(key=lambda x: x[1], reverse=True) + #if len(unique_pkgs) > 5: + # top = 10 + #else: + # top = len(unique_pkgs) + #print(f'Top {top} {chan} package filenames:') + #for i in range(top): + # print(pkg_totals[i]) + + # Totals of unique software names + # i.e. name without version, hash, py or build iteration values + # Extract simple package titles from 'path' column of data frame. + names = pkgs['path'].str.replace('/.*/.*/', '', regex=True) + repl = lambda m: m.group('simplename') + names = list(names.str.replace('(?P<simplename>.*)-.*-.*\.tar\.bz2$', + repl, + regex=True)) + unique_names = set(names) + print(f'Number of unique {chan} titles downloaded: {len(unique_names)}') + name_totals = [] + for name in unique_names: + total = names.count(name) + name_totals.append([name, total]) + name_totals.sort(key=lambda x: x[1], reverse=True) + for total in name_totals: + print(f'{total[0]}: {total[1]}') + + +if __name__ == "__main__": + main() -def downloads_by_host(downloads): - '''Show hostnames of all currently online hosts whose address appears in - the logs.''' - dls_by_host = [] - for addy in set(downloads.ipaddress): - tmp = {} - pkgs = downloads.path[downloads.ipaddress == addy] - tmp['ipaddress'] = addy - tmp['downloads'] = len(pkgs) - path = pkgs.iloc[0] # Assuming all packages requested by a given host are for the same platform. - if 'linux-64' in path: # index here is not the right way to do it - tmp['os'] = 'linux' - elif 'osx-64' in path: - tmp['os'] = 'osx' - else: - tmp['os'] = 'os?' - try: - tmp['hostname'] = socket.gethostbyaddr(addy)[0] - #tmp['hostname'] = '?' - except: - tmp['hostname'] = 'Not online?' - dls_by_host.append(tmp) - return(dls_by_host) - -internal_by_host = downloads_by_host(internal) -internal_by_host = sorted(internal_by_host, key = lambda k: k['downloads']) -internal_by_host.reverse() -print('Internal') -for host in internal_by_host: - print(f"{host['downloads']:<6} {host['ipaddress']:<17} {host['os']:<5} {host['hostname']}") - - -external_by_host = downloads_by_host(external) -external_by_host = sorted(external_by_host, key = lambda k: k['downloads']) -external_by_host.reverse() -print('External') -for host in external_by_host: - print(f"{host['downloads']:<6} {host['ipaddress']:<17} {host['os']:<5} {host['hostname']}") -""" |