diff options
author | Matt Rendina <mrendina@stsci.edu> | 2019-07-25 15:46:39 -0400 |
---|---|---|
committer | Matt Rendina <mrendina@stsci.edu> | 2019-07-25 15:46:39 -0400 |
commit | 68c4adcc9b220947461128473e0c12c2049d7801 (patch) | |
tree | f6f0e491d2cb56c9ddc23017e90ee58f39a736e1 /logparse.py | |
parent | 23894a9368db2acafbfc40c07d6848fe4885bcce (diff) | |
download | conmets-68c4adcc9b220947461128473e0c12c2049d7801.tar.gz |
Initial commit
Diffstat (limited to 'logparse.py')
-rwxr-xr-x | logparse.py | 248 |
1 files changed, 248 insertions, 0 deletions
diff --git a/logparse.py b/logparse.py new file mode 100755 index 0000000..4a21054 --- /dev/null +++ b/logparse.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 + +import re +import gzip +import socket +import pandas as pd +import datetime as dt +import matplotlib.pyplot as plt +import matplotlib.dates as mdates + +# Notes +# df.to_pickle(filename) for serializing a pandas data frame to disk. +# df.read_pickle(filename) to get it back. + +# regex pattern to extract key values from each line of an apache/nginx access log +# Accommodate PUTs as well as second URLs (normally "-") +patt = '(?P<ipaddress>\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}) - - \\[(?P<date>\\d{2}\\/[a-zA-Z]{3}\\/\\d{4}):(?P<time>\\d{2}:\\d{2}:\\d{2}) (\\+|\\-)\\d{4}] ("(GET|POST|PUT) )(?P<path>.*?) HTTP/1.1" (?P<status>\\d*) \\d* ".*" "(?P<agent>.*)"' + +p = re.compile(patt) + +columns = { + 'ipaddress': {}, + 'date': {}, + 'time': {}, + 'path': {}, + 'status': {}, + 'agent': {}, + } + +df = pd.DataFrame(columns) + +#files = [ +# 'astroconda_access.log' +# ] + +files = [ + 'ssb.stsci.edu.access.log-20190715.gz', + #'ssb.stsci.edu.access.log-20190716.gz', + #'ssb.stsci.edu.access.log-20190717.gz', + #'ssb.stsci.edu.access.log-20190718.gz', + #'ssb.stsci.edu.access.log-20190719.gz', + #'ssb.stsci.edu.access.log-20190720.gz', + #'ssb.stsci.edu.access.log-20190721.gz', + #'ssb.stsci.edu.access.log-20190722.gz', + #'ssb.stsci.edu.access.log', + ] + +# Addresses for hosts that should be ignored, such +# as those from which security scan connections come. +ignore_address = '10.128.19.7' # Security scan host. + + +class logData(): + + def __init__(self, hostnames=False): + self.columns = { + 'ipaddress': {}, + 'hostname': {}, + 'date': {}, + 'time': {}, + 'path': {}, + 'status': {}, + 'agent': {}, + } + self.df = pd.DataFrame(columns) + self.digest_path = 'digests' + + + def process_lines(f): + for line in f.readlines(): + try: + line = str(line.decode("utf-8")) + except(AttributeError): + pass + if ignore_address in line: + continue + try: + match = p.match(line) + except: + pass + ipaddress = match.group('ipaddress') + date = match.group('date') + time = match.group('time') + path = match.group('path') + status = match.group('status') + agent = match.group('agent') + # Selective polling of hostnames here. + hostname = '?' + self.df = df.append({'ipaddress':ipaddress, + 'hostname':hostname, + 'date':date, + 'time':time, + 'path':path, + 'status':status, + 'agent':agent}, ignore_index=True) + return(df) + + + def digest_log(logfile): + '''Read in either a text log file or a gzipped log file, extract key values + and store them in a pandas data frame, which is returned.''' + if '.gz' in fname: + with gzip.open(fname, 'r') as f: + df = process_lines(df, f) + else: + with open(fname, 'r') as f: + df = process_lines(df, f) + return(df) + + + def read_logs(logs): + '''Accepts: + a pandas dataframe to which the log data will be appended. + + a list of apache/nginx access log files, either raw or .gz, + and parses each that does not already have a corresponding digested + data frame in the 'digests' subdir.''' + + # Create data frame for receiving log data + columns = { + 'ipaddress': {}, + 'hostname': {}, + 'date': {}, + 'time': {}, + 'path': {}, + 'status': {}, + 'agent': {}, + } + dframe = pd.DataFrame(columns) + + # Sort list of logs before processing so data will be appended in + # chronological order. + for log in logs: + print(log) + setname = re.sub(log, '\.gz$', '') + try: + dframe = pd.read_pickle(f'digests/{setname}') + except(FileNotFoundError): + if '.gz' in log: + with gzip.open(log, 'r') as f: + dframe = process_lines(df, f) + else: + with open(log, 'r') as f: + dframe = process_lines(df, f) + dframe.append(df, ignore_index=True) + return(dframe) + + +""" +# If a stored data frame already exists, load it, otherwise set about +# parsing the log files and creating one. +try: + print('Looking for pickled data frame...') + raise(FileNotFoundError) + #pkg_txns = pd.read_pickle('data.pkl') + #pkg_txns = pd.read_pickle('astroconda.org.pkl') +except(FileNotFoundError): + # iterate over log files and read in values to a master data frame. + for fname in files: + print(fname) + if '.gz' in fname: + with gzip.open(fname, 'r') as f: + df = process_lines(df, f) + else: + with open(fname, 'r') as f: + df = process_lines(df, f) + + # Create frame with only package downloads from conda. + + # Conda transactions + conda_txns = df.loc[df['agent'].str.contains('conda')] + + # Package transactions + pkg_txns = conda_txns.loc[conda_txns['path'].str.contains('bz2')] + pkg_txns = pkg_txns.loc[pkg_txns['status'].str.contains('200')] + + +df = pkg_txns + +# Of package downloads, compile a list of downloads/day + +totals = [] +dates = list(set(df['date'])) +dates.sort() +x = [dt.datetime.strptime(d, '%d/%b/%Y').date() for d in dates] +y = [] +print(f'length of x list {len(x)}') +for date in dates: + num = len(pkg_txns[pkg_txns.date == date]) + total = {date:num} + totals.append(total) + y.append(num) + +#plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y')) +#plt.gca().xaxis.set_major_locator(mdates.DayLocator()) +#plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7)) +plt.plot(x,y) +plt.figure(figsize=(2,2)) +plt.savefig('astroconda_org.png') +##plt.show() +###plt.gcf().autofmt_xdate() + +internal = pkg_txns[pkg_txns.ipaddress.str.startswith('10.') | pkg_txns.ipaddress.str.startswith('172.')] +external = pkg_txns[~(pkg_txns.ipaddress.str.startswith('10.') | pkg_txns.ipaddress.str.startswith('172.'))] + + + + + +def downloads_by_host(downloads): + '''Show hostnames of all currently online hosts whose address appears in + the logs.''' + dls_by_host = [] + for addy in set(downloads.ipaddress): + tmp = {} + pkgs = downloads.path[downloads.ipaddress == addy] + tmp['ipaddress'] = addy + tmp['downloads'] = len(pkgs) + path = pkgs.iloc[0] # Assuming all packages requested by a given host are for the same platform. + if 'linux-64' in path: # index here is not the right way to do it + tmp['os'] = 'linux' + elif 'osx-64' in path: + tmp['os'] = 'osx' + else: + tmp['os'] = 'os?' + try: + tmp['hostname'] = socket.gethostbyaddr(addy)[0] + #tmp['hostname'] = '?' + except: + tmp['hostname'] = 'Not online?' + dls_by_host.append(tmp) + return(dls_by_host) + +internal_by_host = downloads_by_host(internal) +internal_by_host = sorted(internal_by_host, key = lambda k: k['downloads']) +internal_by_host.reverse() +print('Internal') +for host in internal_by_host: + print(f"{host['downloads']:<6} {host['ipaddress']:<17} {host['os']:<5} {host['hostname']}") + + +external_by_host = downloads_by_host(external) +external_by_host = sorted(external_by_host, key = lambda k: k['downloads']) +external_by_host.reverse() +print('External') +for host in external_by_host: + print(f"{host['downloads']:<6} {host['ipaddress']:<17} {host['os']:<5} {host['hostname']}") +""" |