aboutsummaryrefslogtreecommitdiff
path: root/logparse.py
diff options
context:
space:
mode:
Diffstat (limited to 'logparse.py')
-rwxr-xr-xlogparse.py248
1 files changed, 248 insertions, 0 deletions
diff --git a/logparse.py b/logparse.py
new file mode 100755
index 0000000..4a21054
--- /dev/null
+++ b/logparse.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+
+import re
+import gzip
+import socket
+import pandas as pd
+import datetime as dt
+import matplotlib.pyplot as plt
+import matplotlib.dates as mdates
+
+# Notes
+# df.to_pickle(filename) for serializing a pandas data frame to disk.
+# df.read_pickle(filename) to get it back.
+
+# regex pattern to extract key values from each line of an apache/nginx access log
+# Accommodate PUTs as well as second URLs (normally "-")
+patt = '(?P<ipaddress>\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}) - - \\[(?P<date>\\d{2}\\/[a-zA-Z]{3}\\/\\d{4}):(?P<time>\\d{2}:\\d{2}:\\d{2}) (\\+|\\-)\\d{4}] ("(GET|POST|PUT) )(?P<path>.*?) HTTP/1.1" (?P<status>\\d*) \\d* ".*" "(?P<agent>.*)"'
+
+p = re.compile(patt)
+
+columns = {
+ 'ipaddress': {},
+ 'date': {},
+ 'time': {},
+ 'path': {},
+ 'status': {},
+ 'agent': {},
+ }
+
+df = pd.DataFrame(columns)
+
+#files = [
+# 'astroconda_access.log'
+# ]
+
+files = [
+ 'ssb.stsci.edu.access.log-20190715.gz',
+ #'ssb.stsci.edu.access.log-20190716.gz',
+ #'ssb.stsci.edu.access.log-20190717.gz',
+ #'ssb.stsci.edu.access.log-20190718.gz',
+ #'ssb.stsci.edu.access.log-20190719.gz',
+ #'ssb.stsci.edu.access.log-20190720.gz',
+ #'ssb.stsci.edu.access.log-20190721.gz',
+ #'ssb.stsci.edu.access.log-20190722.gz',
+ #'ssb.stsci.edu.access.log',
+ ]
+
+# Addresses for hosts that should be ignored, such
+# as those from which security scan connections come.
+ignore_address = '10.128.19.7' # Security scan host.
+
+
+class logData():
+
+ def __init__(self, hostnames=False):
+ self.columns = {
+ 'ipaddress': {},
+ 'hostname': {},
+ 'date': {},
+ 'time': {},
+ 'path': {},
+ 'status': {},
+ 'agent': {},
+ }
+ self.df = pd.DataFrame(columns)
+ self.digest_path = 'digests'
+
+
+ def process_lines(f):
+ for line in f.readlines():
+ try:
+ line = str(line.decode("utf-8"))
+ except(AttributeError):
+ pass
+ if ignore_address in line:
+ continue
+ try:
+ match = p.match(line)
+ except:
+ pass
+ ipaddress = match.group('ipaddress')
+ date = match.group('date')
+ time = match.group('time')
+ path = match.group('path')
+ status = match.group('status')
+ agent = match.group('agent')
+ # Selective polling of hostnames here.
+ hostname = '?'
+ self.df = df.append({'ipaddress':ipaddress,
+ 'hostname':hostname,
+ 'date':date,
+ 'time':time,
+ 'path':path,
+ 'status':status,
+ 'agent':agent}, ignore_index=True)
+ return(df)
+
+
+ def digest_log(logfile):
+ '''Read in either a text log file or a gzipped log file, extract key values
+ and store them in a pandas data frame, which is returned.'''
+ if '.gz' in fname:
+ with gzip.open(fname, 'r') as f:
+ df = process_lines(df, f)
+ else:
+ with open(fname, 'r') as f:
+ df = process_lines(df, f)
+ return(df)
+
+
+ def read_logs(logs):
+ '''Accepts:
+ a pandas dataframe to which the log data will be appended.
+
+ a list of apache/nginx access log files, either raw or .gz,
+ and parses each that does not already have a corresponding digested
+ data frame in the 'digests' subdir.'''
+
+ # Create data frame for receiving log data
+ columns = {
+ 'ipaddress': {},
+ 'hostname': {},
+ 'date': {},
+ 'time': {},
+ 'path': {},
+ 'status': {},
+ 'agent': {},
+ }
+ dframe = pd.DataFrame(columns)
+
+ # Sort list of logs before processing so data will be appended in
+ # chronological order.
+ for log in logs:
+ print(log)
+ setname = re.sub(log, '\.gz$', '')
+ try:
+ dframe = pd.read_pickle(f'digests/{setname}')
+ except(FileNotFoundError):
+ if '.gz' in log:
+ with gzip.open(log, 'r') as f:
+ dframe = process_lines(df, f)
+ else:
+ with open(log, 'r') as f:
+ dframe = process_lines(df, f)
+ dframe.append(df, ignore_index=True)
+ return(dframe)
+
+
+"""
+# If a stored data frame already exists, load it, otherwise set about
+# parsing the log files and creating one.
+try:
+ print('Looking for pickled data frame...')
+ raise(FileNotFoundError)
+ #pkg_txns = pd.read_pickle('data.pkl')
+ #pkg_txns = pd.read_pickle('astroconda.org.pkl')
+except(FileNotFoundError):
+ # iterate over log files and read in values to a master data frame.
+ for fname in files:
+ print(fname)
+ if '.gz' in fname:
+ with gzip.open(fname, 'r') as f:
+ df = process_lines(df, f)
+ else:
+ with open(fname, 'r') as f:
+ df = process_lines(df, f)
+
+ # Create frame with only package downloads from conda.
+
+ # Conda transactions
+ conda_txns = df.loc[df['agent'].str.contains('conda')]
+
+ # Package transactions
+ pkg_txns = conda_txns.loc[conda_txns['path'].str.contains('bz2')]
+ pkg_txns = pkg_txns.loc[pkg_txns['status'].str.contains('200')]
+
+
+df = pkg_txns
+
+# Of package downloads, compile a list of downloads/day
+
+totals = []
+dates = list(set(df['date']))
+dates.sort()
+x = [dt.datetime.strptime(d, '%d/%b/%Y').date() for d in dates]
+y = []
+print(f'length of x list {len(x)}')
+for date in dates:
+ num = len(pkg_txns[pkg_txns.date == date])
+ total = {date:num}
+ totals.append(total)
+ y.append(num)
+
+#plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y'))
+#plt.gca().xaxis.set_major_locator(mdates.DayLocator())
+#plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7))
+plt.plot(x,y)
+plt.figure(figsize=(2,2))
+plt.savefig('astroconda_org.png')
+##plt.show()
+###plt.gcf().autofmt_xdate()
+
+internal = pkg_txns[pkg_txns.ipaddress.str.startswith('10.') | pkg_txns.ipaddress.str.startswith('172.')]
+external = pkg_txns[~(pkg_txns.ipaddress.str.startswith('10.') | pkg_txns.ipaddress.str.startswith('172.'))]
+
+
+
+
+
+def downloads_by_host(downloads):
+ '''Show hostnames of all currently online hosts whose address appears in
+ the logs.'''
+ dls_by_host = []
+ for addy in set(downloads.ipaddress):
+ tmp = {}
+ pkgs = downloads.path[downloads.ipaddress == addy]
+ tmp['ipaddress'] = addy
+ tmp['downloads'] = len(pkgs)
+ path = pkgs.iloc[0] # Assuming all packages requested by a given host are for the same platform.
+ if 'linux-64' in path: # index here is not the right way to do it
+ tmp['os'] = 'linux'
+ elif 'osx-64' in path:
+ tmp['os'] = 'osx'
+ else:
+ tmp['os'] = 'os?'
+ try:
+ tmp['hostname'] = socket.gethostbyaddr(addy)[0]
+ #tmp['hostname'] = '?'
+ except:
+ tmp['hostname'] = 'Not online?'
+ dls_by_host.append(tmp)
+ return(dls_by_host)
+
+internal_by_host = downloads_by_host(internal)
+internal_by_host = sorted(internal_by_host, key = lambda k: k['downloads'])
+internal_by_host.reverse()
+print('Internal')
+for host in internal_by_host:
+ print(f"{host['downloads']:<6} {host['ipaddress']:<17} {host['os']:<5} {host['hostname']}")
+
+
+external_by_host = downloads_by_host(external)
+external_by_host = sorted(external_by_host, key = lambda k: k['downloads'])
+external_by_host.reverse()
+print('External')
+for host in external_by_host:
+ print(f"{host['downloads']:<6} {host['ipaddress']:<17} {host['os']:<5} {host['hostname']}")
+"""