aboutsummaryrefslogtreecommitdiff
path: root/logparse.py
diff options
context:
space:
mode:
Diffstat (limited to 'logparse.py')
-rwxr-xr-xlogparse.py367
1 files changed, 194 insertions, 173 deletions
diff --git a/logparse.py b/logparse.py
index 4a21054..4b8edb0 100755
--- a/logparse.py
+++ b/logparse.py
@@ -1,58 +1,32 @@
#!/usr/bin/env python3
+import os
+import sys
import re
+from glob import glob
+import argparse
+from math import ceil
import gzip
import socket
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
-
-# Notes
-# df.to_pickle(filename) for serializing a pandas data frame to disk.
-# df.read_pickle(filename) to get it back.
+from dateutil import parser as dpar
+from collections import OrderedDict
# regex pattern to extract key values from each line of an apache/nginx access log
# Accommodate PUTs as well as second URLs (normally "-")
-patt = '(?P<ipaddress>\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}) - - \\[(?P<date>\\d{2}\\/[a-zA-Z]{3}\\/\\d{4}):(?P<time>\\d{2}:\\d{2}:\\d{2}) (\\+|\\-)\\d{4}] ("(GET|POST|PUT) )(?P<path>.*?) HTTP/1.1" (?P<status>\\d*) \\d* ".*" "(?P<agent>.*)"'
+patt = '(?P<ipaddress>\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}) .* .* \\[(?P<date>\\d{2}\\/[a-zA-Z]{3}\\/\\d{4}):(?P<time>\\d{2}:\\d{2}:\\d{2}) (\\+|\\-)\\d{4}] ".* (?P<path>.*?) .*" (?P<status>\\d*) \\d* ".*" "(?P<agent>.*)"'
p = re.compile(patt)
-columns = {
- 'ipaddress': {},
- 'date': {},
- 'time': {},
- 'path': {},
- 'status': {},
- 'agent': {},
- }
-
-df = pd.DataFrame(columns)
-
-#files = [
-# 'astroconda_access.log'
-# ]
-
-files = [
- 'ssb.stsci.edu.access.log-20190715.gz',
- #'ssb.stsci.edu.access.log-20190716.gz',
- #'ssb.stsci.edu.access.log-20190717.gz',
- #'ssb.stsci.edu.access.log-20190718.gz',
- #'ssb.stsci.edu.access.log-20190719.gz',
- #'ssb.stsci.edu.access.log-20190720.gz',
- #'ssb.stsci.edu.access.log-20190721.gz',
- #'ssb.stsci.edu.access.log-20190722.gz',
- #'ssb.stsci.edu.access.log',
- ]
-
-# Addresses for hosts that should be ignored, such
-# as those from which security scan connections come.
-ignore_address = '10.128.19.7' # Security scan host.
-
class logData():
- def __init__(self, hostnames=False):
+ def __init__(self,
+ gethostnames=False,
+ ignore_hosts=[]):
self.columns = {
'ipaddress': {},
'hostname': {},
@@ -62,187 +36,234 @@ class logData():
'status': {},
'agent': {},
}
- self.df = pd.DataFrame(columns)
+ self.dframe = pd.DataFrame(self.columns)
self.digest_path = 'digests'
+ self.gethostnames = gethostnames
+ self.ignore_hosts = ignore_hosts
-
- def process_lines(f):
+ def process_lines(self, f):
+ print('process lines')
+ df = pd.DataFrame(self.columns)
+ unparseable = 0
for line in f.readlines():
+ print(line)
try:
line = str(line.decode("utf-8"))
except(AttributeError):
pass
- if ignore_address in line:
+ # Ignore transactions from particular IP addresses as requested.
+ try:
+ for host in self.ignore_hosts:
+ if host in line:
+ continue
+ except(TypeError):
continue
try:
match = p.match(line)
except:
+ line_errors += 1
pass
- ipaddress = match.group('ipaddress')
- date = match.group('date')
- time = match.group('time')
- path = match.group('path')
- status = match.group('status')
- agent = match.group('agent')
+ print(match)
+ try:
+ ipaddress = match.group('ipaddress')
+ date = match.group('date')
+ dateobj = dpar.parse(date)
+ time = match.group('time')
+ path = match.group('path')
+ status = match.group('status')
+ agent = match.group('agent')
+ except(AttributeError):
+ unparseable += 1
# Selective polling of hostnames here.
hostname = '?'
- self.df = df.append({'ipaddress':ipaddress,
+ df = df.append({'ipaddress':ipaddress,
'hostname':hostname,
- 'date':date,
+ 'date':dateobj,
'time':time,
'path':path,
'status':status,
'agent':agent}, ignore_index=True)
+ print(f'unparseable lines : {unparseable}')
return(df)
-
- def digest_log(logfile):
- '''Read in either a text log file or a gzipped log file, extract key values
- and store them in a pandas data frame, which is returned.'''
- if '.gz' in fname:
- with gzip.open(fname, 'r') as f:
- df = process_lines(df, f)
- else:
- with open(fname, 'r') as f:
- df = process_lines(df, f)
- return(df)
-
-
- def read_logs(logs):
+ def read_logs(self, logs):
'''Accepts:
- a pandas dataframe to which the log data will be appended.
a list of apache/nginx access log files, either raw or .gz,
and parses each that does not already have a corresponding digested
data frame in the 'digests' subdir.'''
# Create data frame for receiving log data
- columns = {
- 'ipaddress': {},
- 'hostname': {},
- 'date': {},
- 'time': {},
- 'path': {},
- 'status': {},
- 'agent': {},
- }
- dframe = pd.DataFrame(columns)
+ df = pd.DataFrame(self.columns)
+ locframe = pd.DataFrame(self.columns)
# Sort list of logs before processing so data will be appended in
# chronological order.
- for log in logs:
- print(log)
- setname = re.sub(log, '\.gz$', '')
- try:
- dframe = pd.read_pickle(f'digests/{setname}')
- except(FileNotFoundError):
+ for log in sorted(logs):
+ setname = re.sub('\.gz$', '', log)
+ setpath = os.path.join(self.digest_path, setname)
+ pklpath = os.path.join(self.digest_path, f'{setname}.pkl')
+ print(f'ingesting dataset = {setname}')
+ if os.path.isfile(pklpath):
+ df = pd.read_pickle(pklpath)
+ else:
+ print('parsing log file')
if '.gz' in log:
with gzip.open(log, 'r') as f:
- dframe = process_lines(df, f)
+ df = self.process_lines(f)
else:
with open(log, 'r') as f:
- dframe = process_lines(df, f)
- dframe.append(df, ignore_index=True)
- return(dframe)
-
+ df = self.process_lines(f)
+ print(f'df shape = {df.shape}')
+ # Dump digested log data to disk for more efficient repeated use.
+ df.to_pickle(f'{setpath}.pkl')
+ locframe = locframe.append(df, ignore_index=True)
+ print(locframe.shape)
+ return(locframe)
+
+
+
+def filter_pkgs(df):
+ '''Filter dataframe df down to just the rows the represent
+ successful (HTTP 200) conda package (.bz2 files) downloads.'''
+ inlen = len(df)
+ out = df.loc[df['agent'].str.contains('conda')]
+ print(out)
+ out = out.loc[out['path'].str.contains('bz2')]
+ out = out.loc[out['status'].str.contains('200')]
+ outlen = len(out)
+ print(f'{inlen-outlen} rows removed to leave conda txns only')
+ return(out)
+
-"""
-# If a stored data frame already exists, load it, otherwise set about
-# parsing the log files and creating one.
-try:
- print('Looking for pickled data frame...')
- raise(FileNotFoundError)
- #pkg_txns = pd.read_pickle('data.pkl')
- #pkg_txns = pd.read_pickle('astroconda.org.pkl')
-except(FileNotFoundError):
- # iterate over log files and read in values to a master data frame.
- for fname in files:
- print(fname)
- if '.gz' in fname:
- with gzip.open(fname, 'r') as f:
- df = process_lines(df, f)
+
+
+def main():
+ ap = argparse.ArgumentParser(
+ prog='logparse.py',
+ description='Parse and digest apache/nginx access logs in either'
+ ' raw or .gz format.')
+ ap.add_argument('--files',
+ '-f',
+ help='List of log files to parse, raw or .gz are accepted.'
+ ' glob syntax is also honored.',
+ nargs='+')
+ ap.add_argument('--ignorehosts',
+ '-i',
+ help='IP addresses of hosts to ignore when parsing logs.'
+ ' Useful for saving time by not reading in transactions '
+ 'from security scans, etc.',
+ nargs='+')
+ args = ap.parse_args()
+
+ files = []
+ for filespec in args.files:
+ expanded = glob(filespec)
+ expanded.sort()
+ if isinstance(expanded, list):
+ for name in expanded:
+ files.append(name)
else:
- with open(fname, 'r') as f:
- df = process_lines(df, f)
-
- # Create frame with only package downloads from conda.
-
- # Conda transactions
- conda_txns = df.loc[df['agent'].str.contains('conda')]
+ files.append(expanded)
+
+ # TODO: Should host filtering take place here?
+ # It leaves a disconnect between the pickled data which _may_ have been
+ # culled and the actual data being referenced by the inclusion of a file
+ # that has data from an exluded host within it.
+ logproc = logData(ignore_hosts=args.ignorehosts)
+ data = logproc.read_logs(files)
+
+ allpkgs = filter_pkgs(data)
+ allpkgs = allpkgs.sort_values(by='date')
+
+ start_date = allpkgs.iloc[0]['date']
+ end_date = allpkgs.iloc[-1]['date']
+ time_range = end_date - start_date
+ days_elapsed = time_range.days
+ if days_elapsed == 0:
+ days_elapsed = 1
- # Package transactions
- pkg_txns = conda_txns.loc[conda_txns['path'].str.contains('bz2')]
- pkg_txns = pkg_txns.loc[pkg_txns['status'].str.contains('200')]
+ print(f'Over the period {start_date.strftime("%m-%d-%Y")} '
+ f'to {end_date.strftime("%m-%d-%Y")}')
+ print(f'{days_elapsed} days')
+ # Normalize all conda-dev channel names to astroconda-dev
+ allpkgs = allpkgs.replace('/conda-dev', '/astroconda-dev', regex=True)
-df = pkg_txns
+ # All packages in a dictionary by channel.
+ chans = [path.split('/')[1] for path in allpkgs['path']]
+ chans = set(chans)
+ chan_pkgs = {}
+ for chan in chans:
+ # Trailing '/' added to ensure only a single channel gets stored for each
+ # due to matching overlap depending on length of substring.
+ chan_pkgs[chan] = allpkgs[allpkgs['path'].str.contains(chan+'/')]
-# Of package downloads, compile a list of downloads/day
+ # For each channel, generate summary report of the download activity.
+ for chan in chan_pkgs.keys():
+ print(f'\n\nSummary for channel: {chan}')
+ print('-----------------------------')
-totals = []
-dates = list(set(df['date']))
-dates.sort()
-x = [dt.datetime.strptime(d, '%d/%b/%Y').date() for d in dates]
-y = []
-print(f'length of x list {len(x)}')
-for date in dates:
- num = len(pkg_txns[pkg_txns.date == date])
- total = {date:num}
- totals.append(total)
- y.append(num)
+ pkgs = chan_pkgs[chan]
+ # Unique days
+ dates = set(pkgs['date'])
+ dates = list(dates)
+ dates.sort()
+ bydate = OrderedDict()
-#plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y'))
-#plt.gca().xaxis.set_major_locator(mdates.DayLocator())
-#plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7))
-plt.plot(x,y)
-plt.figure(figsize=(2,2))
-plt.savefig('astroconda_org.png')
-##plt.show()
-###plt.gcf().autofmt_xdate()
+ # Downloads per day over time frame
+ for date in dates:
+ bydate[date] = len(pkgs[pkgs['date'] == date])
+ #for date in bydate:
+ # print(f'{date} : {bydate[date]}')
-internal = pkg_txns[pkg_txns.ipaddress.str.startswith('10.') | pkg_txns.ipaddress.str.startswith('172.')]
-external = pkg_txns[~(pkg_txns.ipaddress.str.startswith('10.') | pkg_txns.ipaddress.str.startswith('172.'))]
+ total_downloads = len(pkgs.index)
+ print(f'Total downloads: {total_downloads}')
+ # Downloads per week over time frame
+ print(f'Average downloads per day: {ceil(total_downloads / days_elapsed)}')
-
+ # Number of unique hosts and geographic location
+ unique_hosts = set(pkgs['ipaddress'])
+ print(f'Unique hosts {len(unique_hosts)}')
+ ## Unique packages
+ unique_pkgs = set(pkgs['path'])
+ print(f'Unique packages {len(unique_pkgs)}')
+
+ # Totals of unique package files
+ #pkg_totals = []
+ #for pkg in unique_pkgs:
+ # total = len(pkgs[pkgs['path'] == pkg].index)
+ # pkg_totals.append([pkg, total])
+ #pkg_totals.sort(key=lambda x: x[1], reverse=True)
+ #if len(unique_pkgs) > 5:
+ # top = 10
+ #else:
+ # top = len(unique_pkgs)
+ #print(f'Top {top} {chan} package filenames:')
+ #for i in range(top):
+ # print(pkg_totals[i])
+
+ # Totals of unique software names
+ # i.e. name without version, hash, py or build iteration values
+ # Extract simple package titles from 'path' column of data frame.
+ names = pkgs['path'].str.replace('/.*/.*/', '', regex=True)
+ repl = lambda m: m.group('simplename')
+ names = list(names.str.replace('(?P<simplename>.*)-.*-.*\.tar\.bz2$',
+ repl,
+ regex=True))
+ unique_names = set(names)
+ print(f'Number of unique {chan} titles downloaded: {len(unique_names)}')
+ name_totals = []
+ for name in unique_names:
+ total = names.count(name)
+ name_totals.append([name, total])
+ name_totals.sort(key=lambda x: x[1], reverse=True)
+ for total in name_totals:
+ print(f'{total[0]}: {total[1]}')
+
+
+if __name__ == "__main__":
+ main()
-def downloads_by_host(downloads):
- '''Show hostnames of all currently online hosts whose address appears in
- the logs.'''
- dls_by_host = []
- for addy in set(downloads.ipaddress):
- tmp = {}
- pkgs = downloads.path[downloads.ipaddress == addy]
- tmp['ipaddress'] = addy
- tmp['downloads'] = len(pkgs)
- path = pkgs.iloc[0] # Assuming all packages requested by a given host are for the same platform.
- if 'linux-64' in path: # index here is not the right way to do it
- tmp['os'] = 'linux'
- elif 'osx-64' in path:
- tmp['os'] = 'osx'
- else:
- tmp['os'] = 'os?'
- try:
- tmp['hostname'] = socket.gethostbyaddr(addy)[0]
- #tmp['hostname'] = '?'
- except:
- tmp['hostname'] = 'Not online?'
- dls_by_host.append(tmp)
- return(dls_by_host)
-
-internal_by_host = downloads_by_host(internal)
-internal_by_host = sorted(internal_by_host, key = lambda k: k['downloads'])
-internal_by_host.reverse()
-print('Internal')
-for host in internal_by_host:
- print(f"{host['downloads']:<6} {host['ipaddress']:<17} {host['os']:<5} {host['hostname']}")
-
-
-external_by_host = downloads_by_host(external)
-external_by_host = sorted(external_by_host, key = lambda k: k['downloads'])
-external_by_host.reverse()
-print('External')
-for host in external_by_host:
- print(f"{host['downloads']:<6} {host['ipaddress']:<17} {host['os']:<5} {host['hostname']}")
-"""