aboutsummaryrefslogtreecommitdiff
path: root/conmets/main.py
diff options
context:
space:
mode:
Diffstat (limited to 'conmets/main.py')
-rw-r--r--conmets/main.py282
1 files changed, 282 insertions, 0 deletions
diff --git a/conmets/main.py b/conmets/main.py
new file mode 100644
index 0000000..b7d02bb
--- /dev/null
+++ b/conmets/main.py
@@ -0,0 +1,282 @@
+import argparse
+from conmets.conmets import *
+import yaml
+import urllib.request
+from urllib.error import HTTPError
+
+def main():
+ ap = argparse.ArgumentParser(
+ prog='conmets',
+ description='Parse and digest apache/nginx access logs in either'
+ ' raw or .gz format and produce conda package download stats '
+ 'summaries.')
+ ap.add_argument('dataset_name', type=str,
+ help='Name of dataset file. If file does not exist and'
+ ' log data file names are provided for parsing, this '
+ 'file will be created.')
+ ap.add_argument('--config',
+ '-c',
+ help='Configuration file used to adjust behavior of the '
+ 'program',
+ required=True)
+ ap.add_argument('--files',
+ '-f',
+ help='List of log files to parse, raw or .gz are accepted.'
+ ' glob syntax is also honored.',
+ nargs='+')
+ ap.add_argument('--window',
+ '-w',
+ help='Restrict examination of data to the window of dates'
+ ' provided.\n'
+ ' Format: YYYY.MM.DD-YYYY.MM.DD'
+ ' Omitting a date window will operate on all data contained'
+ ' within the given dataset.')
+ ap.add_argument('--ignorehosts',
+ '-i',
+ help='IP addresses of hosts to ignore when parsing logs.'
+ ' Useful for saving time by not reading in transactions '
+ 'from security scans, etc.',
+ nargs='+')
+ args = ap.parse_args()
+
+ # Dataset filename
+ dataset_name = args.dataset_name
+
+ with open(args.config, 'r') as f:
+ config = yaml.safe_load(f)
+
+ files = []
+ try:
+ for filespec in args.files:
+ expanded = glob(filespec)
+ expanded.sort()
+ if isinstance(expanded, list):
+ for name in expanded:
+ files.append(name)
+ else:
+ files.append(expanded)
+ except(TypeError):
+ print('No log files provided.')
+ print(f'Importing existing dataset {dataset_name}.')
+ pass
+
+ inf_hosts = config['infrastructure_hosts']
+ num_inf_hosts = len(inf_hosts)
+
+ # TODO: Should host filtering take place here?
+ # It leaves a disconnect between the pickled data which _may_ have
+ # been culled and the actual data being referenced.
+ logproc = LogData(dataset_name, ignore_hosts=args.ignorehosts)
+ logproc.read_logs(files)
+
+ print('writing (potentially updated) dataset')
+ logproc.write_dataset()
+
+ # Filtering and analysis begins here
+ data = logproc.data
+ print(f'num full data rows = {len(data.index)}')
+
+ # Filter out a particular time period for examination
+ # Set limits on a time period to examine
+ if args.window:
+ start = args.window.split('-')[0].replace('.', '-')
+ end = args.window.split('-')[1].replace('.', '-')
+ window_start = pd.to_datetime(start)
+ window_end = pd.to_datetime(end)
+ print(f'Filtering based on window {window_start} - {window_end}.')
+ data = data[pd.to_datetime(data['date']) >= window_start]
+ data = data[pd.to_datetime(data['date']) <= window_end]
+ print(f'num windowed data rows = {len(data.index)}')
+
+ all_unique_hosts = list(set(data['ipaddress']))
+ #for host in all_unique_hosts:
+ # try:
+ # print(f'{host} {socket.gethostbyaddr(host)[0]}')
+ # except:
+ # print(f'{host} offline?')
+
+ # All packages in a dictionary by channel.
+ chans = [path.split('/')[1] for path in data['path']]
+ chans = list(set(chans))
+ chans.sort()
+ chan_pkgs = OrderedDict()
+ for chan in chans:
+ # Trailing '/' added to ensure only a single channel gets stored for each
+ # due to matching overlap depending on length of substring.
+ chan_pkgs[chan] = data[data['path'].str.contains(chan+'/')]
+
+ total_downloads = 0
+ for chan in chan_pkgs.keys():
+ total_downloads += len(chan_pkgs[chan].index)
+ print(f'TOTAL downloads = {total_downloads}')
+
+ # For each channel, generate summary report of the download activity.
+ for chan in chan_pkgs.keys():
+ print(f'\n\nSummary for channel: {chan}')
+ print('-----------------------------')
+
+ pkgs = chan_pkgs[chan]
+ # Unique days
+ dates = set(pkgs['date'])
+ dates = list(dates)
+ dates.sort()
+ bydate = OrderedDict()
+
+ start_date = dates[0]
+ end_date = dates[-1]
+ time_range = end_date - start_date
+ days_elapsed = time_range.days
+ if days_elapsed == 0:
+ days_elapsed = 1
+ days_elapsed += 1
+ print(f'\nOver the period {start_date.strftime("%m-%d-%Y")} '
+ f'to {end_date.strftime("%m-%d-%Y")}')
+ print(f'{days_elapsed} days')
+
+ # Downloads per day over time frame
+ for date in dates:
+ bydate[date] = len(pkgs[pkgs['date'] == date])
+
+ chan_downloads = len(pkgs.index)
+ print(f'Downloads: {chan_downloads}')
+
+ print(f'Average downloads per day: {ceil(chan_downloads / days_elapsed)}')
+
+ # Total bandwidth consumed by this channel's use over time frame.
+ bytecount = pkgs['size'].sum()
+ gib = bytecount / 1e9
+ print(f'Data transferred: {gib:.2f} GiB')
+
+ # Number of unique hosts and geographic location
+ unique_hosts = set(pkgs['ipaddress'])
+ num_unique_hosts = len(unique_hosts)
+ print(f'Unique hosts {num_unique_hosts}')
+
+ ## Unique packages
+ unique_pkgs = set(pkgs['path'])
+ print(f'Unique full package names {len(unique_pkgs)}')
+
+ # What is the fraction of downloads for each OS?
+ num_linux_txns = len(pkgs[pkgs['path'].str.contains('linux-64')].index)
+ num_osx_txns = len(pkgs[pkgs['path'].str.contains('osx-64')].index)
+ pcnt_linux_txns = (num_linux_txns / float(chan_downloads))*100
+ pcnt_osx_txns = (num_osx_txns / float(chan_downloads))*100
+
+ # What fraction of total downloads come from non-infrastructure on-site hosts?
+ noninf = pkgs[~pkgs['ipaddress'].isin(config['infrastructure_hosts'])]
+ total_noninf = len(noninf.index)
+ print(f'Non-infrastructure downloads: {total_noninf}')
+ print(f'Percentage noninf downloads: {(total_noninf/chan_downloads)*100:.1f}%')
+
+ # What fraction of total downloads come from off-site hosts?
+ int_host_patterns = ['^'+s for s in config['internal_host_specs']]
+ offsite = pkgs[~pkgs['ipaddress'].str.contains(
+ '|'.join(int_host_patterns), regex=True)]
+ num_offsite_hosts = len(set(offsite['ipaddress']))
+ print(f'num unique off-site hosts: {num_offsite_hosts}')
+ onsite = pkgs[pkgs['ipaddress'].str.contains(
+ '|'.join(int_host_patterns), regex=True)]
+ num_onsite_hosts = len(set(onsite['ipaddress']))
+ print(f'num unique on-site hosts: {num_onsite_hosts}')
+
+ infra = pkgs[pkgs['ipaddress'].str.contains('|'.join(inf_hosts))]
+
+ # Totals of unique software titles
+ # i.e. name without version, hash, py or build iteration values
+ # Extract simple package titles from 'path' column of data frame.
+ names = list(pkgs['name'])
+ unique_names = list(set(names))
+ name_statsums = []
+ for name in unique_names:
+ statsum = {}
+ statsum['name'] = name
+ statsum['total'] = names.count(name)
+ # Sum on- and off-site transactions for each package name
+ # 'on-site' means transactions to non-infrastructure hosts.
+ name_txns = pkgs[pkgs['name'] == name]
+
+ on_txns = name_txns[name_txns['ipaddress'].str.contains(
+ '|'.join(int_host_patterns), regex=True)]
+ # Filter out hosts designated as infrastructure hosts in config file.
+ on_txns = on_txns[~on_txns['ipaddress'].str.contains(
+ '|'.join(inf_hosts))]
+
+ num_onsite_txns = len(on_txns.index)
+ statsum['onsite'] = num_onsite_txns
+
+ off_txns = name_txns[~name_txns['ipaddress'].str.contains(
+ '|'.join(int_host_patterns), regex=True)]
+ num_offsite_txns = len(off_txns.index)
+ statsum['offsite'] = num_offsite_txns
+
+ infra_txns = name_txns[name_txns['ipaddress'].str.contains(
+ '|'.join(inf_hosts))]
+ num_infra_txns = len(infra_txns.index)
+ statsum['infra'] = num_infra_txns
+
+ ## Determine which packages are also available via PyPI
+ url = f'https://pypi.org/pypi/{name}/json'
+ try:
+ rq = urllib.request.urlopen(url)
+ #pl = f.read().decode('utf-8')
+ #piinfo = json.loads(pl)
+ statsum['pypi'] = True
+ except(HTTPError):
+ statsum['pypi'] = False
+ #statsum['pypi'] = False
+
+ name_statsums.append(statsum)
+
+ name_statsums.sort(key=lambda x: x['total'], reverse=True)
+ x_onsite = [i['onsite'] for i in name_statsums]
+ x_infra = [i['infra'] for i in name_statsums]
+ x_offsite = [i['offsite'] for i in name_statsums]
+ y = [i['name'] for i in name_statsums]
+
+ print(f'Number of unique {chan} titles downloaded: {len(unique_names)}')
+ # For each unique softare name, sum the number of transactions from internal hosts.
+ fig, axes = plt.subplots(figsize=(10,25))
+ plt.grid(which='major', axis='x')
+ plt.title(f'{chan} -- {start_date.strftime("%Y%m%d")} - {end_date.strftime("%Y%m%d")}')
+ plt.xlabel('Downloads')
+ axes.set_ylim(-1,len(name_statsums))
+ axes.tick_params(labeltop=True)
+
+ plt.gca().invert_yaxis()
+ width = 1
+ from operator import add
+ barlists = []
+ # Horizontal stacked bar chart with off-site, on-site, and infrastructure transactions.
+ barlists.append(axes.barh(y, x_offsite, width, edgecolor='white', color='tab:blue'))
+ barlists.append(axes.barh(y, x_onsite, width, left=x_offsite, edgecolor='white', color='tab:green'))
+ # Sum bars up to this point to correctly stack the subsequent one(s).
+ offset = list(map(add, x_offsite, x_onsite))
+ barlists.append(axes.barh(y, x_infra, width, left=offset, edgecolor='white', color='tab:olive'))
+
+ for i,statsum in enumerate(name_statsums):
+ if statsum['pypi'] == True:
+ axes.get_yticklabels()[i].set_color('orange')
+ axes.get_yticklabels()[i].set_weight('bold')
+
+ # Annotate plot with additional stats
+ props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
+ plural = ''
+ if days_elapsed > 1:
+ plural = 's'
+ stats_text = (f'{days_elapsed} day{plural}\n'
+ f'Total Downloads: {chan_downloads}\n'
+ f'Average downloads per day: {ceil(chan_downloads / days_elapsed)}\n'
+ f'Unique titles: {len(unique_names)}\n'
+ f'Data transferred: {gib:.2f} GiB\n'
+ f'Linux transactions: {pcnt_linux_txns:.1f}%\n'
+ f'Macos transactions: {pcnt_osx_txns:.1f}%\n'
+ f'Unique on-site hosts: {num_onsite_hosts}\n'
+ f'Unique off-site hosts: {num_offsite_hosts}\n\n'
+ f' Orange titles are available on PyPI.')
+ axes.text(0.45, 0.05, stats_text, transform=axes.transAxes, fontsize=14, bbox=props)
+ axes.legend(['off-site', 'on-site', 'on-site infrastructure'])
+
+ plt.tight_layout()
+ short_startdate = start_date.strftime('%Y%m%d')
+ short_enddate = end_date.strftime('%Y%m%d')
+ plt.savefig(f'{chan}-{short_startdate}-{short_enddate}.png')