From 341ede470217f39059dbf0a9fa03e1a516e66696 Mon Sep 17 00:00:00 2001 From: Matt Rendina Date: Wed, 16 Oct 2019 10:31:55 -0400 Subject: Add main --- conmets/main.py | 282 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 282 insertions(+) create mode 100644 conmets/main.py diff --git a/conmets/main.py b/conmets/main.py new file mode 100644 index 0000000..b7d02bb --- /dev/null +++ b/conmets/main.py @@ -0,0 +1,282 @@ +import argparse +from conmets.conmets import * +import yaml +import urllib.request +from urllib.error import HTTPError + +def main(): + ap = argparse.ArgumentParser( + prog='conmets', + description='Parse and digest apache/nginx access logs in either' + ' raw or .gz format and produce conda package download stats ' + 'summaries.') + ap.add_argument('dataset_name', type=str, + help='Name of dataset file. If file does not exist and' + ' log data file names are provided for parsing, this ' + 'file will be created.') + ap.add_argument('--config', + '-c', + help='Configuration file used to adjust behavior of the ' + 'program', + required=True) + ap.add_argument('--files', + '-f', + help='List of log files to parse, raw or .gz are accepted.' + ' glob syntax is also honored.', + nargs='+') + ap.add_argument('--window', + '-w', + help='Restrict examination of data to the window of dates' + ' provided.\n' + ' Format: YYYY.MM.DD-YYYY.MM.DD' + ' Omitting a date window will operate on all data contained' + ' within the given dataset.') + ap.add_argument('--ignorehosts', + '-i', + help='IP addresses of hosts to ignore when parsing logs.' + ' Useful for saving time by not reading in transactions ' + 'from security scans, etc.', + nargs='+') + args = ap.parse_args() + + # Dataset filename + dataset_name = args.dataset_name + + with open(args.config, 'r') as f: + config = yaml.safe_load(f) + + files = [] + try: + for filespec in args.files: + expanded = glob(filespec) + expanded.sort() + if isinstance(expanded, list): + for name in expanded: + files.append(name) + else: + files.append(expanded) + except(TypeError): + print('No log files provided.') + print(f'Importing existing dataset {dataset_name}.') + pass + + inf_hosts = config['infrastructure_hosts'] + num_inf_hosts = len(inf_hosts) + + # TODO: Should host filtering take place here? + # It leaves a disconnect between the pickled data which _may_ have + # been culled and the actual data being referenced. + logproc = LogData(dataset_name, ignore_hosts=args.ignorehosts) + logproc.read_logs(files) + + print('writing (potentially updated) dataset') + logproc.write_dataset() + + # Filtering and analysis begins here + data = logproc.data + print(f'num full data rows = {len(data.index)}') + + # Filter out a particular time period for examination + # Set limits on a time period to examine + if args.window: + start = args.window.split('-')[0].replace('.', '-') + end = args.window.split('-')[1].replace('.', '-') + window_start = pd.to_datetime(start) + window_end = pd.to_datetime(end) + print(f'Filtering based on window {window_start} - {window_end}.') + data = data[pd.to_datetime(data['date']) >= window_start] + data = data[pd.to_datetime(data['date']) <= window_end] + print(f'num windowed data rows = {len(data.index)}') + + all_unique_hosts = list(set(data['ipaddress'])) + #for host in all_unique_hosts: + # try: + # print(f'{host} {socket.gethostbyaddr(host)[0]}') + # except: + # print(f'{host} offline?') + + # All packages in a dictionary by channel. + chans = [path.split('/')[1] for path in data['path']] + chans = list(set(chans)) + chans.sort() + chan_pkgs = OrderedDict() + for chan in chans: + # Trailing '/' added to ensure only a single channel gets stored for each + # due to matching overlap depending on length of substring. + chan_pkgs[chan] = data[data['path'].str.contains(chan+'/')] + + total_downloads = 0 + for chan in chan_pkgs.keys(): + total_downloads += len(chan_pkgs[chan].index) + print(f'TOTAL downloads = {total_downloads}') + + # For each channel, generate summary report of the download activity. + for chan in chan_pkgs.keys(): + print(f'\n\nSummary for channel: {chan}') + print('-----------------------------') + + pkgs = chan_pkgs[chan] + # Unique days + dates = set(pkgs['date']) + dates = list(dates) + dates.sort() + bydate = OrderedDict() + + start_date = dates[0] + end_date = dates[-1] + time_range = end_date - start_date + days_elapsed = time_range.days + if days_elapsed == 0: + days_elapsed = 1 + days_elapsed += 1 + print(f'\nOver the period {start_date.strftime("%m-%d-%Y")} ' + f'to {end_date.strftime("%m-%d-%Y")}') + print(f'{days_elapsed} days') + + # Downloads per day over time frame + for date in dates: + bydate[date] = len(pkgs[pkgs['date'] == date]) + + chan_downloads = len(pkgs.index) + print(f'Downloads: {chan_downloads}') + + print(f'Average downloads per day: {ceil(chan_downloads / days_elapsed)}') + + # Total bandwidth consumed by this channel's use over time frame. + bytecount = pkgs['size'].sum() + gib = bytecount / 1e9 + print(f'Data transferred: {gib:.2f} GiB') + + # Number of unique hosts and geographic location + unique_hosts = set(pkgs['ipaddress']) + num_unique_hosts = len(unique_hosts) + print(f'Unique hosts {num_unique_hosts}') + + ## Unique packages + unique_pkgs = set(pkgs['path']) + print(f'Unique full package names {len(unique_pkgs)}') + + # What is the fraction of downloads for each OS? + num_linux_txns = len(pkgs[pkgs['path'].str.contains('linux-64')].index) + num_osx_txns = len(pkgs[pkgs['path'].str.contains('osx-64')].index) + pcnt_linux_txns = (num_linux_txns / float(chan_downloads))*100 + pcnt_osx_txns = (num_osx_txns / float(chan_downloads))*100 + + # What fraction of total downloads come from non-infrastructure on-site hosts? + noninf = pkgs[~pkgs['ipaddress'].isin(config['infrastructure_hosts'])] + total_noninf = len(noninf.index) + print(f'Non-infrastructure downloads: {total_noninf}') + print(f'Percentage noninf downloads: {(total_noninf/chan_downloads)*100:.1f}%') + + # What fraction of total downloads come from off-site hosts? + int_host_patterns = ['^'+s for s in config['internal_host_specs']] + offsite = pkgs[~pkgs['ipaddress'].str.contains( + '|'.join(int_host_patterns), regex=True)] + num_offsite_hosts = len(set(offsite['ipaddress'])) + print(f'num unique off-site hosts: {num_offsite_hosts}') + onsite = pkgs[pkgs['ipaddress'].str.contains( + '|'.join(int_host_patterns), regex=True)] + num_onsite_hosts = len(set(onsite['ipaddress'])) + print(f'num unique on-site hosts: {num_onsite_hosts}') + + infra = pkgs[pkgs['ipaddress'].str.contains('|'.join(inf_hosts))] + + # Totals of unique software titles + # i.e. name without version, hash, py or build iteration values + # Extract simple package titles from 'path' column of data frame. + names = list(pkgs['name']) + unique_names = list(set(names)) + name_statsums = [] + for name in unique_names: + statsum = {} + statsum['name'] = name + statsum['total'] = names.count(name) + # Sum on- and off-site transactions for each package name + # 'on-site' means transactions to non-infrastructure hosts. + name_txns = pkgs[pkgs['name'] == name] + + on_txns = name_txns[name_txns['ipaddress'].str.contains( + '|'.join(int_host_patterns), regex=True)] + # Filter out hosts designated as infrastructure hosts in config file. + on_txns = on_txns[~on_txns['ipaddress'].str.contains( + '|'.join(inf_hosts))] + + num_onsite_txns = len(on_txns.index) + statsum['onsite'] = num_onsite_txns + + off_txns = name_txns[~name_txns['ipaddress'].str.contains( + '|'.join(int_host_patterns), regex=True)] + num_offsite_txns = len(off_txns.index) + statsum['offsite'] = num_offsite_txns + + infra_txns = name_txns[name_txns['ipaddress'].str.contains( + '|'.join(inf_hosts))] + num_infra_txns = len(infra_txns.index) + statsum['infra'] = num_infra_txns + + ## Determine which packages are also available via PyPI + url = f'https://pypi.org/pypi/{name}/json' + try: + rq = urllib.request.urlopen(url) + #pl = f.read().decode('utf-8') + #piinfo = json.loads(pl) + statsum['pypi'] = True + except(HTTPError): + statsum['pypi'] = False + #statsum['pypi'] = False + + name_statsums.append(statsum) + + name_statsums.sort(key=lambda x: x['total'], reverse=True) + x_onsite = [i['onsite'] for i in name_statsums] + x_infra = [i['infra'] for i in name_statsums] + x_offsite = [i['offsite'] for i in name_statsums] + y = [i['name'] for i in name_statsums] + + print(f'Number of unique {chan} titles downloaded: {len(unique_names)}') + # For each unique softare name, sum the number of transactions from internal hosts. + fig, axes = plt.subplots(figsize=(10,25)) + plt.grid(which='major', axis='x') + plt.title(f'{chan} -- {start_date.strftime("%Y%m%d")} - {end_date.strftime("%Y%m%d")}') + plt.xlabel('Downloads') + axes.set_ylim(-1,len(name_statsums)) + axes.tick_params(labeltop=True) + + plt.gca().invert_yaxis() + width = 1 + from operator import add + barlists = [] + # Horizontal stacked bar chart with off-site, on-site, and infrastructure transactions. + barlists.append(axes.barh(y, x_offsite, width, edgecolor='white', color='tab:blue')) + barlists.append(axes.barh(y, x_onsite, width, left=x_offsite, edgecolor='white', color='tab:green')) + # Sum bars up to this point to correctly stack the subsequent one(s). + offset = list(map(add, x_offsite, x_onsite)) + barlists.append(axes.barh(y, x_infra, width, left=offset, edgecolor='white', color='tab:olive')) + + for i,statsum in enumerate(name_statsums): + if statsum['pypi'] == True: + axes.get_yticklabels()[i].set_color('orange') + axes.get_yticklabels()[i].set_weight('bold') + + # Annotate plot with additional stats + props = dict(boxstyle='round', facecolor='wheat', alpha=0.5) + plural = '' + if days_elapsed > 1: + plural = 's' + stats_text = (f'{days_elapsed} day{plural}\n' + f'Total Downloads: {chan_downloads}\n' + f'Average downloads per day: {ceil(chan_downloads / days_elapsed)}\n' + f'Unique titles: {len(unique_names)}\n' + f'Data transferred: {gib:.2f} GiB\n' + f'Linux transactions: {pcnt_linux_txns:.1f}%\n' + f'Macos transactions: {pcnt_osx_txns:.1f}%\n' + f'Unique on-site hosts: {num_onsite_hosts}\n' + f'Unique off-site hosts: {num_offsite_hosts}\n\n' + f' Orange titles are available on PyPI.') + axes.text(0.45, 0.05, stats_text, transform=axes.transAxes, fontsize=14, bbox=props) + axes.legend(['off-site', 'on-site', 'on-site infrastructure']) + + plt.tight_layout() + short_startdate = start_date.strftime('%Y%m%d') + short_enddate = end_date.strftime('%Y%m%d') + plt.savefig(f'{chan}-{short_startdate}-{short_enddate}.png') -- cgit