aboutsummaryrefslogtreecommitdiff
path: root/conmets
diff options
context:
space:
mode:
Diffstat (limited to 'conmets')
-rwxr-xr-xconmets/conmets.py279
1 files changed, 0 insertions, 279 deletions
diff --git a/conmets/conmets.py b/conmets/conmets.py
index b083ede..f92c235 100755
--- a/conmets/conmets.py
+++ b/conmets/conmets.py
@@ -203,282 +203,3 @@ class LogData():
dsname = self.dataset_name
pickle.dump(self.dataset, open(dsname, 'wb'))
-
-#def main():
-# ap = argparse.ArgumentParser(
-# prog='logparse.py',
-# description='Parse and digest apache/nginx access logs in either'
-# ' raw or .gz format.')
-# ap.add_argument('dataset_name', type=str,
-# help='Name of dataset file. If file does not exist and'
-# ' log data file names are provided for parsing, this '
-# 'file will be created.')
-# ap.add_argument('--config',
-# '-c',
-# help='Configuration file used to adjust behavior of the '
-# 'program',
-# required=True)
-# ap.add_argument('--files',
-# '-f',
-# help='List of log files to parse, raw or .gz are accepted.'
-# ' glob syntax is also honored.',
-# nargs='+')
-# ap.add_argument('--window',
-# '-w',
-# help='Restrict examination of data to the window of dates'
-# ' provided.\n'
-# ' Format: YYYY.MM.DD-YYYY.MM.DD'
-# ' Omitting a date window will operate on all data contained'
-# ' within the given dataset.')
-# ap.add_argument('--ignorehosts',
-# '-i',
-# help='IP addresses of hosts to ignore when parsing logs.'
-# ' Useful for saving time by not reading in transactions '
-# 'from security scans, etc.',
-# nargs='+')
-# args = ap.parse_args()
-#
-# # Dataset filename
-# dataset_name = args.dataset_name
-#
-# with open(args.config, 'r') as f:
-# config = yaml.safe_load(f)
-#
-# files = []
-# try:
-# for filespec in args.files:
-# expanded = glob(filespec)
-# expanded.sort()
-# if isinstance(expanded, list):
-# for name in expanded:
-# files.append(name)
-# else:
-# files.append(expanded)
-# except(TypeError):
-# print('No log files provided.')
-# print(f'Importing existing dataset {dataset_name}.')
-# pass
-#
-# inf_hosts = config['infrastructure_hosts']
-# num_inf_hosts = len(inf_hosts)
-#
-# # TODO: Should host filtering take place here?
-# # It leaves a disconnect between the pickled data which _may_ have
-# # been culled and the actual data being referenced.
-# logproc = LogData(dataset_name, ignore_hosts=args.ignorehosts)
-# logproc.read_logs(files)
-#
-# print('writing (potentially updated) dataset')
-# logproc.write_dataset()
-#
-# # Filtering and analysis begins here
-# data = logproc.data
-# print(f'num full data rows = {len(data.index)}')
-#
-# # Filter out a particular time period for examination
-# # Set limits on a time period to examine
-# if args.window:
-# start = args.window.split('-')[0].replace('.', '-')
-# end = args.window.split('-')[1].replace('.', '-')
-# window_start = pd.to_datetime(start)
-# window_end = pd.to_datetime(end)
-# print(f'Filtering based on window {window_start} - {window_end}.')
-# data = data[pd.to_datetime(data['date']) >= window_start]
-# data = data[pd.to_datetime(data['date']) <= window_end]
-# print(f'num windowed data rows = {len(data.index)}')
-#
-# all_unique_hosts = list(set(data['ipaddress']))
-# #for host in all_unique_hosts:
-# # try:
-# # print(f'{host} {socket.gethostbyaddr(host)[0]}')
-# # except:
-# # print(f'{host} offline?')
-#
-# # All packages in a dictionary by channel.
-# chans = [path.split('/')[1] for path in data['path']]
-# chans = list(set(chans))
-# chans.sort()
-# chan_pkgs = OrderedDict()
-# for chan in chans:
-# # Trailing '/' added to ensure only a single channel gets stored for each
-# # due to matching overlap depending on length of substring.
-# chan_pkgs[chan] = data[data['path'].str.contains(chan+'/')]
-#
-# total_downloads = 0
-# for chan in chan_pkgs.keys():
-# total_downloads += len(chan_pkgs[chan].index)
-# print(f'TOTAL downloads = {total_downloads}')
-#
-# # For each channel, generate summary report of the download activity.
-# for chan in chan_pkgs.keys():
-# print(f'\n\nSummary for channel: {chan}')
-# print('-----------------------------')
-#
-# pkgs = chan_pkgs[chan]
-# # Unique days
-# dates = set(pkgs['date'])
-# dates = list(dates)
-# dates.sort()
-# bydate = OrderedDict()
-#
-# start_date = dates[0]
-# end_date = dates[-1]
-# time_range = end_date - start_date
-# days_elapsed = time_range.days
-# if days_elapsed == 0:
-# days_elapsed = 1
-# print(f'\nOver the period {start_date.strftime("%m-%d-%Y")} '
-# f'to {end_date.strftime("%m-%d-%Y")}')
-# print(f'{days_elapsed} days')
-#
-# # Downloads per day over time frame
-# for date in dates:
-# bydate[date] = len(pkgs[pkgs['date'] == date])
-#
-# chan_downloads = len(pkgs.index)
-# print(f'Downloads: {chan_downloads}')
-#
-# print(f'Average downloads per day: {ceil(chan_downloads / days_elapsed)}')
-#
-# # Total bandwidth consumed by this channel's use over time frame.
-# bytecount = pkgs['size'].sum()
-# gib = bytecount / 1e9
-# print(f'Data transferred: {gib:.2f} GiB')
-#
-# # Number of unique hosts and geographic location
-# unique_hosts = set(pkgs['ipaddress'])
-# num_unique_hosts = len(unique_hosts)
-# print(f'Unique hosts {num_unique_hosts}')
-#
-# ## Unique packages
-# unique_pkgs = set(pkgs['path'])
-# print(f'Unique full package names {len(unique_pkgs)}')
-#
-# # What is the fraction of downloads for each OS?
-# num_linux_txns = len(pkgs[pkgs['path'].str.contains('linux-64')].index)
-# num_osx_txns = len(pkgs[pkgs['path'].str.contains('osx-64')].index)
-# pcnt_linux_txns = (num_linux_txns / float(chan_downloads))*100
-# pcnt_osx_txns = (num_osx_txns / float(chan_downloads))*100
-#
-# # What fraction of total downloads come from non-infrastructure on-site hosts?
-# noninf = pkgs[~pkgs['ipaddress'].isin(config['infrastructure_hosts'])]
-# total_noninf = len(noninf.index)
-# print(f'Non-infrastructure downloads: {total_noninf}')
-# print(f'Percentage noninf downloads: {(total_noninf/chan_downloads)*100:.1f}%')
-#
-# # What fraction of total downloads come from off-site hosts?
-# int_host_patterns = ['^'+s for s in config['internal_host_specs']]
-# offsite = pkgs[~pkgs['ipaddress'].str.contains(
-# '|'.join(int_host_patterns), regex=True)]
-# num_offsite_hosts = len(set(offsite['ipaddress']))
-# print(f'num unique off-site hosts: {num_offsite_hosts}')
-# onsite = pkgs[pkgs['ipaddress'].str.contains(
-# '|'.join(int_host_patterns), regex=True)]
-# num_onsite_hosts = len(set(onsite['ipaddress']))
-# print(f'num unique on-site hosts: {num_onsite_hosts}')
-#
-# infra = pkgs[pkgs['ipaddress'].str.contains('|'.join(inf_hosts))]
-#
-# # Totals of unique software titles
-# # i.e. name without version, hash, py or build iteration values
-# # Extract simple package titles from 'path' column of data frame.
-# names = list(pkgs['name'])
-# unique_names = list(set(names))
-# name_statsums = []
-# for name in unique_names:
-# statsum = {}
-# statsum['name'] = name
-# statsum['total'] = names.count(name)
-# # Sum on- and off-site transactions for each package name
-# # 'on-site' means transactions to non-infrastructure hosts.
-# name_txns = pkgs[pkgs['name'] == name]
-#
-# on_txns = name_txns[name_txns['ipaddress'].str.contains(
-# '|'.join(int_host_patterns), regex=True)]
-# # Filter out hosts designated as infrastructure hosts in config file.
-# on_txns = on_txns[~on_txns['ipaddress'].str.contains(
-# '|'.join(inf_hosts))]
-#
-# num_onsite_txns = len(on_txns.index)
-# statsum['onsite'] = num_onsite_txns
-#
-# off_txns = name_txns[~name_txns['ipaddress'].str.contains(
-# '|'.join(int_host_patterns), regex=True)]
-# num_offsite_txns = len(off_txns.index)
-# statsum['offsite'] = num_offsite_txns
-#
-# infra_txns = name_txns[name_txns['ipaddress'].str.contains(
-# '|'.join(inf_hosts))]
-# num_infra_txns = len(infra_txns.index)
-# statsum['infra'] = num_infra_txns
-#
-# ## Determine which packages are also available via PyPI
-# url = f'https://pypi.org/pypi/{name}/json'
-# try:
-# rq = urllib.request.urlopen(url)
-# #pl = f.read().decode('utf-8')
-# #piinfo = json.loads(pl)
-# statsum['pypi'] = True
-# except(HTTPError):
-# statsum['pypi'] = False
-# #statsum['pypi'] = False
-#
-# name_statsums.append(statsum)
-#
-# name_statsums.sort(key=lambda x: x['total'], reverse=True)
-# x_onsite = [i['onsite'] for i in name_statsums]
-# x_infra = [i['infra'] for i in name_statsums]
-# x_offsite = [i['offsite'] for i in name_statsums]
-# y = [i['name'] for i in name_statsums]
-#
-# print(f'Number of unique {chan} titles downloaded: {len(unique_names)}')
-# # For each unique softare name, sum the number of transactions from internal hosts.
-# fig, axes = plt.subplots(figsize=(10,25))
-# plt.grid(which='major', axis='x')
-# plt.title(f'{chan} -- {start_date.strftime("%m-%d-%Y")} - {end_date.strftime("%m-%d-%Y")}')
-# plt.xlabel('Downloads')
-# axes.set_ylim(-1,len(name_statsums))
-# axes.tick_params(labeltop=True)
-#
-# plt.gca().invert_yaxis()
-# width = 1
-# from operator import add
-# barlists = []
-# # Horizontal stacked bar chart with off-site, on-site, and infrastructure transactions.
-# barlists.append(axes.barh(y, x_offsite, width, edgecolor='white', color='tab:blue'))
-# barlists.append(axes.barh(y, x_onsite, width, left=x_offsite, edgecolor='white', color='tab:green'))
-# # Sum bars up to this point to correctly stack the subsequent one(s).
-# offset = list(map(add, x_offsite, x_onsite))
-# barlists.append(axes.barh(y, x_infra, width, left=offset, edgecolor='white', color='tab:olive'))
-#
-# for i,statsum in enumerate(name_statsums):
-# if statsum['pypi'] == True:
-# axes.get_yticklabels()[i].set_color('orange')
-# axes.get_yticklabels()[i].set_weight('bold')
-#
-# # Annotate plot with additional stats
-# props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
-# plural = ''
-# if days_elapsed > 1:
-# plural = 's'
-# stats_text = (f'{days_elapsed} day{plural}\n'
-# f'Total Downloads: {chan_downloads}\n'
-# f'Average downloads per day: {ceil(chan_downloads / days_elapsed)}\n'
-# f'Unique titles: {len(unique_names)}\n'
-# f'Data transferred: {gib:.2f} GiB\n'
-# f'Linux transactions: {pcnt_linux_txns:.1f}%\n'
-# f'Macos transactions: {pcnt_osx_txns:.1f}%\n'
-# f'Unique on-site hosts: {num_onsite_hosts}\n'
-# f'Unique off-site hosts: {num_offsite_hosts}\n')
-# axes.text(0.45, 0.05, stats_text, transform=axes.transAxes, fontsize=14, bbox=props)
-# axes.legend(['off-site', 'on-site', 'on-site infrastructure'])
-#
-# plt.tight_layout()
-# short_startdate = start_date.strftime('%Y%m%d')
-# short_enddate = end_date.strftime('%Y%m%d')
-# plt.savefig(f'{chan}-{short_startdate}-{short_enddate}.png')
-#
-#
-#if __name__ == "__main__":
-# main()
-