diff options
-rwxr-xr-x | logparse.py | 53 |
1 files changed, 42 insertions, 11 deletions
diff --git a/logparse.py b/logparse.py index baee14d..185f6ae 100755 --- a/logparse.py +++ b/logparse.py @@ -182,6 +182,11 @@ def main(): help='List of log files to parse, raw or .gz are accepted.' ' glob syntax is also honored.', nargs='+') + ap.add_argument('--window', + '-w', + help='Restrict examination of data to the window of dates' + ' provided.\n' + ' Format: YYYY.MM.DD-YYYY.MM.DD') ap.add_argument('--ignorehosts', '-i', help='IP addresses of hosts to ignore when parsing logs.' @@ -238,10 +243,19 @@ def main(): # Dump data to disk for use during subsequent runs. data.to_pickle(datfile) - # Normalize all conda-dev channel names to astroconda-dev data = data.replace('/conda-dev', '/astroconda-dev', regex=True) + print(f'num full data rows = {len(data.index)}') + + # Filter out a particular time period for examination + window_start = pd.to_datetime('2019-08-22') + window_end = pd.to_datetime('2019-08-30') + print(f'Filtering based on window {window_start} - {window_end}.') + data = data[pd.to_datetime(data['date']) >= window_start] + data = data[pd.to_datetime(data['date']) <= window_end] + print(f'num windowed data rows = {len(data.index)}') + all_unique_hosts = list(set(data['ipaddress'])) #for host in all_unique_hosts: # try: @@ -305,7 +319,7 @@ def main(): ## Unique packages unique_pkgs = set(pkgs['path']) - print(f'Unique packages {len(unique_pkgs)}') + print(f'Unique full package names {len(unique_pkgs)}') # Totals of unique package files #pkg_totals = [] @@ -332,13 +346,15 @@ def main(): offsite = pkgs[~pkgs['ipaddress'].str.contains( '|'.join(int_host_patterns), regex=True)] num_offsite_hosts = len(set(offsite['ipaddress'])) - print(f'num off-site hosts: {num_offsite_hosts}') + print(f'num unique off-site hosts: {num_offsite_hosts}') onsite = pkgs[pkgs['ipaddress'].str.contains( '|'.join(int_host_patterns), regex=True)] num_onsite_hosts = len(set(onsite['ipaddress'])) - print(f'num on-site hosts: {num_onsite_hosts}') + print(f'num unique on-site hosts: {num_onsite_hosts}') + + # Fraction of downloads to off-site hosts - # Totals of unique software names + # Totals of unique software titles # i.e. name without version, hash, py or build iteration values # Extract simple package titles from 'path' column of data frame. names = pkgs['path'].str.replace('/.*/.*/', '', regex=True) @@ -346,20 +362,35 @@ def main(): names = list(names.str.replace('(?P<simplename>.*)-.*-.*\.tar\.bz2$', repl, regex=True)) - unique_names = set(names) - print(f'Number of unique {chan} titles downloaded: {len(unique_names)}') + unique_names = list(set(names)) name_totals = [] for name in unique_names: total = names.count(name) name_totals.append([name, total]) name_totals.sort(key=lambda x: x[1], reverse=True) y = [] - x = range(0,len(name_totals)) + x = [x[0] for x in name_totals] + #print(f'Number of unique {chan} titles downloaded: {len(unique_names)}') for total in name_totals: y.append(total[1]) - print(f'{total[0]}: {total[1]}') - plt.plot(x, y) - plt.savefig('ding.png') + #print(f'{total[0]}: {total[1]}') + width = 5.0 + fig, axes = plt.subplots(figsize=(10,25)) + plt.grid(which='major', axis='x') + plt.title(f'{start_date.strftime("%m-%d-%Y")} - {end_date.strftime("%m-%d-%Y")}') + plt.xlabel('Number of downloads') + axes.set_ylim(0,len(name_totals)) + iraf_ids = [] + for i,name in enumerate(x): + if 'iraf' in name: + iraf_ids.append(i) + + plt.gca().invert_yaxis() + barlist = axes.barh(x,y,1,edgecolor='black') + for id in iraf_ids: + barlist[id].set_color('grey') + plt.tight_layout() + plt.savefig(f'{chan}.png') if __name__ == "__main__": |