aboutsummaryrefslogtreecommitdiff
path: root/logparse.py
diff options
context:
space:
mode:
authorMatt Rendina <mrendina@stsci.edu>2019-10-03 09:28:43 -0400
committerMatt Rendina <mrendina@stsci.edu>2019-10-03 09:28:43 -0400
commitf926d27e96c09c932c12ba92854b6a34dc5088b8 (patch)
tree69bf67109265cf795ab3fd0bc901c7178f5ec2d1 /logparse.py
parent710636e32cd57e3f4e3b5a9e86d8162aa376ec1d (diff)
parent8a79c8190fb6482fce5f3419720dcd626bbb4657 (diff)
downloadconmets-f926d27e96c09c932c12ba92854b6a34dc5088b8.tar.gz
syncing
Diffstat (limited to 'logparse.py')
-rwxr-xr-xlogparse.py47
1 files changed, 24 insertions, 23 deletions
diff --git a/logparse.py b/logparse.py
index 4c1c7ea..9d36278 100755
--- a/logparse.py
+++ b/logparse.py
@@ -264,8 +264,8 @@ def main():
print(f'num full data rows = {len(data.index)}')
# Filter out a particular time period for examination
- window_start = pd.to_datetime('2019-09-12')
- window_end = pd.to_datetime('2019-09-19')
+ window_start = pd.to_datetime('2019-09-15')
+ window_end = pd.to_datetime('2019-09-21')
print(f'Filtering based on window {window_start} - {window_end}.')
data = data[pd.to_datetime(data['date']) >= window_start]
data = data[pd.to_datetime(data['date']) <= window_end]
@@ -377,33 +377,38 @@ def main():
#names = list(names.str.replace('(?P<simplename>.*)-.*-.*\.tar\.bz2$',
# repl,
# regex=True))
- names = list(set(pkgs['name']))
- print('*')
- print(names)
- print('*')
+ names = list(pkgs['name'])
unique_names = list(set(names))
name_statsums = []
for name in unique_names:
statsum = {}
statsum['name'] = name
statsum['total'] = names.count(name)
- # Not correct. Need to match up unique_names with host associated with
- # each full package name to avoid over broad matches of unique names
- # with substrings of longer names that contain the unique name.
- #statsum['onsite'] = len(pkgs[pkgs['ipaddress'].str.contains(
- # '|'.join(int_host_patterns), regex=True)])
- #statsum['offsite'] = len(pkgs[~pkgs['ipaddress'].str.contains(
- # '|'.join(int_host_patterns), regex=True)])
- name_statsums.append(statsum)
- # Sum on-site transactions for each package name
- #npkgs = pkgs[pkgs['name'
-
+ # Sum on- and off-site transactions for each package name
+ name_txns = pkgs[pkgs['name'] == name]
+
+ on_txns = name_txns[name_txns['ipaddress'].str.contains(
+ '|'.join(int_host_patterns), regex=True)]
+ num_onsite_txns = len(on_txns.index)
+ statsum['onsite'] = num_onsite_txns
+
+ off_txns = name_txns[~name_txns['ipaddress'].str.contains(
+ '|'.join(int_host_patterns), regex=True)]
+ num_offsite_txns = len(off_txns.index)
+ statsum['offsite'] = num_offsite_txns
+
+ name_statsums.append(statsum)
name_statsums.sort(key=lambda x: x['total'], reverse=True)
y = []
y = [i['total'] for i in name_statsums]
+ y_onsite = [i['onsite'] for i in name_statsums]
+ print(f'y_onsite: {y_onsite}')
+ y_offsite = [i['offsite'] for i in name_statsums]
+ print(f'y_offisite: {y_offsite}')
x = [i['name'] for i in name_statsums]
+ print('name_statsums')
print(name_statsums)
# Calculate fractions of properties of each unique package title
@@ -419,14 +424,10 @@ def main():
plt.xlabel('Number of downloads')
axes.set_ylim(0,len(name_statsums))
- #iraf_ids = []
- #for i,name in enumerate(x):
- # if 'iraf' in name:
- # iraf_ids.append(i)
-
plt.gca().invert_yaxis()
width = 1
- barlist = axes.barh(x, y, width, edgecolor='black')
+ barlist = axes.barh(x, y_onsite, width, edgecolor='black')
+ barlist = axes.barh(x, y_offsite, width, left=y_onsite, edgecolor='black')
#for id in iraf_ids:
# barlist[id].set_color('grey')
plt.tight_layout()