conmets/main.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282

import argparse
from conmets.conmets import *
import yaml
import urllib.request
from urllib.error import HTTPError

def main():
    ap = argparse.ArgumentParser(
            prog='conmets',
            description='Parse and digest apache/nginx access logs in either'
            ' raw or .gz format and produce conda package download stats '
            'summaries.')
    ap.add_argument('dataset_name', type=str,
                    help='Name of dataset file. If file does not exist and'
                    ' log data file names are provided for parsing, this '
                    'file will be created.')
    ap.add_argument('--config',
                    '-c',
                    help='Configuration file used to adjust behavior of the '
                    'program',
                    required=True)
    ap.add_argument('--files',
                    '-f',
                    help='List of log files to parse, raw or .gz are accepted.'
                    ' glob syntax is also honored.',
                    nargs='+')
    ap.add_argument('--window',
                    '-w',
                    help='Restrict examination of data to the window of dates'
                    ' provided.\n'
                    ' Format: YYYY.MM.DD-YYYY.MM.DD'
		    ' Omitting a date window will operate on all data contained'
		    ' within the given dataset.')
    ap.add_argument('--ignorehosts',
                    '-i',
                    help='IP addresses of hosts to ignore when parsing logs.'
                    ' Useful for saving time by not reading in transactions '
                    'from security scans, etc.',
                    nargs='+')
    args = ap.parse_args()

    # Dataset filename
    dataset_name = args.dataset_name

    with open(args.config, 'r') as f:
        config = yaml.safe_load(f)

    files = []
    try:
        for filespec in args.files:
            expanded =  glob(filespec)
            expanded.sort()
            if isinstance(expanded, list):
                for name in expanded:
                    files.append(name)
            else:
                files.append(expanded)
    except(TypeError):
        print('No log files provided.')
        print(f'Importing existing dataset {dataset_name}.')
        pass

    inf_hosts = config['infrastructure_hosts']
    num_inf_hosts = len(inf_hosts)

    # TODO: Should host filtering take place here?
    #       It leaves a disconnect between the pickled data which _may_ have
    #       been culled and the actual data being referenced.
    logproc = LogData(dataset_name, ignore_hosts=args.ignorehosts)
    logproc.read_logs(files)

    print('writing (potentially updated) dataset')
    logproc.write_dataset()

    # Filtering and analysis begins here
    data = logproc.data
    print(f'num full data rows = {len(data.index)}')

    # Filter out a particular time period for examination
    # Set limits on a time period to examine
    if args.window:
        start = args.window.split('-')[0].replace('.', '-')
        end = args.window.split('-')[1].replace('.', '-')
        window_start = pd.to_datetime(start)
        window_end = pd.to_datetime(end)
        print(f'Filtering based on window {window_start} - {window_end}.')
        data = data[pd.to_datetime(data['date']) >= window_start]
        data = data[pd.to_datetime(data['date']) <= window_end]
        print(f'num windowed data rows = {len(data.index)}')

    all_unique_hosts = list(set(data['ipaddress']))
    #for host in all_unique_hosts:
    #    try:
    #        print(f'{host} {socket.gethostbyaddr(host)[0]}')
    #    except:
    #        print(f'{host} offline?')

    # All packages in a dictionary by channel.
    chans = [path.split('/')[1] for path in data['path']]
    chans = list(set(chans))
    chans.sort()
    chan_pkgs = OrderedDict()
    for chan in chans:
        # Trailing '/' added to ensure only a single channel gets stored for each
        # due to matching overlap depending on length of substring.
        chan_pkgs[chan] = data[data['path'].str.contains(chan+'/')]

    total_downloads = 0
    for chan in chan_pkgs.keys():
        total_downloads += len(chan_pkgs[chan].index)
    print(f'TOTAL downloads = {total_downloads}')

    # For each channel, generate summary report of the download activity.
    for chan in chan_pkgs.keys():
        print(f'\n\nSummary for channel: {chan}')
        print('-----------------------------')

        pkgs = chan_pkgs[chan]
        # Unique days
        dates = set(pkgs['date'])
        dates = list(dates)
        dates.sort()
        bydate = OrderedDict()

        start_date = dates[0]
        end_date = dates[-1]
        time_range = end_date - start_date
        days_elapsed = time_range.days
        if days_elapsed == 0:
            days_elapsed = 1
        days_elapsed += 1
        print(f'\nOver the period {start_date.strftime("%m-%d-%Y")} '
              f'to {end_date.strftime("%m-%d-%Y")}')
        print(f'{days_elapsed} days')

        # Downloads per day over time frame
        for date in dates:
            bydate[date] = len(pkgs[pkgs['date'] == date])

        chan_downloads = len(pkgs.index)
        print(f'Downloads: {chan_downloads}')

        print(f'Average downloads per day: {ceil(chan_downloads / days_elapsed)}')

        # Total bandwidth consumed by this channel's use over time frame.
        bytecount = pkgs['size'].sum()
        gib = bytecount / 1e9
        print(f'Data transferred: {gib:.2f} GiB')

        # Number of unique hosts and geographic location
        unique_hosts = set(pkgs['ipaddress'])
        num_unique_hosts = len(unique_hosts)
        print(f'Unique hosts {num_unique_hosts}')

        ## Unique packages
        unique_pkgs = set(pkgs['path'])
        print(f'Unique full package names {len(unique_pkgs)}')

        # What is the fraction of downloads for each OS?
        num_linux_txns = len(pkgs[pkgs['path'].str.contains('linux-64')].index)
        num_osx_txns = len(pkgs[pkgs['path'].str.contains('osx-64')].index)
        pcnt_linux_txns = (num_linux_txns / float(chan_downloads))*100
        pcnt_osx_txns = (num_osx_txns / float(chan_downloads))*100

        # What fraction of total downloads come from non-infrastructure on-site hosts?
        noninf = pkgs[~pkgs['ipaddress'].isin(config['infrastructure_hosts'])]
        total_noninf = len(noninf.index)
        print(f'Non-infrastructure downloads: {total_noninf}')
        print(f'Percentage noninf downloads: {(total_noninf/chan_downloads)*100:.1f}%')

        # What fraction of total downloads come from off-site hosts?
        int_host_patterns = ['^'+s for s in config['internal_host_specs']]
        offsite = pkgs[~pkgs['ipaddress'].str.contains(
            '|'.join(int_host_patterns), regex=True)]
        num_offsite_hosts = len(set(offsite['ipaddress']))
        print(f'num unique off-site hosts: {num_offsite_hosts}')
        onsite = pkgs[pkgs['ipaddress'].str.contains(
            '|'.join(int_host_patterns), regex=True)]
        num_onsite_hosts = len(set(onsite['ipaddress']))
        print(f'num unique on-site hosts: {num_onsite_hosts}')

        infra = pkgs[pkgs['ipaddress'].str.contains('|'.join(inf_hosts))]

        # Totals of unique software titles
        # i.e. name without version, hash, py or build iteration values
        # Extract simple package titles from 'path' column of data frame.
        names = list(pkgs['name'])
        unique_names = list(set(names))
        name_statsums = []
        for name in unique_names:
            statsum = {}
            statsum['name'] = name
            statsum['total'] = names.count(name)
            # Sum on- and off-site transactions for each package name
            # 'on-site' means transactions to non-infrastructure hosts.
            name_txns = pkgs[pkgs['name'] == name]

            on_txns = name_txns[name_txns['ipaddress'].str.contains(
            '|'.join(int_host_patterns), regex=True)]
            # Filter out hosts designated as infrastructure hosts in config file.
            on_txns = on_txns[~on_txns['ipaddress'].str.contains(
            '|'.join(inf_hosts))]

            num_onsite_txns = len(on_txns.index)
            statsum['onsite'] = num_onsite_txns

            off_txns = name_txns[~name_txns['ipaddress'].str.contains(
            '|'.join(int_host_patterns), regex=True)]
            num_offsite_txns = len(off_txns.index)
            statsum['offsite'] = num_offsite_txns

            infra_txns = name_txns[name_txns['ipaddress'].str.contains(
            '|'.join(inf_hosts))]
            num_infra_txns = len(infra_txns.index)
            statsum['infra'] = num_infra_txns

            ## Determine which packages are also available via PyPI
            url = f'https://pypi.org/pypi/{name}/json'
            try:
                rq = urllib.request.urlopen(url)
                #pl = f.read().decode('utf-8')
                #piinfo = json.loads(pl)
                statsum['pypi'] = True
            except(HTTPError):
                statsum['pypi'] = False
            #statsum['pypi'] = False

            name_statsums.append(statsum)

        name_statsums.sort(key=lambda x: x['total'], reverse=True)
        x_onsite = [i['onsite'] for i in name_statsums]
        x_infra = [i['infra'] for i in name_statsums]
        x_offsite = [i['offsite'] for i in name_statsums]
        y = [i['name'] for i in name_statsums]

        print(f'Number of unique {chan} titles downloaded: {len(unique_names)}')
        # For each unique softare name, sum the number of transactions from internal hosts.
        fig, axes = plt.subplots(figsize=(10,25))
        plt.grid(which='major', axis='x')
        plt.title(f'{chan} -- {start_date.strftime("%Y%m%d")} - {end_date.strftime("%Y%m%d")}')
        plt.xlabel('Downloads')
        axes.set_ylim(-1,len(name_statsums))
        axes.tick_params(labeltop=True)

        plt.gca().invert_yaxis()
        width = 1
        from operator import add
        barlists = []
        # Horizontal stacked bar chart with off-site, on-site, and infrastructure transactions.
        barlists.append(axes.barh(y, x_offsite, width, edgecolor='white', color='tab:blue'))
        barlists.append(axes.barh(y, x_onsite, width, left=x_offsite, edgecolor='white', color='tab:green'))
        # Sum bars up to this point to correctly stack the subsequent one(s).
        offset = list(map(add, x_offsite, x_onsite))
        barlists.append(axes.barh(y, x_infra, width, left=offset, edgecolor='white', color='tab:olive'))

        for i,statsum in enumerate(name_statsums):
            if statsum['pypi'] == True:
                axes.get_yticklabels()[i].set_color('orange')
                axes.get_yticklabels()[i].set_weight('bold')

        # Annotate plot with additional stats
        props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
        plural = ''
        if days_elapsed > 1:
            plural = 's'
        stats_text = (f'{days_elapsed} day{plural}\n'
                     f'Total Downloads: {chan_downloads}\n'
                     f'Average downloads per day: {ceil(chan_downloads / days_elapsed)}\n'
                     f'Unique titles: {len(unique_names)}\n'
                     f'Data transferred: {gib:.2f} GiB\n'
                     f'Linux transactions: {pcnt_linux_txns:.1f}%\n'
                     f'Macos transactions: {pcnt_osx_txns:.1f}%\n'
                     f'Unique on-site hosts: {num_onsite_hosts}\n'
                     f'Unique off-site hosts: {num_offsite_hosts}\n\n'
                     f'   Orange titles are available on PyPI.')
        axes.text(0.45, 0.05, stats_text, transform=axes.transAxes, fontsize=14, bbox=props)
        axes.legend(['off-site', 'on-site', 'on-site infrastructure'])

        plt.tight_layout()
        short_startdate = start_date.strftime('%Y%m%d')
        short_enddate = end_date.strftime('%Y%m%d')
        plt.savefig(f'{chan}-{short_startdate}-{short_enddate}.png')