diff options
author | Joseph Hunkeler <jhunkeler@gmail.com> | 2018-03-19 14:01:24 -0400 |
---|---|---|
committer | Joseph Hunkeler <jhunkeler@gmail.com> | 2018-03-19 14:01:24 -0400 |
commit | a9deed01d113b9f0c24afe8f39db3f440f1dfe3c (patch) | |
tree | 82aff1e99164f021f27b497a23e4d0ea74f5e7ac | |
download | pipeline-backup-a9deed01d113b9f0c24afe8f39db3f440f1dfe3c.tar.gz |
Initial commit
-rw-r--r-- | LICENSE.txt | 29 | ||||
-rw-r--r-- | README.md | 6 | ||||
-rwxr-xr-x | pipeline_backup.py | 212 |
3 files changed, 247 insertions, 0 deletions
diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..3f1b241 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,29 @@ +Copyright (C) 2018 Association of Universities for Research in Astronomy (AURA) + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + 3. The name of AURA and its representatives may not be used to + endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY AURA ``AS IS'' AND ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL AURA BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS +OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR +TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +DAMAGE. + diff --git a/README.md b/README.md new file mode 100644 index 0000000..f46a8f8 --- /dev/null +++ b/README.md @@ -0,0 +1,6 @@ +# pipeline_backup + +Scans the current directory, or named directory, for files matching a pattern. If a valid spec file is found, it will download each component package into a path relative to its URL base. + +Where `http://example.com/channel/main/linux-64/package.tar.bz2` becomes `destination/main/linux-64/package.tar.bz2` on the local filesystem. + diff --git a/pipeline_backup.py b/pipeline_backup.py new file mode 100755 index 0000000..b32260f --- /dev/null +++ b/pipeline_backup.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python +import fnmatch +import os +import sys +import time +from urllib.request import urlopen +from urllib.error import HTTPError + + +__version__ = '1.0.0' + + +class PipelineSpecError(Exception): + pass + + +class PipelineSpec: + def __init__(self, filename): + self.filename = filename + self.data = list() + self._read() + + def replace(self, old, new): + for idx, record in enumerate(self.data): + if old in record: + self.data[idx] = record.replace(old, new) + + def search(self, pattern): + for record in self.data: + if fnmatch.fnmatch(record, pattern): + yield record + + def verify(self): + with open(self.filename, 'r') as fp: + if '@EXPLICIT' not in fp.readlines(): + return False + + return True + + def _read(self): + if not self.verify(): + raise PipelineSpecError('Invalid spec file: {}'.format(self.filename)) + + with open(self.filename, 'r') as fp: + data = list() + for line in fp: + line = line.strip() + if not line or line.startswith('#') or line.startswith('@'): + continue + data.append(line) + self.data = data + + +class Backup: + def __init__(self, data, destination): + assert isinstance(data, list) + assert isinstance(destination, str) + self.block_size = 0xFFFF + self.destination = os.path.normpath(destination) + self.data = data + self.stats = dict( + read=0, + written=0, + success=0, + skipped=0, + fatal=list(), + fail=list(), + ) + + def run(self): + for url in self.data: + self._download(url) + + def show_stats(self): + print("### Statistics ###") + for key, value in self.stats.items(): + fmt = '{:<10s}: {:<20d}' + if isinstance(value, list): + if len(value) != 0: + fmt += '\n=>' + print(fmt.format(key, len(value))) + for url, reason in value: + print(' [{}] {}'.format(reason, url)) + continue + + if key == 'read' or key == 'written': + value = value / (1024 ** 2) + fmt = '{:<10s}: {:<.02f}MB' + + print(fmt.format(key, value)) + + def _download(self, url): + path, filename = self._determine_local_path(url) + dirpath = os.path.join(self.destination, path) + fullpath = os.path.join(dirpath, filename) + block_size = self.block_size + + if not os.path.exists(dirpath): + os.makedirs(dirpath) + elif os.path.exists(fullpath): + self.stats['skipped'] += 1 + return + + try: + with urlopen(url) as data: + with open(fullpath, 'w+b') as fp: + if self.verbose: + print("Writing: {}".format(fullpath)) + + chunk = data.read(block_size) + self.stats['read'] += len(chunk) + while chunk: + fp.write(chunk) + self.stats['written'] += len(chunk) + chunk = data.read(block_size) + self.stats['read'] += len(chunk) + except HTTPError as reason: + self.stats['fail'].append([url, reason]) + return + except Exception as reason: + self.stats['fatal'].append([url, reason]) + + self.stats['success'] += 1 + + def _determine_local_path(self, record): + assert isinstance(record, str) + filename = os.path.basename(record) + markers = [] + + for i, ch in enumerate(record): + if ch == '/': + markers.append(i) + + markers_len = len(markers) + if markers_len < 3: + raise ValueError('Invalid URL part length') + + begin = markers[markers_len - 3] + 1 # start after leading slash + end = markers[markers_len - 1] + + local_path = os.path.normpath(os.path.join( + self.destination, record[begin:end])) + return local_path, filename + + +def find_specs(search_path, pattern): + """ Compile list of spec file paths + """ + for root, dirs, files in os.walk(search_path): + for filename in files: + filename = os.path.join(root, filename) + if fnmatch.fnmatch(filename, pattern): + yield filename + + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--basedir', '-d', default='.', + help='Search for spec files under this path') + + parser.add_argument('--extension', '-e', default='*-py*.txt', + help='Match spec file extension by glob') + + parser.add_argument('--search-pattern', '-s', default='*', + help='Return packages from spec files matching glob pattern') + + parser.add_argument('--replace-pattern', '-r', action='append', + default=list(), nargs='*', help='Replace pattern in package output strings') + + parser.add_argument( + '--backup', '-b', help='Backup packages to root directory (preserve relative tree)') + parser.add_argument('--version', action='store_true') + + args = parser.parse_args() + + if args.version: + print(__version__) + exit(0) + + info = list() + for spec in find_specs(args.basedir, args.extension): + pspec = PipelineSpec(spec) + + for pattern in args.replace_pattern: + old, new = pattern + pspec.replace(old, new) + + if args.search_pattern: + info += pspec.search(args.search_pattern) + else: + info += pspec.data + + if not info: + print("No spec files found (extension: '{}')".format(args.extension), file=sys.stderr) + exit(0) + + info = sorted(set(info)) + + if args.backup: + if not os.path.exists(args.backup): + os.makedirs(args.backup) + + backup = Backup(info, args.backup) + backup.verbose = True + backup.run() + backup.show_stats() + + else: + for x in info: + print(x) |