diff options
author | Joseph Hunkeler <jhunkeler@gmail.com> | 2019-09-20 23:25:24 -0400 |
---|---|---|
committer | Joseph Hunkeler <jhunkeler@gmail.com> | 2019-09-20 23:25:24 -0400 |
commit | bf6adac4d192164cded9f0b71fe03461417a5162 (patch) | |
tree | a119dfc3b65160faa566f429a58a337f91ddd8a2 /entomb.py | |
parent | 8c9a82e2d80586e6464b1d0a09bb12c769307fc7 (diff) | |
download | entomb-bf6adac4d192164cded9f0b71fe03461417a5162.tar.gz |
* Added statistics
* Only create directories when they don't exist
* Added verbose mode
Diffstat (limited to 'entomb.py')
-rwxr-xr-x | entomb.py | 52 |
1 files changed, 48 insertions, 4 deletions
@@ -4,6 +4,31 @@ import os import fnmatch import requests import shutil +import sys +import time + + +VERBOSE = False +ERRORS = [] +STATS = { + 'files': 0, + 'urls': 0, + 'processed': 0, + 'skipped': 0, + 'failed': 0, + 'time_sec': '' +} + + +def post_info(): + if ERRORS: + print("# Errors:") + for msg in ERRORS: + print(msg) + + print("# Statistics:") + for k, v in STATS.items(): + print("{:<20s}: {:>10}".format(k, v)) def channel_dir(d): @@ -15,12 +40,16 @@ def channel_dir(d): def download(url, destdir='.', clobber=True, in_memory=False): filename = url.split('/')[-1] - if destdir != '.': + if destdir != '.' and not os.path.exists(destdir): + if VERBOSE: + print("Creating directory: {}".format(destdir)) os.makedirs(destdir, mode=0o775, exist_ok=True) outfile = os.path.join(destdir, filename); + if not clobber and os.path.exists(outfile): print("Skipping: {}".format(outfile)) + STATS['skipped'] += 1 return outfile if not url.startswith('http'): @@ -35,16 +64,24 @@ def download(url, destdir='.', clobber=True, in_memory=False): r = requests.get(url, stream = True) if r.status_code != 200: - print("HTTP ERROR[{}]: Could not download: {}".format(r.status_code, url)) + msg = "HTTP ERROR[{}]: Could not download: {}".format(r.status_code, url) + print(msg, file=sys.stderr) + STATS['failed'] += 1 + ERRORS.append(msg) return "" if in_memory: return r.contents - with open(outfile,"w+b") as fp: + if VERBOSE: + print("Writing to: {}".format(outfile)) + + with open(outfile, "w+b") as fp: for chunk in r.iter_content(chunk_size=0xFFFF): if chunk: fp.write(chunk) + + STATS['processed'] += 1 return outfile @@ -77,15 +114,18 @@ if __name__ == '__main__': parser.add_argument('-o', '--output-dir', required=True, help='Path to output directory') parser.add_argument('-c', '--clobber', action='store_true', help='Overwrite existing packages') parser.add_argument('-p', '--pattern', action='append', help='Search tree for directories and filenames matching patterns (e.g. \'*/latest-*\')') + parser.add_argument('-v', '--verbose', action='store_true', help='Be verbose') args = parser.parse_args() input_dir = args.input_dir output_dir = args.output_dir + VERBOSE = args.verbose pattern = ['*'] if args.pattern: pattern = args.pattern + start_time = time.time() for spec in spec_search(input_dir, pattern): urls = spec_read(spec) channel_parent = channel_dir(spec) @@ -93,5 +133,9 @@ if __name__ == '__main__': channel_sibling = channel_dir(url) new_channel = os.path.join(output_dir, channel_parent, channel_sibling) download(url, destdir=new_channel, clobber=args.clobber); + STATS['urls'] += len(urls) + STATS['files'] += 1 - + stop_time = (time.time() - start_time) + STATS['time_sec'] = '{:0.3f}'.format(stop_time) + post_info() |