From a95ac10538f3232f23dd2f78ae75a119eb9daee7 Mon Sep 17 00:00:00 2001 From: Matt Rendina Date: Thu, 10 Oct 2019 15:17:19 -0400 Subject: Add dataset conversion tool --- convertdata.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100755 convertdata.py diff --git a/convertdata.py b/convertdata.py new file mode 100755 index 0000000..5ae8b9a --- /dev/null +++ b/convertdata.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +# Used to merge old-format pickled dataset and separate log file hashes +# into the newer joined format. + +import pickle +import pandas as pd + +datfile = 'dataframe.dat' +hashfile = 'parsed_files.dat' + +# Read dataframe +frame = pd.read_pickle(datfile) + +# Read MD5 list +with open(hashfile, 'r') as f: + hashes = f.readlines() + +# Store both in a dict and pickle that dict. +data = {'file_hashes': hashes, + 'dataframe': frame} + +print(data) +print('Pickling dict...') +pickle.dump(data, open('data.p', 'wb')) -- cgit