From 0247e7567454684d6eb03e89452d61e397149983 Mon Sep 17 00:00:00 2001 From: Fabian Date: Fri, 19 Nov 2021 11:47:15 -0500 Subject: [PATCH] Fix fs2json.py and copy-to-sha256.py for the new shorter hash --- tools/copy-to-sha256.py | 22 ++++++++++++---------- tools/fs2json.py | 26 ++++++++++++++++++++------ 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/tools/copy-to-sha256.py b/tools/copy-to-sha256.py index 1839bd09..c1b8a01c 100755 --- a/tools/copy-to-sha256.py +++ b/tools/copy-to-sha256.py @@ -8,12 +8,13 @@ import hashlib import shutil import tarfile +HASH_LENGTH = 8 -def hash_file(filename): +def hash_file(filename) -> str: with open(filename, "rb", buffering=0) as f: return hash_fileobj(f) -def hash_fileobj(f): +def hash_fileobj(f) -> str: h = hashlib.sha256() for b in iter(lambda: f.read(128*1024), b""): h.update(b) @@ -42,9 +43,9 @@ def main(): if tar: handle_tar(logger, tar, to_path) else: - handle_dir(logger, path, to_path) + handle_dir(logger, from_path, to_path) -def handle_dir(logger, from_path, to_path): +def handle_dir(logger, from_path: str, to_path: str): def onerror(oserror): logger.warning(oserror) @@ -62,8 +63,9 @@ def handle_dir(logger, from_path, to_path): if stat.S_ISLNK(mode) or stat.S_ISCHR(mode) or stat.S_ISBLK(mode) or stat.S_ISFIFO(mode) or stat.S_ISSOCK(mode): continue - sha256 = hash_file(absname) - to_abs = os.path.join(to_path, sha256) + file_hash = hash_file(absname) + filename = file_hash[0:HASH_LENGTH] + ".bin" + to_abs = os.path.join(to_path, filename) if os.path.exists(to_abs): logger.info("Exists, skipped {} ({})".format(to_abs, absname)) @@ -71,13 +73,13 @@ def handle_dir(logger, from_path, to_path): logger.info("cp {} {}".format(absname, to_abs)) shutil.copyfile(absname, to_abs) -def handle_tar(logger, tar, to_path): +def handle_tar(logger, tar, to_path: str): for member in tar.getmembers(): if member.isfile() or member.islnk(): f = tar.extractfile(member) - sha256 = hash_fileobj(f) - - to_abs = os.path.join(to_path, sha256) + file_hash = hash_fileobj(f) + filename = file_hash[0:HASH_LENGTH] + ".bin" + to_abs = os.path.join(to_path, filename) if os.path.exists(to_abs): logger.info("Exists, skipped {} ({})".format(to_abs, member.name)) diff --git a/tools/fs2json.py b/tools/fs2json.py index 99a95ef4..1a786515 100755 --- a/tools/fs2json.py +++ b/tools/fs2json.py @@ -26,19 +26,21 @@ IDX_GID = 5 # target for symbolic links # child nodes for directories -# sha256 for files +# filename for files IDX_TARGET = 6 -IDX_SHA256 = 6 +IDX_FILENAME = 6 + +HASH_LENGTH = 8 S_IFLNK = 0xA000 S_IFREG = 0x8000 S_IFDIR = 0x4000 -def hash_file(filename): +def hash_file(filename) -> str: with open(filename, "rb", buffering=0) as f: return hash_fileobj(f) -def hash_fileobj(f): +def hash_fileobj(f) -> str: h = hashlib.sha256() for b in iter(lambda: f.read(128*1024), b""): h.update(b) @@ -115,6 +117,7 @@ def handle_dir(logger, path, exclude): prevpath = [] mainroot = [] + filename_to_hash = {} total_size = 0 rootstack = [mainroot] @@ -193,7 +196,12 @@ def handle_dir(logger, path, exclude): target = os.readlink(absname) obj[IDX_TARGET] = target elif isfile: - obj[IDX_SHA256] = hash_file(absname) + file_hash = hash_file(absname) + filename = file_hash[0:HASH_LENGTH] + ".bin" + existing = filename_to_hash.get(filename) + assert existing is None or existing == file_hash, "Collision in short hash (%s and %s)" % (existing, file_hash) + filename_to_hash[filename] = file_hash + obj[IDX_FILENAME] = filename while obj[-1] is None: obj.pop() @@ -206,6 +214,7 @@ def handle_dir(logger, path, exclude): def handle_tar(logger, tar): mainroot = [] + filename_to_hash = {} total_size = 0 for member in tar.getmembers(): @@ -230,7 +239,12 @@ def handle_tar(logger, tar): if member.isfile() or member.islnk(): obj[IDX_MODE] |= S_IFREG f = tar.extractfile(member) - obj[IDX_SHA256] = hash_fileobj(f) + file_hash = hash_fileobj(f) + filename = file_hash[0:HASH_LENGTH] + ".bin" + existing = filename_to_hash.get(filename) + assert existing is None or existing == file_hash, "Collision in short hash (%s and %s)" % (existing, file_hash) + filename_to_hash[filename] = file_hash + obj[IDX_FILENAME] = filename if member.islnk(): # fix size for hard links f.seek(0, os.SEEK_END)