Fix fs2json.py and copy-to-sha256.py for the new shorter hash

This commit is contained in:
Fabian 2021-11-19 11:47:15 -05:00
parent 7853d8f191
commit 0247e75674
2 changed files with 32 additions and 16 deletions

View file

@ -8,12 +8,13 @@ import hashlib
import shutil
import tarfile
HASH_LENGTH = 8
def hash_file(filename):
def hash_file(filename) -> str:
with open(filename, "rb", buffering=0) as f:
return hash_fileobj(f)
def hash_fileobj(f):
def hash_fileobj(f) -> str:
h = hashlib.sha256()
for b in iter(lambda: f.read(128*1024), b""):
h.update(b)
@ -42,9 +43,9 @@ def main():
if tar:
handle_tar(logger, tar, to_path)
else:
handle_dir(logger, path, to_path)
handle_dir(logger, from_path, to_path)
def handle_dir(logger, from_path, to_path):
def handle_dir(logger, from_path: str, to_path: str):
def onerror(oserror):
logger.warning(oserror)
@ -62,8 +63,9 @@ def handle_dir(logger, from_path, to_path):
if stat.S_ISLNK(mode) or stat.S_ISCHR(mode) or stat.S_ISBLK(mode) or stat.S_ISFIFO(mode) or stat.S_ISSOCK(mode):
continue
sha256 = hash_file(absname)
to_abs = os.path.join(to_path, sha256)
file_hash = hash_file(absname)
filename = file_hash[0:HASH_LENGTH] + ".bin"
to_abs = os.path.join(to_path, filename)
if os.path.exists(to_abs):
logger.info("Exists, skipped {} ({})".format(to_abs, absname))
@ -71,13 +73,13 @@ def handle_dir(logger, from_path, to_path):
logger.info("cp {} {}".format(absname, to_abs))
shutil.copyfile(absname, to_abs)
def handle_tar(logger, tar, to_path):
def handle_tar(logger, tar, to_path: str):
for member in tar.getmembers():
if member.isfile() or member.islnk():
f = tar.extractfile(member)
sha256 = hash_fileobj(f)
to_abs = os.path.join(to_path, sha256)
file_hash = hash_fileobj(f)
filename = file_hash[0:HASH_LENGTH] + ".bin"
to_abs = os.path.join(to_path, filename)
if os.path.exists(to_abs):
logger.info("Exists, skipped {} ({})".format(to_abs, member.name))

View file

@ -26,19 +26,21 @@ IDX_GID = 5
# target for symbolic links
# child nodes for directories
# sha256 for files
# filename for files
IDX_TARGET = 6
IDX_SHA256 = 6
IDX_FILENAME = 6
HASH_LENGTH = 8
S_IFLNK = 0xA000
S_IFREG = 0x8000
S_IFDIR = 0x4000
def hash_file(filename):
def hash_file(filename) -> str:
with open(filename, "rb", buffering=0) as f:
return hash_fileobj(f)
def hash_fileobj(f):
def hash_fileobj(f) -> str:
h = hashlib.sha256()
for b in iter(lambda: f.read(128*1024), b""):
h.update(b)
@ -115,6 +117,7 @@ def handle_dir(logger, path, exclude):
prevpath = []
mainroot = []
filename_to_hash = {}
total_size = 0
rootstack = [mainroot]
@ -193,7 +196,12 @@ def handle_dir(logger, path, exclude):
target = os.readlink(absname)
obj[IDX_TARGET] = target
elif isfile:
obj[IDX_SHA256] = hash_file(absname)
file_hash = hash_file(absname)
filename = file_hash[0:HASH_LENGTH] + ".bin"
existing = filename_to_hash.get(filename)
assert existing is None or existing == file_hash, "Collision in short hash (%s and %s)" % (existing, file_hash)
filename_to_hash[filename] = file_hash
obj[IDX_FILENAME] = filename
while obj[-1] is None:
obj.pop()
@ -206,6 +214,7 @@ def handle_dir(logger, path, exclude):
def handle_tar(logger, tar):
mainroot = []
filename_to_hash = {}
total_size = 0
for member in tar.getmembers():
@ -230,7 +239,12 @@ def handle_tar(logger, tar):
if member.isfile() or member.islnk():
obj[IDX_MODE] |= S_IFREG
f = tar.extractfile(member)
obj[IDX_SHA256] = hash_fileobj(f)
file_hash = hash_fileobj(f)
filename = file_hash[0:HASH_LENGTH] + ".bin"
existing = filename_to_hash.get(filename)
assert existing is None or existing == file_hash, "Collision in short hash (%s and %s)" % (existing, file_hash)
filename_to_hash[filename] = file_hash
obj[IDX_FILENAME] = filename
if member.islnk():
# fix size for hard links
f.seek(0, os.SEEK_END)