Fix fs2json.py and copy-to-sha256.py for the new shorter hash
This commit is contained in:
parent
7853d8f191
commit
0247e75674
|
@ -8,12 +8,13 @@ import hashlib
|
||||||
import shutil
|
import shutil
|
||||||
import tarfile
|
import tarfile
|
||||||
|
|
||||||
|
HASH_LENGTH = 8
|
||||||
|
|
||||||
def hash_file(filename):
|
def hash_file(filename) -> str:
|
||||||
with open(filename, "rb", buffering=0) as f:
|
with open(filename, "rb", buffering=0) as f:
|
||||||
return hash_fileobj(f)
|
return hash_fileobj(f)
|
||||||
|
|
||||||
def hash_fileobj(f):
|
def hash_fileobj(f) -> str:
|
||||||
h = hashlib.sha256()
|
h = hashlib.sha256()
|
||||||
for b in iter(lambda: f.read(128*1024), b""):
|
for b in iter(lambda: f.read(128*1024), b""):
|
||||||
h.update(b)
|
h.update(b)
|
||||||
|
@ -42,9 +43,9 @@ def main():
|
||||||
if tar:
|
if tar:
|
||||||
handle_tar(logger, tar, to_path)
|
handle_tar(logger, tar, to_path)
|
||||||
else:
|
else:
|
||||||
handle_dir(logger, path, to_path)
|
handle_dir(logger, from_path, to_path)
|
||||||
|
|
||||||
def handle_dir(logger, from_path, to_path):
|
def handle_dir(logger, from_path: str, to_path: str):
|
||||||
def onerror(oserror):
|
def onerror(oserror):
|
||||||
logger.warning(oserror)
|
logger.warning(oserror)
|
||||||
|
|
||||||
|
@ -62,8 +63,9 @@ def handle_dir(logger, from_path, to_path):
|
||||||
if stat.S_ISLNK(mode) or stat.S_ISCHR(mode) or stat.S_ISBLK(mode) or stat.S_ISFIFO(mode) or stat.S_ISSOCK(mode):
|
if stat.S_ISLNK(mode) or stat.S_ISCHR(mode) or stat.S_ISBLK(mode) or stat.S_ISFIFO(mode) or stat.S_ISSOCK(mode):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
sha256 = hash_file(absname)
|
file_hash = hash_file(absname)
|
||||||
to_abs = os.path.join(to_path, sha256)
|
filename = file_hash[0:HASH_LENGTH] + ".bin"
|
||||||
|
to_abs = os.path.join(to_path, filename)
|
||||||
|
|
||||||
if os.path.exists(to_abs):
|
if os.path.exists(to_abs):
|
||||||
logger.info("Exists, skipped {} ({})".format(to_abs, absname))
|
logger.info("Exists, skipped {} ({})".format(to_abs, absname))
|
||||||
|
@ -71,13 +73,13 @@ def handle_dir(logger, from_path, to_path):
|
||||||
logger.info("cp {} {}".format(absname, to_abs))
|
logger.info("cp {} {}".format(absname, to_abs))
|
||||||
shutil.copyfile(absname, to_abs)
|
shutil.copyfile(absname, to_abs)
|
||||||
|
|
||||||
def handle_tar(logger, tar, to_path):
|
def handle_tar(logger, tar, to_path: str):
|
||||||
for member in tar.getmembers():
|
for member in tar.getmembers():
|
||||||
if member.isfile() or member.islnk():
|
if member.isfile() or member.islnk():
|
||||||
f = tar.extractfile(member)
|
f = tar.extractfile(member)
|
||||||
sha256 = hash_fileobj(f)
|
file_hash = hash_fileobj(f)
|
||||||
|
filename = file_hash[0:HASH_LENGTH] + ".bin"
|
||||||
to_abs = os.path.join(to_path, sha256)
|
to_abs = os.path.join(to_path, filename)
|
||||||
|
|
||||||
if os.path.exists(to_abs):
|
if os.path.exists(to_abs):
|
||||||
logger.info("Exists, skipped {} ({})".format(to_abs, member.name))
|
logger.info("Exists, skipped {} ({})".format(to_abs, member.name))
|
||||||
|
|
|
@ -26,19 +26,21 @@ IDX_GID = 5
|
||||||
|
|
||||||
# target for symbolic links
|
# target for symbolic links
|
||||||
# child nodes for directories
|
# child nodes for directories
|
||||||
# sha256 for files
|
# filename for files
|
||||||
IDX_TARGET = 6
|
IDX_TARGET = 6
|
||||||
IDX_SHA256 = 6
|
IDX_FILENAME = 6
|
||||||
|
|
||||||
|
HASH_LENGTH = 8
|
||||||
|
|
||||||
S_IFLNK = 0xA000
|
S_IFLNK = 0xA000
|
||||||
S_IFREG = 0x8000
|
S_IFREG = 0x8000
|
||||||
S_IFDIR = 0x4000
|
S_IFDIR = 0x4000
|
||||||
|
|
||||||
def hash_file(filename):
|
def hash_file(filename) -> str:
|
||||||
with open(filename, "rb", buffering=0) as f:
|
with open(filename, "rb", buffering=0) as f:
|
||||||
return hash_fileobj(f)
|
return hash_fileobj(f)
|
||||||
|
|
||||||
def hash_fileobj(f):
|
def hash_fileobj(f) -> str:
|
||||||
h = hashlib.sha256()
|
h = hashlib.sha256()
|
||||||
for b in iter(lambda: f.read(128*1024), b""):
|
for b in iter(lambda: f.read(128*1024), b""):
|
||||||
h.update(b)
|
h.update(b)
|
||||||
|
@ -115,6 +117,7 @@ def handle_dir(logger, path, exclude):
|
||||||
prevpath = []
|
prevpath = []
|
||||||
|
|
||||||
mainroot = []
|
mainroot = []
|
||||||
|
filename_to_hash = {}
|
||||||
total_size = 0
|
total_size = 0
|
||||||
rootstack = [mainroot]
|
rootstack = [mainroot]
|
||||||
|
|
||||||
|
@ -193,7 +196,12 @@ def handle_dir(logger, path, exclude):
|
||||||
target = os.readlink(absname)
|
target = os.readlink(absname)
|
||||||
obj[IDX_TARGET] = target
|
obj[IDX_TARGET] = target
|
||||||
elif isfile:
|
elif isfile:
|
||||||
obj[IDX_SHA256] = hash_file(absname)
|
file_hash = hash_file(absname)
|
||||||
|
filename = file_hash[0:HASH_LENGTH] + ".bin"
|
||||||
|
existing = filename_to_hash.get(filename)
|
||||||
|
assert existing is None or existing == file_hash, "Collision in short hash (%s and %s)" % (existing, file_hash)
|
||||||
|
filename_to_hash[filename] = file_hash
|
||||||
|
obj[IDX_FILENAME] = filename
|
||||||
|
|
||||||
while obj[-1] is None:
|
while obj[-1] is None:
|
||||||
obj.pop()
|
obj.pop()
|
||||||
|
@ -206,6 +214,7 @@ def handle_dir(logger, path, exclude):
|
||||||
|
|
||||||
def handle_tar(logger, tar):
|
def handle_tar(logger, tar):
|
||||||
mainroot = []
|
mainroot = []
|
||||||
|
filename_to_hash = {}
|
||||||
total_size = 0
|
total_size = 0
|
||||||
|
|
||||||
for member in tar.getmembers():
|
for member in tar.getmembers():
|
||||||
|
@ -230,7 +239,12 @@ def handle_tar(logger, tar):
|
||||||
if member.isfile() or member.islnk():
|
if member.isfile() or member.islnk():
|
||||||
obj[IDX_MODE] |= S_IFREG
|
obj[IDX_MODE] |= S_IFREG
|
||||||
f = tar.extractfile(member)
|
f = tar.extractfile(member)
|
||||||
obj[IDX_SHA256] = hash_fileobj(f)
|
file_hash = hash_fileobj(f)
|
||||||
|
filename = file_hash[0:HASH_LENGTH] + ".bin"
|
||||||
|
existing = filename_to_hash.get(filename)
|
||||||
|
assert existing is None or existing == file_hash, "Collision in short hash (%s and %s)" % (existing, file_hash)
|
||||||
|
filename_to_hash[filename] = file_hash
|
||||||
|
obj[IDX_FILENAME] = filename
|
||||||
if member.islnk():
|
if member.islnk():
|
||||||
# fix size for hard links
|
# fix size for hard links
|
||||||
f.seek(0, os.SEEK_END)
|
f.seek(0, os.SEEK_END)
|
||||||
|
|
Loading…
Reference in a new issue