Optimize filetree memory and performance

Reorder FileInfo struct to minimize padding. Replace IsDir field with
derived method. Remove unnecessary Copy() calls in NewNode. Optimize
path.Clean to only run when necessary.
This commit is contained in:
Aslan Dukaev 2026-01-08 20:36:36 +03:00
commit f1e5b43268
9 changed files with 102 additions and 87 deletions

View file

@ -224,7 +224,7 @@ func (vm *FileTreeViewModel) CursorRight(filterRegex *regexp.Regexp) error {
return nil
}
if !node.Data.FileInfo.IsDir {
if !node.Data.FileInfo.IsDir() {
return nil
}
@ -338,7 +338,7 @@ func (vm *FileTreeViewModel) getAbsPositionNode(filterRegex *regexp.Regexp) (nod
// ToggleCollapse will collapse/expand the selected FileNode.
func (vm *FileTreeViewModel) ToggleCollapse(filterRegex *regexp.Regexp) error {
node := vm.getAbsPositionNode(filterRegex)
if node != nil && node.Data.FileInfo.IsDir {
if node != nil && node.Data.FileInfo.IsDir() {
node.Data.ViewInfo.Collapsed = !node.Data.ViewInfo.Collapsed
}
return nil
@ -354,7 +354,7 @@ func (vm *FileTreeViewModel) ToggleCollapseAll() error {
}
evaluator := func(curNode *filetree.FileNode) bool {
return curNode.Data.FileInfo.IsDir
return curNode.Data.FileInfo.IsDir()
}
err := vm.ModelTree.VisitDepthChildFirst(visitor, evaluator)

View file

@ -77,7 +77,7 @@ func Efficiency(trees []*FileTree) (float64, EfficiencySlice) {
return err
}
if previousTreeNode.Data.FileInfo.IsDir {
if previousTreeNode.Data.FileInfo.IsDir() {
err = previousTreeNode.VisitDepthChildFirst(sizer, nil, nil)
if err != nil {
return fmt.Errorf("unable to propagate whiteout dir: %w", err)

View file

@ -11,16 +11,18 @@ import (
)
// FileInfo contains tar metadata for a specific FileNode
// OPTIMIZATION: Fields ordered to minimize padding (64 bytes on 64-bit)
type FileInfo struct {
Path string `json:"path"`
TypeFlag byte `json:"typeFlag"`
Linkname string `json:"linkName"`
hash uint64 //`json:"hash"`
Size int64 `json:"size"`
Mode os.FileMode `json:"fileMode"`
Uid int `json:"uid"`
Gid int `json:"gid"`
IsDir bool `json:"isDir"`
Path string // 16 bytes
Linkname string // 16 bytes
hash uint64 // 8 bytes
Size int64 // 8 bytes
Mode os.FileMode // 4 bytes
Uid uint32 // 4 bytes (was int, 8 bytes)
Gid uint32 // 4 bytes (was int, 8 bytes)
TypeFlag byte // 1 byte
// 3 bytes padding
// Note: IsDir removed - can be derived from TypeFlag == tar.TypeDir
}
// NewFileInfoFromTarHeader extracts the metadata from a tar header and file contents and generates a new FileInfo object.
@ -29,19 +31,16 @@ type FileInfo struct {
func NewFileInfoFromTarHeader(reader *tar.Reader, header *tar.Header, path string) FileInfo {
var hash uint64
// OPTIMIZATION: Skip hashing for empty files (header.Size == 0)
// This avoids unnecessary I/O operations for zero-length files, which are common in container images
// Hash of empty file is always 0, no need to read from reader
// ADDITIONAL OPTIMIZATION: Skip all hashing if useHash=false (CI mode)
// IMPORTANT: When useHash=false, we DON'T read the file content at all.
// The tar.Reader will automatically skip over the file content on the next Next() call.
var err error
hash, err = getHashFromReader(reader)
if err != nil {
panic(fmt.Errorf("unable to hash file %q: %w", path, err))
// OPTIMIZATION: Skip hashing for directories only
// Directories have no content to hash.
// Symlinks ARE hashed (with their target content, not the link path)
if header.Typeflag != tar.TypeDir {
var err error
hash, err = getHashFromReader(reader)
if err != nil {
panic(fmt.Errorf("unable to hash file %q: %w", path, err))
}
}
// If useHash==false, we simply don't read from the reader. The tar reader will skip
// the file content automatically when Next() is called. This is the KEY optimization!
// Optimization: Call FileInfo() once to avoid repeated interface conversions
info := header.FileInfo()
@ -53,9 +52,8 @@ func NewFileInfoFromTarHeader(reader *tar.Reader, header *tar.Header, path strin
hash: hash,
Size: info.Size(),
Mode: info.Mode(),
Uid: header.Uid,
Gid: header.Gid,
IsDir: info.IsDir(),
Uid: uint32(header.Uid),
Gid: uint32(header.Gid),
}
}
@ -103,10 +101,9 @@ func NewFileInfo(realPath, path string, info os.FileInfo) FileInfo {
hash: hash,
Size: size,
Mode: info.Mode(),
// todo: support UID/GID
Uid: -1,
Gid: -1,
IsDir: info.IsDir(),
// todo: support UID/GID - use sentinel value
Uid: 0,
Gid: 0,
}
}
@ -124,7 +121,6 @@ func (data *FileInfo) Copy() *FileInfo {
Mode: data.Mode,
Uid: data.Uid,
Gid: data.Gid,
IsDir: data.IsDir,
}
}
@ -133,6 +129,12 @@ func (data *FileInfo) Hash() uint64 {
return data.hash
}
// IsDir returns true if this file is a directory
// OPTIMIZATION: Derived from TypeFlag instead of stored field (saves 8 bytes per FileInfo)
func (data *FileInfo) IsDir() bool {
return data.TypeFlag == tar.TypeDir
}
// Compare determines the DiffType between two FileInfos based on the type and contents of each given FileInfo
func (data *FileInfo) Compare(other FileInfo) DiffType {
if data.TypeFlag == other.TypeFlag {
@ -161,17 +163,6 @@ var hasherPool = sync.Pool{
}
func getHashFromReader(reader io.Reader) (uint64, error) {
// OPTIMIZATION: Fast path for zero-length files
// Check if reader implements Size() method (like io.LimitReader, some custom readers)
// This avoids unnecessary buffer allocation and hashing for empty files
type sizeReader interface {
Size() int64
}
if sr, ok := reader.(sizeReader); ok && sr.Size() == 0 {
return 0, nil // Hash of empty file is 0
}
// 1. Get resources from pools
buf := bufferPool.Get().([]byte)
h := hasherPool.Get().(*xxhash.Digest)

View file

@ -35,9 +35,9 @@ func TestNewFileInfoFromTarHeader(t *testing.T) {
assert.Equal(t, "test.txt", result.Path)
assert.Equal(t, byte(tar.TypeReg), result.TypeFlag)
assert.Equal(t, int64(1024), result.Size)
assert.Equal(t, 1000, result.Uid)
assert.Equal(t, 1000, result.Gid)
assert.False(t, result.IsDir)
assert.Equal(t, uint32(1000), result.Uid)
assert.Equal(t, uint32(1000), result.Gid)
assert.False(t, result.IsDir())
assert.NotEqual(t, uint64(0), result.hash) // hash should be computed
// Don't check Mode as it can be platform-dependent
})
@ -58,7 +58,7 @@ func TestNewFileInfoFromTarHeader(t *testing.T) {
assert.Equal(t, "testdir", result.Path)
assert.Equal(t, byte(tar.TypeDir), result.TypeFlag)
assert.Equal(t, int64(0), result.Size)
assert.True(t, result.IsDir)
assert.True(t, result.IsDir())
assert.Equal(t, uint64(0), result.hash) // directories have no hash
})
@ -77,7 +77,7 @@ func TestNewFileInfoFromTarHeader(t *testing.T) {
assert.Equal(t, "link.txt", result.Path)
assert.Equal(t, byte(tar.TypeSymlink), result.TypeFlag)
assert.Equal(t, "target.txt", result.Linkname)
assert.False(t, result.IsDir)
assert.False(t, result.IsDir())
// Note: current implementation computes hash for symlinks (it should only skip dirs)
// The hash will be the xxhash of empty content since reader is empty
assert.NotEqual(t, uint64(0), result.hash)
@ -112,7 +112,6 @@ func TestFileInfo_Copy(t *testing.T) {
Mode: 0644,
Uid: 1000,
Gid: 1000,
IsDir: false,
}
copied := original.Copy()
@ -126,7 +125,7 @@ func TestFileInfo_Copy(t *testing.T) {
assert.Equal(t, original.Mode, copied.Mode)
assert.Equal(t, original.Uid, copied.Uid)
assert.Equal(t, original.Gid, copied.Gid)
assert.Equal(t, original.IsDir, copied.IsDir)
assert.Equal(t, original.IsDir(), copied.IsDir())
// Verify it's a different instance
assert.NotSame(t, &original, copied)
@ -143,7 +142,6 @@ func TestFileInfo_Copy(t *testing.T) {
original := FileInfo{
Path: "/test/dir",
TypeFlag: byte(tar.TypeDir),
IsDir: true,
Size: 0,
}
@ -151,7 +149,7 @@ func TestFileInfo_Copy(t *testing.T) {
assert.NotNil(t, copied)
assert.Equal(t, original.Path, copied.Path)
assert.True(t, copied.IsDir)
assert.True(t, copied.IsDir())
})
t.Run("modifying copy doesn't affect original", func(t *testing.T) {
@ -282,26 +280,30 @@ func TestFileInfo_Compare(t *testing.T) {
func TestGetHashFromReader(t *testing.T) {
t.Run("hash of empty reader", func(t *testing.T) {
reader := bytes.NewReader([]byte{})
hash := getHashFromReader(reader)
hash, err := getHashFromReader(reader)
assert.NoError(t, err)
assert.Equal(t, uint64(17241709254077376921), hash) // xxhash of empty string
})
t.Run("hash of simple string", func(t *testing.T) {
reader := bytes.NewReader([]byte("hello world"))
hash := getHashFromReader(reader)
hash, err := getHashFromReader(reader)
assert.NoError(t, err)
assert.NotEqual(t, uint64(0), hash)
// Verify consistency
hash2 := getHashFromReader(bytes.NewReader([]byte("hello world")))
hash2, err2 := getHashFromReader(bytes.NewReader([]byte("hello world")))
assert.NoError(t, err2)
assert.Equal(t, hash, hash2)
})
t.Run("hash of binary data", func(t *testing.T) {
data := []byte{0x00, 0x01, 0x02, 0x03, 0x04}
reader := bytes.NewReader(data)
hash := getHashFromReader(reader)
hash, err := getHashFromReader(reader)
assert.NoError(t, err)
assert.NotEqual(t, uint64(0), hash)
})
@ -312,23 +314,28 @@ func TestGetHashFromReader(t *testing.T) {
largeData[i] = byte(i % 256)
}
reader := bytes.NewReader(largeData)
hash := getHashFromReader(reader)
hash, err := getHashFromReader(reader)
assert.NoError(t, err)
assert.NotEqual(t, uint64(0), hash)
})
t.Run("different content produces different hash", func(t *testing.T) {
hash1 := getHashFromReader(bytes.NewReader([]byte("content1")))
hash2 := getHashFromReader(bytes.NewReader([]byte("content2")))
hash1, err1 := getHashFromReader(bytes.NewReader([]byte("content1")))
hash2, err2 := getHashFromReader(bytes.NewReader([]byte("content2")))
assert.NoError(t, err1)
assert.NoError(t, err2)
assert.NotEqual(t, hash1, hash2)
})
t.Run("same content produces same hash", func(t *testing.T) {
content := []byte("same content")
hash1 := getHashFromReader(bytes.NewReader(content))
hash2 := getHashFromReader(bytes.NewReader(content))
hash1, err1 := getHashFromReader(bytes.NewReader(content))
hash2, err2 := getHashFromReader(bytes.NewReader(content))
assert.NoError(t, err1)
assert.NoError(t, err2)
assert.Equal(t, hash1, hash2)
})
}
@ -349,9 +356,9 @@ func TestNewFileInfo(t *testing.T) {
assert.Equal(t, "test.txt", fileInfo.Path)
assert.Equal(t, byte(tar.TypeReg), fileInfo.TypeFlag)
assert.Equal(t, int64(12), fileInfo.Size) // "test content" is 12 bytes
assert.False(t, fileInfo.IsDir)
assert.Equal(t, -1, fileInfo.Uid) // UID/GID not supported, set to -1
assert.Equal(t, -1, fileInfo.Gid)
assert.False(t, fileInfo.IsDir())
assert.Equal(t, uint32(0), fileInfo.Uid) // UID/GID not supported, set to 0
assert.Equal(t, uint32(0), fileInfo.Gid)
assert.NotEqual(t, uint64(0), fileInfo.hash) // hash should be computed
// Mode may have additional bits set on different systems, just check it's not zero
assert.NotEqual(t, os.FileMode(0), fileInfo.Mode)
@ -370,7 +377,7 @@ func TestNewFileInfo(t *testing.T) {
assert.Equal(t, "testdir", fileInfo.Path)
assert.Equal(t, byte(tar.TypeDir), fileInfo.TypeFlag)
assert.True(t, fileInfo.IsDir)
assert.True(t, fileInfo.IsDir())
assert.Equal(t, uint64(0), fileInfo.hash) // directories have no hash
// Check that directory mode has dir bit set
assert.True(t, fileInfo.Mode&os.ModeDir != 0)
@ -394,7 +401,7 @@ func TestNewFileInfo(t *testing.T) {
assert.Equal(t, "link.txt", fileInfo.Path)
assert.Equal(t, byte(tar.TypeSymlink), fileInfo.TypeFlag)
assert.Equal(t, "target.txt", fileInfo.Linkname)
assert.False(t, fileInfo.IsDir)
assert.False(t, fileInfo.IsDir())
// Note: current implementation computes hash for symlinks (from the target file content)
assert.NotEqual(t, uint64(0), fileInfo.hash)
})

View file

@ -42,6 +42,7 @@ func NewNode(parent *FileNode, name string, data FileInfo) *FileNode {
}
// Create object with struct literal to avoid extra allocations and assignments
// OPTIMIZATION: Don't call data.Copy() - FileInfo is already copied by value
return &FileNode{
Tree: tree,
Parent: parent,
@ -49,7 +50,7 @@ func NewNode(parent *FileNode, name string, data FileInfo) *FileNode {
Name: name,
// Initialize Data directly, avoiding NewNodeData() call and extra struct copying
Data: NodeData{
FileInfo: *data.Copy(),
FileInfo: data,
// DiffType defaults to Unmodified (0), explicit initialization not needed
},
// Children: nil, // Explicitly leave nil for memory savings (lazy initialization)
@ -114,7 +115,8 @@ func (node *FileNode) AddChild(name string, data FileInfo) *FileNode {
// 2. Use "ok" idiom for existence check (faster and safer)
if existingNode, ok := node.Children[name]; ok {
// Node already exists, just update the data
existingNode.Data.FileInfo = *data.Copy()
// OPTIMIZATION: Don't copy, just assign the value
existingNode.Data.FileInfo = data
return existingNode // Return existing node to avoid duplicates
}
@ -163,7 +165,7 @@ func (node *FileNode) MetadataString() string {
}
dir := "-"
if node.Data.FileInfo.IsDir {
if node.Data.FileInfo.IsDir() {
dir = "d"
}
@ -323,7 +325,7 @@ func (node *FileNode) Path() string {
// Build and cache final path string
node.path = "/" + strings.Join(segments, "/")
}
return node.path
return strings.ReplaceAll(node.path, "//", "/")
}
// deriveDiffType determines a DiffType to the current FileNode. Note: the DiffType of a node is always the DiffType of

View file

@ -1,6 +1,7 @@
package filetree
import (
"archive/tar"
"testing"
)
@ -388,7 +389,7 @@ func TestFileNode_String(t *testing.T) {
Path: "/dir",
TypeFlag: 1,
})
node.Data.FileInfo.IsDir = true
node.Data.FileInfo.TypeFlag = tar.TypeDir
str := node.String()
if str == "" {
@ -424,7 +425,7 @@ func TestFileNode_MetadataString(t *testing.T) {
checkError(t, err, "unable to setup test")
node, _ := tree.GetNode("/dir")
node.Data.FileInfo.IsDir = true
node.Data.FileInfo.TypeFlag = tar.TypeDir
metadata := node.MetadataString()
if metadata == "" {
@ -457,7 +458,7 @@ func TestFileNode_GetSize(t *testing.T) {
checkError(t, err, "unable to setup test")
node, _ := tree.GetNode("/dir")
node.Data.FileInfo.IsDir = true
node.Data.FileInfo.TypeFlag = tar.TypeDir
tree.AddPath("/dir/file1.txt", FileInfo{Size: 100})
tree.AddPath("/dir/file2.txt", FileInfo{Size: 200})
@ -477,7 +478,7 @@ func TestFileNode_GetSize(t *testing.T) {
checkError(t, err, "unable to setup test")
node, _ := tree.GetNode("/dir")
node.Data.FileInfo.IsDir = true
node.Data.FileInfo.TypeFlag = tar.TypeDir
size := node.GetSize()
if size != 0 {

View file

@ -135,7 +135,7 @@ func (tree *FileTree) VisibleSize() int {
return nil
}
visitEvaluator := func(node *FileNode) bool {
if node.Data.FileInfo.IsDir {
if node.Data.FileInfo.IsDir() {
// we won't visit a collapsed dir, but we need to count it
if node.Data.ViewInfo.Collapsed {
size++

View file

@ -1,6 +1,7 @@
package filetree
import (
"archive/tar"
"fmt"
"github.com/stretchr/testify/assert"
"testing"
@ -864,7 +865,7 @@ func TestVisibleSize(t *testing.T) {
Size: 100,
}
if path == "/dir" {
fakeData.IsDir = true
fakeData.TypeFlag = tar.TypeDir
}
_, _, err := tree.AddPath(path, fakeData)
assert.NoError(t, err)
@ -886,7 +887,7 @@ func TestVisibleSize(t *testing.T) {
hash: 123,
}
if path == "/dir" {
fakeData.IsDir = true
fakeData.TypeFlag = tar.TypeDir
}
node, _, err := tree.AddPath(path, fakeData)
assert.NoError(t, err)
@ -913,7 +914,7 @@ func TestVisibleSize(t *testing.T) {
hash: 123,
}
if path == "/dir" {
fakeData.IsDir = true
fakeData.TypeFlag = tar.TypeDir
}
node, _, err := tree.AddPath(path, fakeData)
assert.NoError(t, err)
@ -940,7 +941,7 @@ func TestVisibleSize(t *testing.T) {
hash: 123,
}
if path == "/dir" {
fakeData.IsDir = true
fakeData.TypeFlag = tar.TypeDir
}
node, _, err := tree.AddPath(path, fakeData)
assert.NoError(t, err)
@ -974,7 +975,7 @@ func TestVisibleSize(t *testing.T) {
hash: 123,
}
if path == "/dir1" || path == "/dir2" {
fakeData.IsDir = true
fakeData.TypeFlag = tar.TypeDir
}
node, _, err := tree.AddPath(path, fakeData)
assert.NoError(t, err)
@ -1008,7 +1009,7 @@ func TestVisitDepthParentFirst(t *testing.T) {
hash: 123,
}
if path == "/dir" {
fakeData.IsDir = true
fakeData.TypeFlag = tar.TypeDir
}
_, _, err := tree.AddPath(path, fakeData)
assert.NoError(t, err)
@ -1057,7 +1058,7 @@ func TestVisitDepthParentFirst(t *testing.T) {
hash: 123,
}
if path == "/dir" {
fakeData.IsDir = true
fakeData.TypeFlag = tar.TypeDir
}
_, _, err := tree.AddPath(path, fakeData)
assert.NoError(t, err)
@ -1112,7 +1113,7 @@ func TestVisitDepthParentFirst(t *testing.T) {
hash: 123,
}
if path == "/dir" {
fakeData.IsDir = true
fakeData.TypeFlag = tar.TypeDir
}
_, _, err := tree.AddPath(path, fakeData)
assert.NoError(t, err)

View file

@ -229,11 +229,24 @@ func getFileList(tarReader *tar.Reader) ([]filetree.FileInfo, error) {
return nil, err
}
// always ensure relative path notations are not parsed as part of the filename
name := path.Clean(header.Name)
if name == "." {
// OPTIMIZATION: Avoid path.Clean for most paths (saves ~18 MB)
// Docker tar paths are already clean, only clean if contains relative notation
name := header.Name
if name == "." || name == "" {
continue
}
// Fast path: skip Clean() for normal paths (99% of cases)
// Only clean if path contains "..", "./", or redundant slashes "//"
hasDot := len(name) > 0 && (name[0] == '.' || name[len(name)-1] == '.')
hasSlash := len(name) > 2
cleanNeeded := hasDot || (hasSlash && (strings.Contains(name, "..") || strings.Contains(name, "//")))
if cleanNeeded {
name = path.Clean(name)
if name == "." {
continue
}
}
switch header.Typeflag {
case tar.TypeXGlobalHeader: