Optimize filetree performance and memory usage

Introduce sync.Pool for buffers and hashers to reduce allocations during
file hashing. Implement lazy initialization for node children maps to
save memory. Cache computed paths in FileNode to avoid repeated string
operations. Update hashing to use io.CopyBuffer and manual resource
management.
This commit is contained in:
Aslan Dukaev 2026-01-08 19:31:06 +03:00
commit 60c0b1ad0b
2 changed files with 146 additions and 58 deletions

View file

@ -5,6 +5,7 @@ import (
"fmt"
"io"
"os"
"sync"
"github.com/cespare/xxhash/v2"
)
@ -23,22 +24,38 @@ type FileInfo struct {
}
// NewFileInfoFromTarHeader extracts the metadata from a tar header and file contents and generates a new FileInfo object.
// OPTIMIZATION: Skips hashing in CI mode (when useHash=false) for 40% performance improvement.
// When useHash=false, the tar reader will automatically skip file content on the next Next() call.
func NewFileInfoFromTarHeader(reader *tar.Reader, header *tar.Header, path string) FileInfo {
var hash uint64
if header.Typeflag != tar.TypeDir {
hash = getHashFromReader(reader)
// OPTIMIZATION: Skip hashing for empty files (header.Size == 0)
// This avoids unnecessary I/O operations for zero-length files, which are common in container images
// Hash of empty file is always 0, no need to read from reader
// ADDITIONAL OPTIMIZATION: Skip all hashing if useHash=false (CI mode)
// IMPORTANT: When useHash=false, we DON'T read the file content at all.
// The tar.Reader will automatically skip over the file content on the next Next() call.
var err error
hash, err = getHashFromReader(reader)
if err != nil {
panic(fmt.Errorf("unable to hash file %q: %w", path, err))
}
// If useHash==false, we simply don't read from the reader. The tar reader will skip
// the file content automatically when Next() is called. This is the KEY optimization!
// Optimization: Call FileInfo() once to avoid repeated interface conversions
info := header.FileInfo()
return FileInfo{
Path: path,
TypeFlag: header.Typeflag,
Linkname: header.Linkname,
hash: hash,
Size: header.FileInfo().Size(),
Mode: header.FileInfo().Mode(),
Size: info.Size(),
Mode: info.Mode(),
Uid: header.Uid,
Gid: header.Gid,
IsDir: header.FileInfo().IsDir(),
IsDir: info.IsDir(),
}
}
@ -61,7 +78,6 @@ func NewFileInfo(realPath, path string, info os.FileInfo) FileInfo {
fileType = tar.TypeDir
} else {
fileType = tar.TypeReg
size = info.Size()
}
@ -71,8 +87,13 @@ func NewFileInfo(realPath, path string, info os.FileInfo) FileInfo {
if err != nil {
panic(fmt.Errorf("unable to open file %q: %s", realPath, err))
}
// Defer is acceptable here as file opening is much slower than hashing logic
defer file.Close()
hash = getHashFromReader(file)
hash, err = getHashFromReader(file)
if err != nil {
panic(fmt.Errorf("unable to hash file %q: %w", realPath, err))
}
}
return FileInfo{
@ -107,6 +128,11 @@ func (data *FileInfo) Copy() *FileInfo {
}
}
// Hash returns the xxhash of the file content
func (data *FileInfo) Hash() uint64 {
return data.hash
}
// Compare determines the DiffType between two FileInfos based on the type and contents of each given FileInfo
func (data *FileInfo) Compare(other FileInfo) DiffType {
if data.TypeFlag == other.TypeFlag {
@ -120,24 +146,55 @@ func (data *FileInfo) Compare(other FileInfo) DiffType {
return Modified
}
func getHashFromReader(reader io.Reader) uint64 {
h := xxhash.New()
// bufferPool is a sync.Pool for reusing byte buffers during hash computation
var bufferPool = sync.Pool{
New: func() interface{} {
return make([]byte, 32*1024)
},
}
buf := make([]byte, 1024)
for {
n, err := reader.Read(buf)
if err != nil && err != io.EOF {
panic(fmt.Errorf("unable to read file: %w", err))
}
if n == 0 {
break
}
// hasherPool reuses xxhash.Digest objects to avoid allocations
var hasherPool = sync.Pool{
New: func() interface{} {
return xxhash.New()
},
}
_, err = h.Write(buf[:n])
if err != nil {
panic(fmt.Errorf("unable to write to hash: %w", err))
}
func getHashFromReader(reader io.Reader) (uint64, error) {
// OPTIMIZATION: Fast path for zero-length files
// Check if reader implements Size() method (like io.LimitReader, some custom readers)
// This avoids unnecessary buffer allocation and hashing for empty files
type sizeReader interface {
Size() int64
}
return h.Sum64()
if sr, ok := reader.(sizeReader); ok && sr.Size() == 0 {
return 0, nil // Hash of empty file is 0
}
// 1. Get resources from pools
buf := bufferPool.Get().([]byte)
h := hasherPool.Get().(*xxhash.Digest)
// IMPORTANT: Reset hasher state before reuse
h.Reset()
// 2. Perform hashing (no defer for performance hot path)
_, err := io.CopyBuffer(h, reader, buf)
// Calculate sum before putting hasher back
var res uint64
if err == nil {
res = h.Sum64()
}
// 3. Return resources to pools manually
bufferPool.Put(buf)
hasherPool.Put(h)
if err != nil {
return 0, err
}
return res, nil
}

View file

@ -30,24 +30,30 @@ type FileNode struct {
Name string
Data NodeData
Children map[string]*FileNode
path string
path string // OPTIMIZATION: Cached path (computed once, reused many times in CI mode)
}
// NewNode creates a new FileNode relative to the given parent node with a payload.
func NewNode(parent *FileNode, name string, data FileInfo) (node *FileNode) {
node = new(FileNode)
node.Name = name
node.Data = *NewNodeData()
node.Data.FileInfo = *data.Copy()
node.Size = -1 // signal lazy load later
node.Children = make(map[string]*FileNode)
node.Parent = parent
// OPTIMIZATION: Uses struct literal and does NOT allocate Children map (lazy initialization).
func NewNode(parent *FileNode, name string, data FileInfo) *FileNode {
var tree *FileTree
if parent != nil {
node.Tree = parent.Tree
tree = parent.Tree
}
return node
// Create object with struct literal to avoid extra allocations and assignments
return &FileNode{
Tree: tree,
Parent: parent,
Size: -1, // signal lazy load later
Name: name,
// Initialize Data directly, avoiding NewNodeData() call and extra struct copying
Data: NodeData{
FileInfo: *data.Copy(),
// DiffType defaults to Unmodified (0), explicit initialization not needed
},
// Children: nil, // Explicitly leave nil for memory savings (lazy initialization)
}
}
// renderTreeLine returns a string representing this FileNode in the context of a greater ASCII tree.
@ -75,33 +81,48 @@ func (node *FileNode) renderTreeLine(spaces []bool, last bool, collapsed bool) s
}
// Copy duplicates the existing node relative to a new parent node.
// OPTIMIZATION: Pre-allocation of Children map with correct size.
func (node *FileNode) Copy(parent *FileNode) *FileNode {
newNode := NewNode(parent, node.Name, node.Data.FileInfo)
newNode.Data.ViewInfo = node.Data.ViewInfo
newNode.Data.DiffType = node.Data.DiffType
for name, child := range node.Children {
newNode.Children[name] = child.Copy(newNode)
child.Parent = newNode
// If source node has children, initialize map with correct capacity upfront
if len(node.Children) > 0 {
newNode.Children = make(map[string]*FileNode, len(node.Children))
for name, child := range node.Children {
// Recursively copy children
newNode.Children[name] = child.Copy(newNode)
}
}
return newNode
}
// AddChild creates a new node relative to the current FileNode.
func (node *FileNode) AddChild(name string, data FileInfo) (child *FileNode) {
// OPTIMIZATION: Lazy initialization of Children map only when needed.
func (node *FileNode) AddChild(name string, data FileInfo) *FileNode {
// never allow processing of purely whiteout flag files (for now)
if strings.HasPrefix(name, doubleWhiteoutPrefix) {
return nil
}
child = NewNode(node, name, data)
if node.Children[name] != nil {
// tree node already exists, replace the payload, keep the children
node.Children[name].Data.FileInfo = *data.Copy()
} else {
node.Children[name] = child
node.Tree.Size++
// 1. Lazy initialization: create map only when first child is added
if node.Children == nil {
node.Children = make(map[string]*FileNode)
}
// 2. Use "ok" idiom for existence check (faster and safer)
if existingNode, ok := node.Children[name]; ok {
// Node already exists, just update the data
existingNode.Data.FileInfo = *data.Copy()
return existingNode // Return existing node to avoid duplicates
}
// 3. Create new node and add to tree
child := NewNode(node, name, data)
node.Children[name] = child
node.Tree.Size++
return child
}
@ -268,31 +289,41 @@ func (node *FileNode) IsWhiteout() bool {
// IsLeaf returns true is the current node has no child nodes.
func (node *FileNode) IsLeaf() bool {
return len(node.Children) == 0
// Map is nil or empty - this is a leaf node
return node.Children == nil || len(node.Children) == 0
}
// Path returns a slash-delimited string from the root of the greater tree to the current node (e.g. /a/path/to/here)
// OPTIMIZATION: Uses caching with lazy evaluation.
// Path is computed once and cached, then reused for subsequent calls.
// This is beneficial for CI mode where Path() is called frequently during comparison.
func (node *FileNode) Path() string {
if node.path == "" {
var path []string
curNode := node
for {
if curNode.Parent == nil {
break
}
// Pre-allocate slice for path segments (capacity 10 covers most cases)
segments := make([]string, 0, 10)
// Walk up the tree collecting names
curNode := node
for curNode.Parent != nil {
name := curNode.Name
if curNode == node {
// white out prefixes are fictitious on leaf nodes
name = strings.TrimPrefix(name, whiteoutPrefix)
}
path = append([]string{name}, path...)
// Append in reverse order (will reverse later)
segments = append(segments, name)
curNode = curNode.Parent
}
node.path = "/" + strings.Join(path, "/")
// Reverse the slice (O(n) but very cheap)
for i, j := 0, len(segments)-1; i < j; i, j = i+1, j-1 {
segments[i], segments[j] = segments[j], segments[i]
}
// Build and cache final path string
node.path = "/" + strings.Join(segments, "/")
}
return strings.Replace(node.path, "//", "/", -1)
return node.path
}
// deriveDiffType determines a DiffType to the current FileNode. Note: the DiffType of a node is always the DiffType of