dive/dive/filetree/efficiency.go
Alex Goodman d5e8a92968
Rework CI validation workflow and makefile (#460)
* rework CI validation workflow and makefile

* enable push

* fix job names

* fix license check

* fix snapshot builds

* fix acceptance tests

* fix linting

* disable pull request event

* rework windows runner caching

* disable release pipeline and add issue templates
2023-07-06 22:01:46 -04:00

134 lines
3.6 KiB
Go

package filetree
import (
"sort"
"github.com/sirupsen/logrus"
)
// EfficiencyData represents the storage and reference statistics for a given file tree path.
type EfficiencyData struct {
Path string
Nodes []*FileNode
CumulativeSize int64
minDiscoveredSize int64
}
// EfficiencySlice represents an ordered set of EfficiencyData data structures.
type EfficiencySlice []*EfficiencyData
// Len is required for sorting.
func (efs EfficiencySlice) Len() int {
return len(efs)
}
// Swap operation is required for sorting.
func (efs EfficiencySlice) Swap(i, j int) {
efs[i], efs[j] = efs[j], efs[i]
}
// Less comparison is required for sorting.
func (efs EfficiencySlice) Less(i, j int) bool {
return efs[i].CumulativeSize < efs[j].CumulativeSize
}
// Efficiency returns the score and file set of the given set of FileTrees (layers). This is loosely based on:
// 1. Files that are duplicated across layers discounts your score, weighted by file size
// 2. Files that are removed discounts your score, weighted by the original file size
func Efficiency(trees []*FileTree) (float64, EfficiencySlice) {
efficiencyMap := make(map[string]*EfficiencyData)
inefficientMatches := make(EfficiencySlice, 0)
currentTree := 0
visitor := func(node *FileNode) error {
path := node.Path()
if _, ok := efficiencyMap[path]; !ok {
efficiencyMap[path] = &EfficiencyData{
Path: path,
Nodes: make([]*FileNode, 0),
minDiscoveredSize: -1,
}
}
data := efficiencyMap[path]
// this node may have had children that were deleted, however, we won't explicitly list out every child, only
// the top-most parent with the cumulative size. These operations will need to be done on the full (stacked)
// tree.
// Note: whiteout files may also represent directories, so we need to find out if this was previously a file or dir.
var sizeBytes int64
if node.IsWhiteout() {
sizer := func(curNode *FileNode) error {
sizeBytes += curNode.Data.FileInfo.Size
return nil
}
stackedTree, failedPaths, err := StackTreeRange(trees, 0, currentTree-1)
if len(failedPaths) > 0 {
for _, path := range failedPaths {
logrus.Errorf(path.String())
}
}
if err != nil {
logrus.Errorf("unable to stack tree range: %+v", err)
return err
}
previousTreeNode, err := stackedTree.GetNode(node.Path())
if err != nil {
return err
}
if previousTreeNode.Data.FileInfo.IsDir {
err = previousTreeNode.VisitDepthChildFirst(sizer, nil)
if err != nil {
logrus.Errorf("unable to propagate whiteout dir: %+v", err)
return err
}
}
} else {
sizeBytes = node.Data.FileInfo.Size
}
data.CumulativeSize += sizeBytes
if data.minDiscoveredSize < 0 || sizeBytes < data.minDiscoveredSize {
data.minDiscoveredSize = sizeBytes
}
data.Nodes = append(data.Nodes, node)
if len(data.Nodes) == 2 {
inefficientMatches = append(inefficientMatches, data)
}
return nil
}
visitEvaluator := func(node *FileNode) bool {
return node.IsLeaf()
}
for idx, tree := range trees {
currentTree = idx
err := tree.VisitDepthChildFirst(visitor, visitEvaluator)
if err != nil {
logrus.Errorf("unable to propagate ref tree: %+v", err)
}
}
// calculate the score
var minimumPathSizes int64
var discoveredPathSizes int64
for _, value := range efficiencyMap {
minimumPathSizes += value.minDiscoveredSize
discoveredPathSizes += value.CumulativeSize
}
var score float64
if discoveredPathSizes == 0 {
score = 1.0
} else {
score = float64(minimumPathSizes) / float64(discoveredPathSizes)
}
sort.Sort(inefficientMatches)
return score, inefficientMatches
}