go-twig/zero_alloc_tokenizer.go
semihalev b081db6b80 Consolidate optimizations and cleanup codebase
- Merged buffer pooling optimizations into buffer_pool.go
- Integrated string interning and tag detection into zero_alloc_tokenizer.go
- Removed duplicate and superseded optimization implementations
- Added optimized expression parsing to expr.go
- Ensured all tests pass with consolidated implementation
- Maintained zero allocation implementation for tokenization

🤖 Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-03-12 11:30:44 +03:00

1297 lines
No EOL
35 KiB
Go

package twig
import (
"fmt"
"strings"
"sync"
"unsafe"
)
const (
// Common HTML/Twig strings to pre-cache
maxCacheableLength = 64 // Only cache strings shorter than this to avoid memory bloat
// Common HTML tags
stringDiv = "div"
stringSpan = "span"
stringP = "p"
stringA = "a"
stringImg = "img"
stringHref = "href"
stringClass = "class"
stringId = "id"
stringStyle = "style"
// Common Twig syntax
stringIf = "if"
stringFor = "for"
stringEnd = "end"
stringEndif = "endif"
stringEndfor = "endfor"
stringElse = "else"
stringBlock = "block"
stringSet = "set"
stringInclude = "include"
stringExtends = "extends"
stringMacro = "macro"
// Common operators
stringEquals = "=="
stringNotEquals = "!="
stringAnd = "and"
stringOr = "or"
stringNot = "not"
stringIn = "in"
stringIs = "is"
)
// GlobalStringCache provides a centralized cache for string interning
type GlobalStringCache struct {
sync.RWMutex
strings map[string]string
}
var (
// Singleton instance of the global string cache
globalCache = newGlobalStringCache()
)
// TagType represents the type of tag found
type TagType int
const (
TAG_NONE TagType = iota
TAG_VAR
TAG_VAR_TRIM
TAG_BLOCK
TAG_BLOCK_TRIM
TAG_COMMENT
)
// TagLocation represents the location of a tag in a template
type TagLocation struct {
Type TagType // Type of tag
Position int // Position in source
Length int // Length of opening tag
}
// ZeroAllocTokenizer is an allocation-free tokenizer
// It uses a pre-allocated token buffer for all token operations
type ZeroAllocTokenizer struct {
tokenBuffer []Token // Pre-allocated buffer of tokens
source string // Source string being tokenized
position int // Current position in source
line int // Current line
result []Token // Slice of actually used tokens
tempStrings []string // String constants that we can reuse
}
// This array contains commonly used strings in tokenization to avoid allocations
var commonStrings = []string{
// Common twig words and operators
"if", "else", "elseif", "endif", "for", "endfor", "in",
"block", "endblock", "extends", "include", "with", "set",
"macro", "endmacro", "import", "from", "as", "do",
// Common operators
"+", "-", "*", "/", "=", "==", "!=", ">", "<", ">=", "<=",
"and", "or", "not", "~", "%", "?", ":", "??",
// Common punctuation
"(", ")", "[", "]", "{", "}", ".", ",", "|", ";",
// Common literals
"true", "false", "null",
// Empty string
"",
}
// TokenizerPooled holds a set of resources for zero-allocation tokenization
type TokenizerPooled struct {
tokenizer ZeroAllocTokenizer
used bool
}
// TokenizerPool is a pool of tokenizer resources
var tokenizerPool = sync.Pool{
New: func() interface{} {
// Create a pre-allocated tokenizer with reasonable defaults
return &TokenizerPooled{
tokenizer: ZeroAllocTokenizer{
tokenBuffer: make([]Token, 0, 256), // Buffer for tokens
tempStrings: append([]string{}, commonStrings...),
result: nil,
},
used: false,
}
},
}
// GetTokenizer gets a tokenizer from the pool
func GetTokenizer(source string, capacityHint int) *ZeroAllocTokenizer {
pooled := tokenizerPool.Get().(*TokenizerPooled)
// Reset the tokenizer
tokenizer := &pooled.tokenizer
tokenizer.source = source
tokenizer.position = 0
tokenizer.line = 1
// Ensure token buffer has enough capacity
neededCapacity := capacityHint
if neededCapacity <= 0 {
// Estimate capacity based on source length
neededCapacity = len(source) / 10
if neededCapacity < 32 {
neededCapacity = 32
}
}
// Resize token buffer if needed
if cap(tokenizer.tokenBuffer) < neededCapacity {
tokenizer.tokenBuffer = make([]Token, 0, neededCapacity)
} else {
tokenizer.tokenBuffer = tokenizer.tokenBuffer[:0]
}
// Reset result
tokenizer.result = nil
// Mark as used
pooled.used = true
return tokenizer
}
// ReleaseTokenizer returns a tokenizer to the pool
func ReleaseTokenizer(tokenizer *ZeroAllocTokenizer) {
// Get the parent pooled struct
pooled := (*TokenizerPooled)(unsafe.Pointer(
uintptr(unsafe.Pointer(tokenizer)) - unsafe.Offsetof(TokenizerPooled{}.tokenizer)))
// Only return to pool if it's used
if pooled.used {
// Mark as not used and clear references that might prevent GC
pooled.used = false
tokenizer.source = ""
tokenizer.result = nil
// Return to pool
tokenizerPool.Put(pooled)
}
}
// AddToken adds a token to the buffer
func (t *ZeroAllocTokenizer) AddToken(tokenType int, value string, line int) {
// Create a token
var token Token
token.Type = tokenType
token.Value = value
token.Line = line
// Add to buffer
t.tokenBuffer = append(t.tokenBuffer, token)
}
// GetStringConstant checks if a string exists in our constants and returns
// the canonical version to avoid allocation
func (t *ZeroAllocTokenizer) GetStringConstant(s string) string {
// First check common strings
for _, constant := range t.tempStrings {
if constant == s {
return constant
}
}
// Add to temp strings if it's a short string that might be reused
if len(s) <= 20 {
t.tempStrings = append(t.tempStrings, s)
}
return s
}
// TokenizeExpression tokenizes an expression string with zero allocations
func (t *ZeroAllocTokenizer) TokenizeExpression(expr string) []Token {
// Save current position and set new source context
savedSource := t.source
savedPosition := t.position
savedLine := t.line
t.source = expr
t.position = 0
startTokenCount := len(t.tokenBuffer)
var inString bool
var stringDelimiter byte
var stringStart int
for t.position < len(t.source) {
c := t.source[t.position]
// Handle string literals
if (c == '"' || c == '\'') && (t.position == 0 || t.source[t.position-1] != '\\') {
if inString && c == stringDelimiter {
// End of string, add the string token
value := t.source[stringStart:t.position]
t.AddToken(TOKEN_STRING, value, t.line)
inString = false
} else if !inString {
// Start of string
inString = true
stringDelimiter = c
stringStart = t.position + 1
}
t.position++
continue
}
// Skip chars inside strings
if inString {
t.position++
continue
}
// Handle operators (includes multi-char operators like ==, !=, etc.)
if isOperator(c) {
op := string(c)
t.position++
// Check for two-character operators
if t.position < len(t.source) {
nextChar := t.source[t.position]
twoCharOp := string([]byte{c, nextChar})
// Check common two-char operators
if (c == '=' && nextChar == '=') ||
(c == '!' && nextChar == '=') ||
(c == '>' && nextChar == '=') ||
(c == '<' && nextChar == '=') ||
(c == '&' && nextChar == '&') ||
(c == '|' && nextChar == '|') ||
(c == '?' && nextChar == '?') {
op = twoCharOp
t.position++
}
}
// Use constant version of the operator string if possible
op = t.GetStringConstant(op)
t.AddToken(TOKEN_OPERATOR, op, t.line)
continue
}
// Handle punctuation
if isPunctuation(c) {
// Use constant version of punctuation
punct := t.GetStringConstant(string(c))
t.AddToken(TOKEN_PUNCTUATION, punct, t.line)
t.position++
continue
}
// Skip whitespace
if isWhitespace(c) {
t.position++
if c == '\n' {
t.line++
}
continue
}
// Handle identifiers, literals, etc.
if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' {
// Start of an identifier
start := t.position
// Find the end
t.position++
for t.position < len(t.source) &&
((t.source[t.position] >= 'a' && t.source[t.position] <= 'z') ||
(t.source[t.position] >= 'A' && t.source[t.position] <= 'Z') ||
(t.source[t.position] >= '0' && t.source[t.position] <= '9') ||
t.source[t.position] == '_') {
t.position++
}
// Extract the identifier
identifier := t.source[start:t.position]
// Try to use a canonical string
identifier = t.GetStringConstant(identifier)
// Keywords/literals get special token types
if identifier == "true" || identifier == "false" || identifier == "null" {
t.AddToken(TOKEN_NAME, identifier, t.line)
} else {
t.AddToken(TOKEN_NAME, identifier, t.line)
}
continue
}
// Handle numbers
if (c >= '0' && c <= '9') || (c == '-' && t.position+1 < len(t.source) && t.source[t.position+1] >= '0' && t.source[t.position+1] <= '9') {
start := t.position
// Skip the negative sign if present
if c == '-' {
t.position++
}
// Consume digits
for t.position < len(t.source) && t.source[t.position] >= '0' && t.source[t.position] <= '9' {
t.position++
}
// Handle decimal point
if t.position < len(t.source) && t.source[t.position] == '.' {
t.position++
// Consume fractional digits
for t.position < len(t.source) && t.source[t.position] >= '0' && t.source[t.position] <= '9' {
t.position++
}
}
// Add the number token
t.AddToken(TOKEN_NUMBER, t.source[start:t.position], t.line)
continue
}
// Unrecognized character
t.position++
}
// Create slice of tokens
tokens := t.tokenBuffer[startTokenCount:]
// Restore original context
t.source = savedSource
t.position = savedPosition
t.line = savedLine
return tokens
}
// TokenizeHtmlPreserving performs full tokenization of a template with HTML preservation
func (t *ZeroAllocTokenizer) TokenizeHtmlPreserving() ([]Token, error) {
// Reset position and line
t.position = 0
t.line = 1
// Clear token buffer
t.tokenBuffer = t.tokenBuffer[:0]
tagPatterns := [5]string{"{{-", "{{", "{%-", "{%", "{#"}
tagTypes := [5]int{TOKEN_VAR_START_TRIM, TOKEN_VAR_START, TOKEN_BLOCK_START_TRIM, TOKEN_BLOCK_START, TOKEN_COMMENT_START}
tagLengths := [5]int{3, 2, 3, 2, 2}
for t.position < len(t.source) {
// Find the next tag
nextTagPos := -1
tagType := -1
tagLength := 0
// Check for all possible tag patterns
// This loop avoids allocations by manually checking prefixes
remainingSource := t.source[t.position:]
for i := 0; i < 5; i++ {
pattern := tagPatterns[i]
if len(remainingSource) >= len(pattern) &&
remainingSource[:len(pattern)] == pattern {
// Tag found at current position
nextTagPos = t.position
tagType = tagTypes[i]
tagLength = tagLengths[i]
break
}
// If not found at current position, find it in the remainder
patternPos := strings.Index(remainingSource, pattern)
if patternPos != -1 {
pos := t.position + patternPos
if nextTagPos == -1 || pos < nextTagPos {
nextTagPos = pos
tagType = tagTypes[i]
tagLength = tagLengths[i]
}
}
}
// Check if the tag is escaped
if nextTagPos != -1 && nextTagPos > 0 && t.source[nextTagPos-1] == '\\' {
// Add text up to the backslash
if nextTagPos-1 > t.position {
preText := t.source[t.position:nextTagPos-1]
t.AddToken(TOKEN_TEXT, preText, t.line)
t.line += countNewlines(preText)
}
// Add the tag as literal text (without the backslash)
// Find which pattern was matched
for i := 0; i < 5; i++ {
if tagType == tagTypes[i] {
t.AddToken(TOKEN_TEXT, tagPatterns[i], t.line)
break
}
}
// Move past this tag
t.position = nextTagPos + tagLength
continue
}
// No more tags found - add the rest as TEXT
if nextTagPos == -1 {
if t.position < len(t.source) {
remainingText := t.source[t.position:]
t.AddToken(TOKEN_TEXT, remainingText, t.line)
t.line += countNewlines(remainingText)
}
break
}
// Add text before the tag
if nextTagPos > t.position {
textContent := t.source[t.position:nextTagPos]
t.AddToken(TOKEN_TEXT, textContent, t.line)
t.line += countNewlines(textContent)
}
// Add the tag start token
t.AddToken(tagType, "", t.line)
// Move past opening tag
t.position = nextTagPos + tagLength
// Find matching end tag
var endTag string
var endTagType int
var endTagLength int
if tagType == TOKEN_VAR_START || tagType == TOKEN_VAR_START_TRIM {
// Look for "}}" or "-}}"
endPos1 := strings.Index(t.source[t.position:], "}}")
endPos2 := strings.Index(t.source[t.position:], "-}}")
if endPos1 != -1 && (endPos2 == -1 || endPos1 < endPos2) {
endTag = "}}"
endTagType = TOKEN_VAR_END
endTagLength = 2
} else if endPos2 != -1 {
endTag = "-}}"
endTagType = TOKEN_VAR_END_TRIM
endTagLength = 3
} else {
return nil, fmt.Errorf("unclosed variable tag at line %d", t.line)
}
} else if tagType == TOKEN_BLOCK_START || tagType == TOKEN_BLOCK_START_TRIM {
// Look for "%}" or "-%}"
endPos1 := strings.Index(t.source[t.position:], "%}")
endPos2 := strings.Index(t.source[t.position:], "-%}")
if endPos1 != -1 && (endPos2 == -1 || endPos1 < endPos2) {
endTag = "%}"
endTagType = TOKEN_BLOCK_END
endTagLength = 2
} else if endPos2 != -1 {
endTag = "-%}"
endTagType = TOKEN_BLOCK_END_TRIM
endTagLength = 3
} else {
return nil, fmt.Errorf("unclosed block tag at line %d", t.line)
}
} else if tagType == TOKEN_COMMENT_START {
// Look for "#}"
endPos := strings.Index(t.source[t.position:], "#}")
if endPos == -1 {
return nil, fmt.Errorf("unclosed comment at line %d", t.line)
}
endTag = "#}"
endTagType = TOKEN_COMMENT_END
endTagLength = 2
}
// Find position of the end tag
endPos := strings.Index(t.source[t.position:], endTag)
if endPos == -1 {
return nil, fmt.Errorf("unclosed tag at line %d", t.line)
}
// Get content between tags
tagContent := t.source[t.position:t.position+endPos]
t.line += countNewlines(tagContent)
// Process tag content based on type
if tagType == TOKEN_COMMENT_START {
// Store comments as TEXT tokens
if len(tagContent) > 0 {
t.AddToken(TOKEN_TEXT, tagContent, t.line)
}
} else {
// For variable and block tags, tokenize the content
tagContent = strings.TrimSpace(tagContent)
if tagType == TOKEN_BLOCK_START || tagType == TOKEN_BLOCK_START_TRIM {
// Process block tags with specialized tokenization
t.processBlockTag(tagContent)
} else {
// Process variable tags with optimized tokenization
if len(tagContent) > 0 {
if !strings.ContainsAny(tagContent, ".|[](){}\"',+-*/=!<>%&^~") {
// Simple variable name
identifier := t.GetStringConstant(tagContent)
t.AddToken(TOKEN_NAME, identifier, t.line)
} else {
// Complex expression
t.TokenizeExpression(tagContent)
}
}
}
}
// Add the end tag token
t.AddToken(endTagType, "", t.line)
// Move past the end tag
t.position = t.position + endPos + endTagLength
}
// Add EOF token
t.AddToken(TOKEN_EOF, "", t.line)
// Save the token buffer to result
t.result = t.tokenBuffer
return t.result, nil
}
// processBlockTag handles specialized block tag tokenization
func (t *ZeroAllocTokenizer) processBlockTag(content string) {
// Extract the tag name
spacePos := strings.IndexByte(content, ' ')
var blockName string
var blockContent string
if spacePos == -1 {
// No space found, the whole content is the tag name
blockName = content
blockContent = ""
} else {
blockName = content[:spacePos]
blockContent = strings.TrimSpace(content[spacePos+1:])
}
// Use canonical string for block name
blockName = t.GetStringConstant(blockName)
t.AddToken(TOKEN_NAME, blockName, t.line)
// If there's no content, we're done
if blockContent == "" {
return
}
// Process based on block type
switch blockName {
case "if", "elseif":
// For conditional blocks, tokenize expression
t.TokenizeExpression(blockContent)
case "for":
// Process for loop with iterator(s) and collection
inPos := strings.Index(strings.ToLower(blockContent), " in ")
if inPos != -1 {
iterators := strings.TrimSpace(blockContent[:inPos])
collection := strings.TrimSpace(blockContent[inPos+4:])
// Handle key, value iterator syntax
if strings.Contains(iterators, ",") {
iterParts := strings.SplitN(iterators, ",", 2)
if len(iterParts) == 2 {
// Process iterator variables
keyVar := t.GetStringConstant(strings.TrimSpace(iterParts[0]))
valueVar := t.GetStringConstant(strings.TrimSpace(iterParts[1]))
t.AddToken(TOKEN_NAME, keyVar, t.line)
t.AddToken(TOKEN_PUNCTUATION, ",", t.line)
t.AddToken(TOKEN_NAME, valueVar, t.line)
}
} else {
// Single iterator
iterator := t.GetStringConstant(iterators)
t.AddToken(TOKEN_NAME, iterator, t.line)
}
// Add 'in' keyword
t.AddToken(TOKEN_NAME, "in", t.line)
// Process collection expression
t.TokenizeExpression(collection)
} else {
// Fallback for malformed for loops
t.AddToken(TOKEN_NAME, blockContent, t.line)
}
case "set":
// Handle variable assignment
assignPos := strings.Index(blockContent, "=")
if assignPos != -1 {
varName := strings.TrimSpace(blockContent[:assignPos])
value := strings.TrimSpace(blockContent[assignPos+1:])
// Add the variable name token
varName = t.GetStringConstant(varName)
t.AddToken(TOKEN_NAME, varName, t.line)
// Add the assignment operator
t.AddToken(TOKEN_OPERATOR, "=", t.line)
// Tokenize the value expression
t.TokenizeExpression(value)
} else {
// Simple set without assignment
blockContent = t.GetStringConstant(blockContent)
t.AddToken(TOKEN_NAME, blockContent, t.line)
}
case "do":
// Handle variable assignment similar to set tag
assignPos := strings.Index(blockContent, "=")
if assignPos != -1 {
varName := strings.TrimSpace(blockContent[:assignPos])
value := strings.TrimSpace(blockContent[assignPos+1:])
// Check if varName is valid (should be a variable name)
// In Twig, variable names must start with a letter or underscore
if len(varName) > 0 && (isCharAlpha(varName[0]) || varName[0] == '_') {
// Add the variable name token
varName = t.GetStringConstant(varName)
t.AddToken(TOKEN_NAME, varName, t.line)
// Add the assignment operator
t.AddToken(TOKEN_OPERATOR, "=", t.line)
// Tokenize the value expression
if len(value) > 0 {
t.TokenizeExpression(value)
} else {
// Empty value after =, which is invalid
// Add an error token to trigger proper parser error
t.AddToken(TOKEN_EOF, "ERROR_MISSING_VALUE", t.line)
}
} else {
// Invalid variable name (like a number or operator)
// Just tokenize as expressions to produce an error in the parser
t.TokenizeExpression(varName)
t.AddToken(TOKEN_OPERATOR, "=", t.line)
t.TokenizeExpression(value)
}
} else {
// No assignment, just an expression to evaluate
t.TokenizeExpression(blockContent)
}
case "include":
// Handle include with template path and optional context
withPos := strings.Index(strings.ToLower(blockContent), " with ")
if withPos != -1 {
templatePath := strings.TrimSpace(blockContent[:withPos])
contextExpr := strings.TrimSpace(blockContent[withPos+6:])
// Process template path
t.tokenizeTemplatePath(templatePath)
// Add 'with' keyword
t.AddToken(TOKEN_NAME, "with", t.line)
// Process context expression as object
if strings.HasPrefix(contextExpr, "{") && strings.HasSuffix(contextExpr, "}") {
// Context is an object literal
t.AddToken(TOKEN_PUNCTUATION, "{", t.line)
objectContent := contextExpr[1:len(contextExpr)-1]
t.tokenizeObjectContents(objectContent)
t.AddToken(TOKEN_PUNCTUATION, "}", t.line)
} else {
// Context is a variable or expression
t.TokenizeExpression(contextExpr)
}
} else {
// Just a template path
t.tokenizeTemplatePath(blockContent)
}
case "extends":
// Handle extends tag (similar to include template path)
t.tokenizeTemplatePath(blockContent)
case "from":
// Handle from tag which has a special format:
// {% from "template.twig" import macro1, macro2 as alias %}
importPos := strings.Index(strings.ToLower(blockContent), " import ")
if importPos != -1 {
// Extract template path and macros list
templatePath := strings.TrimSpace(blockContent[:importPos])
macrosStr := strings.TrimSpace(blockContent[importPos+8:]) // 8 = len(" import ")
// Process template path
t.tokenizeTemplatePath(templatePath)
// Add 'import' keyword
t.AddToken(TOKEN_NAME, "import", t.line)
// Process macro imports
macros := strings.Split(macrosStr, ",")
for i, macro := range macros {
macro = strings.TrimSpace(macro)
// Check for "as" alias
asPos := strings.Index(strings.ToLower(macro), " as ")
if asPos != -1 {
// Extract macro name and alias
macroName := strings.TrimSpace(macro[:asPos])
alias := strings.TrimSpace(macro[asPos+4:])
// Add macro name
macroName = t.GetStringConstant(macroName)
t.AddToken(TOKEN_NAME, macroName, t.line)
// Add 'as' keyword
t.AddToken(TOKEN_NAME, "as", t.line)
// Add alias
alias = t.GetStringConstant(alias)
t.AddToken(TOKEN_NAME, alias, t.line)
} else {
// Just the macro name
macro = t.GetStringConstant(macro)
t.AddToken(TOKEN_NAME, macro, t.line)
}
// Add comma if not the last macro
if i < len(macros)-1 {
t.AddToken(TOKEN_PUNCTUATION, ",", t.line)
}
}
} else {
// Malformed from tag, just tokenize as expression
t.TokenizeExpression(blockContent)
}
case "import":
// Handle import tag which allows importing entire templates
// {% import "template.twig" as alias %}
asPos := strings.Index(strings.ToLower(blockContent), " as ")
if asPos != -1 {
// Extract template path and alias
templatePath := strings.TrimSpace(blockContent[:asPos])
alias := strings.TrimSpace(blockContent[asPos+4:])
// Process template path
t.tokenizeTemplatePath(templatePath)
// Add 'as' keyword
t.AddToken(TOKEN_NAME, "as", t.line)
// Add alias
alias = t.GetStringConstant(alias)
t.AddToken(TOKEN_NAME, alias, t.line)
} else {
// Simple import without alias
t.TokenizeExpression(blockContent)
}
default:
// Other block types - tokenize as expression
t.TokenizeExpression(blockContent)
}
}
// Helper methods for specialized tag tokenization
// tokenizeTemplatePath handles template paths in extends/include tags
func (t *ZeroAllocTokenizer) tokenizeTemplatePath(path string) {
path = strings.TrimSpace(path)
// If it's a quoted string
if (strings.HasPrefix(path, "\"") && strings.HasSuffix(path, "\"")) ||
(strings.HasPrefix(path, "'") && strings.HasSuffix(path, "'")) {
// Extract content without quotes
content := path[1:len(path)-1]
t.AddToken(TOKEN_STRING, content, t.line)
} else {
// Otherwise tokenize as expression
t.TokenizeExpression(path)
}
}
// isCharAlpha checks if a byte is an alphabetic character
func isCharAlpha(c byte) bool {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
}
// tokenizeObjectContents handles object literal contents
func (t *ZeroAllocTokenizer) tokenizeObjectContents(content string) {
// Track state for nested structures
inString := false
stringDelim := byte(0)
inObject := 0
inArray := 0
start := 0
colonPos := -1
for i := 0; i <= len(content); i++ {
// At end of string or at a comma at the top level
atEnd := i == len(content)
isComma := !atEnd && content[i] == ','
// Process key-value pair when we find a comma or reach the end
if (isComma || atEnd) && inObject == 0 && inArray == 0 && !inString {
if colonPos != -1 {
// We have a key-value pair
keyStr := strings.TrimSpace(content[start:colonPos])
valueStr := strings.TrimSpace(content[colonPos+1:i])
// Process key
if (len(keyStr) >= 2 && keyStr[0] == '"' && keyStr[len(keyStr)-1] == '"') ||
(len(keyStr) >= 2 && keyStr[0] == '\'' && keyStr[len(keyStr)-1] == '\'') {
// Quoted key
t.AddToken(TOKEN_STRING, keyStr[1:len(keyStr)-1], t.line)
} else {
// Unquoted key
keyStr = t.GetStringConstant(keyStr)
t.AddToken(TOKEN_NAME, keyStr, t.line)
}
// Add colon
t.AddToken(TOKEN_PUNCTUATION, ":", t.line)
// Process value
t.TokenizeExpression(valueStr)
// Add comma if needed
if isComma && i < len(content)-1 {
t.AddToken(TOKEN_PUNCTUATION, ",", t.line)
}
// Reset for next pair
start = i + 1
colonPos = -1
}
continue
}
// Skip end of string case
if atEnd {
continue
}
// Current character
c := content[i]
// Handle string literals
if (c == '"' || c == '\'') && (i == 0 || content[i-1] != '\\') {
if inString && c == stringDelim {
inString = false
} else if !inString {
inString = true
stringDelim = c
}
continue
}
// Skip processing inside strings
if inString {
continue
}
// Handle object and array nesting
if c == '{' {
inObject++
} else if c == '}' {
inObject--
} else if c == '[' {
inArray++
} else if c == ']' {
inArray--
}
// Track colon position for key-value separator
if c == ':' && inObject == 0 && inArray == 0 && colonPos == -1 {
colonPos = i
}
}
}
// ApplyWhitespaceControl applies whitespace control to the tokenized result
func (t *ZeroAllocTokenizer) ApplyWhitespaceControl() {
tokens := t.result
for i := 0; i < len(tokens); i++ {
token := tokens[i]
// Handle opening tags that trim whitespace before them
if token.Type == TOKEN_VAR_START_TRIM || token.Type == TOKEN_BLOCK_START_TRIM {
// If there's a text token before this, trim its trailing whitespace
if i > 0 && tokens[i-1].Type == TOKEN_TEXT {
tokens[i-1].Value = trimTrailingWhitespace(tokens[i-1].Value)
}
}
// Handle closing tags that trim whitespace after them
if token.Type == TOKEN_VAR_END_TRIM || token.Type == TOKEN_BLOCK_END_TRIM {
// If there's a text token after this, trim its leading whitespace
if i+1 < len(tokens) && tokens[i+1].Type == TOKEN_TEXT {
tokens[i+1].Value = trimLeadingWhitespace(tokens[i+1].Value)
}
}
}
}
// The following functions implement string interning and tag detection from the optimized implementations
// newGlobalStringCache creates a new global string cache with pre-populated common strings
func newGlobalStringCache() *GlobalStringCache {
cache := &GlobalStringCache{
strings: make(map[string]string, 64), // Pre-allocate capacity
}
// Pre-populate with common strings
commonStrings := []string{
stringDiv, stringSpan, stringP, stringA, stringImg,
stringHref, stringClass, stringId, stringStyle,
stringIf, stringFor, stringEnd, stringEndif, stringEndfor,
stringElse, stringBlock, stringSet, stringInclude, stringExtends,
stringMacro, stringEquals, stringNotEquals, stringAnd,
stringOr, stringNot, stringIn, stringIs,
// Add empty string as well
"",
}
for _, s := range commonStrings {
cache.strings[s] = s
}
return cache
}
// Intern returns an interned version of the input string
// For strings that are already in the cache, the cached version is returned
// Otherwise, the input string is added to the cache and returned
func Intern(s string) string {
// Fast path for very common strings to avoid lock contention
switch s {
case stringDiv, stringSpan, stringP, stringA, stringImg,
stringIf, stringFor, stringEnd, stringEndif, stringEndfor,
stringElse, "":
return s
}
// Don't intern strings that are too long
if len(s) > maxCacheableLength {
return s
}
// Use read lock for lookup first (less contention)
globalCache.RLock()
cached, exists := globalCache.strings[s]
globalCache.RUnlock()
if exists {
return cached
}
// Not found with read lock, acquire write lock to add
globalCache.Lock()
defer globalCache.Unlock()
// Check again after acquiring write lock (double-checked locking)
if cached, exists := globalCache.strings[s]; exists {
return cached
}
// Add to cache and return
globalCache.strings[s] = s
return s
}
// FindNextTag finds the next twig tag in a template string using
// optimized detection methods to reduce allocations and string operations.
func FindNextTag(source string, startPos int) TagLocation {
// Quick check for empty source or position at end
if len(source) == 0 || startPos >= len(source) || startPos < 0 {
return TagLocation{TAG_NONE, -1, 0}
}
// Define the remaining source to search
remainingSource := source[startPos:]
remainingLen := len(remainingSource)
// Fast paths for common cases
if remainingLen < 2 {
return TagLocation{TAG_NONE, -1, 0}
}
// Direct byte comparison for opening characters
// This avoids string allocations and uses pointer arithmetic
srcPtr := unsafe.Pointer(unsafe.StringData(remainingSource))
// Quick check for potential tag start with { character
for i := 0; i < remainingLen-1; i++ {
if *(*byte)(unsafe.Add(srcPtr, i)) != '{' {
continue
}
// We found a '{', check next character
secondChar := *(*byte)(unsafe.Add(srcPtr, i+1))
// Check for start of blocks
tagPosition := startPos + i
// Check for possible tag patterns
switch secondChar {
case '{': // Potential variable tag {{
if i+2 < remainingLen && *(*byte)(unsafe.Add(srcPtr, i+2)) == '-' {
return TagLocation{TAG_VAR_TRIM, tagPosition, 3}
}
return TagLocation{TAG_VAR, tagPosition, 2}
case '%': // Potential block tag {%
if i+2 < remainingLen && *(*byte)(unsafe.Add(srcPtr, i+2)) == '-' {
return TagLocation{TAG_BLOCK_TRIM, tagPosition, 3}
}
return TagLocation{TAG_BLOCK, tagPosition, 2}
case '#': // Comment tag {#
return TagLocation{TAG_COMMENT, tagPosition, 2}
}
}
// No tags found
return TagLocation{TAG_NONE, -1, 0}
}
// FindTagEnd finds the end of a tag based on the type
func FindTagEnd(source string, startPos int, tagType TagType) int {
if startPos >= len(source) {
return -1
}
switch tagType {
case TAG_VAR, TAG_VAR_TRIM:
// Find "}}" sequence
for i := startPos; i < len(source)-1; i++ {
if source[i] == '}' && source[i+1] == '}' {
return i
}
}
case TAG_BLOCK, TAG_BLOCK_TRIM:
// Find "%}" sequence
for i := startPos; i < len(source)-1; i++ {
if source[i] == '%' && source[i+1] == '}' {
return i
}
}
case TAG_COMMENT:
// Find "#}" sequence
for i := startPos; i < len(source)-1; i++ {
if source[i] == '#' && source[i+1] == '}' {
return i
}
}
}
return -1
}
// TokenizeOptimized uses the enhanced tag detection for faster tokenization
// This is a hybrid approach that combines direct tag detection with
// full HTML-preserving tokenization for maximum performance
func (t *ZeroAllocTokenizer) TokenizeOptimized() ([]Token, error) {
// Reset position and line
t.position = 0
t.line = 1
// Clear token buffer
t.tokenBuffer = t.tokenBuffer[:0]
// Process the template content
pos := 0
for pos < len(t.source) {
// Find the next tag using optimized detection
tagLoc := FindNextTag(t.source, pos)
// Check if no more tags found
if tagLoc.Position == -1 {
// Add remaining text as TOKEN_TEXT
if pos < len(t.source) {
remainingText := t.source[pos:]
t.AddToken(TOKEN_TEXT, remainingText, t.line)
t.line += countNewlines(remainingText)
}
break
}
// Check if the tag is escaped with a backslash
if tagLoc.Position > 0 && t.source[tagLoc.Position-1] == '\\' {
// Add text up to the backslash
if tagLoc.Position-1 > pos {
preText := t.source[pos:tagLoc.Position-1]
t.AddToken(TOKEN_TEXT, preText, t.line)
t.line += countNewlines(preText)
}
// Add the tag as literal text (without the backslash)
var tagText string
switch tagLoc.Type {
case TAG_VAR:
tagText = "{{"
case TAG_VAR_TRIM:
tagText = "{{-"
case TAG_BLOCK:
tagText = "{%"
case TAG_BLOCK_TRIM:
tagText = "{%-"
case TAG_COMMENT:
tagText = "{#"
}
t.AddToken(TOKEN_TEXT, tagText, t.line)
// Move past this tag
pos = tagLoc.Position + tagLoc.Length
continue
}
// Add text before the tag
if tagLoc.Position > pos {
textContent := t.source[pos:tagLoc.Position]
t.AddToken(TOKEN_TEXT, textContent, t.line)
t.line += countNewlines(textContent)
}
// Add the tag start token
var startTokenType int
switch tagLoc.Type {
case TAG_VAR:
startTokenType = TOKEN_VAR_START
case TAG_VAR_TRIM:
startTokenType = TOKEN_VAR_START_TRIM
case TAG_BLOCK:
startTokenType = TOKEN_BLOCK_START
case TAG_BLOCK_TRIM:
startTokenType = TOKEN_BLOCK_START_TRIM
case TAG_COMMENT:
startTokenType = TOKEN_COMMENT_START
}
t.AddToken(startTokenType, "", t.line)
// Move past the tag's opening characters
tagContentStart := tagLoc.Position + tagLoc.Length
// Find the end of the tag
tagEndPos := FindTagEnd(t.source, tagContentStart, tagLoc.Type)
if tagEndPos == -1 {
var unclosedType string
switch tagLoc.Type {
case TAG_VAR, TAG_VAR_TRIM:
unclosedType = "variable"
case TAG_BLOCK, TAG_BLOCK_TRIM:
unclosedType = "block"
case TAG_COMMENT:
unclosedType = "comment"
}
return nil, fmt.Errorf("unclosed %s tag at line %d", unclosedType, t.line)
}
// Get tag content
tagContent := t.source[tagContentStart:tagEndPos]
t.line += countNewlines(tagContent)
// Determine the end token type and length
var endTokenType int
var endLength int
switch tagLoc.Type {
case TAG_VAR:
endTokenType = TOKEN_VAR_END
endLength = 2 // }}
case TAG_VAR_TRIM:
// Check if it ends with -}}
if tagEndPos > 0 && t.source[tagEndPos-1] == '-' {
endTokenType = TOKEN_VAR_END_TRIM
endLength = 3 // -}}
// Adjust tag content to remove the trailing dash
tagContent = tagContent[:len(tagContent)-1]
} else {
endTokenType = TOKEN_VAR_END
endLength = 2 // }}
}
case TAG_BLOCK:
endTokenType = TOKEN_BLOCK_END
endLength = 2 // %}
case TAG_BLOCK_TRIM:
// Check if it ends with -%}
if tagEndPos > 0 && t.source[tagEndPos-1] == '-' {
endTokenType = TOKEN_BLOCK_END_TRIM
endLength = 3 // -%}
// Adjust tag content to remove the trailing dash
tagContent = tagContent[:len(tagContent)-1]
} else {
endTokenType = TOKEN_BLOCK_END
endLength = 2 // %}
}
case TAG_COMMENT:
endTokenType = TOKEN_COMMENT_END
endLength = 2 // #}
}
// Process tag content based on tag type
if tagLoc.Type == TAG_COMMENT {
// Store comments as TEXT tokens
if len(tagContent) > 0 {
t.AddToken(TOKEN_TEXT, tagContent, t.line)
}
} else {
// For variable and block tags, tokenize the content
tagContent = strings.TrimSpace(tagContent)
if tagLoc.Type == TAG_BLOCK || tagLoc.Type == TAG_BLOCK_TRIM {
// Process block tags using specialized tokenization
if len(tagContent) > 0 {
t.processBlockTag(tagContent)
}
} else {
// Process variable tags using optimized tokenization
if len(tagContent) > 0 {
// Check if it's a simple variable or a complex expression
if !strings.ContainsAny(tagContent, ".|[](){}\"',+-*/=!<>%&^~") {
// Simple variable name - use string interning for efficiency
identifier := Intern(tagContent)
t.AddToken(TOKEN_NAME, identifier, t.line)
} else {
// Complex expression - tokenize fully
t.TokenizeExpression(tagContent)
}
}
}
}
// Add end token
t.AddToken(endTokenType, "", t.line)
// Move past the end tag
pos = tagEndPos + endLength
}
// Add EOF token
t.AddToken(TOKEN_EOF, "", t.line)
// Save and return result
t.result = t.tokenBuffer
return t.result, nil
}