package twig import ( "fmt" "strings" "sync" "unsafe" ) const ( // Common HTML/Twig strings to pre-cache maxCacheableLength = 64 // Only cache strings shorter than this to avoid memory bloat // Common HTML tags stringDiv = "div" stringSpan = "span" stringP = "p" stringA = "a" stringImg = "img" stringHref = "href" stringClass = "class" stringId = "id" stringStyle = "style" // Common Twig syntax stringIf = "if" stringFor = "for" stringEnd = "end" stringEndif = "endif" stringEndfor = "endfor" stringElse = "else" stringBlock = "block" stringSet = "set" stringInclude = "include" stringExtends = "extends" stringMacro = "macro" // Common operators stringEquals = "==" stringNotEquals = "!=" stringAnd = "and" stringOr = "or" stringNot = "not" stringIn = "in" stringIs = "is" ) // GlobalStringCache provides a centralized cache for string interning type GlobalStringCache struct { sync.RWMutex strings map[string]string } var ( // Singleton instance of the global string cache globalCache = newGlobalStringCache() ) // TagType represents the type of tag found type TagType int const ( TAG_NONE TagType = iota TAG_VAR TAG_VAR_TRIM TAG_BLOCK TAG_BLOCK_TRIM TAG_COMMENT ) // TagLocation represents the location of a tag in a template type TagLocation struct { Type TagType // Type of tag Position int // Position in source Length int // Length of opening tag } // ZeroAllocTokenizer is an allocation-free tokenizer // It uses a pre-allocated token buffer for all token operations type ZeroAllocTokenizer struct { tokenBuffer []Token // Pre-allocated buffer of tokens source string // Source string being tokenized position int // Current position in source line int // Current line result []Token // Slice of actually used tokens tempStrings []string // String constants that we can reuse } // This array contains commonly used strings in tokenization to avoid allocations var commonStrings = []string{ // Common twig words and operators "if", "else", "elseif", "endif", "for", "endfor", "in", "block", "endblock", "extends", "include", "with", "set", "macro", "endmacro", "import", "from", "as", "do", // Common operators "+", "-", "*", "/", "=", "==", "!=", ">", "<", ">=", "<=", "and", "or", "not", "~", "%", "?", ":", "??", // Common punctuation "(", ")", "[", "]", "{", "}", ".", ",", "|", ";", // Common literals "true", "false", "null", // Empty string "", } // TokenizerPooled holds a set of resources for zero-allocation tokenization type TokenizerPooled struct { tokenizer ZeroAllocTokenizer used bool } // TokenizerPool is a pool of tokenizer resources var tokenizerPool = sync.Pool{ New: func() interface{} { // Create a pre-allocated tokenizer with reasonable defaults return &TokenizerPooled{ tokenizer: ZeroAllocTokenizer{ tokenBuffer: make([]Token, 0, 256), // Buffer for tokens tempStrings: append([]string{}, commonStrings...), result: nil, }, used: false, } }, } // GetTokenizer gets a tokenizer from the pool func GetTokenizer(source string, capacityHint int) *ZeroAllocTokenizer { pooled := tokenizerPool.Get().(*TokenizerPooled) // Reset the tokenizer tokenizer := &pooled.tokenizer tokenizer.source = source tokenizer.position = 0 tokenizer.line = 1 // Ensure token buffer has enough capacity neededCapacity := capacityHint if neededCapacity <= 0 { // Estimate capacity based on source length neededCapacity = len(source) / 10 if neededCapacity < 32 { neededCapacity = 32 } } // Resize token buffer if needed if cap(tokenizer.tokenBuffer) < neededCapacity { tokenizer.tokenBuffer = make([]Token, 0, neededCapacity) } else { tokenizer.tokenBuffer = tokenizer.tokenBuffer[:0] } // Reset result tokenizer.result = nil // Mark as used pooled.used = true return tokenizer } // ReleaseTokenizer returns a tokenizer to the pool func ReleaseTokenizer(tokenizer *ZeroAllocTokenizer) { // Get the parent pooled struct pooled := (*TokenizerPooled)(unsafe.Pointer( uintptr(unsafe.Pointer(tokenizer)) - unsafe.Offsetof(TokenizerPooled{}.tokenizer))) // Only return to pool if it's used if pooled.used { // Mark as not used and clear references that might prevent GC pooled.used = false tokenizer.source = "" tokenizer.result = nil // Return to pool tokenizerPool.Put(pooled) } } // AddToken adds a token to the buffer func (t *ZeroAllocTokenizer) AddToken(tokenType int, value string, line int) { // Create a token var token Token token.Type = tokenType token.Value = value token.Line = line // Add to buffer t.tokenBuffer = append(t.tokenBuffer, token) } // GetStringConstant checks if a string exists in our constants and returns // the canonical version to avoid allocation func (t *ZeroAllocTokenizer) GetStringConstant(s string) string { // First check common strings for _, constant := range t.tempStrings { if constant == s { return constant } } // Add to temp strings if it's a short string that might be reused if len(s) <= 20 { t.tempStrings = append(t.tempStrings, s) } return s } // TokenizeExpression tokenizes an expression string with zero allocations func (t *ZeroAllocTokenizer) TokenizeExpression(expr string) []Token { // Save current position and set new source context savedSource := t.source savedPosition := t.position savedLine := t.line t.source = expr t.position = 0 startTokenCount := len(t.tokenBuffer) var inString bool var stringDelimiter byte var stringStart int for t.position < len(t.source) { c := t.source[t.position] // Handle string literals if (c == '"' || c == '\'') && (t.position == 0 || t.source[t.position-1] != '\\') { if inString && c == stringDelimiter { // End of string, add the string token value := t.source[stringStart:t.position] t.AddToken(TOKEN_STRING, value, t.line) inString = false } else if !inString { // Start of string inString = true stringDelimiter = c stringStart = t.position + 1 } t.position++ continue } // Skip chars inside strings if inString { t.position++ continue } // Handle operators (includes multi-char operators like ==, !=, etc.) if isOperator(c) { op := string(c) t.position++ // Check for two-character operators if t.position < len(t.source) { nextChar := t.source[t.position] twoCharOp := string([]byte{c, nextChar}) // Check common two-char operators if (c == '=' && nextChar == '=') || (c == '!' && nextChar == '=') || (c == '>' && nextChar == '=') || (c == '<' && nextChar == '=') || (c == '&' && nextChar == '&') || (c == '|' && nextChar == '|') || (c == '?' && nextChar == '?') { op = twoCharOp t.position++ } } // Use constant version of the operator string if possible op = t.GetStringConstant(op) t.AddToken(TOKEN_OPERATOR, op, t.line) continue } // Handle punctuation if isPunctuation(c) { // Use constant version of punctuation punct := t.GetStringConstant(string(c)) t.AddToken(TOKEN_PUNCTUATION, punct, t.line) t.position++ continue } // Skip whitespace if isWhitespace(c) { t.position++ if c == '\n' { t.line++ } continue } // Handle identifiers, literals, etc. if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' { // Start of an identifier start := t.position // Find the end t.position++ for t.position < len(t.source) && ((t.source[t.position] >= 'a' && t.source[t.position] <= 'z') || (t.source[t.position] >= 'A' && t.source[t.position] <= 'Z') || (t.source[t.position] >= '0' && t.source[t.position] <= '9') || t.source[t.position] == '_') { t.position++ } // Extract the identifier identifier := t.source[start:t.position] // Try to use a canonical string identifier = t.GetStringConstant(identifier) // Keywords/literals get special token types if identifier == "true" || identifier == "false" || identifier == "null" { t.AddToken(TOKEN_NAME, identifier, t.line) } else { t.AddToken(TOKEN_NAME, identifier, t.line) } continue } // Handle numbers if (c >= '0' && c <= '9') || (c == '-' && t.position+1 < len(t.source) && t.source[t.position+1] >= '0' && t.source[t.position+1] <= '9') { start := t.position // Skip the negative sign if present if c == '-' { t.position++ } // Consume digits for t.position < len(t.source) && t.source[t.position] >= '0' && t.source[t.position] <= '9' { t.position++ } // Handle decimal point if t.position < len(t.source) && t.source[t.position] == '.' { t.position++ // Consume fractional digits for t.position < len(t.source) && t.source[t.position] >= '0' && t.source[t.position] <= '9' { t.position++ } } // Add the number token t.AddToken(TOKEN_NUMBER, t.source[start:t.position], t.line) continue } // Unrecognized character t.position++ } // Create slice of tokens tokens := t.tokenBuffer[startTokenCount:] // Restore original context t.source = savedSource t.position = savedPosition t.line = savedLine return tokens } // TokenizeHtmlPreserving performs full tokenization of a template with HTML preservation func (t *ZeroAllocTokenizer) TokenizeHtmlPreserving() ([]Token, error) { // Reset position and line t.position = 0 t.line = 1 // Clear token buffer t.tokenBuffer = t.tokenBuffer[:0] tagPatterns := [5]string{"{{-", "{{", "{%-", "{%", "{#"} tagTypes := [5]int{TOKEN_VAR_START_TRIM, TOKEN_VAR_START, TOKEN_BLOCK_START_TRIM, TOKEN_BLOCK_START, TOKEN_COMMENT_START} tagLengths := [5]int{3, 2, 3, 2, 2} for t.position < len(t.source) { // Find the next tag nextTagPos := -1 tagType := -1 tagLength := 0 // Check for all possible tag patterns // This loop avoids allocations by manually checking prefixes remainingSource := t.source[t.position:] for i := 0; i < 5; i++ { pattern := tagPatterns[i] if len(remainingSource) >= len(pattern) && remainingSource[:len(pattern)] == pattern { // Tag found at current position nextTagPos = t.position tagType = tagTypes[i] tagLength = tagLengths[i] break } // If not found at current position, find it in the remainder patternPos := strings.Index(remainingSource, pattern) if patternPos != -1 { pos := t.position + patternPos if nextTagPos == -1 || pos < nextTagPos { nextTagPos = pos tagType = tagTypes[i] tagLength = tagLengths[i] } } } // Check if the tag is escaped if nextTagPos != -1 && nextTagPos > 0 && t.source[nextTagPos-1] == '\\' { // Add text up to the backslash if nextTagPos-1 > t.position { preText := t.source[t.position : nextTagPos-1] t.AddToken(TOKEN_TEXT, preText, t.line) t.line += countNewlines(preText) } // Add the tag as literal text (without the backslash) // Find which pattern was matched for i := 0; i < 5; i++ { if tagType == tagTypes[i] { t.AddToken(TOKEN_TEXT, tagPatterns[i], t.line) break } } // Move past this tag t.position = nextTagPos + tagLength continue } // No more tags found - add the rest as TEXT if nextTagPos == -1 { if t.position < len(t.source) { remainingText := t.source[t.position:] t.AddToken(TOKEN_TEXT, remainingText, t.line) t.line += countNewlines(remainingText) } break } // Add text before the tag if nextTagPos > t.position { textContent := t.source[t.position:nextTagPos] t.AddToken(TOKEN_TEXT, textContent, t.line) t.line += countNewlines(textContent) } // Add the tag start token t.AddToken(tagType, "", t.line) // Move past opening tag t.position = nextTagPos + tagLength // Find matching end tag var endTag string var endTagType int var endTagLength int if tagType == TOKEN_VAR_START || tagType == TOKEN_VAR_START_TRIM { // Look for "}}" or "-}}" endPos1 := strings.Index(t.source[t.position:], "}}") endPos2 := strings.Index(t.source[t.position:], "-}}") if endPos1 != -1 && (endPos2 == -1 || endPos1 < endPos2) { endTag = "}}" endTagType = TOKEN_VAR_END endTagLength = 2 } else if endPos2 != -1 { endTag = "-}}" endTagType = TOKEN_VAR_END_TRIM endTagLength = 3 } else { return nil, fmt.Errorf("unclosed variable tag at line %d", t.line) } } else if tagType == TOKEN_BLOCK_START || tagType == TOKEN_BLOCK_START_TRIM { // Look for "%}" or "-%}" endPos1 := strings.Index(t.source[t.position:], "%}") endPos2 := strings.Index(t.source[t.position:], "-%}") if endPos1 != -1 && (endPos2 == -1 || endPos1 < endPos2) { endTag = "%}" endTagType = TOKEN_BLOCK_END endTagLength = 2 } else if endPos2 != -1 { endTag = "-%}" endTagType = TOKEN_BLOCK_END_TRIM endTagLength = 3 } else { return nil, fmt.Errorf("unclosed block tag at line %d", t.line) } } else if tagType == TOKEN_COMMENT_START { // Look for "#}" endPos := strings.Index(t.source[t.position:], "#}") if endPos == -1 { return nil, fmt.Errorf("unclosed comment at line %d", t.line) } endTag = "#}" endTagType = TOKEN_COMMENT_END endTagLength = 2 } // Find position of the end tag endPos := strings.Index(t.source[t.position:], endTag) if endPos == -1 { return nil, fmt.Errorf("unclosed tag at line %d", t.line) } // Get content between tags tagContent := t.source[t.position : t.position+endPos] t.line += countNewlines(tagContent) // Process tag content based on type if tagType == TOKEN_COMMENT_START { // Store comments as TEXT tokens if len(tagContent) > 0 { t.AddToken(TOKEN_TEXT, tagContent, t.line) } } else { // For variable and block tags, tokenize the content tagContent = strings.TrimSpace(tagContent) if tagType == TOKEN_BLOCK_START || tagType == TOKEN_BLOCK_START_TRIM { // Process block tags with specialized tokenization t.processBlockTag(tagContent) } else { // Process variable tags with optimized tokenization if len(tagContent) > 0 { if !strings.ContainsAny(tagContent, ".|[](){}\"',+-*/=!<>%&^~") { // Simple variable name identifier := t.GetStringConstant(tagContent) t.AddToken(TOKEN_NAME, identifier, t.line) } else { // Complex expression t.TokenizeExpression(tagContent) } } } } // Add the end tag token t.AddToken(endTagType, "", t.line) // Move past the end tag t.position = t.position + endPos + endTagLength } // Add EOF token t.AddToken(TOKEN_EOF, "", t.line) // Save the token buffer to result t.result = t.tokenBuffer return t.result, nil } // processBlockTag handles specialized block tag tokenization func (t *ZeroAllocTokenizer) processBlockTag(content string) { // Extract the tag name spacePos := strings.IndexByte(content, ' ') var blockName string var blockContent string if spacePos == -1 { // No space found, the whole content is the tag name blockName = content blockContent = "" } else { blockName = content[:spacePos] blockContent = strings.TrimSpace(content[spacePos+1:]) } // Use canonical string for block name blockName = t.GetStringConstant(blockName) t.AddToken(TOKEN_NAME, blockName, t.line) // If there's no content, we're done if blockContent == "" { return } // Process based on block type switch blockName { case "if", "elseif": // For conditional blocks, tokenize expression t.TokenizeExpression(blockContent) case "for": // Process for loop with iterator(s) and collection inPos := strings.Index(strings.ToLower(blockContent), " in ") if inPos != -1 { iterators := strings.TrimSpace(blockContent[:inPos]) collection := strings.TrimSpace(blockContent[inPos+4:]) // Handle key, value iterator syntax if strings.Contains(iterators, ",") { iterParts := strings.SplitN(iterators, ",", 2) if len(iterParts) == 2 { // Process iterator variables keyVar := t.GetStringConstant(strings.TrimSpace(iterParts[0])) valueVar := t.GetStringConstant(strings.TrimSpace(iterParts[1])) t.AddToken(TOKEN_NAME, keyVar, t.line) t.AddToken(TOKEN_PUNCTUATION, ",", t.line) t.AddToken(TOKEN_NAME, valueVar, t.line) } } else { // Single iterator iterator := t.GetStringConstant(iterators) t.AddToken(TOKEN_NAME, iterator, t.line) } // Add 'in' keyword t.AddToken(TOKEN_NAME, "in", t.line) // Process collection expression t.TokenizeExpression(collection) } else { // Fallback for malformed for loops t.AddToken(TOKEN_NAME, blockContent, t.line) } case "set": // Handle variable assignment assignPos := strings.Index(blockContent, "=") if assignPos != -1 { varName := strings.TrimSpace(blockContent[:assignPos]) value := strings.TrimSpace(blockContent[assignPos+1:]) // Add the variable name token varName = t.GetStringConstant(varName) t.AddToken(TOKEN_NAME, varName, t.line) // Add the assignment operator t.AddToken(TOKEN_OPERATOR, "=", t.line) // Tokenize the value expression t.TokenizeExpression(value) } else { // Simple set without assignment blockContent = t.GetStringConstant(blockContent) t.AddToken(TOKEN_NAME, blockContent, t.line) } case "do": // Handle variable assignment similar to set tag assignPos := strings.Index(blockContent, "=") if assignPos != -1 { varName := strings.TrimSpace(blockContent[:assignPos]) value := strings.TrimSpace(blockContent[assignPos+1:]) // Check if varName is valid (should be a variable name) // In Twig, variable names must start with a letter or underscore if len(varName) > 0 && (isCharAlpha(varName[0]) || varName[0] == '_') { // Add the variable name token varName = t.GetStringConstant(varName) t.AddToken(TOKEN_NAME, varName, t.line) // Add the assignment operator t.AddToken(TOKEN_OPERATOR, "=", t.line) // Tokenize the value expression if len(value) > 0 { t.TokenizeExpression(value) } else { // Empty value after =, which is invalid // Add an error token to trigger proper parser error t.AddToken(TOKEN_EOF, "ERROR_MISSING_VALUE", t.line) } } else { // Invalid variable name (like a number or operator) // Just tokenize as expressions to produce an error in the parser t.TokenizeExpression(varName) t.AddToken(TOKEN_OPERATOR, "=", t.line) t.TokenizeExpression(value) } } else { // No assignment, just an expression to evaluate t.TokenizeExpression(blockContent) } case "include": // Handle include with template path and optional context withPos := strings.Index(strings.ToLower(blockContent), " with ") if withPos != -1 { templatePath := strings.TrimSpace(blockContent[:withPos]) contextExpr := strings.TrimSpace(blockContent[withPos+6:]) // Process template path t.tokenizeTemplatePath(templatePath) // Add 'with' keyword t.AddToken(TOKEN_NAME, "with", t.line) // Process context expression as object if strings.HasPrefix(contextExpr, "{") && strings.HasSuffix(contextExpr, "}") { // Context is an object literal t.AddToken(TOKEN_PUNCTUATION, "{", t.line) objectContent := contextExpr[1 : len(contextExpr)-1] t.tokenizeObjectContents(objectContent) t.AddToken(TOKEN_PUNCTUATION, "}", t.line) } else { // Context is a variable or expression t.TokenizeExpression(contextExpr) } } else { // Just a template path t.tokenizeTemplatePath(blockContent) } case "extends": // Handle extends tag (similar to include template path) t.tokenizeTemplatePath(blockContent) case "from": // Handle from tag which has a special format: // {% from "template.twig" import macro1, macro2 as alias %} importPos := strings.Index(strings.ToLower(blockContent), " import ") if importPos != -1 { // Extract template path and macros list templatePath := strings.TrimSpace(blockContent[:importPos]) macrosStr := strings.TrimSpace(blockContent[importPos+8:]) // 8 = len(" import ") // Process template path t.tokenizeTemplatePath(templatePath) // Add 'import' keyword t.AddToken(TOKEN_NAME, "import", t.line) // Process macro imports macros := strings.Split(macrosStr, ",") for i, macro := range macros { macro = strings.TrimSpace(macro) // Check for "as" alias asPos := strings.Index(strings.ToLower(macro), " as ") if asPos != -1 { // Extract macro name and alias macroName := strings.TrimSpace(macro[:asPos]) alias := strings.TrimSpace(macro[asPos+4:]) // Add macro name macroName = t.GetStringConstant(macroName) t.AddToken(TOKEN_NAME, macroName, t.line) // Add 'as' keyword t.AddToken(TOKEN_NAME, "as", t.line) // Add alias alias = t.GetStringConstant(alias) t.AddToken(TOKEN_NAME, alias, t.line) } else { // Just the macro name macro = t.GetStringConstant(macro) t.AddToken(TOKEN_NAME, macro, t.line) } // Add comma if not the last macro if i < len(macros)-1 { t.AddToken(TOKEN_PUNCTUATION, ",", t.line) } } } else { // Malformed from tag, just tokenize as expression t.TokenizeExpression(blockContent) } case "import": // Handle import tag which allows importing entire templates // {% import "template.twig" as alias %} asPos := strings.Index(strings.ToLower(blockContent), " as ") if asPos != -1 { // Extract template path and alias templatePath := strings.TrimSpace(blockContent[:asPos]) alias := strings.TrimSpace(blockContent[asPos+4:]) // Process template path t.tokenizeTemplatePath(templatePath) // Add 'as' keyword t.AddToken(TOKEN_NAME, "as", t.line) // Add alias alias = t.GetStringConstant(alias) t.AddToken(TOKEN_NAME, alias, t.line) } else { // Simple import without alias t.TokenizeExpression(blockContent) } default: // Other block types - tokenize as expression t.TokenizeExpression(blockContent) } } // Helper methods for specialized tag tokenization // tokenizeTemplatePath handles template paths in extends/include tags func (t *ZeroAllocTokenizer) tokenizeTemplatePath(path string) { path = strings.TrimSpace(path) // If it's a quoted string if (strings.HasPrefix(path, "\"") && strings.HasSuffix(path, "\"")) || (strings.HasPrefix(path, "'") && strings.HasSuffix(path, "'")) { // Extract content without quotes content := path[1 : len(path)-1] t.AddToken(TOKEN_STRING, content, t.line) } else { // Otherwise tokenize as expression t.TokenizeExpression(path) } } // isCharAlpha checks if a byte is an alphabetic character func isCharAlpha(c byte) bool { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') } // tokenizeObjectContents handles object literal contents func (t *ZeroAllocTokenizer) tokenizeObjectContents(content string) { // Track state for nested structures inString := false stringDelim := byte(0) inObject := 0 inArray := 0 start := 0 colonPos := -1 for i := 0; i <= len(content); i++ { // At end of string or at a comma at the top level atEnd := i == len(content) isComma := !atEnd && content[i] == ',' // Process key-value pair when we find a comma or reach the end if (isComma || atEnd) && inObject == 0 && inArray == 0 && !inString { if colonPos != -1 { // We have a key-value pair keyStr := strings.TrimSpace(content[start:colonPos]) valueStr := strings.TrimSpace(content[colonPos+1 : i]) // Process key if (len(keyStr) >= 2 && keyStr[0] == '"' && keyStr[len(keyStr)-1] == '"') || (len(keyStr) >= 2 && keyStr[0] == '\'' && keyStr[len(keyStr)-1] == '\'') { // Quoted key t.AddToken(TOKEN_STRING, keyStr[1:len(keyStr)-1], t.line) } else { // Unquoted key keyStr = t.GetStringConstant(keyStr) t.AddToken(TOKEN_NAME, keyStr, t.line) } // Add colon t.AddToken(TOKEN_PUNCTUATION, ":", t.line) // Process value t.TokenizeExpression(valueStr) // Add comma if needed if isComma && i < len(content)-1 { t.AddToken(TOKEN_PUNCTUATION, ",", t.line) } // Reset for next pair start = i + 1 colonPos = -1 } continue } // Skip end of string case if atEnd { continue } // Current character c := content[i] // Handle string literals if (c == '"' || c == '\'') && (i == 0 || content[i-1] != '\\') { if inString && c == stringDelim { inString = false } else if !inString { inString = true stringDelim = c } continue } // Skip processing inside strings if inString { continue } // Handle object and array nesting if c == '{' { inObject++ } else if c == '}' { inObject-- } else if c == '[' { inArray++ } else if c == ']' { inArray-- } // Track colon position for key-value separator if c == ':' && inObject == 0 && inArray == 0 && colonPos == -1 { colonPos = i } } } // ApplyWhitespaceControl applies whitespace control to the tokenized result func (t *ZeroAllocTokenizer) ApplyWhitespaceControl() { tokens := t.result for i := 0; i < len(tokens); i++ { token := tokens[i] // Handle opening tags that trim whitespace before them if token.Type == TOKEN_VAR_START_TRIM || token.Type == TOKEN_BLOCK_START_TRIM { // If there's a text token before this, trim its trailing whitespace if i > 0 && tokens[i-1].Type == TOKEN_TEXT { tokens[i-1].Value = trimTrailingWhitespace(tokens[i-1].Value) } } // Handle closing tags that trim whitespace after them if token.Type == TOKEN_VAR_END_TRIM || token.Type == TOKEN_BLOCK_END_TRIM { // If there's a text token after this, trim its leading whitespace if i+1 < len(tokens) && tokens[i+1].Type == TOKEN_TEXT { tokens[i+1].Value = trimLeadingWhitespace(tokens[i+1].Value) } } } } // The following functions implement string interning and tag detection from the optimized implementations // newGlobalStringCache creates a new global string cache with pre-populated common strings func newGlobalStringCache() *GlobalStringCache { cache := &GlobalStringCache{ strings: make(map[string]string, 64), // Pre-allocate capacity } // Pre-populate with common strings commonStrings := []string{ stringDiv, stringSpan, stringP, stringA, stringImg, stringHref, stringClass, stringId, stringStyle, stringIf, stringFor, stringEnd, stringEndif, stringEndfor, stringElse, stringBlock, stringSet, stringInclude, stringExtends, stringMacro, stringEquals, stringNotEquals, stringAnd, stringOr, stringNot, stringIn, stringIs, // Add empty string as well "", } for _, s := range commonStrings { cache.strings[s] = s } return cache } // Intern returns an interned version of the input string // For strings that are already in the cache, the cached version is returned // Otherwise, the input string is added to the cache and returned func Intern(s string) string { // Fast path for very common strings to avoid lock contention switch s { case stringDiv, stringSpan, stringP, stringA, stringImg, stringIf, stringFor, stringEnd, stringEndif, stringEndfor, stringElse, "": return s } // Don't intern strings that are too long if len(s) > maxCacheableLength { return s } // Use read lock for lookup first (less contention) globalCache.RLock() cached, exists := globalCache.strings[s] globalCache.RUnlock() if exists { return cached } // Not found with read lock, acquire write lock to add globalCache.Lock() defer globalCache.Unlock() // Check again after acquiring write lock (double-checked locking) if cached, exists := globalCache.strings[s]; exists { return cached } // Add to cache and return globalCache.strings[s] = s return s } // FindNextTag finds the next twig tag in a template string using // optimized detection methods to reduce allocations and string operations. func FindNextTag(source string, startPos int) TagLocation { // Quick check for empty source or position at end if len(source) == 0 || startPos >= len(source) || startPos < 0 { return TagLocation{TAG_NONE, -1, 0} } // Define the remaining source to search remainingSource := source[startPos:] remainingLen := len(remainingSource) // Fast paths for common cases if remainingLen < 2 { return TagLocation{TAG_NONE, -1, 0} } // Direct byte comparison for opening characters // This avoids string allocations and uses pointer arithmetic srcPtr := unsafe.Pointer(unsafe.StringData(remainingSource)) // Quick check for potential tag start with { character for i := 0; i < remainingLen-1; i++ { if *(*byte)(unsafe.Add(srcPtr, i)) != '{' { continue } // We found a '{', check next character secondChar := *(*byte)(unsafe.Add(srcPtr, i+1)) // Check for start of blocks tagPosition := startPos + i // Check for possible tag patterns switch secondChar { case '{': // Potential variable tag {{ if i+2 < remainingLen && *(*byte)(unsafe.Add(srcPtr, i+2)) == '-' { return TagLocation{TAG_VAR_TRIM, tagPosition, 3} } return TagLocation{TAG_VAR, tagPosition, 2} case '%': // Potential block tag {% if i+2 < remainingLen && *(*byte)(unsafe.Add(srcPtr, i+2)) == '-' { return TagLocation{TAG_BLOCK_TRIM, tagPosition, 3} } return TagLocation{TAG_BLOCK, tagPosition, 2} case '#': // Comment tag {# return TagLocation{TAG_COMMENT, tagPosition, 2} } } // No tags found return TagLocation{TAG_NONE, -1, 0} } // FindTagEnd finds the end of a tag based on the type func FindTagEnd(source string, startPos int, tagType TagType) int { if startPos >= len(source) { return -1 } switch tagType { case TAG_VAR, TAG_VAR_TRIM: // Find "}}" sequence for i := startPos; i < len(source)-1; i++ { if source[i] == '}' && source[i+1] == '}' { return i } } case TAG_BLOCK, TAG_BLOCK_TRIM: // Find "%}" sequence for i := startPos; i < len(source)-1; i++ { if source[i] == '%' && source[i+1] == '}' { return i } } case TAG_COMMENT: // Find "#}" sequence for i := startPos; i < len(source)-1; i++ { if source[i] == '#' && source[i+1] == '}' { return i } } } return -1 } // TokenizeOptimized uses the enhanced tag detection for faster tokenization // This is a hybrid approach that combines direct tag detection with // full HTML-preserving tokenization for maximum performance func (t *ZeroAllocTokenizer) TokenizeOptimized() ([]Token, error) { // Reset position and line t.position = 0 t.line = 1 // Clear token buffer t.tokenBuffer = t.tokenBuffer[:0] // Process the template content pos := 0 for pos < len(t.source) { // Find the next tag using optimized detection tagLoc := FindNextTag(t.source, pos) // Check if no more tags found if tagLoc.Position == -1 { // Add remaining text as TOKEN_TEXT if pos < len(t.source) { remainingText := t.source[pos:] t.AddToken(TOKEN_TEXT, remainingText, t.line) t.line += countNewlines(remainingText) } break } // Check if the tag is escaped with a backslash if tagLoc.Position > 0 && t.source[tagLoc.Position-1] == '\\' { // Add text up to the backslash if tagLoc.Position-1 > pos { preText := t.source[pos : tagLoc.Position-1] t.AddToken(TOKEN_TEXT, preText, t.line) t.line += countNewlines(preText) } // Add the tag as literal text (without the backslash) var tagText string switch tagLoc.Type { case TAG_VAR: tagText = "{{" case TAG_VAR_TRIM: tagText = "{{-" case TAG_BLOCK: tagText = "{%" case TAG_BLOCK_TRIM: tagText = "{%-" case TAG_COMMENT: tagText = "{#" } t.AddToken(TOKEN_TEXT, tagText, t.line) // Move past this tag pos = tagLoc.Position + tagLoc.Length continue } // Add text before the tag if tagLoc.Position > pos { textContent := t.source[pos:tagLoc.Position] t.AddToken(TOKEN_TEXT, textContent, t.line) t.line += countNewlines(textContent) } // Add the tag start token var startTokenType int switch tagLoc.Type { case TAG_VAR: startTokenType = TOKEN_VAR_START case TAG_VAR_TRIM: startTokenType = TOKEN_VAR_START_TRIM case TAG_BLOCK: startTokenType = TOKEN_BLOCK_START case TAG_BLOCK_TRIM: startTokenType = TOKEN_BLOCK_START_TRIM case TAG_COMMENT: startTokenType = TOKEN_COMMENT_START } t.AddToken(startTokenType, "", t.line) // Move past the tag's opening characters tagContentStart := tagLoc.Position + tagLoc.Length // Find the end of the tag tagEndPos := FindTagEnd(t.source, tagContentStart, tagLoc.Type) if tagEndPos == -1 { var unclosedType string switch tagLoc.Type { case TAG_VAR, TAG_VAR_TRIM: unclosedType = "variable" case TAG_BLOCK, TAG_BLOCK_TRIM: unclosedType = "block" case TAG_COMMENT: unclosedType = "comment" } return nil, fmt.Errorf("unclosed %s tag at line %d", unclosedType, t.line) } // Get tag content tagContent := t.source[tagContentStart:tagEndPos] t.line += countNewlines(tagContent) // Determine the end token type and length var endTokenType int var endLength int switch tagLoc.Type { case TAG_VAR: endTokenType = TOKEN_VAR_END endLength = 2 // }} case TAG_VAR_TRIM: // Check if it ends with -}} if tagEndPos > 0 && t.source[tagEndPos-1] == '-' { endTokenType = TOKEN_VAR_END_TRIM endLength = 3 // -}} // Adjust tag content to remove the trailing dash tagContent = tagContent[:len(tagContent)-1] } else { endTokenType = TOKEN_VAR_END endLength = 2 // }} } case TAG_BLOCK: endTokenType = TOKEN_BLOCK_END endLength = 2 // %} case TAG_BLOCK_TRIM: // Check if it ends with -%} if tagEndPos > 0 && t.source[tagEndPos-1] == '-' { endTokenType = TOKEN_BLOCK_END_TRIM endLength = 3 // -%} // Adjust tag content to remove the trailing dash tagContent = tagContent[:len(tagContent)-1] } else { endTokenType = TOKEN_BLOCK_END endLength = 2 // %} } case TAG_COMMENT: endTokenType = TOKEN_COMMENT_END endLength = 2 // #} } // Process tag content based on tag type if tagLoc.Type == TAG_COMMENT { // Store comments as TEXT tokens if len(tagContent) > 0 { t.AddToken(TOKEN_TEXT, tagContent, t.line) } } else { // For variable and block tags, tokenize the content tagContent = strings.TrimSpace(tagContent) if tagLoc.Type == TAG_BLOCK || tagLoc.Type == TAG_BLOCK_TRIM { // Process block tags using specialized tokenization if len(tagContent) > 0 { t.processBlockTag(tagContent) } } else { // Process variable tags using optimized tokenization if len(tagContent) > 0 { // Check if it's a simple variable or a complex expression if !strings.ContainsAny(tagContent, ".|[](){}\"',+-*/=!<>%&^~") { // Simple variable name - use string interning for efficiency identifier := Intern(tagContent) t.AddToken(TOKEN_NAME, identifier, t.line) } else { // Complex expression - tokenize fully t.TokenizeExpression(tagContent) } } } } // Add end token t.AddToken(endTokenType, "", t.line) // Move past the end tag pos = tagEndPos + endLength } // Add EOF token t.AddToken(TOKEN_EOF, "", t.line) // Save and return result t.result = t.tokenBuffer return t.result, nil }