From 6bc5698125c2b60fb2e7a6072a911872fae507ca Mon Sep 17 00:00:00 2001 From: Tulir Asokan Date: Sun, 10 Aug 2025 23:26:16 +0300 Subject: [PATCH] format/htmlparser: collapse spaces when parsing html Backported from 78aea00999ceb8a9440f411d0e5a79d73116bf32 --- format/htmlparser.go | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/format/htmlparser.go b/format/htmlparser.go index eb2a662b..99936a36 100644 --- a/format/htmlparser.go +++ b/format/htmlparser.go @@ -334,11 +334,37 @@ func (parser *HTMLParser) tagToString(node *html.Node, ctx Context) string { } } +// PrefixByteRunLength returns the number of the given byte at the start of a string. +func PrefixByteRunLength(s string, b byte) int { + count := 0 + for ; count < len(s) && s[count] == b; count++ { + } + return count +} + +// CollapseSpaces replaces all runs of multiple spaces (\x20) in a string with a single space. +func CollapseSpaces(s string) string { + doubleSpaceIdx := strings.Index(s, " ") + if doubleSpaceIdx < 0 { + return s + } + var buf strings.Builder + buf.Grow(len(s)) + for doubleSpaceIdx >= 0 { + buf.WriteString(s[:doubleSpaceIdx+1]) + spaceCount := PrefixByteRunLength(s[doubleSpaceIdx+2:], ' ') + 2 + s = s[doubleSpaceIdx+spaceCount:] + doubleSpaceIdx = strings.Index(s, " ") + } + buf.WriteString(s) + return buf.String() +} + func (parser *HTMLParser) singleNodeToString(node *html.Node, ctx Context) TaggedString { switch node.Type { case html.TextNode: if !ctx.PreserveWhitespace { - node.Data = strings.Replace(node.Data, "\n", "", -1) + node.Data = CollapseSpaces(strings.Replace(node.Data, "\n", "", -1)) } if parser.TextConverter != nil { node.Data = parser.TextConverter(node.Data, ctx)