From f9fc0e045bc1f72ba61fdf4a79b10a75a240394e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Erik=20Pedersen?= Date: Sun, 5 Feb 2023 15:14:30 +0100 Subject: [PATCH] Fix slow HTML elements collector for the pre case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ``` name old time/op new time/op delta ElementsCollectorWriterPre-10 25.2µs ± 1% 3.4µs ± 0% -86.54% (p=0.029 n=4+4) name old alloc/op new alloc/op delta ElementsCollectorWriterPre-10 624B ± 0% 142B ± 0% -77.18% (p=0.029 n=4+4) name old allocs/op new allocs/op delta ElementsCollectorWriterPre-10 16.0 ± 0% 6.0 ± 0% -62.50% (p=0.029 n=4+4) ``` Fixes #10698 --- publisher/htmlElementsCollector.go | 73 ++++++++++++++++++++++--- publisher/htmlElementsCollector_test.go | 28 ++++++++++ 2 files changed, 93 insertions(+), 8 deletions(-) diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go index ca6e2d940..91e1237a9 100644 --- a/publisher/htmlElementsCollector.go +++ b/publisher/htmlElementsCollector.go @@ -36,7 +36,6 @@ var ( skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`) skipAllElementRe = regexp.MustCompile(`(?i)^!DOCTYPE`) - endTagRe = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`) exceptionList = map[string]bool{ "thead": true, @@ -312,11 +311,7 @@ func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc if w.r != '>' { return false } - m := endTagRe.FindSubmatch(w.buff.Bytes()) - if m == nil { - return false - } - return bytes.EqualFold(m[1], tagNameCopy) + return isClosedByTag(w.buff.Bytes(), tagNameCopy) }, htmlLexStart, )) @@ -428,8 +423,9 @@ func parseHTMLElement(elStr string) (el htmlElement, err error) { } // Variants of s -// -//
+// +// +//
func parseStartTag(s string) string { spaceIndex := strings.IndexFunc(s, func(r rune) bool { return unicode.IsSpace(r) @@ -441,3 +437,64 @@ func parseStartTag(s string) string { return s[1:spaceIndex] } + +// isClosedByTag reports whether b ends with a closing tag for tagName. +func isClosedByTag(b, tagName []byte) bool { + if len(b) == 0 { + return false + } + + if b[len(b)-1] != '>' { + return false + } + + var ( + lo int + hi int + + state int + inWord bool + ) + +LOOP: + for i := len(b) - 2; i >= 0; i-- { + switch { + case b[i] == '<': + if state != 1 { + return false + } + state = 2 + break LOOP + case b[i] == '/': + if state != 0 { + return false + } + state++ + if inWord { + lo = i + 1 + inWord = false + } + case isSpace(b[i]): + if inWord { + lo = i + 1 + inWord = false + } + default: + if !inWord { + hi = i + 1 + inWord = true + } + } + } + + if state != 2 { + return false + } + + return bytes.EqualFold(tagName, b[lo:hi]) + +} + +func isSpace(b byte) bool { + return b == ' ' || b == '\t' || b == '\n' +} diff --git a/publisher/htmlElementsCollector_test.go b/publisher/htmlElementsCollector_test.go index 8be8c46ac..11590e0a3 100644 --- a/publisher/htmlElementsCollector_test.go +++ b/publisher/htmlElementsCollector_test.go @@ -155,6 +155,34 @@ func TestClassCollector(t *testing.T) { } +func TestEndsWithTag(t *testing.T) { + c := qt.New((t)) + + for _, test := range []struct { + name string + s string + tagName string + expect bool + }{ + {"empty", "", "div", false}, + {"no match", "foo", "div", false}, + {"no close", "foo
", "div", false}, + {"no close 2", "foo/div>", "div", false}, + {"no close 2", "foo//div>", "div", false}, + {"no tag", "foo", "div", false}, + {"match", "foo
", "div", true}, + {"match space", "foo< / div>", "div", true}, + {"match space 2", "foo< / div \n>", "div", true}, + {"match case", "foo
", "div", true}, + } { + c.Run(test.name, func(c *qt.C) { + got := isClosedByTag([]byte(test.s), []byte(test.tagName)) + c.Assert(got, qt.Equals, test.expect) + }) + } + +} + func BenchmarkElementsCollectorWriter(b *testing.B) { const benchHTML = `