From f9fc0e045bc1f72ba61fdf4a79b10a75a240394e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B8rn=20Erik=20Pedersen?=
 <bjorn.erik.pedersen@gmail.com>
Date: Sun, 5 Feb 2023 15:14:30 +0100
Subject: [PATCH] Fix slow HTML elements collector for the pre case
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

```
name                           old time/op    new time/op    delta
ElementsCollectorWriterPre-10    25.2µs ± 1%     3.4µs ± 0%  -86.54%  (p=0.029 n=4+4)

name                           old alloc/op   new alloc/op   delta
ElementsCollectorWriterPre-10      624B ± 0%      142B ± 0%  -77.18%  (p=0.029 n=4+4)

name                           old allocs/op  new allocs/op  delta
ElementsCollectorWriterPre-10      16.0 ± 0%       6.0 ± 0%  -62.50%  (p=0.029 n=4+4)
```

Fixes #10698
---
 publisher/htmlElementsCollector.go      | 73 ++++++++++++++++++++++---
 publisher/htmlElementsCollector_test.go | 28 ++++++++++
 2 files changed, 93 insertions(+), 8 deletions(-)
diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go
index ca6e2d940..91e1237a9 100644
--- a/publisher/htmlElementsCollector.go
+++ b/publisher/htmlElementsCollector.go
@@ -36,7 +36,6 @@ var (
 
 	skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`)
 	skipAllElementRe   = regexp.MustCompile(`(?i)^!DOCTYPE`)
-	endTagRe           = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`)
 
 	exceptionList = map[string]bool{
 		"thead": true,
@@ -312,11 +311,7 @@ func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc
 						if w.r != '>' {
 							return false
 						}
-						m := endTagRe.FindSubmatch(w.buff.Bytes())
-						if m == nil {
-							return false
-						}
-						return bytes.EqualFold(m[1], tagNameCopy)
+						return isClosedByTag(w.buff.Bytes(), tagNameCopy)
 					},
 					htmlLexStart,
 				))
@@ -428,8 +423,9 @@ func parseHTMLElement(elStr string) (el htmlElement, err error) {
 }
 
 // Variants of s
-//    <body class="b a">
-//    <div>
+//
+//	<body class="b a">
+//	<div>
 func parseStartTag(s string) string {
 	spaceIndex := strings.IndexFunc(s, func(r rune) bool {
 		return unicode.IsSpace(r)
@@ -441,3 +437,64 @@ func parseStartTag(s string) string {
 
 	return s[1:spaceIndex]
 }
+
+// isClosedByTag reports whether b ends with a closing tag for tagName.
+func isClosedByTag(b, tagName []byte) bool {
+	if len(b) == 0 {
+		return false
+	}
+
+	if b[len(b)-1] != '>' {
+		return false
+	}
+
+	var (
+		lo int
+		hi int
+
+		state  int
+		inWord bool
+	)
+
+LOOP:
+	for i := len(b) - 2; i >= 0; i-- {
+		switch {
+		case b[i] == '<':
+			if state != 1 {
+				return false
+			}
+			state = 2
+			break LOOP
+		case b[i] == '/':
+			if state != 0 {
+				return false
+			}
+			state++
+			if inWord {
+				lo = i + 1
+				inWord = false
+			}
+		case isSpace(b[i]):
+			if inWord {
+				lo = i + 1
+				inWord = false
+			}
+		default:
+			if !inWord {
+				hi = i + 1
+				inWord = true
+			}
+		}
+	}
+
+	if state != 2 {
+		return false
+	}
+
+	return bytes.EqualFold(tagName, b[lo:hi])
+
+}
+
+func isSpace(b byte) bool {
+	return b == ' ' || b == '\t' || b == '\n'
+}
diff --git a/publisher/htmlElementsCollector_test.go b/publisher/htmlElementsCollector_test.go
index 8be8c46ac..11590e0a3 100644
--- a/publisher/htmlElementsCollector_test.go
+++ b/publisher/htmlElementsCollector_test.go
@@ -155,6 +155,34 @@ func TestClassCollector(t *testing.T) {
 
 }
 
+func TestEndsWithTag(t *testing.T) {
+	c := qt.New((t))
+
+	for _, test := range []struct {
+		name    string
+		s       string
+		tagName string
+		expect  bool
+	}{
+		{"empty", "", "div", false},
+		{"no match", "foo", "div", false},
+		{"no close", "foo<div>", "div", false},
+		{"no close 2", "foo/div>", "div", false},
+		{"no close 2", "foo//div>", "div", false},
+		{"no tag", "foo</>", "div", false},
+		{"match", "foo</div>", "div", true},
+		{"match space", "foo<  / div>", "div", true},
+		{"match space 2", "foo<  / div   \n>", "div", true},
+		{"match case", "foo</DIV>", "div", true},
+	} {
+		c.Run(test.name, func(c *qt.C) {
+			got := isClosedByTag([]byte(test.s), []byte(test.tagName))
+			c.Assert(got, qt.Equals, test.expect)
+		})
+	}
+
+}
+
 func BenchmarkElementsCollectorWriter(b *testing.B) {
 	const benchHTML = `
 <!DOCTYPE html>