Fix slow HTML elements collector for the pre case

```
name                           old time/op    new time/op    delta
ElementsCollectorWriterPre-10    25.2µs ± 1%     3.4µs ± 0%  -86.54%  (p=0.029 n=4+4)

name                           old alloc/op   new alloc/op   delta
ElementsCollectorWriterPre-10      624B ± 0%      142B ± 0%  -77.18%  (p=0.029 n=4+4)

name                           old allocs/op  new allocs/op  delta
ElementsCollectorWriterPre-10      16.0 ± 0%       6.0 ± 0%  -62.50%  (p=0.029 n=4+4)
```

Fixes #10698
This commit is contained in:
Bjørn Erik Pedersen 2023-02-05 15:14:30 +01:00
parent 4f4a1c00bf
commit f9fc0e045b
2 changed files with 93 additions and 8 deletions

View file

@ -36,7 +36,6 @@ var (
skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`)
skipAllElementRe = regexp.MustCompile(`(?i)^!DOCTYPE`)
endTagRe = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`)
exceptionList = map[string]bool{
"thead": true,
@ -312,11 +311,7 @@ func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc
if w.r != '>' {
return false
}
m := endTagRe.FindSubmatch(w.buff.Bytes())
if m == nil {
return false
}
return bytes.EqualFold(m[1], tagNameCopy)
return isClosedByTag(w.buff.Bytes(), tagNameCopy)
},
htmlLexStart,
))
@ -428,8 +423,9 @@ func parseHTMLElement(elStr string) (el htmlElement, err error) {
}
// Variants of s
// <body class="b a">
// <div>
//
// <body class="b a">
// <div>
func parseStartTag(s string) string {
spaceIndex := strings.IndexFunc(s, func(r rune) bool {
return unicode.IsSpace(r)
@ -441,3 +437,64 @@ func parseStartTag(s string) string {
return s[1:spaceIndex]
}
// isClosedByTag reports whether b ends with a closing tag for tagName.
func isClosedByTag(b, tagName []byte) bool {
if len(b) == 0 {
return false
}
if b[len(b)-1] != '>' {
return false
}
var (
lo int
hi int
state int
inWord bool
)
LOOP:
for i := len(b) - 2; i >= 0; i-- {
switch {
case b[i] == '<':
if state != 1 {
return false
}
state = 2
break LOOP
case b[i] == '/':
if state != 0 {
return false
}
state++
if inWord {
lo = i + 1
inWord = false
}
case isSpace(b[i]):
if inWord {
lo = i + 1
inWord = false
}
default:
if !inWord {
hi = i + 1
inWord = true
}
}
}
if state != 2 {
return false
}
return bytes.EqualFold(tagName, b[lo:hi])
}
func isSpace(b byte) bool {
return b == ' ' || b == '\t' || b == '\n'
}

View file

@ -155,6 +155,34 @@ func TestClassCollector(t *testing.T) {
}
func TestEndsWithTag(t *testing.T) {
c := qt.New((t))
for _, test := range []struct {
name string
s string
tagName string
expect bool
}{
{"empty", "", "div", false},
{"no match", "foo", "div", false},
{"no close", "foo<div>", "div", false},
{"no close 2", "foo/div>", "div", false},
{"no close 2", "foo//div>", "div", false},
{"no tag", "foo</>", "div", false},
{"match", "foo</div>", "div", true},
{"match space", "foo< / div>", "div", true},
{"match space 2", "foo< / div \n>", "div", true},
{"match case", "foo</DIV>", "div", true},
} {
c.Run(test.name, func(c *qt.C) {
got := isClosedByTag([]byte(test.s), []byte(test.tagName))
c.Assert(got, qt.Equals, test.expect)
})
}
}
func BenchmarkElementsCollectorWriter(b *testing.B) {
const benchHTML = `
<!DOCTYPE html>