Fix canonifyurl vs schemaless links

And looks even faster:

Compared to previous attempt:

benchmark              old ns/op     new ns/op     delta
BenchmarkAbsUrl        30902         27206         -11.96%
BenchmarkXmlAbsUrl     15389         14216         -7.62%

benchmark              old allocs     new allocs     delta
BenchmarkAbsUrl        33             28             -15.15%
BenchmarkXmlAbsUrl     16             14             -12.50%

benchmark              old bytes     new bytes     delta
BenchmarkAbsUrl        4167          3504          -15.91%
BenchmarkXmlAbsUrl     2057          2048          -0.44%

Compared to before I started all of this:

benchmark              old ns/op     new ns/op     delta
BenchmarkAbsUrl        36219         27206         -24.88%
BenchmarkXmlAbsUrl     23903         14216         -40.53%

benchmark              old allocs     new allocs     delta
BenchmarkAbsUrl        60             28             -53.33%
BenchmarkXmlAbsUrl     30             14             -53.33%

benchmark              old bytes     new bytes     delta
BenchmarkAbsUrl        5842          3504          -40.02%
BenchmarkXmlAbsUrl     3754          2048          -45.44%

Fixes #816
This commit is contained in:
bep 2015-02-17 04:33:44 +01:00
parent 6b28e38cea
commit c2e2913872
2 changed files with 46 additions and 74 deletions

View file

@ -37,11 +37,6 @@ const (
tHrefdq
tSrcsq
tHrefsq
// guards
tGrcdq
tGhrefdq
tGsrcsq
tGhrefsq
)
type contentlexer struct {
@ -130,24 +125,6 @@ var itemSlicePool = &sync.Pool{
},
}
func replace(content []byte, matchers []absurlMatcher) *contentlexer {
var items []item
if x := itemSlicePool.Get(); x != nil {
items = x.([]item)[:0]
defer itemSlicePool.Put(items)
} else {
items = make([]item, 0, 8)
}
lexer := &contentlexer{content: content,
items: items,
prefixLookup: &prefixes{pr: mainPrefixRunes},
matchers: matchers}
lexer.runReplacer()
return lexer
}
func (l *contentlexer) runReplacer() {
for l.state = lexReplacements; l.state != nil; {
l.state = l.state(l)
@ -156,11 +133,8 @@ func (l *contentlexer) runReplacer() {
type absurlMatcher struct {
replaceType itemType
guardType itemType
match []byte
guard []byte
replacement []byte
guarded bool
}
func (a absurlMatcher) isSourceType() bool {
@ -207,24 +181,21 @@ func checkCandidate(l *contentlexer) {
isSource := l.prefixLookup.first == 's'
for _, m := range l.matchers {
if m.guarded {
continue
}
if isSource && !m.isSourceType() || !isSource && m.isSourceType() {
continue
}
s := l.content[l.pos:]
if bytes.HasPrefix(s, m.guard) {
if l.pos > l.start {
l.emit(tText)
}
l.pos += pos(len(m.guard))
l.emit(m.guardType)
m.guarded = true
if bytes.HasPrefix(l.content[l.pos:], m.match) {
// check for schemaless urls
posAfter := pos(int(l.pos) + len(m.match))
if int(posAfter) >= len(l.content) {
return
} else if bytes.HasPrefix(s, m.match) {
}
r, _ := utf8.DecodeRune(l.content[posAfter:])
if r == '/' {
// schemaless: skip
return
}
if l.pos > l.start {
l.emit(tText)
}
@ -240,31 +211,30 @@ func doReplace(content []byte, matchers []absurlMatcher) []byte {
b := bp.GetBuffer()
defer bp.PutBuffer(b)
guards := make([]bool, len(matchers))
replaced := replace(content, matchers)
var items []item
if x := itemSlicePool.Get(); x != nil {
items = x.([]item)[:0]
defer itemSlicePool.Put(items)
} else {
items = make([]item, 0, 8)
}
// first pass: check guards
for _, item := range replaced.items {
if item.typ != tText {
for i, e := range matchers {
if item.typ == e.guardType {
guards[i] = true
break
}
}
}
}
// second pass: do replacements for non-guarded tokens
for _, token := range replaced.items {
lexer := &contentlexer{content: content,
items: items,
prefixLookup: &prefixes{pr: mainPrefixRunes},
matchers: matchers}
lexer.runReplacer()
for _, token := range lexer.items {
switch token.typ {
case tText:
b.Write(token.val)
default:
for i, e := range matchers {
if token.typ == e.replaceType && !guards[i] {
for _, e := range matchers {
if token.typ == e.replaceType {
b.Write(e.replacement)
} else if token.typ == e.replaceType || token.typ == e.guardType {
b.Write(token.val)
break
}
}
}
@ -286,16 +256,10 @@ func newAbsurlReplacer(baseUrl string) *absurlReplacer {
dqHtmlMatch := []byte("\"/")
sqHtmlMatch := []byte("'/")
dqGuard := []byte("\"//")
sqGuard := []byte("'//")
// XML
dqXmlMatch := []byte(""/")
sqXmlMatch := []byte("'/")
dqXmlGuard := []byte(""//")
sqXmlGuard := []byte("'//")
dqHtml := []byte("\"" + base + "/")
sqHtml := []byte("'" + base + "/")
@ -303,15 +267,15 @@ func newAbsurlReplacer(baseUrl string) *absurlReplacer {
sqXml := []byte("'" + base + "/")
return &absurlReplacer{htmlMatchers: []absurlMatcher{
{tSrcdq, tGrcdq, dqHtmlMatch, dqGuard, dqHtml, false},
{tSrcsq, tGsrcsq, sqHtmlMatch, sqGuard, sqHtml, false},
{tHrefdq, tGhrefdq, dqHtmlMatch, dqGuard, dqHtml, false},
{tHrefsq, tGhrefsq, sqHtmlMatch, sqGuard, sqHtml, false}},
{tSrcdq, dqHtmlMatch, dqHtml},
{tSrcsq, sqHtmlMatch, sqHtml},
{tHrefdq, dqHtmlMatch, dqHtml},
{tHrefsq, sqHtmlMatch, sqHtml}},
xmlMatchers: []absurlMatcher{
{tSrcdq, tGrcdq, dqXmlMatch, dqXmlGuard, dqXml, false},
{tSrcsq, tGsrcsq, sqXmlMatch, sqXmlGuard, sqXml, false},
{tHrefdq, tGhrefdq, dqXmlMatch, dqXmlGuard, dqXml, false},
{tHrefsq, tGhrefsq, sqXmlMatch, sqXmlGuard, sqXml, false},
{tSrcdq, dqXmlMatch, dqXml},
{tSrcsq, sqXmlMatch, sqXml},
{tHrefdq, dqXmlMatch, dqXml},
{tHrefsq, sqXmlMatch, sqXml},
}}
}

View file

@ -21,6 +21,12 @@ const H5_XML_CONTENT_GUARDED = "<?xml version=\"1.0\" encoding=\"utf-8\" standal
const REPLACE_1 = "No replacements."
const REPLACE_2 = "ᚠᛇᚻ ᛒᛦᚦ ᚠᚱᚩᚠᚢᚱ\nᚠᚱᚪ ᚷᛖᚻᚹᛦᛚᚳᚢᛗ"
// Issue: 816, schemaless links combined with others
const REPLACE_SCHEMALESS_HTML = `Pre. src='//schemaless' src='/normal' <a href="//schemaless">Schemaless</a>. <a href="/normal">normal</a>. Post.`
const REPLACE_SCHEMALESS_HTML_CORRECT = `Pre. src='//schemaless' src='http://base/normal' <a href="//schemaless">Schemaless</a>. <a href="http://base/normal">normal</a>. Post.`
const REPLACE_SCHEMALESS_XML = `Pre. src=&#34;//schemaless&#34; src=&#34;/normal&#34; <a href=&#39;//schemaless&#39;>Schemaless</a>. <a href=&#39;/normal&#39;>normal</a>. Post.`
const REPLACE_SCHEMALESS_XML_CORRECT = `Pre. src=&#34;//schemaless&#34; src=&#34;http://base/normal&#34; <a href=&#39;//schemaless&#39;>Schemaless</a>. <a href=&#39;http://base/normal&#39;>normal</a>. Post.`
var abs_url_bench_tests = []test{
{H5_JS_CONTENT_DOUBLE_QUOTE, CORRECT_OUTPUT_SRC_HREF_DQ},
{H5_JS_CONTENT_SINGLE_QUOTE, CORRECT_OUTPUT_SRC_HREF_SQ},
@ -34,8 +40,10 @@ var xml_abs_url_bench_tests = []test{
}
var sanity_tests = []test{{REPLACE_1, REPLACE_1}, {REPLACE_2, REPLACE_2}}
var abs_url_tests = append(abs_url_bench_tests, sanity_tests...)
var xml_abs_url_tests = append(xml_abs_url_bench_tests, sanity_tests...)
var extra_tests_html = []test{{REPLACE_SCHEMALESS_HTML, REPLACE_SCHEMALESS_HTML_CORRECT}}
var abs_url_tests = append(abs_url_bench_tests, append(sanity_tests, extra_tests_html...)...)
var extra_tests_xml = []test{{REPLACE_SCHEMALESS_XML, REPLACE_SCHEMALESS_XML_CORRECT}}
var xml_abs_url_tests = append(xml_abs_url_bench_tests, append(sanity_tests, extra_tests_xml...)...)
func TestChainZeroTransformers(t *testing.T) {
tr := NewChain()