diff --git a/transform/absurlreplacer.go b/transform/absurlreplacer.go index 7b6f72379..f558fcaae 100644 --- a/transform/absurlreplacer.go +++ b/transform/absurlreplacer.go @@ -37,11 +37,6 @@ const ( tHrefdq tSrcsq tHrefsq - // guards - tGrcdq - tGhrefdq - tGsrcsq - tGhrefsq ) type contentlexer struct { @@ -130,24 +125,6 @@ var itemSlicePool = &sync.Pool{ }, } -func replace(content []byte, matchers []absurlMatcher) *contentlexer { - var items []item - if x := itemSlicePool.Get(); x != nil { - items = x.([]item)[:0] - defer itemSlicePool.Put(items) - } else { - items = make([]item, 0, 8) - } - - lexer := &contentlexer{content: content, - items: items, - prefixLookup: &prefixes{pr: mainPrefixRunes}, - matchers: matchers} - - lexer.runReplacer() - return lexer -} - func (l *contentlexer) runReplacer() { for l.state = lexReplacements; l.state != nil; { l.state = l.state(l) @@ -156,11 +133,8 @@ func (l *contentlexer) runReplacer() { type absurlMatcher struct { replaceType itemType - guardType itemType match []byte - guard []byte replacement []byte - guarded bool } func (a absurlMatcher) isSourceType() bool { @@ -207,24 +181,21 @@ func checkCandidate(l *contentlexer) { isSource := l.prefixLookup.first == 's' for _, m := range l.matchers { - if m.guarded { - continue - } - if isSource && !m.isSourceType() || !isSource && m.isSourceType() { continue } - s := l.content[l.pos:] - if bytes.HasPrefix(s, m.guard) { - if l.pos > l.start { - l.emit(tText) + if bytes.HasPrefix(l.content[l.pos:], m.match) { + // check for schemaless urls + posAfter := pos(int(l.pos) + len(m.match)) + if int(posAfter) >= len(l.content) { + return + } + r, _ := utf8.DecodeRune(l.content[posAfter:]) + if r == '/' { + // schemaless: skip + return } - l.pos += pos(len(m.guard)) - l.emit(m.guardType) - m.guarded = true - return - } else if bytes.HasPrefix(s, m.match) { if l.pos > l.start { l.emit(tText) } @@ -240,31 +211,30 @@ func doReplace(content []byte, matchers []absurlMatcher) []byte { b := bp.GetBuffer() defer bp.PutBuffer(b) - guards := make([]bool, len(matchers)) - replaced := replace(content, matchers) - - // first pass: check guards - for _, item := range replaced.items { - if item.typ != tText { - for i, e := range matchers { - if item.typ == e.guardType { - guards[i] = true - break - } - } - } + var items []item + if x := itemSlicePool.Get(); x != nil { + items = x.([]item)[:0] + defer itemSlicePool.Put(items) + } else { + items = make([]item, 0, 8) } - // second pass: do replacements for non-guarded tokens - for _, token := range replaced.items { + + lexer := &contentlexer{content: content, + items: items, + prefixLookup: &prefixes{pr: mainPrefixRunes}, + matchers: matchers} + + lexer.runReplacer() + + for _, token := range lexer.items { switch token.typ { case tText: b.Write(token.val) default: - for i, e := range matchers { - if token.typ == e.replaceType && !guards[i] { + for _, e := range matchers { + if token.typ == e.replaceType { b.Write(e.replacement) - } else if token.typ == e.replaceType || token.typ == e.guardType { - b.Write(token.val) + break } } } @@ -286,16 +256,10 @@ func newAbsurlReplacer(baseUrl string) *absurlReplacer { dqHtmlMatch := []byte("\"/") sqHtmlMatch := []byte("'/") - dqGuard := []byte("\"//") - sqGuard := []byte("'//") - // XML dqXmlMatch := []byte(""/") sqXmlMatch := []byte("'/") - dqXmlGuard := []byte(""//") - sqXmlGuard := []byte("'//") - dqHtml := []byte("\"" + base + "/") sqHtml := []byte("'" + base + "/") @@ -303,15 +267,15 @@ func newAbsurlReplacer(baseUrl string) *absurlReplacer { sqXml := []byte("'" + base + "/") return &absurlReplacer{htmlMatchers: []absurlMatcher{ - {tSrcdq, tGrcdq, dqHtmlMatch, dqGuard, dqHtml, false}, - {tSrcsq, tGsrcsq, sqHtmlMatch, sqGuard, sqHtml, false}, - {tHrefdq, tGhrefdq, dqHtmlMatch, dqGuard, dqHtml, false}, - {tHrefsq, tGhrefsq, sqHtmlMatch, sqGuard, sqHtml, false}}, + {tSrcdq, dqHtmlMatch, dqHtml}, + {tSrcsq, sqHtmlMatch, sqHtml}, + {tHrefdq, dqHtmlMatch, dqHtml}, + {tHrefsq, sqHtmlMatch, sqHtml}}, xmlMatchers: []absurlMatcher{ - {tSrcdq, tGrcdq, dqXmlMatch, dqXmlGuard, dqXml, false}, - {tSrcsq, tGsrcsq, sqXmlMatch, sqXmlGuard, sqXml, false}, - {tHrefdq, tGhrefdq, dqXmlMatch, dqXmlGuard, dqXml, false}, - {tHrefsq, tGhrefsq, sqXmlMatch, sqXmlGuard, sqXml, false}, + {tSrcdq, dqXmlMatch, dqXml}, + {tSrcsq, sqXmlMatch, sqXml}, + {tHrefdq, dqXmlMatch, dqXml}, + {tHrefsq, sqXmlMatch, sqXml}, }} } diff --git a/transform/chain_test.go b/transform/chain_test.go index a88d84533..cd7749ec7 100644 --- a/transform/chain_test.go +++ b/transform/chain_test.go @@ -21,6 +21,12 @@ const H5_XML_CONTENT_GUARDED = "Schemaless. normal. Post.` +const REPLACE_SCHEMALESS_HTML_CORRECT = `Pre. src='//schemaless' src='http://base/normal' Schemaless. normal. Post.` +const REPLACE_SCHEMALESS_XML = `Pre. src="//schemaless" src="/normal" Schemaless. normal. Post.` +const REPLACE_SCHEMALESS_XML_CORRECT = `Pre. src="//schemaless" src="http://base/normal" Schemaless. normal. Post.` + var abs_url_bench_tests = []test{ {H5_JS_CONTENT_DOUBLE_QUOTE, CORRECT_OUTPUT_SRC_HREF_DQ}, {H5_JS_CONTENT_SINGLE_QUOTE, CORRECT_OUTPUT_SRC_HREF_SQ}, @@ -34,8 +40,10 @@ var xml_abs_url_bench_tests = []test{ } var sanity_tests = []test{{REPLACE_1, REPLACE_1}, {REPLACE_2, REPLACE_2}} -var abs_url_tests = append(abs_url_bench_tests, sanity_tests...) -var xml_abs_url_tests = append(xml_abs_url_bench_tests, sanity_tests...) +var extra_tests_html = []test{{REPLACE_SCHEMALESS_HTML, REPLACE_SCHEMALESS_HTML_CORRECT}} +var abs_url_tests = append(abs_url_bench_tests, append(sanity_tests, extra_tests_html...)...) +var extra_tests_xml = []test{{REPLACE_SCHEMALESS_XML, REPLACE_SCHEMALESS_XML_CORRECT}} +var xml_abs_url_tests = append(xml_abs_url_bench_tests, append(sanity_tests, extra_tests_xml...)...) func TestChainZeroTransformers(t *testing.T) { tr := NewChain()