diff --git a/.gitignore b/.gitignore index 3ea8aedd6..41162a757 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ hugo docs/public* hugo.exe *.test +*.prof +nohup.out cover.out *.swp *.swo diff --git a/transform/absurl.go b/transform/absurl.go index 0a0cd7239..0efe624ac 100644 --- a/transform/absurl.go +++ b/transform/absurl.go @@ -1,64 +1,33 @@ package transform import ( - "bytes" - "net/url" - "strings" + "sync" ) +var absUrlInit sync.Once +var ar *absurlReplacer + +// for performance reasons, we reuse the first baseUrl given +func initAbsurlReplacer(baseURL string) { + absUrlInit.Do(func() { + ar = newAbsurlReplacer(baseURL) + }) +} + func AbsURL(absURL string) (trs []link, err error) { - var baseURL *url.URL + initAbsurlReplacer(absURL) - if baseURL, err = url.Parse(absURL); err != nil { - return - } - - base := strings.TrimRight(baseURL.String(), "/") - - var ( - srcdq = []byte(" src=\"" + base + "/") - hrefdq = []byte(" href=\"" + base + "/") - srcsq = []byte(" src='" + base + "/") - hrefsq = []byte(" href='" + base + "/") - ) trs = append(trs, func(content []byte) []byte { - content = guardReplace(content, []byte(" src=\"//"), []byte(" src=\"/"), srcdq) - content = guardReplace(content, []byte(" src='//"), []byte(" src='/"), srcsq) - content = guardReplace(content, []byte(" href=\"//"), []byte(" href=\"/"), hrefdq) - content = guardReplace(content, []byte(" href='//"), []byte(" href='/"), hrefsq) - return content + return ar.replaceInHtml(content) }) return } func AbsURLInXML(absURL string) (trs []link, err error) { - var baseURL *url.URL + initAbsurlReplacer(absURL) - if baseURL, err = url.Parse(absURL); err != nil { - return - } - - base := strings.TrimRight(baseURL.String(), "/") - - var ( - srcedq = []byte(" src="" + base + "/") - hrefedq = []byte(" href="" + base + "/") - srcesq = []byte(" src='" + base + "/") - hrefesq = []byte(" href='" + base + "/") - ) trs = append(trs, func(content []byte) []byte { - content = guardReplace(content, []byte(" src="//"), []byte(" src="/"), srcedq) - content = guardReplace(content, []byte(" src='//"), []byte(" src='/"), srcesq) - content = guardReplace(content, []byte(" href="//"), []byte(" href="/"), hrefedq) - content = guardReplace(content, []byte(" href='//"), []byte(" href='/"), hrefesq) - return content + return ar.replaceInXml(content) }) return } - -func guardReplace(content, guard, match, replace []byte) []byte { - if !bytes.Contains(content, guard) { - content = bytes.Replace(content, match, replace, -1) - } - return content -} diff --git a/transform/absurlreplacer.go b/transform/absurlreplacer.go new file mode 100644 index 000000000..7b6f72379 --- /dev/null +++ b/transform/absurlreplacer.go @@ -0,0 +1,325 @@ +package transform + +import ( + "bytes" + bp "github.com/spf13/hugo/bufferpool" + "net/url" + "strings" + "sync" + "unicode/utf8" +) + +// position (in bytes) +type pos int + +type matchState int + +const ( + matchStateNone matchState = iota + matchStateWhitespace + matchStatePartial + matchStateFull +) + +type item struct { + typ itemType + pos pos + val []byte +} + +type itemType int + +const ( + tText itemType = iota + + // matches + tSrcdq + tHrefdq + tSrcsq + tHrefsq + // guards + tGrcdq + tGhrefdq + tGsrcsq + tGhrefsq +) + +type contentlexer struct { + content []byte + + pos pos // input position + start pos // item start position + width pos // width of last element + + matchers []absurlMatcher + state stateFunc + prefixLookup *prefixes + + // items delivered to client + items []item +} + +type stateFunc func(*contentlexer) stateFunc + +type prefixRunes []rune + +type prefixes struct { + pr []prefixRunes + curr prefixRunes // current prefix lookup table + i int // current index + + // first rune in potential match + first rune + + // match-state: + // none, whitespace, partial, full + ms matchState +} + +// match returns partial and full match for the prefix in play +// - it's a full match if all prefix runes has checked out in row +// - it's a partial match if it's on its way towards a full match +func (l *contentlexer) match(r rune) { + p := l.prefixLookup + if p.curr == nil { + // assumes prefixes all start off on a different rune + // works in this special case: href, src + p.i = 0 + for _, pr := range p.pr { + if pr[p.i] == r { + fullMatch := len(p.pr) == 1 + p.first = r + if !fullMatch { + p.curr = pr + l.prefixLookup.ms = matchStatePartial + } else { + l.prefixLookup.ms = matchStateFull + } + return + } + } + } else { + p.i++ + if p.curr[p.i] == r { + fullMatch := len(p.curr) == p.i+1 + if fullMatch { + p.curr = nil + l.prefixLookup.ms = matchStateFull + } else { + l.prefixLookup.ms = matchStatePartial + } + return + } + + p.curr = nil + } + + l.prefixLookup.ms = matchStateNone +} + +func (l *contentlexer) emit(t itemType) { + l.items = append(l.items, item{t, l.start, l.content[l.start:l.pos]}) + l.start = l.pos +} + +var mainPrefixRunes = []prefixRunes{{'s', 'r', 'c', '='}, {'h', 'r', 'e', 'f', '='}} + +var itemSlicePool = &sync.Pool{ + New: func() interface{} { + return make([]item, 0, 8) + }, +} + +func replace(content []byte, matchers []absurlMatcher) *contentlexer { + var items []item + if x := itemSlicePool.Get(); x != nil { + items = x.([]item)[:0] + defer itemSlicePool.Put(items) + } else { + items = make([]item, 0, 8) + } + + lexer := &contentlexer{content: content, + items: items, + prefixLookup: &prefixes{pr: mainPrefixRunes}, + matchers: matchers} + + lexer.runReplacer() + return lexer +} + +func (l *contentlexer) runReplacer() { + for l.state = lexReplacements; l.state != nil; { + l.state = l.state(l) + } +} + +type absurlMatcher struct { + replaceType itemType + guardType itemType + match []byte + guard []byte + replacement []byte + guarded bool +} + +func (a absurlMatcher) isSourceType() bool { + return a.replaceType == tSrcdq || a.replaceType == tSrcsq +} + +func lexReplacements(l *contentlexer) stateFunc { + contentLength := len(l.content) + var r rune + + for { + if int(l.pos) >= contentLength { + l.width = 0 + break + } + + var width int = 1 + r = rune(l.content[l.pos]) + if r >= utf8.RuneSelf { + r, width = utf8.DecodeRune(l.content[l.pos:]) + } + l.width = pos(width) + l.pos += l.width + + if r == ' ' { + l.prefixLookup.ms = matchStateWhitespace + } else if l.prefixLookup.ms != matchStateNone { + l.match(r) + if l.prefixLookup.ms == matchStateFull { + checkCandidate(l) + } + } + + } + + // Done! + if l.pos > l.start { + l.emit(tText) + } + return nil +} + +func checkCandidate(l *contentlexer) { + isSource := l.prefixLookup.first == 's' + for _, m := range l.matchers { + + if m.guarded { + continue + } + + if isSource && !m.isSourceType() || !isSource && m.isSourceType() { + continue + } + + s := l.content[l.pos:] + if bytes.HasPrefix(s, m.guard) { + if l.pos > l.start { + l.emit(tText) + } + l.pos += pos(len(m.guard)) + l.emit(m.guardType) + m.guarded = true + return + } else if bytes.HasPrefix(s, m.match) { + if l.pos > l.start { + l.emit(tText) + } + l.pos += pos(len(m.match)) + l.emit(m.replaceType) + return + + } + } +} + +func doReplace(content []byte, matchers []absurlMatcher) []byte { + b := bp.GetBuffer() + defer bp.PutBuffer(b) + + guards := make([]bool, len(matchers)) + replaced := replace(content, matchers) + + // first pass: check guards + for _, item := range replaced.items { + if item.typ != tText { + for i, e := range matchers { + if item.typ == e.guardType { + guards[i] = true + break + } + } + } + } + // second pass: do replacements for non-guarded tokens + for _, token := range replaced.items { + switch token.typ { + case tText: + b.Write(token.val) + default: + for i, e := range matchers { + if token.typ == e.replaceType && !guards[i] { + b.Write(e.replacement) + } else if token.typ == e.replaceType || token.typ == e.guardType { + b.Write(token.val) + } + } + } + } + + return b.Bytes() +} + +type absurlReplacer struct { + htmlMatchers []absurlMatcher + xmlMatchers []absurlMatcher +} + +func newAbsurlReplacer(baseUrl string) *absurlReplacer { + u, _ := url.Parse(baseUrl) + base := strings.TrimRight(u.String(), "/") + + // HTML + dqHtmlMatch := []byte("\"/") + sqHtmlMatch := []byte("'/") + + dqGuard := []byte("\"//") + sqGuard := []byte("'//") + + // XML + dqXmlMatch := []byte(""/") + sqXmlMatch := []byte("'/") + + dqXmlGuard := []byte(""//") + sqXmlGuard := []byte("'//") + + dqHtml := []byte("\"" + base + "/") + sqHtml := []byte("'" + base + "/") + + dqXml := []byte(""" + base + "/") + sqXml := []byte("'" + base + "/") + + return &absurlReplacer{htmlMatchers: []absurlMatcher{ + {tSrcdq, tGrcdq, dqHtmlMatch, dqGuard, dqHtml, false}, + {tSrcsq, tGsrcsq, sqHtmlMatch, sqGuard, sqHtml, false}, + {tHrefdq, tGhrefdq, dqHtmlMatch, dqGuard, dqHtml, false}, + {tHrefsq, tGhrefsq, sqHtmlMatch, sqGuard, sqHtml, false}}, + xmlMatchers: []absurlMatcher{ + {tSrcdq, tGrcdq, dqXmlMatch, dqXmlGuard, dqXml, false}, + {tSrcsq, tGsrcsq, sqXmlMatch, sqXmlGuard, sqXml, false}, + {tHrefdq, tGhrefdq, dqXmlMatch, dqXmlGuard, dqXml, false}, + {tHrefsq, tGhrefsq, sqXmlMatch, sqXmlGuard, sqXml, false}, + }} + +} + +func (au *absurlReplacer) replaceInHtml(content []byte) []byte { + return doReplace(content, au.htmlMatchers) +} + +func (au *absurlReplacer) replaceInXml(content []byte) []byte { + return doReplace(content, au.xmlMatchers) +} diff --git a/transform/chain_test.go b/transform/chain_test.go index 71037d455..a88d84533 100644 --- a/transform/chain_test.go +++ b/transform/chain_test.go @@ -14,21 +14,29 @@ const CORRECT_OUTPUT_SRC_HREF_DQ = "