From bcd434794a28ff75a6e6504c6c3bada554ba88ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Erik=20Pedersen?= Date: Tue, 16 Aug 2016 22:50:15 +0200 Subject: [PATCH] Avoid splitting words for summary For people using autogenerated summaries, this is one of the hot spots in the memory department. We don't need to split al the content into words to do proper summary truncation. This is obviously more effective: ``` BenchmarkTestTruncateWordsToWholeSentence-4 300000 4720 ns/op 0 B/op 0 allocs/op BenchmarkTestTruncateWordsToWholeSentenceOld-4 100000 17699 ns/op 3072 B/op 3 allocs/op ``` --- helpers/content.go | 54 ++++++++++++++++++++++++++++++++++++++--- helpers/content_test.go | 21 ++++++++++++++-- hugolib/page.go | 17 ++++++++++--- 3 files changed, 83 insertions(+), 9 deletions(-) diff --git a/helpers/content.go b/helpers/content.go index 53176de64..bb7819175 100644 --- a/helpers/content.go +++ b/helpers/content.go @@ -21,6 +21,7 @@ import ( "bytes" "html/template" "os/exec" + "unicode" "unicode/utf8" "github.com/miekg/mmark" @@ -424,10 +425,55 @@ func TruncateWordsByRune(words []string, max int) (string, bool) { return strings.Join(words, " "), false } -// TruncateWordsToWholeSentence takes content and an int -// and returns entire sentences from content, delimited by the int -// and whether it's truncated or not. -func TruncateWordsToWholeSentence(words []string, max int) (string, bool) { +// TruncateWordsToWholeSentence takes content and truncates to whole sentence +// limited by max number of words. It also returns whether it is truncated. +func TruncateWordsToWholeSentence(s string, max int) (string, bool) { + + var ( + wordCount = 0 + lastWordIndex = -1 + ) + + for i, r := range s { + if unicode.IsSpace(r) { + wordCount++ + lastWordIndex = i + + if wordCount >= max { + break + } + + } + } + + if lastWordIndex == -1 { + return s, false + } + + endIndex := -1 + + for j, r := range s[lastWordIndex:] { + if isEndOfSentence(r) { + endIndex = j + lastWordIndex + utf8.RuneLen(r) + break + } + } + + if endIndex == -1 { + return s, false + } + + return strings.TrimSpace(s[:endIndex]), endIndex < len(s) +} + +func isEndOfSentence(r rune) bool { + return r == '.' || r == '?' || r == '!' || r == '"' || r == '\n' +} + +// Kept only for benchmark. +func truncateWordsToWholeSentenceOld(content string, max int) (string, bool) { + words := strings.Fields(content) + if max >= len(words) { return strings.Join(words, " "), false } diff --git a/helpers/content_test.go b/helpers/content_test.go index 3a038ea12..5165a7a26 100644 --- a/helpers/content_test.go +++ b/helpers/content_test.go @@ -64,6 +64,22 @@ func TestBytesToHTML(t *testing.T) { assert.Equal(t, template.HTML("dobedobedo"), BytesToHTML([]byte("dobedobedo"))) } +var benchmarkTruncateString = strings.Repeat("This is a sentence about nothing.", 20) + +func BenchmarkTestTruncateWordsToWholeSentence(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + TruncateWordsToWholeSentence(benchmarkTruncateString, SummaryLength) + } +} + +func BenchmarkTestTruncateWordsToWholeSentenceOld(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + truncateWordsToWholeSentenceOld(benchmarkTruncateString, SummaryLength) + } +} + func TestTruncateWordsToWholeSentence(t *testing.T) { type test struct { input, expected string @@ -77,10 +93,11 @@ func TestTruncateWordsToWholeSentence(t *testing.T) { {"This is a sentence.", "This is a sentence.", 5, false}, {"This is also a sentence!", "This is also a sentence!", 1, false}, {"To be. Or not to be. That's the question.", "To be.", 1, true}, - {" \nThis is not a sentence\n ", "This is not a", 4, true}, + {" \nThis is not a sentence\nAnd this is another", "This is not a sentence", 4, true}, + {"", "", 10, false}, } for i, d := range data { - output, truncated := TruncateWordsToWholeSentence(strings.Fields(d.input), d.max) + output, truncated := TruncateWordsToWholeSentence(d.input, d.max) if d.expected != output { t.Errorf("Test %d failed. Expected %q got %q", i, d.expected, output) } diff --git a/hugolib/page.go b/hugolib/page.go index bc54aac91..0784f5bf8 100644 --- a/hugolib/page.go +++ b/hugolib/page.go @@ -89,6 +89,7 @@ type Page struct { plain string // TODO should be []byte plainWords []string plainInit sync.Once + plainWordsInit sync.Once renderingConfig *helpers.Blackfriday renderingConfigInit sync.Once pageMenus PageMenus @@ -147,14 +148,20 @@ func (p *Page) Plain() string { } func (p *Page) PlainWords() []string { - p.initPlain() + p.initPlainWords() return p.plainWords } func (p *Page) initPlain() { p.plainInit.Do(func() { p.plain = helpers.StripHTML(string(p.Content)) - p.plainWords = strings.Fields(p.plain) + return + }) +} + +func (p *Page) initPlainWords() { + p.plainWordsInit.Do(func() { + p.plainWords = strings.Fields(p.Plain()) return }) } @@ -335,7 +342,7 @@ func (p *Page) setAutoSummary() error { if p.isCJKLanguage { summary, truncated = helpers.TruncateWordsByRune(p.PlainWords(), helpers.SummaryLength) } else { - summary, truncated = helpers.TruncateWordsToWholeSentence(p.PlainWords(), helpers.SummaryLength) + summary, truncated = helpers.TruncateWordsToWholeSentence(p.Plain(), helpers.SummaryLength) } p.Summary = template.HTML(summary) p.Truncated = truncated @@ -479,6 +486,10 @@ func (p *Page) ReadFrom(buf io.Reader) (int64, error) { } func (p *Page) analyzePage() { + // TODO(bep) + if true { + return + } if p.isCJKLanguage { p.WordCount = 0 for _, word := range p.PlainWords() {