From dd45e6d7e5406991d8df3a2f9ba4c7e5ae039c34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Erik=20Pedersen?= Date: Wed, 17 Aug 2016 13:41:48 +0200 Subject: [PATCH] Lazy calculate WordCount, ReadingTime and FuzzyWordCount This avoids having to execute these expensive operations for sites not using these values. This commit sums up a set of wordcounting and autosummary related performance improvements. The effect of these kind of depends on what features your site use, but a benchmark from 4 Hugo sites in the wild shows promise: ``` benchmark old ns/op new ns/op delta BenchmarkHugo-4 21293005843 20032857342 -5.92% benchmark old allocs new allocs delta BenchmarkHugo-4 65290922 65186032 -0.16% benchmark old bytes new bytes delta BenchmarkHugo-4 9771213416 9681866464 -0.91% ``` Closes #2378 --- helpers/content.go | 21 +++++++++---- helpers/content_test.go | 13 +++++++- hugolib/page.go | 61 +++++++++++++++++++++++++------------- hugolib/pageSort_test.go | 10 +++---- hugolib/page_test.go | 40 ++++++++++++------------- hugolib/pagination_test.go | 10 +++---- hugolib/site_test.go | 5 ++++ 7 files changed, 103 insertions(+), 57 deletions(-) diff --git a/helpers/content.go b/helpers/content.go index 9d35675f7..f3d8bd94f 100644 --- a/helpers/content.go +++ b/helpers/content.go @@ -138,19 +138,28 @@ func StripHTML(s string) string { // Walk through the string removing all tags b := bp.GetBuffer() defer bp.PutBuffer(b) - - inTag := false + var inTag, isSpace, wasSpace bool for _, r := range s { - switch r { - case '<': + if !inTag { + isSpace = false + } + + switch { + case r == '<': inTag = true - case '>': + case r == '>': inTag = false + case unicode.IsSpace(r): + isSpace = true + fallthrough default: - if !inTag { + if !inTag && (!isSpace || (isSpace && !wasSpace)) { b.WriteRune(r) } } + + wasSpace = isSpace + } return b.String() } diff --git a/helpers/content_test.go b/helpers/content_test.go index 82af70f8f..22c81005f 100644 --- a/helpers/content_test.go +++ b/helpers/content_test.go @@ -34,11 +34,22 @@ func TestStripHTML(t *testing.T) { } data := []test{ {"

strip h1 tag

", "strip h1 tag "}, - {"

strip p tag

", " strip p tag \n"}, + {"

strip p tag

", " strip p tag "}, {"
strip br
", " strip br\n"}, {"
strip br2
", " strip br2\n"}, {"This is a\nnewline", "This is a newline"}, {"No Tags", "No Tags"}, + {`

Summary Next Line. +

+ + + + +
+. +More text here.

+ +

Some more text

`, "Summary Next Line. . More text here.\nSome more text\n"}, } for i, d := range data { output := StripHTML(d.input) diff --git a/hugolib/page.go b/hugolib/page.go index 66d099bc0..a76c157be 100644 --- a/hugolib/page.go +++ b/hugolib/page.go @@ -107,9 +107,10 @@ type Source struct { source.File } type PageMeta struct { - WordCount int - FuzzyWordCount int - ReadingTime int + wordCount int + fuzzyWordCount int + readingTime int + pageMetaInit sync.Once Weight int } @@ -485,28 +486,48 @@ func (p *Page) ReadFrom(buf io.Reader) (int64, error) { return int64(len(p.rawContent)), nil } +func (p *Page) WordCount() int { + p.analyzePage() + return p.wordCount +} + +func (p *Page) ReadingTime() int { + p.analyzePage() + return p.readingTime +} + +func (p *Page) FuzzyWordCount() int { + p.analyzePage() + return p.fuzzyWordCount +} + func (p *Page) analyzePage() { - if p.isCJKLanguage { - p.WordCount = 0 - for _, word := range p.PlainWords() { - runeCount := utf8.RuneCountInString(word) - if len(word) == runeCount { - p.WordCount++ - } else { - p.WordCount += runeCount + p.pageMetaInit.Do(func() { + if p.isCJKLanguage { + p.wordCount = 0 + for _, word := range p.PlainWords() { + runeCount := utf8.RuneCountInString(word) + if len(word) == runeCount { + p.wordCount++ + } else { + p.wordCount += runeCount + } } + } else { + p.wordCount = helpers.TotalWords(p.Plain()) } - } else { - p.WordCount = len(p.PlainWords()) - } - p.FuzzyWordCount = (p.WordCount + 100) / 100 * 100 + // TODO(bep) is set in a test. Fix that. + if p.fuzzyWordCount == 0 { + p.fuzzyWordCount = (p.wordCount + 100) / 100 * 100 + } - if p.isCJKLanguage { - p.ReadingTime = (p.WordCount + 500) / 501 - } else { - p.ReadingTime = (p.WordCount + 212) / 213 - } + if p.isCJKLanguage { + p.readingTime = (p.wordCount + 500) / 501 + } else { + p.readingTime = (p.wordCount + 212) / 213 + } + }) } func (p *Page) permalink() (*url.URL, error) { diff --git a/hugolib/pageSort_test.go b/hugolib/pageSort_test.go index 1ed99f318..23a3fd07c 100644 --- a/hugolib/pageSort_test.go +++ b/hugolib/pageSort_test.go @@ -95,11 +95,11 @@ func TestLimit(t *testing.T) { func TestPageSortReverse(t *testing.T) { p1 := createSortTestPages(10) - assert.Equal(t, 0, p1[0].FuzzyWordCount) - assert.Equal(t, 9, p1[9].FuzzyWordCount) + assert.Equal(t, 0, p1[0].fuzzyWordCount) + assert.Equal(t, 9, p1[9].fuzzyWordCount) p2 := p1.Reverse() - assert.Equal(t, 9, p2[0].FuzzyWordCount) - assert.Equal(t, 0, p2[9].FuzzyWordCount) + assert.Equal(t, 9, p2[0].fuzzyWordCount) + assert.Equal(t, 0, p2[9].fuzzyWordCount) // cached assert.True(t, probablyEqualPages(p2, p1.Reverse())) } @@ -149,7 +149,7 @@ func createSortTestPages(num int) Pages { if i%2 == 0 { w = 10 } - pages[i].FuzzyWordCount = i + pages[i].fuzzyWordCount = i pages[i].Weight = w pages[i].Description = "initial" } diff --git a/hugolib/page_test.go b/hugolib/page_test.go index 641e421b3..7cdc55898 100644 --- a/hugolib/page_test.go +++ b/hugolib/page_test.go @@ -504,10 +504,13 @@ func checkPageContent(t *testing.T, page *Page, content string, msg ...interface } func normalizeContent(c string) string { - norm := strings.Replace(c, "\n", "", -1) + norm := c + norm = strings.Replace(norm, "\n", " ", -1) norm = strings.Replace(norm, " ", " ", -1) norm = strings.Replace(norm, " ", " ", -1) norm = strings.Replace(norm, " ", " ", -1) + norm = strings.Replace(norm, "p> ", "p>", -1) + norm = strings.Replace(norm, "> <", "> <", -1) return strings.TrimSpace(norm) } @@ -710,8 +713,8 @@ func TestPageWithShortCodeInSummary(t *testing.T) { assertFunc := func(t *testing.T, ext string, p *Page) { checkPageTitle(t, p, "Simple") - checkPageContent(t, p, normalizeExpected(ext, "

Summary Next Line.

.\nMore text here.

Some more text

"), ext) - checkPageSummary(t, p, "Summary Next Line. . More text here. Some more text", ext) + checkPageContent(t, p, normalizeExpected(ext, "

Summary Next Line. \n

\n \n \n \n \n
\n.\nMore text here.

\n\n

Some more text

\n")) + checkPageSummary(t, p, "Summary Next Line. . More text here. Some more text") checkPageType(t, p, "page") checkPageLayout(t, p, "page/single.html", "_default/single.html", "theme/page/single.html", "theme/_default/single.html") } @@ -793,8 +796,8 @@ func TestWordCountWithAllCJKRunesWithoutHasCJKLanguage(t *testing.T) { testCommonResetState() assertFunc := func(t *testing.T, ext string, p *Page) { - if p.WordCount != 8 { - t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 8, p.WordCount) + if p.WordCount() != 8 { + t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 8, p.WordCount()) } } @@ -806,11 +809,10 @@ func TestWordCountWithAllCJKRunesHasCJKLanguage(t *testing.T) { viper.Set("HasCJKLanguage", true) assertFunc := func(t *testing.T, ext string, p *Page) { - if p.WordCount != 15 { - t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 15, p.WordCount) + if p.WordCount() != 15 { + t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 15, p.WordCount()) } } - testAllMarkdownEnginesForPage(t, assertFunc, "simple", simplePageWithAllCJKRunes) } @@ -820,15 +822,14 @@ func TestWordCountWithMainEnglishWithCJKRunes(t *testing.T) { viper.Set("HasCJKLanguage", true) assertFunc := func(t *testing.T, ext string, p *Page) { - if p.WordCount != 74 { - t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 74, p.WordCount) + if p.WordCount() != 74 { + t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 74, p.WordCount()) } if p.Summary != simplePageWithMainEnglishWithCJKRunesSummary { t.Fatalf("[%s] incorrect Summary for content '%s'. expected %v, got %v", ext, p.plain, simplePageWithMainEnglishWithCJKRunesSummary, p.Summary) } - } testAllMarkdownEnginesForPage(t, assertFunc, "simple", simplePageWithMainEnglishWithCJKRunes) @@ -839,15 +840,14 @@ func TestWordCountWithIsCJKLanguageFalse(t *testing.T) { viper.Set("HasCJKLanguage", true) assertFunc := func(t *testing.T, ext string, p *Page) { - if p.WordCount != 75 { - t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 74, p.WordCount) + if p.WordCount() != 75 { + t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 74, p.WordCount()) } if p.Summary != simplePageWithIsCJKLanguageFalseSummary { t.Fatalf("[%s] incorrect Summary for content '%s'. expected %v, got %v", ext, p.plain, simplePageWithIsCJKLanguageFalseSummary, p.Summary) } - } testAllMarkdownEnginesForPage(t, assertFunc, "simple", simplePageWithIsCJKLanguageFalse) @@ -857,16 +857,16 @@ func TestWordCountWithIsCJKLanguageFalse(t *testing.T) { func TestWordCount(t *testing.T) { assertFunc := func(t *testing.T, ext string, p *Page) { - if p.WordCount != 483 { - t.Fatalf("[%s] incorrect word count. expected %v, got %v", ext, 483, p.WordCount) + if p.WordCount() != 483 { + t.Fatalf("[%s] incorrect word count. expected %v, got %v", ext, 483, p.WordCount()) } - if p.FuzzyWordCount != 500 { - t.Fatalf("[%s] incorrect word count. expected %v, got %v", ext, 500, p.WordCount) + if p.FuzzyWordCount() != 500 { + t.Fatalf("[%s] incorrect word count. expected %v, got %v", ext, 500, p.WordCount()) } - if p.ReadingTime != 3 { - t.Fatalf("[%s] incorrect min read. expected %v, got %v", ext, 3, p.ReadingTime) + if p.ReadingTime() != 3 { + t.Fatalf("[%s] incorrect min read. expected %v, got %v", ext, 3, p.ReadingTime()) } checkTruncation(t, p, true, "long page") diff --git a/hugolib/pagination_test.go b/hugolib/pagination_test.go index 6f85e91d3..786650469 100644 --- a/hugolib/pagination_test.go +++ b/hugolib/pagination_test.go @@ -55,7 +55,7 @@ func TestSplitPageGroups(t *testing.T) { // first group 10 in weight assert.Equal(t, 10, pg.Key) for _, p := range pg.Pages { - assert.True(t, p.FuzzyWordCount%2 == 0) // magic test + assert.True(t, p.fuzzyWordCount%2 == 0) // magic test } } } else { @@ -70,7 +70,7 @@ func TestSplitPageGroups(t *testing.T) { // last should have 5 in weight assert.Equal(t, 5, pg.Key) for _, p := range pg.Pages { - assert.True(t, p.FuzzyWordCount%2 != 0) // magic test + assert.True(t, p.fuzzyWordCount%2 != 0) // magic test } } } else { @@ -443,10 +443,10 @@ func TestPage(t *testing.T) { page21, _ := f2.page(1) page2Nil, _ := f2.page(3) - assert.Equal(t, 1, page11.FuzzyWordCount) + assert.Equal(t, 3, page11.fuzzyWordCount) assert.Nil(t, page1Nil) - assert.Equal(t, 1, page21.FuzzyWordCount) + assert.Equal(t, 3, page21.fuzzyWordCount) assert.Nil(t, page2Nil) } @@ -468,7 +468,7 @@ func createTestPages(num int) Pages { if i%2 == 0 { w = 10 } - pages[i].FuzzyWordCount = i + pages[i].fuzzyWordCount = i + 2 pages[i].Weight = w } diff --git a/hugolib/site_test.go b/hugolib/site_test.go index 8360d7b94..b278456fc 100644 --- a/hugolib/site_test.go +++ b/hugolib/site_test.go @@ -33,6 +33,11 @@ import ( "github.com/stretchr/testify/require" ) +func init() { + //There are expected ERROR logging in tests that produces a lot of noise. + jww.SetStdoutThreshold(jww.LevelCritical) +} + const ( pageSimpleTitle = `--- title: simple template