From 4abaec5c045e92ae5f8b3a2dc66606b080ef6ea5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Erik=20Pedersen?= Date: Wed, 17 Aug 2016 06:37:19 +0200 Subject: [PATCH] Improve TotalWords counter func It is obviously more efficient when we do not care about the actual words. ``` BenchmarkTotalWords-4 100000 18795 ns/op 0 B/op 0 allocs/op BenchmarkTotalWordsOld-4 30000 46751 ns/op 6400 B/op 1 allocs/op ``` --- helpers/content.go | 19 +++++++++++++++++- helpers/content_test.go | 43 ++++++++++++++++++++++++++++++++++++----- hugolib/page.go | 4 ---- 3 files changed, 56 insertions(+), 10 deletions(-) diff --git a/helpers/content.go b/helpers/content.go index bb7819175..9d35675f7 100644 --- a/helpers/content.go +++ b/helpers/content.go @@ -384,8 +384,25 @@ func RenderBytes(ctx *RenderingContext) []byte { } } -// TotalWords returns an int of the total number of words in a given content. +// TotalWords counts instance of one or more consecutive white space +// characters, as defined by unicode.IsSpace, in s. +// This is a cheaper way of word counting than the obvious len(strings.Fields(s)). func TotalWords(s string) int { + n := 0 + inWord := false + for _, r := range s { + wasInWord := inWord + inWord = !unicode.IsSpace(r) + if inWord && !wasInWord { + n++ + } + } + return n +} + +// Old implementation only kept for benchmark comparison. +// TODO(bep) remove +func totalWordsOld(s string) int { return len(strings.Fields(s)) } diff --git a/helpers/content_test.go b/helpers/content_test.go index 5165a7a26..82af70f8f 100644 --- a/helpers/content_test.go +++ b/helpers/content_test.go @@ -408,12 +408,45 @@ func TestExtractNoTOC(t *testing.T) { } } -func TestTotalWords(t *testing.T) { - testString := "Two, Words!" - actualWordCount := TotalWords(testString) +var totalWordsBenchmarkString = strings.Repeat("Hugo Rocks ", 200) - if actualWordCount != 2 { - t.Errorf("Actual word count (%d) for test string (%s) did not match 2.", actualWordCount, testString) +func TestTotalWords(t *testing.T) { + + for i, this := range []struct { + s string + words int + }{ + {"Two, Words!", 2}, + {"Word", 1}, + {"", 0}, + {"One, Two, Three", 3}, + {totalWordsBenchmarkString, 400}, + } { + actualWordCount := TotalWords(this.s) + + if actualWordCount != this.words { + t.Errorf("[%d] Actual word count (%d) for test string (%s) did not match %d", i, actualWordCount, this.s, this.words) + } + } +} + +func BenchmarkTotalWords(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + wordCount := TotalWords(totalWordsBenchmarkString) + if wordCount != 400 { + b.Fatal("Wordcount error") + } + } +} + +func BenchmarkTotalWordsOld(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + wordCount := totalWordsOld(totalWordsBenchmarkString) + if wordCount != 400 { + b.Fatal("Wordcount error") + } } } diff --git a/hugolib/page.go b/hugolib/page.go index 0784f5bf8..66d099bc0 100644 --- a/hugolib/page.go +++ b/hugolib/page.go @@ -486,10 +486,6 @@ func (p *Page) ReadFrom(buf io.Reader) (int64, error) { } func (p *Page) analyzePage() { - // TODO(bep) - if true { - return - } if p.isCJKLanguage { p.WordCount = 0 for _, word := range p.PlainWords() {