From 823334875d396bdc15770c335c2029a01a7ef2ce Mon Sep 17 00:00:00 2001 From: coderzh Date: Thu, 3 Sep 2015 18:22:20 +0800 Subject: [PATCH] WordCount and Summary support CJK Language * add global `hasCJKLanguage` flag, if true, turn on auto-detecting CJKLanguage * add `isCJKLanguage` frontmatter to force specify whether is CJKLanguage or not * For .Summary: If isCJKLanguage is true, use the runes as basis for truncation, else keep as today. * For WordCount: If isCJKLanguage is true, use the runes as basis for calculation, else keep as today. * Unexport RuneCount Fixes #1377 --- commands/hugo.go | 1 + helpers/content.go | 91 +++++++++++++--------------- helpers/content_test.go | 38 +++++++++++- hugolib/page.go | 81 ++++++++++++++----------- hugolib/page_test.go | 129 ++++++++++++++++++++++++++++++++++++++-- 5 files changed, 247 insertions(+), 93 deletions(-) diff --git a/commands/hugo.go b/commands/hugo.go index b0e4964c4..718a3e6f6 100644 --- a/commands/hugo.go +++ b/commands/hugo.go @@ -168,6 +168,7 @@ func LoadDefaultSettings() { viper.SetDefault("RSSUri", "index.xml") viper.SetDefault("SectionPagesMenu", "") viper.SetDefault("DisablePathToLower", false) + viper.SetDefault("HasCJKLanguage", false) } // InitializeConfig initializes a config file with sensible default configuration flags. diff --git a/helpers/content.go b/helpers/content.go index 8c5c9cc7b..847d4dcbc 100644 --- a/helpers/content.go +++ b/helpers/content.go @@ -19,9 +19,9 @@ package helpers import ( "bytes" - "unicode/utf8" "html/template" "os/exec" + "unicode/utf8" "github.com/miekg/mmark" "github.com/russross/blackfriday" @@ -178,7 +178,6 @@ func GetHTMLRenderer(defaultFlags int, ctx *RenderingContext) blackfriday.Render } } - func getMarkdownExtensions(ctx *RenderingContext) int { flags := 0 | blackfriday.EXTENSION_NO_INTRA_EMPHASIS | blackfriday.EXTENSION_TABLES | blackfriday.EXTENSION_FENCED_CODE | @@ -385,61 +384,51 @@ func TruncateWords(s string, max int) string { return strings.Join(words[:max], " ") } +func TruncateWordsByRune(words []string, max int) (string, bool) { + count := 0 + for index, word := range words { + if count >= max { + return strings.Join(words[:index], " "), true + } + runeCount := utf8.RuneCountInString(word) + if len(word) == runeCount { + count++ + } else if count+runeCount < max { + count += runeCount + } else { + for ri, _ := range word { + if count >= max { + truncatedWords := append(words[:index], word[:ri]) + return strings.Join(truncatedWords, " "), true + } else { + count++ + } + } + } + } + + return strings.Join(words, " "), false +} + // TruncateWordsToWholeSentence takes content and an int // and returns entire sentences from content, delimited by the int // and whether it's truncated or not. func TruncateWordsToWholeSentence(words []string, max int) (string, bool) { - count := 0 - index, word := 0, "" - truncated := false - - for index, word = range words { - runeCount := utf8.RuneCountInString(word) - if len(word) == runeCount { - count++; - } else { - if count + runeCount <= max { - count += runeCount - } else { - offset := 0 - for count < max { - _, width := utf8.DecodeRuneInString(word[offset:]) - offset += width - count++ - } - words[index] = word[:offset] - truncated = true - } - } - - if count >= max { - if index < len(words) - 1 { - truncated = true - } - break + if max >= len(words) { + return strings.Join(words, " "), false + } + + for counter, word := range words[max:] { + if strings.HasSuffix(word, ".") || + strings.HasSuffix(word, "?") || + strings.HasSuffix(word, ".\"") || + strings.HasSuffix(word, "!") { + upper := max + counter + 1 + return strings.Join(words[:upper], " "), (upper < len(words)) } } - - index += 1 - - if index < len(words) { - for counter, word := range words[index:] { - if len(word) != utf8.RuneCountInString(word) { - break - } - if strings.HasSuffix(word, ".") || - strings.HasSuffix(word, "?") || - strings.HasSuffix(word, ".\"") || - strings.HasSuffix(word, "!") { - upper := index + counter + 1 - return strings.Join(words[:upper], " "), (upper < len(words)) - } - } - } else if index > len(words) { - return strings.Join(words, " "), truncated - } - - return strings.Join(words[:index], " "), truncated + + return strings.Join(words[:max], " "), true } // GetAsciidocContent calls asciidoctor or asciidoc as an external helper diff --git a/helpers/content_test.go b/helpers/content_test.go index f614011c0..f0d76b6ce 100644 --- a/helpers/content_test.go +++ b/helpers/content_test.go @@ -1,10 +1,11 @@ package helpers import ( - "github.com/stretchr/testify/assert" "html/template" "strings" "testing" + + "github.com/stretchr/testify/assert" ) const tstHTMLContent = "
content foobar. Follow up

This is some text.
And some more.

" @@ -54,8 +55,6 @@ func TestTruncateWordsToWholeSentence(t *testing.T) { {"a b c", "a b c", 12, false}, {"a b c", "a b c", 3, false}, {"a", "a", 1, false}, - {"Hello 中国", "Hello 中", 2, true}, - {"Hello 中国", "Hello 中国", 3, false}, {"This is a sentence.", "This is a sentence.", 5, false}, {"This is also a sentence!", "This is also a sentence!", 1, false}, {"To be. Or not to be. That's the question.", "To be.", 1, true}, @@ -72,3 +71,36 @@ func TestTruncateWordsToWholeSentence(t *testing.T) { } } } + +func TestTruncateWordsByRune(t *testing.T) { + type test struct { + input, expected string + max int + truncated bool + } + data := []test{ + {"", "", 1, false}, + {"a b c", "a b c", 12, false}, + {"a b c", "a b c", 3, false}, + {"a", "a", 1, false}, + {"Hello 中国", "", 0, true}, + {"这是中文,全中文。", "这是中文,", 5, true}, + {"Hello 中国", "Hello 中", 2, true}, + {"Hello 中国", "Hello 中国", 3, false}, + {"Hello中国 Good 好的", "Hello中国 Good 好", 9, true}, + {"This is a sentence.", "This is", 2, true}, + {"This is also a sentence!", "This", 1, true}, + {"To be. Or not to be. That's the question.", "To be. Or not", 4, true}, + {" \nThis is not a sentence\n ", "This is not", 3, true}, + } + for i, d := range data { + output, truncated := TruncateWordsByRune(strings.Fields(d.input), d.max) + if d.expected != output { + t.Errorf("Test %d failed. Expected %q got %q", i, d.expected, output) + } + + if d.truncated != truncated { + t.Errorf("Test %d failed. Expected truncated=%t got %t", i, d.truncated, truncated) + } + } +} diff --git a/hugolib/page.go b/hugolib/page.go index c50e2da18..e08e764af 100644 --- a/hugolib/page.go +++ b/hugolib/page.go @@ -28,6 +28,7 @@ import ( "net/url" "path" "path/filepath" + "regexp" "strings" "sync" "time" @@ -42,6 +43,10 @@ import ( "github.com/spf13/viper" ) +var ( + cjk = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}`) +) + type Page struct { Params map[string]interface{} Content template.HTML @@ -67,7 +72,6 @@ type Page struct { contentShortCodes map[string]string plain string // TODO should be []byte plainWords []string - plainRuneCount int plainInit sync.Once plainSecondaryInit sync.Once renderingConfig *helpers.Blackfriday @@ -78,6 +82,7 @@ type Page struct { Node pageMenus PageMenus pageMenusInit sync.Once + isCJKLanguage bool } type Source struct { @@ -111,12 +116,6 @@ func (p *Page) PlainWords() []string { return p.plainWords } -// RuneCount returns the rune count, excluding any whitespace, of the plain content. -func (p *Page) RuneCount() int { - p.initPlainSecondary() - return p.plainRuneCount -} - func (p *Page) initPlain() { p.plainInit.Do(func() { p.plain = helpers.StripHTML(string(p.Content)) @@ -125,20 +124,6 @@ func (p *Page) initPlain() { }) } -func (p *Page) initPlainSecondary() { - p.plainSecondaryInit.Do(func() { - p.initPlain() - runeCount := 0 - for _, r := range p.plain { - if !helpers.IsWhitespace(r) { - runeCount++ - } - } - p.plainRuneCount = runeCount - return - }) -} - func (p *Page) IsNode() bool { return false } @@ -218,7 +203,13 @@ func (p *Page) setSummary() { } else { // If hugo defines split: // render, strip html, then split - summary, truncated := helpers.TruncateWordsToWholeSentence(p.PlainWords(), helpers.SummaryLength) + var summary string + var truncated bool + if p.isCJKLanguage { + summary, truncated = helpers.TruncateWordsByRune(p.PlainWords(), helpers.SummaryLength) + } else { + summary, truncated = helpers.TruncateWordsToWholeSentence(p.PlainWords(), helpers.SummaryLength) + } p.Summary = template.HTML(summary) p.Truncated = truncated @@ -363,18 +354,27 @@ func (p *Page) ReadFrom(buf io.Reader) (int64, error) { } func (p *Page) analyzePage() { - p.WordCount = 0 - for _, word := range p.PlainWords() { - runeCount := utf8.RuneCountInString(word) - if len(word) == runeCount { - p.WordCount++ - } else { - p.WordCount += runeCount + if p.isCJKLanguage { + p.WordCount = 0 + for _, word := range p.PlainWords() { + runeCount := utf8.RuneCountInString(word) + if len(word) == runeCount { + p.WordCount++ + } else { + p.WordCount += runeCount + } } + } else { + p.WordCount = len(p.PlainWords()) } - + p.FuzzyWordCount = int((p.WordCount+100)/100) * 100 - p.ReadingTime = int((p.WordCount + 212) / 213) + + if p.isCJKLanguage { + p.ReadingTime = int((p.WordCount + 500) / 501) + } else { + p.ReadingTime = int((p.WordCount + 212) / 213) + } } func (p *Page) permalink() (*url.URL, error) { @@ -481,7 +481,7 @@ func (p *Page) update(f interface{}) error { } m := f.(map[string]interface{}) var err error - var draft, published *bool + var draft, published, isCJKLanguage *bool for k, v := range m { loki := strings.ToLower(k) switch loki { @@ -542,6 +542,9 @@ func (p *Page) update(f interface{}) error { p.Status = cast.ToString(v) case "sitemap": p.Sitemap = parseSitemap(cast.ToStringMap(v)) + case "iscjklanguage": + isCJKLanguage = new(bool) + *isCJKLanguage = cast.ToBool(v) default: // If not one of the explicit values, store in Params switch vv := v.(type) { @@ -596,6 +599,16 @@ func (p *Page) update(f interface{}) error { p.Lastmod = p.Date } + if isCJKLanguage != nil { + p.isCJKLanguage = *isCJKLanguage + } else if viper.GetBool("HasCJKLanguage") { + if cjk.Match(p.rawContent) { + p.isCJKLanguage = true + } else { + p.isCJKLanguage = false + } + } + return nil } @@ -766,6 +779,8 @@ func (p *Page) parse(reader io.Reader) error { p.renderable = psr.IsRenderable() p.frontmatter = psr.FrontMatter() + p.rawContent = psr.Content() + meta, err := psr.Metadata() if meta != nil { if err != nil { @@ -778,8 +793,6 @@ func (p *Page) parse(reader io.Reader) error { } } - p.rawContent = psr.Content() - return nil } diff --git a/hugolib/page_test.go b/hugolib/page_test.go index c3506d48d..9134ba6c6 100644 --- a/hugolib/page_test.go +++ b/hugolib/page_test.go @@ -146,16 +146,67 @@ Summary Same Line Some more text ` - SIMPLE_PAGE_WITH_FIVE_MULTIBYTE_UFT8_RUNES = `--- + SIMPLE_PAGE_WITH_ALL_CJK_RUNES = `--- title: Simple --- € € € € € +你好 +도형이 +カテゴリー ` + SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES = `--- +title: Simple +--- + + +In Chinese, 好 means good. In Chinese, 好 means good. +In Chinese, 好 means good. In Chinese, 好 means good. +In Chinese, 好 means good. In Chinese, 好 means good. +In Chinese, 好 means good. In Chinese, 好 means good. +In Chinese, 好 means good. In Chinese, 好 means good. +In Chinese, 好 means good. In Chinese, 好 means good. +In Chinese, 好 means good. In Chinese, 好 means good. +More then 70 words. + + +` + SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES_SUMMARY = "In Chinese, 好 means good. In Chinese, 好 means good. " + + "In Chinese, 好 means good. In Chinese, 好 means good. " + + "In Chinese, 好 means good. In Chinese, 好 means good. " + + "In Chinese, 好 means good. In Chinese, 好 means good. " + + "In Chinese, 好 means good. In Chinese, 好 means good. " + + "In Chinese, 好 means good. In Chinese, 好 means good. " + + "In Chinese, 好 means good. In Chinese, 好 means good." + + SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE = `--- +title: Simple +isCJKLanguage: false +--- + +In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. +In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. +In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. +In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. +In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. +In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. +In Chinese, 好的啊 means good. In Chinese, 好的呀呀 means good enough. +More then 70 words. + + +` + SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE_SUMMARY = "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " + + "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " + + "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " + + "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " + + "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " + + "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " + + "In Chinese, 好的啊 means good. In Chinese, 好的呀呀 means good enough." + SIMPLE_PAGE_WITH_LONG_CONTENT = `--- title: Simple --- @@ -584,18 +635,86 @@ func TestPageWithDate(t *testing.T) { checkPageDate(t, p, d) } -func TestRuneCount(t *testing.T) { +func TestWordCountWithAllCJKRunesWithoutHasCJKLanguage(t *testing.T) { + viper.Reset() + p, _ := NewPage("simple.md") - _, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_FIVE_MULTIBYTE_UFT8_RUNES)) + _, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_ALL_CJK_RUNES)) p.Convert() p.analyzePage() if err != nil { t.Fatalf("Unable to create a page with frontmatter and body content: %s", err) } - if p.RuneCount() != 5 { - t.Fatalf("incorrect rune count for content '%s'. expected %v, got %v", p.plain, 5, p.RuneCount()) + if p.WordCount != 8 { + t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 8, p.WordCount) + } +} +func TestWordCountWithAllCJKRunesHasCJKLanguage(t *testing.T) { + viper.Reset() + defer viper.Reset() + + viper.Set("HasCJKLanguage", true) + + p, _ := NewPage("simple.md") + _, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_ALL_CJK_RUNES)) + p.Convert() + p.analyzePage() + if err != nil { + t.Fatalf("Unable to create a page with frontmatter and body content: %s", err) + } + + if p.WordCount != 15 { + t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 15, p.WordCount) + } +} + +func TestWordCountWithMainEnglishWithCJKRunes(t *testing.T) { + viper.Reset() + defer viper.Reset() + + viper.Set("HasCJKLanguage", true) + + p, _ := NewPage("simple.md") + _, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES)) + p.Convert() + p.analyzePage() + if err != nil { + t.Fatalf("Unable to create a page with frontmatter and body content: %s", err) + } + + if p.WordCount != 74 { + t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 74, p.WordCount) + } + + if p.Summary != SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES_SUMMARY { + t.Fatalf("incorrect Summary for content '%s'. expected %v, got %v", p.plain, + SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES_SUMMARY, p.Summary) + } +} + +func TestWordCountWithIsCJKLanguageFalse(t *testing.T) { + viper.Reset() + defer viper.Reset() + + viper.Set("HasCJKLanguage", true) + + p, _ := NewPage("simple.md") + _, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE)) + p.Convert() + p.analyzePage() + if err != nil { + t.Fatalf("Unable to create a page with frontmatter and body content: %s", err) + } + + if p.WordCount != 75 { + t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 75, p.WordCount) + } + + if p.Summary != SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE_SUMMARY { + t.Fatalf("incorrect Summary for content '%s'. expected %v, got %v", p.plain, + SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE_SUMMARY, p.Summary) } }