WordCount and Summary support CJK Language

* add global `hasCJKLanguage` flag, if true, turn on auto-detecting CJKLanguage
 * add `isCJKLanguage` frontmatter to force specify whether is CJKLanguage or not
 * For .Summary: If isCJKLanguage is true, use the runes as basis for truncation, else keep as today.
 * For WordCount: If isCJKLanguage is true, use the runes as basis for calculation, else keep as today.
 * Unexport RuneCount

Fixes #1377
This commit is contained in:
coderzh 2015-09-03 18:22:20 +08:00 committed by Bjørn Erik Pedersen
parent 2c045ac449
commit 823334875d
5 changed files with 247 additions and 93 deletions

View file

@ -168,6 +168,7 @@ func LoadDefaultSettings() {
viper.SetDefault("RSSUri", "index.xml") viper.SetDefault("RSSUri", "index.xml")
viper.SetDefault("SectionPagesMenu", "") viper.SetDefault("SectionPagesMenu", "")
viper.SetDefault("DisablePathToLower", false) viper.SetDefault("DisablePathToLower", false)
viper.SetDefault("HasCJKLanguage", false)
} }
// InitializeConfig initializes a config file with sensible default configuration flags. // InitializeConfig initializes a config file with sensible default configuration flags.

View file

@ -19,9 +19,9 @@ package helpers
import ( import (
"bytes" "bytes"
"unicode/utf8"
"html/template" "html/template"
"os/exec" "os/exec"
"unicode/utf8"
"github.com/miekg/mmark" "github.com/miekg/mmark"
"github.com/russross/blackfriday" "github.com/russross/blackfriday"
@ -178,7 +178,6 @@ func GetHTMLRenderer(defaultFlags int, ctx *RenderingContext) blackfriday.Render
} }
} }
func getMarkdownExtensions(ctx *RenderingContext) int { func getMarkdownExtensions(ctx *RenderingContext) int {
flags := 0 | blackfriday.EXTENSION_NO_INTRA_EMPHASIS | flags := 0 | blackfriday.EXTENSION_NO_INTRA_EMPHASIS |
blackfriday.EXTENSION_TABLES | blackfriday.EXTENSION_FENCED_CODE | blackfriday.EXTENSION_TABLES | blackfriday.EXTENSION_FENCED_CODE |
@ -385,61 +384,51 @@ func TruncateWords(s string, max int) string {
return strings.Join(words[:max], " ") return strings.Join(words[:max], " ")
} }
func TruncateWordsByRune(words []string, max int) (string, bool) {
count := 0
for index, word := range words {
if count >= max {
return strings.Join(words[:index], " "), true
}
runeCount := utf8.RuneCountInString(word)
if len(word) == runeCount {
count++
} else if count+runeCount < max {
count += runeCount
} else {
for ri, _ := range word {
if count >= max {
truncatedWords := append(words[:index], word[:ri])
return strings.Join(truncatedWords, " "), true
} else {
count++
}
}
}
}
return strings.Join(words, " "), false
}
// TruncateWordsToWholeSentence takes content and an int // TruncateWordsToWholeSentence takes content and an int
// and returns entire sentences from content, delimited by the int // and returns entire sentences from content, delimited by the int
// and whether it's truncated or not. // and whether it's truncated or not.
func TruncateWordsToWholeSentence(words []string, max int) (string, bool) { func TruncateWordsToWholeSentence(words []string, max int) (string, bool) {
count := 0 if max >= len(words) {
index, word := 0, "" return strings.Join(words, " "), false
truncated := false }
for index, word = range words { for counter, word := range words[max:] {
runeCount := utf8.RuneCountInString(word) if strings.HasSuffix(word, ".") ||
if len(word) == runeCount { strings.HasSuffix(word, "?") ||
count++; strings.HasSuffix(word, ".\"") ||
} else { strings.HasSuffix(word, "!") {
if count + runeCount <= max { upper := max + counter + 1
count += runeCount return strings.Join(words[:upper], " "), (upper < len(words))
} else {
offset := 0
for count < max {
_, width := utf8.DecodeRuneInString(word[offset:])
offset += width
count++
}
words[index] = word[:offset]
truncated = true
}
}
if count >= max {
if index < len(words) - 1 {
truncated = true
}
break
} }
} }
index += 1 return strings.Join(words[:max], " "), true
if index < len(words) {
for counter, word := range words[index:] {
if len(word) != utf8.RuneCountInString(word) {
break
}
if strings.HasSuffix(word, ".") ||
strings.HasSuffix(word, "?") ||
strings.HasSuffix(word, ".\"") ||
strings.HasSuffix(word, "!") {
upper := index + counter + 1
return strings.Join(words[:upper], " "), (upper < len(words))
}
}
} else if index > len(words) {
return strings.Join(words, " "), truncated
}
return strings.Join(words[:index], " "), truncated
} }
// GetAsciidocContent calls asciidoctor or asciidoc as an external helper // GetAsciidocContent calls asciidoctor or asciidoc as an external helper

View file

@ -1,10 +1,11 @@
package helpers package helpers
import ( import (
"github.com/stretchr/testify/assert"
"html/template" "html/template"
"strings" "strings"
"testing" "testing"
"github.com/stretchr/testify/assert"
) )
const tstHTMLContent = "<!DOCTYPE html><html><head><script src=\"http://two/foobar.js\"></script></head><body><nav><ul><li hugo-nav=\"section_0\"></li><li hugo-nav=\"section_1\"></li></ul></nav><article>content <a href=\"http://two/foobar\">foobar</a>. Follow up</article><p>This is some text.<br>And some more.</p></body></html>" const tstHTMLContent = "<!DOCTYPE html><html><head><script src=\"http://two/foobar.js\"></script></head><body><nav><ul><li hugo-nav=\"section_0\"></li><li hugo-nav=\"section_1\"></li></ul></nav><article>content <a href=\"http://two/foobar\">foobar</a>. Follow up</article><p>This is some text.<br>And some more.</p></body></html>"
@ -54,8 +55,6 @@ func TestTruncateWordsToWholeSentence(t *testing.T) {
{"a b c", "a b c", 12, false}, {"a b c", "a b c", 12, false},
{"a b c", "a b c", 3, false}, {"a b c", "a b c", 3, false},
{"a", "a", 1, false}, {"a", "a", 1, false},
{"Hello 中国", "Hello 中", 2, true},
{"Hello 中国", "Hello 中国", 3, false},
{"This is a sentence.", "This is a sentence.", 5, false}, {"This is a sentence.", "This is a sentence.", 5, false},
{"This is also a sentence!", "This is also a sentence!", 1, false}, {"This is also a sentence!", "This is also a sentence!", 1, false},
{"To be. Or not to be. That's the question.", "To be.", 1, true}, {"To be. Or not to be. That's the question.", "To be.", 1, true},
@ -72,3 +71,36 @@ func TestTruncateWordsToWholeSentence(t *testing.T) {
} }
} }
} }
func TestTruncateWordsByRune(t *testing.T) {
type test struct {
input, expected string
max int
truncated bool
}
data := []test{
{"", "", 1, false},
{"a b c", "a b c", 12, false},
{"a b c", "a b c", 3, false},
{"a", "a", 1, false},
{"Hello 中国", "", 0, true},
{"这是中文,全中文。", "这是中文,", 5, true},
{"Hello 中国", "Hello 中", 2, true},
{"Hello 中国", "Hello 中国", 3, false},
{"Hello中国 Good 好的", "Hello中国 Good 好", 9, true},
{"This is a sentence.", "This is", 2, true},
{"This is also a sentence!", "This", 1, true},
{"To be. Or not to be. That's the question.", "To be. Or not", 4, true},
{" \nThis is not a sentence\n ", "This is not", 3, true},
}
for i, d := range data {
output, truncated := TruncateWordsByRune(strings.Fields(d.input), d.max)
if d.expected != output {
t.Errorf("Test %d failed. Expected %q got %q", i, d.expected, output)
}
if d.truncated != truncated {
t.Errorf("Test %d failed. Expected truncated=%t got %t", i, d.truncated, truncated)
}
}
}

View file

@ -28,6 +28,7 @@ import (
"net/url" "net/url"
"path" "path"
"path/filepath" "path/filepath"
"regexp"
"strings" "strings"
"sync" "sync"
"time" "time"
@ -42,6 +43,10 @@ import (
"github.com/spf13/viper" "github.com/spf13/viper"
) )
var (
cjk = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}`)
)
type Page struct { type Page struct {
Params map[string]interface{} Params map[string]interface{}
Content template.HTML Content template.HTML
@ -67,7 +72,6 @@ type Page struct {
contentShortCodes map[string]string contentShortCodes map[string]string
plain string // TODO should be []byte plain string // TODO should be []byte
plainWords []string plainWords []string
plainRuneCount int
plainInit sync.Once plainInit sync.Once
plainSecondaryInit sync.Once plainSecondaryInit sync.Once
renderingConfig *helpers.Blackfriday renderingConfig *helpers.Blackfriday
@ -78,6 +82,7 @@ type Page struct {
Node Node
pageMenus PageMenus pageMenus PageMenus
pageMenusInit sync.Once pageMenusInit sync.Once
isCJKLanguage bool
} }
type Source struct { type Source struct {
@ -111,12 +116,6 @@ func (p *Page) PlainWords() []string {
return p.plainWords return p.plainWords
} }
// RuneCount returns the rune count, excluding any whitespace, of the plain content.
func (p *Page) RuneCount() int {
p.initPlainSecondary()
return p.plainRuneCount
}
func (p *Page) initPlain() { func (p *Page) initPlain() {
p.plainInit.Do(func() { p.plainInit.Do(func() {
p.plain = helpers.StripHTML(string(p.Content)) p.plain = helpers.StripHTML(string(p.Content))
@ -125,20 +124,6 @@ func (p *Page) initPlain() {
}) })
} }
func (p *Page) initPlainSecondary() {
p.plainSecondaryInit.Do(func() {
p.initPlain()
runeCount := 0
for _, r := range p.plain {
if !helpers.IsWhitespace(r) {
runeCount++
}
}
p.plainRuneCount = runeCount
return
})
}
func (p *Page) IsNode() bool { func (p *Page) IsNode() bool {
return false return false
} }
@ -218,7 +203,13 @@ func (p *Page) setSummary() {
} else { } else {
// If hugo defines split: // If hugo defines split:
// render, strip html, then split // render, strip html, then split
summary, truncated := helpers.TruncateWordsToWholeSentence(p.PlainWords(), helpers.SummaryLength) var summary string
var truncated bool
if p.isCJKLanguage {
summary, truncated = helpers.TruncateWordsByRune(p.PlainWords(), helpers.SummaryLength)
} else {
summary, truncated = helpers.TruncateWordsToWholeSentence(p.PlainWords(), helpers.SummaryLength)
}
p.Summary = template.HTML(summary) p.Summary = template.HTML(summary)
p.Truncated = truncated p.Truncated = truncated
@ -363,18 +354,27 @@ func (p *Page) ReadFrom(buf io.Reader) (int64, error) {
} }
func (p *Page) analyzePage() { func (p *Page) analyzePage() {
p.WordCount = 0 if p.isCJKLanguage {
for _, word := range p.PlainWords() { p.WordCount = 0
runeCount := utf8.RuneCountInString(word) for _, word := range p.PlainWords() {
if len(word) == runeCount { runeCount := utf8.RuneCountInString(word)
p.WordCount++ if len(word) == runeCount {
} else { p.WordCount++
p.WordCount += runeCount } else {
p.WordCount += runeCount
}
} }
} else {
p.WordCount = len(p.PlainWords())
} }
p.FuzzyWordCount = int((p.WordCount+100)/100) * 100 p.FuzzyWordCount = int((p.WordCount+100)/100) * 100
p.ReadingTime = int((p.WordCount + 212) / 213)
if p.isCJKLanguage {
p.ReadingTime = int((p.WordCount + 500) / 501)
} else {
p.ReadingTime = int((p.WordCount + 212) / 213)
}
} }
func (p *Page) permalink() (*url.URL, error) { func (p *Page) permalink() (*url.URL, error) {
@ -481,7 +481,7 @@ func (p *Page) update(f interface{}) error {
} }
m := f.(map[string]interface{}) m := f.(map[string]interface{})
var err error var err error
var draft, published *bool var draft, published, isCJKLanguage *bool
for k, v := range m { for k, v := range m {
loki := strings.ToLower(k) loki := strings.ToLower(k)
switch loki { switch loki {
@ -542,6 +542,9 @@ func (p *Page) update(f interface{}) error {
p.Status = cast.ToString(v) p.Status = cast.ToString(v)
case "sitemap": case "sitemap":
p.Sitemap = parseSitemap(cast.ToStringMap(v)) p.Sitemap = parseSitemap(cast.ToStringMap(v))
case "iscjklanguage":
isCJKLanguage = new(bool)
*isCJKLanguage = cast.ToBool(v)
default: default:
// If not one of the explicit values, store in Params // If not one of the explicit values, store in Params
switch vv := v.(type) { switch vv := v.(type) {
@ -596,6 +599,16 @@ func (p *Page) update(f interface{}) error {
p.Lastmod = p.Date p.Lastmod = p.Date
} }
if isCJKLanguage != nil {
p.isCJKLanguage = *isCJKLanguage
} else if viper.GetBool("HasCJKLanguage") {
if cjk.Match(p.rawContent) {
p.isCJKLanguage = true
} else {
p.isCJKLanguage = false
}
}
return nil return nil
} }
@ -766,6 +779,8 @@ func (p *Page) parse(reader io.Reader) error {
p.renderable = psr.IsRenderable() p.renderable = psr.IsRenderable()
p.frontmatter = psr.FrontMatter() p.frontmatter = psr.FrontMatter()
p.rawContent = psr.Content()
meta, err := psr.Metadata() meta, err := psr.Metadata()
if meta != nil { if meta != nil {
if err != nil { if err != nil {
@ -778,8 +793,6 @@ func (p *Page) parse(reader io.Reader) error {
} }
} }
p.rawContent = psr.Content()
return nil return nil
} }

View file

@ -146,16 +146,67 @@ Summary Same Line<!--more-->
Some more text Some more text
` `
SIMPLE_PAGE_WITH_FIVE_MULTIBYTE_UFT8_RUNES = `--- SIMPLE_PAGE_WITH_ALL_CJK_RUNES = `---
title: Simple title: Simple
--- ---
你好
도형이
カテゴリー
` `
SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES = `---
title: Simple
---
In Chinese, means good. In Chinese, means good.
In Chinese, means good. In Chinese, means good.
In Chinese, means good. In Chinese, means good.
In Chinese, means good. In Chinese, means good.
In Chinese, means good. In Chinese, means good.
In Chinese, means good. In Chinese, means good.
In Chinese, means good. In Chinese, means good.
More then 70 words.
`
SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES_SUMMARY = "In Chinese, 好 means good. In Chinese, 好 means good. " +
"In Chinese, 好 means good. In Chinese, 好 means good. " +
"In Chinese, 好 means good. In Chinese, 好 means good. " +
"In Chinese, 好 means good. In Chinese, 好 means good. " +
"In Chinese, 好 means good. In Chinese, 好 means good. " +
"In Chinese, 好 means good. In Chinese, 好 means good. " +
"In Chinese, 好 means good. In Chinese, 好 means good."
SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE = `---
title: Simple
isCJKLanguage: false
---
In Chinese, 好的啊 means good. In Chinese, 好的呀 means good.
In Chinese, 好的啊 means good. In Chinese, 好的呀 means good.
In Chinese, 好的啊 means good. In Chinese, 好的呀 means good.
In Chinese, 好的啊 means good. In Chinese, 好的呀 means good.
In Chinese, 好的啊 means good. In Chinese, 好的呀 means good.
In Chinese, 好的啊 means good. In Chinese, 好的呀 means good.
In Chinese, 好的啊 means good. In Chinese, 好的呀呀 means good enough.
More then 70 words.
`
SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE_SUMMARY = "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
"In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
"In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
"In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
"In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
"In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
"In Chinese, 好的啊 means good. In Chinese, 好的呀呀 means good enough."
SIMPLE_PAGE_WITH_LONG_CONTENT = `--- SIMPLE_PAGE_WITH_LONG_CONTENT = `---
title: Simple title: Simple
--- ---
@ -584,18 +635,86 @@ func TestPageWithDate(t *testing.T) {
checkPageDate(t, p, d) checkPageDate(t, p, d)
} }
func TestRuneCount(t *testing.T) { func TestWordCountWithAllCJKRunesWithoutHasCJKLanguage(t *testing.T) {
viper.Reset()
p, _ := NewPage("simple.md") p, _ := NewPage("simple.md")
_, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_FIVE_MULTIBYTE_UFT8_RUNES)) _, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_ALL_CJK_RUNES))
p.Convert() p.Convert()
p.analyzePage() p.analyzePage()
if err != nil { if err != nil {
t.Fatalf("Unable to create a page with frontmatter and body content: %s", err) t.Fatalf("Unable to create a page with frontmatter and body content: %s", err)
} }
if p.RuneCount() != 5 { if p.WordCount != 8 {
t.Fatalf("incorrect rune count for content '%s'. expected %v, got %v", p.plain, 5, p.RuneCount()) t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 8, p.WordCount)
}
}
func TestWordCountWithAllCJKRunesHasCJKLanguage(t *testing.T) {
viper.Reset()
defer viper.Reset()
viper.Set("HasCJKLanguage", true)
p, _ := NewPage("simple.md")
_, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_ALL_CJK_RUNES))
p.Convert()
p.analyzePage()
if err != nil {
t.Fatalf("Unable to create a page with frontmatter and body content: %s", err)
}
if p.WordCount != 15 {
t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 15, p.WordCount)
}
}
func TestWordCountWithMainEnglishWithCJKRunes(t *testing.T) {
viper.Reset()
defer viper.Reset()
viper.Set("HasCJKLanguage", true)
p, _ := NewPage("simple.md")
_, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES))
p.Convert()
p.analyzePage()
if err != nil {
t.Fatalf("Unable to create a page with frontmatter and body content: %s", err)
}
if p.WordCount != 74 {
t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 74, p.WordCount)
}
if p.Summary != SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES_SUMMARY {
t.Fatalf("incorrect Summary for content '%s'. expected %v, got %v", p.plain,
SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES_SUMMARY, p.Summary)
}
}
func TestWordCountWithIsCJKLanguageFalse(t *testing.T) {
viper.Reset()
defer viper.Reset()
viper.Set("HasCJKLanguage", true)
p, _ := NewPage("simple.md")
_, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE))
p.Convert()
p.analyzePage()
if err != nil {
t.Fatalf("Unable to create a page with frontmatter and body content: %s", err)
}
if p.WordCount != 75 {
t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 75, p.WordCount)
}
if p.Summary != SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE_SUMMARY {
t.Fatalf("incorrect Summary for content '%s'. expected %v, got %v", p.plain,
SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE_SUMMARY, p.Summary)
} }
} }