Skip to content

Commit

Permalink
WordCount and Summary support CJK Language
Browse files Browse the repository at this point in the history
 * add global `hasCJKLanguage` flag, if true, turn on auto-detecting CJKLanguage
 * add `isCJKLanguage` frontmatter to force specify whether is CJKLanguage or not
 * For .Summary: If isCJKLanguage is true, use the runes as basis for truncation, else keep as today.
 * For WordCount: If isCJKLanguage is true, use the runes as basis for calculation, else keep as today.
 * Unexport RuneCount

Fixes gohugoio#1377
  • Loading branch information
coderzh authored and bep committed Oct 7, 2015
1 parent 2c045ac commit 8233348
Show file tree
Hide file tree
Showing 5 changed files with 242 additions and 88 deletions.
1 change: 1 addition & 0 deletions commands/hugo.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ func LoadDefaultSettings() {
viper.SetDefault("RSSUri", "index.xml")
viper.SetDefault("SectionPagesMenu", "")
viper.SetDefault("DisablePathToLower", false)
viper.SetDefault("HasCJKLanguage", false)
}

// InitializeConfig initializes a config file with sensible default configuration flags.
Expand Down
81 changes: 35 additions & 46 deletions helpers/content.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ package helpers

import (
"bytes"
"unicode/utf8"
"html/template"
"os/exec"
"unicode/utf8"

"github.com/miekg/mmark"
"github.com/russross/blackfriday"
Expand Down Expand Up @@ -178,7 +178,6 @@ func GetHTMLRenderer(defaultFlags int, ctx *RenderingContext) blackfriday.Render
}
}


func getMarkdownExtensions(ctx *RenderingContext) int {
flags := 0 | blackfriday.EXTENSION_NO_INTRA_EMPHASIS |
blackfriday.EXTENSION_TABLES | blackfriday.EXTENSION_FENCED_CODE |
Expand Down Expand Up @@ -385,61 +384,51 @@ func TruncateWords(s string, max int) string {
return strings.Join(words[:max], " ")
}

// TruncateWordsToWholeSentence takes content and an int
// and returns entire sentences from content, delimited by the int
// and whether it's truncated or not.
func TruncateWordsToWholeSentence(words []string, max int) (string, bool) {
func TruncateWordsByRune(words []string, max int) (string, bool) {
count := 0
index, word := 0, ""
truncated := false

for index, word = range words {
for index, word := range words {
if count >= max {
return strings.Join(words[:index], " "), true
}
runeCount := utf8.RuneCountInString(word)
if len(word) == runeCount {
count++;
count++
} else if count+runeCount < max {
count += runeCount
} else {
if count + runeCount <= max {
count += runeCount
} else {
offset := 0
for count < max {
_, width := utf8.DecodeRuneInString(word[offset:])
offset += width
for ri, _ := range word {
if count >= max {
truncatedWords := append(words[:index], word[:ri])
return strings.Join(truncatedWords, " "), true
} else {
count++
}
words[index] = word[:offset]
truncated = true
}
}

if count >= max {
if index < len(words) - 1 {
truncated = true
}
break
}
}

index += 1

if index < len(words) {
for counter, word := range words[index:] {
if len(word) != utf8.RuneCountInString(word) {
break
}
if strings.HasSuffix(word, ".") ||
strings.HasSuffix(word, "?") ||
strings.HasSuffix(word, ".\"") ||
strings.HasSuffix(word, "!") {
upper := index + counter + 1
return strings.Join(words[:upper], " "), (upper < len(words))
}

return strings.Join(words, " "), false
}

// TruncateWordsToWholeSentence takes content and an int
// and returns entire sentences from content, delimited by the int
// and whether it's truncated or not.
func TruncateWordsToWholeSentence(words []string, max int) (string, bool) {
if max >= len(words) {
return strings.Join(words, " "), false
}

for counter, word := range words[max:] {
if strings.HasSuffix(word, ".") ||
strings.HasSuffix(word, "?") ||
strings.HasSuffix(word, ".\"") ||
strings.HasSuffix(word, "!") {
upper := max + counter + 1
return strings.Join(words[:upper], " "), (upper < len(words))
}
} else if index > len(words) {
return strings.Join(words, " "), truncated
}
return strings.Join(words[:index], " "), truncated

return strings.Join(words[:max], " "), true
}

// GetAsciidocContent calls asciidoctor or asciidoc as an external helper
Expand Down
38 changes: 35 additions & 3 deletions helpers/content_test.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
package helpers

import (
"github.com/stretchr/testify/assert"
"html/template"
"strings"
"testing"

"github.com/stretchr/testify/assert"
)

const tstHTMLContent = "<!DOCTYPE html><html><head><script src=\"http://two/foobar.js\"></script></head><body><nav><ul><li hugo-nav=\"section_0\"></li><li hugo-nav=\"section_1\"></li></ul></nav><article>content <a href=\"http://two/foobar\">foobar</a>. Follow up</article><p>This is some text.<br>And some more.</p></body></html>"
Expand Down Expand Up @@ -54,8 +55,6 @@ func TestTruncateWordsToWholeSentence(t *testing.T) {
{"a b c", "a b c", 12, false},
{"a b c", "a b c", 3, false},
{"a", "a", 1, false},
{"Hello 中国", "Hello 中", 2, true},
{"Hello 中国", "Hello 中国", 3, false},
{"This is a sentence.", "This is a sentence.", 5, false},
{"This is also a sentence!", "This is also a sentence!", 1, false},
{"To be. Or not to be. That's the question.", "To be.", 1, true},
Expand All @@ -72,3 +71,36 @@ func TestTruncateWordsToWholeSentence(t *testing.T) {
}
}
}

func TestTruncateWordsByRune(t *testing.T) {
type test struct {
input, expected string
max int
truncated bool
}
data := []test{
{"", "", 1, false},
{"a b c", "a b c", 12, false},
{"a b c", "a b c", 3, false},
{"a", "a", 1, false},
{"Hello 中国", "", 0, true},
{"这是中文,全中文。", "这是中文,", 5, true},
{"Hello 中国", "Hello 中", 2, true},
{"Hello 中国", "Hello 中国", 3, false},
{"Hello中国 Good 好的", "Hello中国 Good 好", 9, true},
{"This is a sentence.", "This is", 2, true},
{"This is also a sentence!", "This", 1, true},
{"To be. Or not to be. That's the question.", "To be. Or not", 4, true},
{" \nThis is not a sentence\n ", "This is not", 3, true},
}
for i, d := range data {
output, truncated := TruncateWordsByRune(strings.Fields(d.input), d.max)
if d.expected != output {
t.Errorf("Test %d failed. Expected %q got %q", i, d.expected, output)
}

if d.truncated != truncated {
t.Errorf("Test %d failed. Expected truncated=%t got %t", i, d.truncated, truncated)
}
}
}
81 changes: 47 additions & 34 deletions hugolib/page.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"net/url"
"path"
"path/filepath"
"regexp"
"strings"
"sync"
"time"
Expand All @@ -42,6 +43,10 @@ import (
"github.com/spf13/viper"
)

var (
cjk = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}`)
)

type Page struct {
Params map[string]interface{}
Content template.HTML
Expand All @@ -67,7 +72,6 @@ type Page struct {
contentShortCodes map[string]string
plain string // TODO should be []byte
plainWords []string
plainRuneCount int
plainInit sync.Once
plainSecondaryInit sync.Once
renderingConfig *helpers.Blackfriday
Expand All @@ -78,6 +82,7 @@ type Page struct {
Node
pageMenus PageMenus
pageMenusInit sync.Once
isCJKLanguage bool
}

type Source struct {
Expand Down Expand Up @@ -111,12 +116,6 @@ func (p *Page) PlainWords() []string {
return p.plainWords
}

// RuneCount returns the rune count, excluding any whitespace, of the plain content.
func (p *Page) RuneCount() int {
p.initPlainSecondary()
return p.plainRuneCount
}

func (p *Page) initPlain() {
p.plainInit.Do(func() {
p.plain = helpers.StripHTML(string(p.Content))
Expand All @@ -125,20 +124,6 @@ func (p *Page) initPlain() {
})
}

func (p *Page) initPlainSecondary() {
p.plainSecondaryInit.Do(func() {
p.initPlain()
runeCount := 0
for _, r := range p.plain {
if !helpers.IsWhitespace(r) {
runeCount++
}
}
p.plainRuneCount = runeCount
return
})
}

func (p *Page) IsNode() bool {
return false
}
Expand Down Expand Up @@ -218,7 +203,13 @@ func (p *Page) setSummary() {
} else {
// If hugo defines split:
// render, strip html, then split
summary, truncated := helpers.TruncateWordsToWholeSentence(p.PlainWords(), helpers.SummaryLength)
var summary string
var truncated bool
if p.isCJKLanguage {
summary, truncated = helpers.TruncateWordsByRune(p.PlainWords(), helpers.SummaryLength)
} else {
summary, truncated = helpers.TruncateWordsToWholeSentence(p.PlainWords(), helpers.SummaryLength)
}
p.Summary = template.HTML(summary)
p.Truncated = truncated

Expand Down Expand Up @@ -363,18 +354,27 @@ func (p *Page) ReadFrom(buf io.Reader) (int64, error) {
}

func (p *Page) analyzePage() {
p.WordCount = 0
for _, word := range p.PlainWords() {
runeCount := utf8.RuneCountInString(word)
if len(word) == runeCount {
p.WordCount++
} else {
p.WordCount += runeCount
if p.isCJKLanguage {
p.WordCount = 0
for _, word := range p.PlainWords() {
runeCount := utf8.RuneCountInString(word)
if len(word) == runeCount {
p.WordCount++
} else {
p.WordCount += runeCount
}
}
} else {
p.WordCount = len(p.PlainWords())
}

p.FuzzyWordCount = int((p.WordCount+100)/100) * 100
p.ReadingTime = int((p.WordCount + 212) / 213)

if p.isCJKLanguage {
p.ReadingTime = int((p.WordCount + 500) / 501)
} else {
p.ReadingTime = int((p.WordCount + 212) / 213)
}
}

func (p *Page) permalink() (*url.URL, error) {
Expand Down Expand Up @@ -481,7 +481,7 @@ func (p *Page) update(f interface{}) error {
}
m := f.(map[string]interface{})
var err error
var draft, published *bool
var draft, published, isCJKLanguage *bool
for k, v := range m {
loki := strings.ToLower(k)
switch loki {
Expand Down Expand Up @@ -542,6 +542,9 @@ func (p *Page) update(f interface{}) error {
p.Status = cast.ToString(v)
case "sitemap":
p.Sitemap = parseSitemap(cast.ToStringMap(v))
case "iscjklanguage":
isCJKLanguage = new(bool)
*isCJKLanguage = cast.ToBool(v)
default:
// If not one of the explicit values, store in Params
switch vv := v.(type) {
Expand Down Expand Up @@ -596,6 +599,16 @@ func (p *Page) update(f interface{}) error {
p.Lastmod = p.Date
}

if isCJKLanguage != nil {
p.isCJKLanguage = *isCJKLanguage
} else if viper.GetBool("HasCJKLanguage") {
if cjk.Match(p.rawContent) {
p.isCJKLanguage = true
} else {
p.isCJKLanguage = false
}
}

return nil

}
Expand Down Expand Up @@ -766,6 +779,8 @@ func (p *Page) parse(reader io.Reader) error {

p.renderable = psr.IsRenderable()
p.frontmatter = psr.FrontMatter()
p.rawContent = psr.Content()

meta, err := psr.Metadata()
if meta != nil {
if err != nil {
Expand All @@ -778,8 +793,6 @@ func (p *Page) parse(reader io.Reader) error {
}
}

p.rawContent = psr.Content()

return nil
}

Expand Down
Loading

0 comments on commit 8233348

Please sign in to comment.