hugo/hugolib/page__content.go
curegit 02a7f59c28 hugolib: Fix regression in summary generation behavior since v0.123
Fix regression in content summarization so that we can use empty
summary by using the manual summary divider. Since v0.123, there
has been the regression that causes Hugo to use automatic summary
generation when the manual summary results in an empty string,
even if there is a `<!--more-->` summary divider.
2024-03-29 12:57:31 +09:00

807 lines
20 KiB
Go

// Copyright 2019 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package hugolib
import (
"bytes"
"context"
"errors"
"fmt"
"html/template"
"io"
"strconv"
"strings"
"unicode/utf8"
"github.com/bep/logg"
"github.com/gohugoio/hugo/common/hcontext"
"github.com/gohugoio/hugo/common/herrors"
"github.com/gohugoio/hugo/common/hugio"
"github.com/gohugoio/hugo/helpers"
"github.com/gohugoio/hugo/identity"
"github.com/gohugoio/hugo/markup/converter"
"github.com/gohugoio/hugo/markup/tableofcontents"
"github.com/gohugoio/hugo/parser/metadecoders"
"github.com/gohugoio/hugo/parser/pageparser"
"github.com/gohugoio/hugo/resources"
"github.com/gohugoio/hugo/resources/resource"
"github.com/gohugoio/hugo/tpl"
)
const (
internalSummaryDividerBase = "HUGOMORE42"
)
var (
internalSummaryDividerBaseBytes = []byte(internalSummaryDividerBase)
internalSummaryDividerPre = []byte("\n\n" + internalSummaryDividerBase + "\n\n")
)
type pageContentReplacement struct {
val []byte
source pageparser.Item
}
func (m *pageMeta) parseFrontMatter(h *HugoSites, pid uint64, sourceKey string) (*contentParseInfo, error) {
var openSource hugio.OpenReadSeekCloser
if m.f != nil {
meta := m.f.FileInfo().Meta()
openSource = func() (hugio.ReadSeekCloser, error) {
r, err := meta.Open()
if err != nil {
return nil, fmt.Errorf("failed to open file %q: %w", meta.Filename, err)
}
return r, nil
}
}
if sourceKey == "" {
sourceKey = strconv.Itoa(int(pid))
}
pi := &contentParseInfo{
h: h,
pid: pid,
sourceKey: sourceKey,
openSource: openSource,
}
source, err := pi.contentSource(m)
if err != nil {
return nil, err
}
items, err := pageparser.ParseBytes(
source,
pageparser.Config{},
)
if err != nil {
return nil, err
}
pi.itemsStep1 = items
if err := pi.mapFrontMatter(source); err != nil {
return nil, err
}
return pi, nil
}
func (m *pageMeta) newCachedContent(h *HugoSites, pi *contentParseInfo) (*cachedContent, error) {
var filename string
if m.f != nil {
filename = m.f.Filename()
}
c := &cachedContent{
pm: m.s.pageMap,
StaleInfo: m,
shortcodeState: newShortcodeHandler(filename, m.s),
pi: pi,
enableEmoji: m.s.conf.EnableEmoji,
}
source, err := c.pi.contentSource(m)
if err != nil {
return nil, err
}
if err := c.parseContentFile(source); err != nil {
return nil, err
}
return c, nil
}
type cachedContent struct {
pm *pageMap
resource.StaleInfo
shortcodeState *shortcodeHandler
// Parsed content.
pi *contentParseInfo
enableEmoji bool
}
type contentParseInfo struct {
h *HugoSites
pid uint64
sourceKey string
// The source bytes.
openSource hugio.OpenReadSeekCloser
frontMatter map[string]any
// Whether the parsed content contains a summary separator.
hasSummaryDivider bool
// Whether there are more content after the summary divider.
summaryTruncated bool
// Returns the position in bytes after any front matter.
posMainContent int
// Indicates whether we must do placeholder replacements.
hasNonMarkdownShortcode bool
// Items from the page parser.
// These maps directly to the source
itemsStep1 pageparser.Items
// *shortcode, pageContentReplacement or pageparser.Item
itemsStep2 []any
}
func (p *contentParseInfo) AddBytes(item pageparser.Item) {
p.itemsStep2 = append(p.itemsStep2, item)
}
func (p *contentParseInfo) AddReplacement(val []byte, source pageparser.Item) {
p.itemsStep2 = append(p.itemsStep2, pageContentReplacement{val: val, source: source})
}
func (p *contentParseInfo) AddShortcode(s *shortcode) {
p.itemsStep2 = append(p.itemsStep2, s)
if s.insertPlaceholder() {
p.hasNonMarkdownShortcode = true
}
}
// contentToRenderForItems returns the content to be processed by Goldmark or similar.
func (pi *contentParseInfo) contentToRender(ctx context.Context, source []byte, renderedShortcodes map[string]shortcodeRenderer) ([]byte, bool, error) {
var hasVariants bool
c := make([]byte, 0, len(source)+(len(source)/10))
for _, it := range pi.itemsStep2 {
switch v := it.(type) {
case pageparser.Item:
c = append(c, source[v.Pos():v.Pos()+len(v.Val(source))]...)
case pageContentReplacement:
c = append(c, v.val...)
case *shortcode:
if !v.insertPlaceholder() {
// Insert the rendered shortcode.
renderedShortcode, found := renderedShortcodes[v.placeholder]
if !found {
// This should never happen.
panic(fmt.Sprintf("rendered shortcode %q not found", v.placeholder))
}
b, more, err := renderedShortcode.renderShortcode(ctx)
if err != nil {
return nil, false, fmt.Errorf("failed to render shortcode: %w", err)
}
hasVariants = hasVariants || more
c = append(c, []byte(b)...)
} else {
// Insert the placeholder so we can insert the content after
// markdown processing.
c = append(c, []byte(v.placeholder)...)
}
default:
panic(fmt.Sprintf("unknown item type %T", it))
}
}
return c, hasVariants, nil
}
func (c *cachedContent) IsZero() bool {
return len(c.pi.itemsStep2) == 0
}
func (c *cachedContent) parseContentFile(source []byte) error {
if source == nil || c.pi.openSource == nil {
return nil
}
return c.pi.mapItemsAfterFrontMatter(source, c.shortcodeState)
}
func (c *contentParseInfo) parseFrontMatter(it pageparser.Item, iter *pageparser.Iterator, source []byte) error {
if c.frontMatter != nil {
return nil
}
f := pageparser.FormatFromFrontMatterType(it.Type)
var err error
c.frontMatter, err = metadecoders.Default.UnmarshalToMap(it.Val(source), f)
if err != nil {
if fe, ok := err.(herrors.FileError); ok {
pos := fe.Position()
// Offset the starting position of front matter.
offset := iter.LineNumber(source) - 1
if f == metadecoders.YAML {
offset -= 1
}
pos.LineNumber += offset
fe.UpdatePosition(pos)
fe.SetFilename("") // It will be set later.
return fe
} else {
return err
}
}
return nil
}
func (rn *contentParseInfo) failMap(source []byte, err error, i pageparser.Item) error {
if fe, ok := err.(herrors.FileError); ok {
return fe
}
pos := posFromInput("", source, i.Pos())
return herrors.NewFileErrorFromPos(err, pos)
}
func (rn *contentParseInfo) mapFrontMatter(source []byte) error {
if len(rn.itemsStep1) == 0 {
return nil
}
iter := pageparser.NewIterator(rn.itemsStep1)
Loop:
for {
it := iter.Next()
switch {
case it.IsFrontMatter():
if err := rn.parseFrontMatter(it, iter, source); err != nil {
return err
}
next := iter.Peek()
if !next.IsDone() {
rn.posMainContent = next.Pos()
}
// Done.
break Loop
case it.IsEOF():
break Loop
case it.IsError():
return rn.failMap(source, it.Err, it)
default:
}
}
return nil
}
func (rn *contentParseInfo) mapItemsAfterFrontMatter(
source []byte,
s *shortcodeHandler,
) error {
if len(rn.itemsStep1) == 0 {
return nil
}
fail := func(err error, i pageparser.Item) error {
if fe, ok := err.(herrors.FileError); ok {
return fe
}
pos := posFromInput("", source, i.Pos())
return herrors.NewFileErrorFromPos(err, pos)
}
iter := pageparser.NewIterator(rn.itemsStep1)
// the parser is guaranteed to return items in proper order or fail, so …
// … it's safe to keep some "global" state
var ordinal int
Loop:
for {
it := iter.Next()
switch {
case it.Type == pageparser.TypeIgnore:
case it.IsFrontMatter():
// Ignore.
case it.Type == pageparser.TypeLeadSummaryDivider:
posBody := -1
f := func(item pageparser.Item) bool {
if posBody == -1 && !item.IsDone() {
posBody = item.Pos()
}
if item.IsNonWhitespace(source) {
rn.summaryTruncated = true
// Done
return false
}
return true
}
iter.PeekWalk(f)
rn.hasSummaryDivider = true
// The content may be rendered by Goldmark or similar,
// and we need to track the summary.
rn.AddReplacement(internalSummaryDividerPre, it)
// Handle shortcode
case it.IsLeftShortcodeDelim():
// let extractShortcode handle left delim (will do so recursively)
iter.Backup()
currShortcode, err := s.extractShortcode(ordinal, 0, source, iter)
if err != nil {
return fail(err, it)
}
currShortcode.pos = it.Pos()
currShortcode.length = iter.Current().Pos() - it.Pos()
if currShortcode.placeholder == "" {
currShortcode.placeholder = createShortcodePlaceholder("s", rn.pid, currShortcode.ordinal)
}
if currShortcode.name != "" {
s.addName(currShortcode.name)
}
if currShortcode.params == nil {
var s []string
currShortcode.params = s
}
currShortcode.placeholder = createShortcodePlaceholder("s", rn.pid, ordinal)
ordinal++
s.shortcodes = append(s.shortcodes, currShortcode)
rn.AddShortcode(currShortcode)
case it.IsEOF():
break Loop
case it.IsError():
return fail(it.Err, it)
default:
rn.AddBytes(it)
}
}
return nil
}
func (c *cachedContent) mustSource() []byte {
source, err := c.pi.contentSource(c)
if err != nil {
panic(err)
}
return source
}
func (c *contentParseInfo) contentSource(s resource.StaleInfo) ([]byte, error) {
key := c.sourceKey
v, err := c.h.cacheContentSource.GetOrCreate(key, func(string) (*resources.StaleValue[[]byte], error) {
b, err := c.readSourceAll()
if err != nil {
return nil, err
}
return &resources.StaleValue[[]byte]{
Value: b,
IsStaleFunc: func() bool {
return s.IsStale()
},
}, nil
})
if err != nil {
return nil, err
}
return v.Value, nil
}
func (c *contentParseInfo) readSourceAll() ([]byte, error) {
if c.openSource == nil {
return []byte{}, nil
}
r, err := c.openSource()
if err != nil {
return nil, err
}
defer r.Close()
return io.ReadAll(r)
}
type contentTableOfContents struct {
// For Goldmark we split Parse and Render.
astDoc any
tableOfContents *tableofcontents.Fragments
tableOfContentsHTML template.HTML
// Temporary storage of placeholders mapped to their content.
// These are shortcodes etc. Some of these will need to be replaced
// after any markup is rendered, so they share a common prefix.
contentPlaceholders map[string]shortcodeRenderer
contentToRender []byte
}
type contentSummary struct {
content template.HTML
summary template.HTML
summaryTruncated bool
}
type contentPlainPlainWords struct {
plain string
plainWords []string
summary template.HTML
summaryTruncated bool
wordCount int
fuzzyWordCount int
readingTime int
}
func (c *cachedContent) contentRendered(ctx context.Context, cp *pageContentOutput) (contentSummary, error) {
ctx = tpl.Context.DependencyScope.Set(ctx, pageDependencyScopeGlobal)
key := c.pi.sourceKey + "/" + cp.po.f.Name
versionv := cp.contentRenderedVersion
v, err := c.pm.cacheContentRendered.GetOrCreate(key, func(string) (*resources.StaleValue[contentSummary], error) {
cp.po.p.s.Log.Trace(logg.StringFunc(func() string {
return fmt.Sprintln("contentRendered", key)
}))
cp.po.p.s.h.contentRenderCounter.Add(1)
cp.contentRendered = true
po := cp.po
ct, err := c.contentToC(ctx, cp)
if err != nil {
return nil, err
}
rs := &resources.StaleValue[contentSummary]{
IsStaleFunc: func() bool {
return c.IsStale() || cp.contentRenderedVersion != versionv
},
}
if len(c.pi.itemsStep2) == 0 {
// Nothing to do.
return rs, nil
}
var b []byte
if ct.astDoc != nil {
// The content is parsed, but not rendered.
r, ok, err := po.contentRenderer.RenderContent(ctx, ct.contentToRender, ct.astDoc)
if err != nil {
return nil, err
}
if !ok {
return nil, errors.New("invalid state: astDoc is set but RenderContent returned false")
}
b = r.Bytes()
} else {
// Copy the content to be rendered.
b = make([]byte, len(ct.contentToRender))
copy(b, ct.contentToRender)
}
// There are one or more replacement tokens to be replaced.
var hasShortcodeVariants bool
tokenHandler := func(ctx context.Context, token string) ([]byte, error) {
if token == tocShortcodePlaceholder {
return []byte(ct.tableOfContentsHTML), nil
}
renderer, found := ct.contentPlaceholders[token]
if found {
repl, more, err := renderer.renderShortcode(ctx)
if err != nil {
return nil, err
}
hasShortcodeVariants = hasShortcodeVariants || more
return repl, nil
}
// This should never happen.
panic(fmt.Errorf("unknown shortcode token %q (number of tokens: %d)", token, len(ct.contentPlaceholders)))
}
b, err = expandShortcodeTokens(ctx, b, tokenHandler)
if err != nil {
return nil, err
}
if hasShortcodeVariants {
cp.po.p.pageOutputTemplateVariationsState.Add(1)
}
var result contentSummary // hasVariants bool
if c.pi.hasSummaryDivider {
isHTML := cp.po.p.m.pageConfig.Markup == "html"
if isHTML {
// Use the summary sections as provided by the user.
i := bytes.Index(b, internalSummaryDividerPre)
result.summary = helpers.BytesToHTML(b[:i])
b = b[i+len(internalSummaryDividerPre):]
} else {
summary, content, err := splitUserDefinedSummaryAndContent(cp.po.p.m.pageConfig.Markup, b)
if err != nil {
cp.po.p.s.Log.Errorf("Failed to set user defined summary for page %q: %s", cp.po.p.pathOrTitle(), err)
} else {
b = content
result.summary = helpers.BytesToHTML(summary)
}
}
result.summaryTruncated = c.pi.summaryTruncated
}
result.content = helpers.BytesToHTML(b)
rs.Value = result
return rs, nil
})
if err != nil {
return contentSummary{}, cp.po.p.wrapError(err)
}
return v.Value, nil
}
func (c *cachedContent) mustContentToC(ctx context.Context, cp *pageContentOutput) contentTableOfContents {
ct, err := c.contentToC(ctx, cp)
if err != nil {
panic(err)
}
return ct
}
var setGetContentCallbackInContext = hcontext.NewContextDispatcher[func(*pageContentOutput, contentTableOfContents)]("contentCallback")
func (c *cachedContent) contentToC(ctx context.Context, cp *pageContentOutput) (contentTableOfContents, error) {
key := c.pi.sourceKey + "/" + cp.po.f.Name
versionv := cp.contentRenderedVersion
v, err := c.pm.contentTableOfContents.GetOrCreate(key, func(string) (*resources.StaleValue[contentTableOfContents], error) {
source, err := c.pi.contentSource(c)
if err != nil {
return nil, err
}
var ct contentTableOfContents
if err := cp.initRenderHooks(); err != nil {
return nil, err
}
f := cp.po.f
po := cp.po
p := po.p
ct.contentPlaceholders, err = c.shortcodeState.prepareShortcodesForPage(ctx, p, f, false)
if err != nil {
return nil, err
}
// Callback called from above (e.g. in .RenderString)
ctxCallback := func(cp2 *pageContentOutput, ct2 contentTableOfContents) {
// Merge content placeholders
for k, v := range ct2.contentPlaceholders {
ct.contentPlaceholders[k] = v
}
if p.s.conf.Internal.Watch {
for _, s := range cp2.po.p.m.content.shortcodeState.shortcodes {
for _, templ := range s.templs {
cp.trackDependency(templ.(identity.IdentityProvider))
}
}
}
// Transfer shortcode names so HasShortcode works for shortcodes from included pages.
cp.po.p.m.content.shortcodeState.transferNames(cp2.po.p.m.content.shortcodeState)
if cp2.po.p.pageOutputTemplateVariationsState.Load() > 0 {
cp.po.p.pageOutputTemplateVariationsState.Add(1)
}
}
ctx = setGetContentCallbackInContext.Set(ctx, ctxCallback)
var hasVariants bool
ct.contentToRender, hasVariants, err = c.pi.contentToRender(ctx, source, ct.contentPlaceholders)
if err != nil {
return nil, err
}
if hasVariants {
p.pageOutputTemplateVariationsState.Add(1)
}
isHTML := cp.po.p.m.pageConfig.Markup == "html"
if !isHTML {
createAndSetToC := func(tocProvider converter.TableOfContentsProvider) {
cfg := p.s.ContentSpec.Converters.GetMarkupConfig()
ct.tableOfContents = tocProvider.TableOfContents()
ct.tableOfContentsHTML = template.HTML(
ct.tableOfContents.ToHTML(
cfg.TableOfContents.StartLevel,
cfg.TableOfContents.EndLevel,
cfg.TableOfContents.Ordered,
),
)
}
// If the converter supports doing the parsing separately, we do that.
parseResult, ok, err := po.contentRenderer.ParseContent(ctx, ct.contentToRender)
if err != nil {
return nil, err
}
if ok {
// This is Goldmark.
// Store away the parse result for later use.
createAndSetToC(parseResult)
ct.astDoc = parseResult.Doc()
} else {
// This is Asciidoctor etc.
r, err := po.contentRenderer.ParseAndRenderContent(ctx, ct.contentToRender, true)
if err != nil {
return nil, err
}
ct.contentToRender = r.Bytes()
if tocProvider, ok := r.(converter.TableOfContentsProvider); ok {
createAndSetToC(tocProvider)
} else {
tmpContent, tmpTableOfContents := helpers.ExtractTOC(ct.contentToRender)
ct.tableOfContentsHTML = helpers.BytesToHTML(tmpTableOfContents)
ct.tableOfContents = tableofcontents.Empty
ct.contentToRender = tmpContent
}
}
}
return &resources.StaleValue[contentTableOfContents]{
Value: ct,
IsStaleFunc: func() bool {
return c.IsStale() || cp.contentRenderedVersion != versionv
},
}, nil
})
if err != nil {
return contentTableOfContents{}, err
}
return v.Value, nil
}
func (c *cachedContent) contentPlain(ctx context.Context, cp *pageContentOutput) (contentPlainPlainWords, error) {
key := c.pi.sourceKey + "/" + cp.po.f.Name
versionv := cp.contentRenderedVersion
v, err := c.pm.cacheContentPlain.GetOrCreateWitTimeout(key, cp.po.p.s.Conf.Timeout(), func(string) (*resources.StaleValue[contentPlainPlainWords], error) {
var result contentPlainPlainWords
rs := &resources.StaleValue[contentPlainPlainWords]{
IsStaleFunc: func() bool {
return c.IsStale() || cp.contentRenderedVersion != versionv
},
}
rendered, err := c.contentRendered(ctx, cp)
if err != nil {
return nil, err
}
result.plain = tpl.StripHTML(string(rendered.content))
result.plainWords = strings.Fields(result.plain)
isCJKLanguage := cp.po.p.m.pageConfig.IsCJKLanguage
if isCJKLanguage {
result.wordCount = 0
for _, word := range result.plainWords {
runeCount := utf8.RuneCountInString(word)
if len(word) == runeCount {
result.wordCount++
} else {
result.wordCount += runeCount
}
}
} else {
result.wordCount = helpers.TotalWords(result.plain)
}
// TODO(bep) is set in a test. Fix that.
if result.fuzzyWordCount == 0 {
result.fuzzyWordCount = (result.wordCount + 100) / 100 * 100
}
if isCJKLanguage {
result.readingTime = (result.wordCount + 500) / 501
} else {
result.readingTime = (result.wordCount + 212) / 213
}
if c.pi.hasSummaryDivider || rendered.summary != "" {
result.summary = rendered.summary
result.summaryTruncated = rendered.summaryTruncated
} else if cp.po.p.m.pageConfig.Summary != "" {
b, err := cp.po.contentRenderer.ParseAndRenderContent(ctx, []byte(cp.po.p.m.pageConfig.Summary), false)
if err != nil {
return nil, err
}
html := cp.po.p.s.ContentSpec.TrimShortHTML(b.Bytes())
result.summary = helpers.BytesToHTML(html)
} else {
var summary string
var truncated bool
if isCJKLanguage {
summary, truncated = cp.po.p.s.ContentSpec.TruncateWordsByRune(result.plainWords)
} else {
summary, truncated = cp.po.p.s.ContentSpec.TruncateWordsToWholeSentence(result.plain)
}
result.summary = template.HTML(summary)
result.summaryTruncated = truncated
}
rs.Value = result
return rs, nil
})
if err != nil {
if herrors.IsTimeoutError(err) {
err = fmt.Errorf("timed out rendering the page content. You may have a circular loop in a shortcode, or your site may have resources that take longer to build than the `timeout` limit in your Hugo config file: %w", err)
}
return contentPlainPlainWords{}, err
}
return v.Value, nil
}