From 2fdc4a24d5450a98cf38a4456e8e0e8e97a3343d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Erik=20Pedersen?= Date: Wed, 17 Oct 2018 13:48:55 +0200 Subject: [PATCH] parser/pageparser: Add front matter etc. support See #5324 --- parser/pageparser/item.go | 18 +- parser/pageparser/pagelexer.go | 248 ++++++++++++++++-- parser/pageparser/pageparser_intro_test.go | 103 ++++++++ ...r_test.go => pageparser_shortcode_test.go} | 44 +--- 4 files changed, 345 insertions(+), 68 deletions(-) create mode 100644 parser/pageparser/pageparser_intro_test.go rename parser/pageparser/{pageparser_test.go => pageparser_shortcode_test.go} (92%) diff --git a/parser/pageparser/item.go b/parser/pageparser/item.go index ae2f6cbc9..f7495c90e 100644 --- a/parser/pageparser/item.go +++ b/parser/pageparser/item.go @@ -73,10 +73,10 @@ func (i Item) String() string { return i.Val case i.typ > tKeywordMarker: return fmt.Sprintf("<%s>", i.Val) - case len(i.Val) > 20: - return fmt.Sprintf("%.20q...", i.Val) + case len(i.Val) > 50: + return fmt.Sprintf("%v:%.20q...", i.typ, i.Val) } - return fmt.Sprintf("[%s]", i.Val) + return fmt.Sprintf("%v:[%s]", i.typ, i.Val) } type itemType int @@ -85,6 +85,15 @@ const ( tError itemType = iota tEOF + // page items + tHTMLLead // < + tSummaryDivider // + tSummaryDividerOrg // # more + tFrontMatterYAML + tFrontMatterTOML + tFrontMatterJSON + tFrontMatterORG + // shortcode items tLeftDelimScNoMarkup tRightDelimScNoMarkup @@ -95,8 +104,7 @@ const ( tScParam tScParamVal - //itemIdentifier - tText // plain text, used for everything outside the shortcodes + tText // plain text // preserved for later - keywords come after this tKeywordMarker diff --git a/parser/pageparser/pagelexer.go b/parser/pageparser/pagelexer.go index 5267c5634..0c97becde 100644 --- a/parser/pageparser/pagelexer.go +++ b/parser/pageparser/pagelexer.go @@ -44,13 +44,15 @@ type lexerShortcodeState struct { } type pageLexer struct { - name string - input string - state stateFunc - pos pos // input position - start pos // item start position - width pos // width of last element - lastPos pos // position of the last item returned by nextItem + input string + stateStart stateFunc + state stateFunc + pos pos // input position + start pos // item start position + width pos // width of last element + lastPos pos // position of the last item returned by nextItem + + contentSections int lexerShortcodeState @@ -63,18 +65,18 @@ func Parse(s string) *Tokens { } func ParseFrom(s string, from int) *Tokens { - lexer := newPageLexer("default", s, pos(from)) + lexer := newPageLexer(s, pos(from), lexMainSection) // TODO(bep) 2errors lexer.run() return &Tokens{lexer: lexer} } // note: the input position here is normally 0 (start), but // can be set if position of first shortcode is known -func newPageLexer(name, input string, inputPosition pos) *pageLexer { +func newPageLexer(input string, inputPosition pos, stateStart stateFunc) *pageLexer { lexer := &pageLexer{ - name: name, - input: input, - pos: inputPosition, + input: input, + pos: inputPosition, + stateStart: stateStart, lexerShortcodeState: lexerShortcodeState{ currLeftDelimItem: tLeftDelimScNoMarkup, currRightDelimItem: tRightDelimScNoMarkup, @@ -88,14 +90,13 @@ func newPageLexer(name, input string, inputPosition pos) *pageLexer { // main loop func (l *pageLexer) run() *pageLexer { - for l.state = lexTextOutsideShortcodes; l.state != nil; { + for l.state = l.stateStart; l.state != nil; { l.state = l.state(l) } return l } -// state functions - +// Shortcode syntax const ( leftDelimScNoMarkup = "{{<" rightDelimScNoMarkup = ">}}" @@ -105,6 +106,12 @@ const ( rightComment = "*/" ) +// Page syntax +const ( + summaryDivider = "" + summaryDividerOrg = "# more" +) + func (l *pageLexer) next() rune { if int(l.pos) >= len(l.input) { l.width = 0 @@ -178,11 +185,21 @@ func (l *pageLexer) nextItem() Item { return item } -// scans until an opening shortcode opening bracket. -// if no shortcodes, it will keep on scanning until EOF -func lexTextOutsideShortcodes(l *pageLexer) stateFunc { +func (l *pageLexer) consumeCRLF() bool { + var consumed bool + for _, r := range crLf { + if l.next() != r { + l.backup() + } else { + consumed = true + } + } + return consumed +} + +func lexMainSection(l *pageLexer) stateFunc { for { - if strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) || strings.HasPrefix(l.input[l.pos:], leftDelimScNoMarkup) { + if l.isShortCodeStart() { if l.pos > l.start { l.emit(tText) } @@ -194,12 +211,79 @@ func lexTextOutsideShortcodes(l *pageLexer) stateFunc { l.currRightDelimItem = tRightDelimScNoMarkup } return lexShortcodeLeftDelim - } - if l.next() == eof { + + if l.contentSections <= 1 { + if strings.HasPrefix(l.input[l.pos:], summaryDivider) { + if l.pos > l.start { + l.emit(tText) + } + l.contentSections++ + l.pos += pos(len(summaryDivider)) + l.emit(tSummaryDivider) + } else if strings.HasPrefix(l.input[l.pos:], summaryDividerOrg) { + if l.pos > l.start { + l.emit(tText) + } + l.contentSections++ + l.pos += pos(len(summaryDividerOrg)) + l.emit(tSummaryDividerOrg) + } + } + + r := l.next() + if r == eof { break } + } + + return lexDone + +} + +func (l *pageLexer) isShortCodeStart() bool { + return strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) || strings.HasPrefix(l.input[l.pos:], leftDelimScNoMarkup) +} + +func lexIntroSection(l *pageLexer) stateFunc { +LOOP: + for { + r := l.next() + if r == eof { + break + } + + switch { + case r == '+': + return l.lexFrontMatterSection(tFrontMatterTOML, r, "TOML", "+++") + case r == '-': + return l.lexFrontMatterSection(tFrontMatterYAML, r, "YAML", "---") + case r == '{': + return lexFrontMatterJSON + case r == '#': + return lexFrontMatterOrgMode + case !isSpace(r) && !isEndOfLine(r): + if r == '<' { + l.emit(tHTMLLead) + // Not need to look further. Hugo treats this as plain HTML, + // no front matter, no shortcodes, no nothing. + l.pos = pos(len(l.input)) + l.emit(tText) + break LOOP + } + return l.errorf("failed to detect front matter type; got unknown identifier %q", r) + } + } + + l.contentSections = 1 + + // Now move on to the shortcodes. + return lexMainSection +} + +func lexDone(l *pageLexer) stateFunc { + // Done! if l.pos > l.start { l.emit(tText) @@ -208,6 +292,122 @@ func lexTextOutsideShortcodes(l *pageLexer) stateFunc { return nil } +func lexFrontMatterJSON(l *pageLexer) stateFunc { + // Include the left delimiter + l.backup() + + var ( + inQuote bool + level int + ) + + for { + + r := l.next() + + switch { + case r == eof: + return l.errorf("unexpected EOF parsing JSON front matter") + case r == '{': + if !inQuote { + level++ + } + case r == '}': + if !inQuote { + level-- + } + case r == '"': + inQuote = !inQuote + case r == '\\': + // This may be an escaped quote. Make sure it's not marked as a + // real one. + l.next() + } + + if level == 0 { + break + } + } + + l.consumeCRLF() + l.emit(tFrontMatterJSON) + + return lexMainSection +} + +func lexFrontMatterOrgMode(l *pageLexer) stateFunc { + /* + #+TITLE: Test File For chaseadamsio/goorgeous + #+AUTHOR: Chase Adams + #+DESCRIPTION: Just another golang parser for org content! + */ + + const prefix = "#+" + + l.backup() + + if !strings.HasPrefix(l.input[l.pos:], prefix) { + // TODO(bep) consider error + return lexMainSection + } + + // Read lines until we no longer see a #+ prefix +LOOP: + for { + + r := l.next() + + switch { + case r == '\n': + if !strings.HasPrefix(l.input[l.pos:], prefix) { + break LOOP + } + case r == eof: + break LOOP + + } + } + + l.emit(tFrontMatterORG) + + return lexMainSection + +} + +// Handle YAML or TOML front matter. +func (l *pageLexer) lexFrontMatterSection(tp itemType, delimr rune, name, delim string) stateFunc { + for i := 0; i < 2; i++ { + if r := l.next(); r != delimr { + return l.errorf("invalid %s delimiter", name) + } + } + + if !l.consumeCRLF() { + return l.errorf("invalid %s delimiter", name) + } + + // We don't care about the delimiters. + l.ignore() + + for { + r := l.next() + if r == eof { + return l.errorf("EOF looking for end %s front matter delimiter", name) + } + if isEndOfLine(r) { + if strings.HasPrefix(l.input[l.pos:], delim) { + l.emit(tp) + l.pos += 3 + l.consumeCRLF() + l.ignore() + break + } + } + } + + return lexMainSection +} + func lexShortcodeLeftDelim(l *pageLexer) stateFunc { l.pos += pos(len(l.currentLeftShortcodeDelim())) if strings.HasPrefix(l.input[l.pos:], leftComment) { @@ -234,14 +434,14 @@ func lexShortcodeComment(l *pageLexer) stateFunc { l.ignore() l.pos += pos(len(l.currentRightShortcodeDelim())) l.emit(tText) - return lexTextOutsideShortcodes + return lexMainSection } func lexShortcodeRightDelim(l *pageLexer) stateFunc { l.closingState = 0 l.pos += pos(len(l.currentRightShortcodeDelim())) l.emit(l.currentRightShortcodeDelimItem()) - return lexTextOutsideShortcodes + return lexMainSection } // either: @@ -485,6 +685,8 @@ func isAlphaNumericOrHyphen(r rune) bool { return isAlphaNumeric(r) || r == '-' } +var crLf = []rune{'\r', '\n'} + func isEndOfLine(r rune) bool { return r == '\r' || r == '\n' } diff --git a/parser/pageparser/pageparser_intro_test.go b/parser/pageparser/pageparser_intro_test.go new file mode 100644 index 000000000..3dc08c776 --- /dev/null +++ b/parser/pageparser/pageparser_intro_test.go @@ -0,0 +1,103 @@ +// Copyright 2018 The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pageparser + +import ( + "fmt" + "strings" + "testing" +) + +type lexerTest struct { + name string + input string + items []Item +} + +var ( + tstJSON = `{ "a": { "b": "\"Hugo\"}" } }` + tstHTMLLead = Item{tHTMLLead, 0, " <"} + tstFrontMatterTOML = Item{tFrontMatterTOML, 0, "foo = \"bar\"\n"} + tstFrontMatterYAML = Item{tFrontMatterYAML, 0, "foo: \"bar\"\n"} + tstFrontMatterYAMLCRLF = Item{tFrontMatterYAML, 0, "foo: \"bar\"\r\n"} + tstFrontMatterJSON = Item{tFrontMatterJSON, 0, tstJSON + "\r\n"} + tstSomeText = Item{tText, 0, "\nSome text.\n"} + tstSummaryDivider = Item{tSummaryDivider, 0, ""} + tstSummaryDividerOrg = Item{tSummaryDividerOrg, 0, "# more"} + + tstORG = ` +#+TITLE: T1 +#+AUTHOR: A1 +#+DESCRIPTION: D1 +` + tstFrontMatterORG = Item{tFrontMatterORG, 0, tstORG} +) + +var crLfReplacer = strings.NewReplacer("\r", "#", "\n", "$") + +// TODO(bep) a way to toggle ORG mode vs the rest. +var frontMatterTests = []lexerTest{ + {"empty", "", []Item{tstEOF}}, + {"HTML Document", ` `, []Item{tstHTMLLead, Item{tText, 0, "html> "}, tstEOF}}, + {"YAML front matter", "---\nfoo: \"bar\"\n---\n\nSome text.\n", []Item{tstFrontMatterYAML, tstSomeText, tstEOF}}, + // Note that we keep all bytes as they are, but we need to handle CRLF + {"YAML front matter CRLF", "---\r\nfoo: \"bar\"\r\n---\n\nSome text.\n", []Item{tstFrontMatterYAMLCRLF, tstSomeText, tstEOF}}, + {"TOML front matter", "+++\nfoo = \"bar\"\n+++\n\nSome text.\n", []Item{tstFrontMatterTOML, tstSomeText, tstEOF}}, + {"JSON front matter", tstJSON + "\r\n\nSome text.\n", []Item{tstFrontMatterJSON, tstSomeText, tstEOF}}, + {"ORG front matter", tstORG + "\nSome text.\n", []Item{tstFrontMatterORG, tstSomeText, tstEOF}}, + {"Summary divider ORG", tstORG + "\nSome text.\n# more\nSome text.\n", []Item{tstFrontMatterORG, tstSomeText, tstSummaryDividerOrg, tstSomeText, tstEOF}}, + {"Summary divider", "+++\nfoo = \"bar\"\n+++\n\nSome text.\n\nSome text.\n", []Item{tstFrontMatterTOML, tstSomeText, tstSummaryDivider, tstSomeText, tstEOF}}, +} + +func TestFrontMatter(t *testing.T) { + t.Parallel() + for i, test := range frontMatterTests { + items := collect(test.name, test.input, false, lexIntroSection) + if !equal(items, test.items) { + got := crLfReplacer.Replace(fmt.Sprint(items)) + expected := crLfReplacer.Replace(fmt.Sprint(test.items)) + t.Errorf("[%d] %s: got\n\t%v\nexpected\n\t%v", i, test.name, got, expected) + } + } +} + +func collect(name, input string, skipFrontMatter bool, stateStart stateFunc) (items []Item) { + l := newPageLexer(input, 0, stateStart) + l.run() + + for { + item := l.nextItem() + items = append(items, item) + if item.typ == tEOF || item.typ == tError { + break + } + } + return +} + +// no positional checking, for now ... +func equal(i1, i2 []Item) bool { + if len(i1) != len(i2) { + return false + } + for k := range i1 { + if i1[k].typ != i2[k].typ { + return false + } + if i1[k].Val != i2[k].Val { + return false + } + } + return true +} diff --git a/parser/pageparser/pageparser_test.go b/parser/pageparser/pageparser_shortcode_test.go similarity index 92% rename from parser/pageparser/pageparser_test.go rename to parser/pageparser/pageparser_shortcode_test.go index ceb439a65..525c7452f 100644 --- a/parser/pageparser/pageparser_test.go +++ b/parser/pageparser/pageparser_shortcode_test.go @@ -13,15 +13,7 @@ package pageparser -import ( - "testing" -) - -type shortCodeLexerTest struct { - name string - input string - items []Item -} +import "testing" var ( tstEOF = Item{tEOF, 0, ""} @@ -39,7 +31,7 @@ var ( tstVal = Item{tScParamVal, 0, "Hello World"} ) -var shortCodeLexerTests = []shortCodeLexerTest{ +var shortCodeLexerTests = []lexerTest{ {"empty", "", []Item{tstEOF}}, {"spaces", " \t\n", []Item{{tText, 0, " \t\n"}, tstEOF}}, {"text", `to be or not`, []Item{{tText, 0, "to be or not"}, tstEOF}}, @@ -159,7 +151,7 @@ var shortCodeLexerTests = []shortCodeLexerTest{ func TestShortcodeLexer(t *testing.T) { t.Parallel() for i, test := range shortCodeLexerTests { - items := collect(&test) + items := collect(test.name, test.input, true, lexMainSection) if !equal(items, test.items) { t.Errorf("[%d] %s: got\n\t%v\nexpected\n\t%v", i, test.name, items, test.items) } @@ -170,38 +162,10 @@ func BenchmarkShortcodeLexer(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { for _, test := range shortCodeLexerTests { - items := collect(&test) + items := collect(test.name, test.input, true, lexMainSection) if !equal(items, test.items) { b.Errorf("%s: got\n\t%v\nexpected\n\t%v", test.name, items, test.items) } } } } - -func collect(t *shortCodeLexerTest) (items []Item) { - l := newPageLexer(t.name, t.input, 0).run() - for { - item := l.nextItem() - items = append(items, item) - if item.typ == tEOF || item.typ == tError { - break - } - } - return -} - -// no positional checking, for now ... -func equal(i1, i2 []Item) bool { - if len(i1) != len(i2) { - return false - } - for k := range i1 { - if i1[k].typ != i2[k].typ { - return false - } - if i1[k].Val != i2[k].Val { - return false - } - } - return true -}