// Copyright 2016n The Hugo Authors. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package parser import ( "bufio" "bytes" "fmt" "io" "regexp" "strings" "unicode" "github.com/chaseadamsio/goorgeous" ) const ( // TODO(bep) Do we really have to export these? // HTMLLead identifies the start of HTML documents. HTMLLead = "<" // YAMLLead identifies the start of YAML frontmatter. YAMLLead = "-" // YAMLDelimUnix identifies the end of YAML front matter on Unix. YAMLDelimUnix = "---\n" // YAMLDelimDOS identifies the end of YAML front matter on Windows. YAMLDelimDOS = "---\r\n" // YAMLDelim identifies the YAML front matter delimiter. YAMLDelim = "---" // TOMLLead identifies the start of TOML front matter. TOMLLead = "+" // TOMLDelimUnix identifies the end of TOML front matter on Unix. TOMLDelimUnix = "+++\n" // TOMLDelimDOS identifies the end of TOML front matter on Windows. TOMLDelimDOS = "+++\r\n" // TOMLDelim identifies the TOML front matter delimiter. TOMLDelim = "+++" // JSONLead identifies the start of JSON frontmatter. JSONLead = "{" // HTMLCommentStart identifies the start of HTML comment. HTMLCommentStart = "" // BOM Unicode byte order marker BOM = '\ufeff' ) var ( delims = regexp.MustCompile( "^(" + regexp.QuoteMeta(YAMLDelim) + `\s*\n|` + regexp.QuoteMeta(TOMLDelim) + `\s*\n|` + regexp.QuoteMeta(JSONLead) + ")", ) ) // Page represents a parsed content page. type Page interface { // FrontMatter contains the raw frontmatter with relevant delimiters. FrontMatter() []byte // Content contains the raw page content. Content() []byte // IsRenderable denotes that the page should be rendered. IsRenderable() bool // Metadata returns the unmarshalled frontmatter data. Metadata() (map[string]interface{}, error) } // page implements the Page interface. type page struct { render bool frontmatter []byte content []byte } // Content returns the raw page content. func (p *page) Content() []byte { return p.content } // FrontMatter contains the raw frontmatter with relevant delimiters. func (p *page) FrontMatter() []byte { return p.frontmatter } // IsRenderable denotes that the page should be rendered. func (p *page) IsRenderable() bool { return p.render } // Metadata returns the unmarshalled frontmatter data. func (p *page) Metadata() (meta map[string]interface{}, err error) { frontmatter := p.FrontMatter() if len(frontmatter) != 0 { fm := DetectFrontMatter(rune(frontmatter[0])) if fm != nil { meta, err = fm.Parse(frontmatter) } } return } // ReadFrom reads the content from an io.Reader and constructs a page. func ReadFrom(r io.Reader) (p Page, err error) { reader := bufio.NewReader(r) // chomp BOM and assume UTF-8 if err = chompBOM(reader); err != nil && err != io.EOF { return } if err = chompWhitespace(reader); err != nil && err != io.EOF { return } if err = chompFrontmatterStartComment(reader); err != nil && err != io.EOF { return } firstLine, err := peekLine(reader) if err != nil && err != io.EOF { return } newp := new(page) newp.render = shouldRender(firstLine) if newp.render && isFrontMatterDelim(firstLine) { left, right := determineDelims(firstLine) fm, err := extractFrontMatterDelims(reader, left, right) if err != nil { return nil, err } newp.frontmatter = fm } else if newp.render && goorgeous.IsKeyword(firstLine) { fm, err := goorgeous.ExtractOrgHeaders(reader) if err != nil { return nil, err } newp.frontmatter = fm } content, err := extractContent(reader) if err != nil { return nil, err } newp.content = content return newp, nil } // chompBOM scans any leading Unicode Byte Order Markers from r. func chompBOM(r io.RuneScanner) (err error) { for { c, _, err := r.ReadRune() if err != nil { return err } if c != BOM { r.UnreadRune() return nil } } } // chompWhitespace scans any leading Unicode whitespace from r. func chompWhitespace(r io.RuneScanner) (err error) { for { c, _, err := r.ReadRune() if err != nil { return err } if !unicode.IsSpace(c) { r.UnreadRune() return nil } } } // chompFrontmatterStartComment checks r for a leading HTML comment. If a // comment is found, it is read from r and then whitespace is trimmed from the // beginning of r. func chompFrontmatterStartComment(r *bufio.Reader) (err error) { candidate, err := r.Peek(32) if err != nil { return err } str := string(candidate) if strings.HasPrefix(str, HTMLCommentStart) { lineEnd := strings.IndexAny(str, "\n") if lineEnd == -1 { //TODO: if we can't find it, Peek more? return nil } testStr := strings.TrimSuffix(str[0:lineEnd], "\r") if strings.Contains(testStr, HTMLCommentEnd) { return nil } buf := make([]byte, lineEnd) if _, err = r.Read(buf); err != nil { return } if err = chompWhitespace(r); err != nil { return err } } return nil } // chompFrontmatterEndComment checks r for a trailing HTML comment. func chompFrontmatterEndComment(r *bufio.Reader) (err error) { candidate, err := r.Peek(32) if err != nil { return err } str := string(candidate) lineEnd := strings.IndexAny(str, "\n") if lineEnd == -1 { return nil } testStr := strings.TrimSuffix(str[0:lineEnd], "\r") if strings.Contains(testStr, HTMLCommentStart) { return nil } //TODO: if we can't find it, Peek more? if strings.HasSuffix(testStr, HTMLCommentEnd) { buf := make([]byte, lineEnd) if _, err = r.Read(buf); err != nil { return } if err = chompWhitespace(r); err != nil { return err } } return nil } func peekLine(r *bufio.Reader) (line []byte, err error) { firstFive, err := r.Peek(5) if err != nil { return } idx := bytes.IndexByte(firstFive, '\n') if idx == -1 { return firstFive, nil } idx++ // include newline. return firstFive[:idx], nil } func shouldRender(lead []byte) (frontmatter bool) { if len(lead) <= 0 { return } if bytes.Equal(lead[:1], []byte(HTMLLead)) { return } return true } func isFrontMatterDelim(data []byte) bool { return delims.Match(data) } func determineDelims(firstLine []byte) (left, right []byte) { switch firstLine[0] { case YAMLLead[0]: return []byte(YAMLDelim), []byte(YAMLDelim) case TOMLLead[0]: return []byte(TOMLDelim), []byte(TOMLDelim) case JSONLead[0]: return []byte(JSONLead), []byte("}") default: panic(fmt.Sprintf("Unable to determine delims from %q", firstLine)) } } // extractFrontMatterDelims takes a frontmatter from the content bufio.Reader. // Beginning white spaces of the bufio.Reader must be trimmed before call this // function. func extractFrontMatterDelims(r *bufio.Reader, left, right []byte) (fm []byte, err error) { var ( c byte buf bytes.Buffer level int sameDelim = bytes.Equal(left, right) inQuote bool escapeState int ) // Frontmatter must start with a delimiter. To check it first, // pre-reads beginning delimiter length - 1 bytes from Reader for i := 0; i < len(left)-1; i++ { if c, err = r.ReadByte(); err != nil { return nil, fmt.Errorf("unable to read frontmatter at filepos %d: %s", buf.Len(), err) } if err = buf.WriteByte(c); err != nil { return nil, err } } // Reads a character from Reader one by one and checks it matches the // last character of one of delimiters to find the last character of // frontmatter. If it matches, makes sure it contains the delimiter // and if so, also checks it is followed by CR+LF or LF when YAML, // TOML case. In JSON case, nested delimiters must be parsed and it // is expected that the delimiter only contains one character. for { if c, err = r.ReadByte(); err != nil { return nil, fmt.Errorf("unable to read frontmatter at filepos %d: %s", buf.Len(), err) } if err = buf.WriteByte(c); err != nil { return nil, err } switch c { case '"': if escapeState != 1 { inQuote = !inQuote } case '\\': escapeState++ case left[len(left)-1]: if sameDelim { // YAML, TOML case if bytes.HasSuffix(buf.Bytes(), left) && (buf.Len() == len(left) || buf.Bytes()[buf.Len()-len(left)-1] == '\n') { nextByte: c, err = r.ReadByte() if err != nil { // It is ok that the end delimiter ends with EOF if err != io.EOF || level != 1 { return nil, fmt.Errorf("unable to read frontmatter at filepos %d: %s", buf.Len(), err) } } else { switch c { case '\n': // ok case ' ': // Consume this byte and try to match again goto nextByte case '\r': if err = buf.WriteByte(c); err != nil { return nil, err } if c, err = r.ReadByte(); err != nil { return nil, fmt.Errorf("unable to read frontmatter at filepos %d: %s", buf.Len(), err) } if c != '\n' { return nil, fmt.Errorf("frontmatter delimiter must be followed by CR+LF or LF but those can't be found at filepos %d", buf.Len()) } default: return nil, fmt.Errorf("frontmatter delimiter must be followed by CR+LF or LF but those can't be found at filepos %d", buf.Len()) } if err = buf.WriteByte(c); err != nil { return nil, err } } if level == 0 { level = 1 } else { level = 0 } } } else { // JSON case if !inQuote { level++ } } case right[len(right)-1]: // JSON case only reaches here if !inQuote { level-- } } if level == 0 { // Consumes white spaces immediately behind frontmatter if err = chompWhitespace(r); err != nil && err != io.EOF { return nil, err } if err = chompFrontmatterEndComment(r); err != nil && err != io.EOF { return nil, err } return buf.Bytes(), nil } if c != '\\' { escapeState = 0 } } } func extractContent(r io.Reader) (content []byte, err error) { wr := new(bytes.Buffer) if _, err = wr.ReadFrom(r); err != nil { return } return wr.Bytes(), nil }