// Copyright 2020 The Hugo Authors. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package publisher import ( "bytes" "regexp" "sort" "strings" "sync" "golang.org/x/net/html" "github.com/gohugoio/hugo/helpers" ) // HTMLElements holds lists of tags and attribute values for classes and id. type HTMLElements struct { Tags []string `json:"tags"` Classes []string `json:"classes"` IDs []string `json:"ids"` } func (h *HTMLElements) Merge(other HTMLElements) { h.Tags = append(h.Tags, other.Tags...) h.Classes = append(h.Classes, other.Classes...) h.IDs = append(h.IDs, other.IDs...) h.Tags = helpers.UniqueStringsReuse(h.Tags) h.Classes = helpers.UniqueStringsReuse(h.Classes) h.IDs = helpers.UniqueStringsReuse(h.IDs) } func (h *HTMLElements) Sort() { sort.Strings(h.Tags) sort.Strings(h.Classes) sort.Strings(h.IDs) } type htmlElementsCollector struct { // Contains the raw HTML string. We will get the same element // several times, and want to avoid costly reparsing when this // is used for aggregated data only. elementSet map[string]bool elements []htmlElement mu sync.RWMutex } func newHTMLElementsCollector() *htmlElementsCollector { return &htmlElementsCollector{ elementSet: make(map[string]bool), } } func (c *htmlElementsCollector) getHTMLElements() HTMLElements { var ( classes []string ids []string tags []string ) for _, el := range c.elements { classes = append(classes, el.Classes...) ids = append(ids, el.IDs...) tags = append(tags, el.Tag) } classes = helpers.UniqueStringsSorted(classes) ids = helpers.UniqueStringsSorted(ids) tags = helpers.UniqueStringsSorted(tags) els := HTMLElements{ Classes: classes, IDs: ids, Tags: tags, } return els } type htmlElementsCollectorWriter struct { collector *htmlElementsCollector buff bytes.Buffer isCollecting bool inPreTag string inQuote bool quoteValue byte } func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter { return &htmlElementsCollectorWriter{ collector: collector, } } // Write splits the incoming stream into single html element. func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) { n = len(p) i := 0 for i < len(p) { // If we are not collecting, cycle through byte stream until start bracket "<" is found. if !w.isCollecting { for ; i < len(p); i++ { b := p[i] if b == '<' { w.startCollecting() break } } } if w.isCollecting { // If we are collecting, cycle through byte stream until end bracket ">" is found, // disregard any ">" if within a quote, // write bytes until found to buffer. for ; i < len(p); i++ { b := p[i] w.toggleIfQuote(b) w.buff.WriteByte(b) if !w.inQuote && b == '>' { w.endCollecting() break } } } // If no end bracket ">" is found while collecting, but the stream ended // this could mean we received chunks of a stream from e.g. the minify functionality // next if loop will be skipped. // At this point we have collected an element line between angle brackets "<" and ">". if !w.isCollecting { if w.buff.Len() == 0 { continue } if w.inPreTag != "" { // within preformatted code block s := w.buff.String() w.buff.Reset() if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName { w.inPreTag = "" } continue } // First check if we have processed this element before. w.collector.mu.RLock() // Work with the bytes slice as long as it's practical, // to save memory allocations. b := w.buff.Bytes() // See https://github.com/dominikh/go-tools/issues/723 //lint:ignore S1030 This construct avoids memory allocation for the string. seen := w.collector.elementSet[string(b)] w.collector.mu.RUnlock() if seen { w.buff.Reset() continue } // Filter out unwanted tags // if within preformatted code blocks
,