// Package html minifies HTML5 following the specifications at http://www.w3.org/TR/html5/syntax.html.
package html
import (
"bytes"
"io"
"github.com/tdewolff/minify/v2"
"github.com/tdewolff/parse/v2"
"github.com/tdewolff/parse/v2/buffer"
"github.com/tdewolff/parse/v2/html"
)
var (
gtBytes = []byte(">")
isBytes = []byte("=")
spaceBytes = []byte(" ")
doctypeBytes = []byte("")
jsMimeBytes = []byte("application/javascript")
cssMimeBytes = []byte("text/css")
htmlMimeBytes = []byte("text/html")
svgMimeBytes = []byte("image/svg+xml")
formMimeBytes = []byte("application/x-www-form-urlencoded")
mathMimeBytes = []byte("application/mathml+xml")
dataSchemeBytes = []byte("data:")
jsSchemeBytes = []byte("javascript:")
httpBytes = []byte("http")
radioBytes = []byte("radio")
onBytes = []byte("on")
textBytes = []byte("text")
noneBytes = []byte("none")
submitBytes = []byte("submit")
allBytes = []byte("all")
rectBytes = []byte("rect")
dataBytes = []byte("data")
getBytes = []byte("get")
autoBytes = []byte("auto")
oneBytes = []byte("one")
inlineParams = map[string]string{"inline": "1"}
)
////////////////////////////////////////////////////////////////
// Minifier is an HTML minifier.
type Minifier struct {
KeepComments bool
KeepConditionalComments bool
KeepDefaultAttrVals bool
KeepDocumentTags bool
KeepEndTags bool
KeepQuotes bool
KeepWhitespace bool
}
// Minify minifies HTML data, it reads from r and writes to w.
func Minify(m *minify.M, w io.Writer, r io.Reader, params map[string]string) error {
return (&Minifier{}).Minify(m, w, r, params)
}
// Minify minifies HTML data, it reads from r and writes to w.
func (o *Minifier) Minify(m *minify.M, w io.Writer, r io.Reader, _ map[string]string) error {
var rawTagHash Hash
var rawTagMediatype []byte
omitSpace := true // if true the next leading space is omitted
inPre := false
attrMinifyBuffer := buffer.NewWriter(make([]byte, 0, 64))
attrByteBuffer := make([]byte, 0, 64)
z := parse.NewInput(r)
defer z.Restore()
l := html.NewLexer(z)
tb := NewTokenBuffer(z, l)
for {
t := *tb.Shift()
switch t.TokenType {
case html.ErrorToken:
if _, err := w.Write(nil); err != nil {
return err
}
if l.Err() == io.EOF {
return nil
}
return l.Err()
case html.DoctypeToken:
w.Write(doctypeBytes)
case html.CommentToken:
if o.KeepComments {
w.Write(t.Data)
} else if o.KeepConditionalComments && 6 < len(t.Text) && (bytes.HasPrefix(t.Text, []byte("[if ")) || bytes.HasSuffix(t.Text, []byte("[endif]")) || bytes.HasSuffix(t.Text, []byte("[endif]--"))) {
// [if ...] is always 7 or more characters, [endif] is only encountered for downlevel-revealed
// see https://msdn.microsoft.com/en-us/library/ms537512(v=vs.85).aspx#syntax
if bytes.HasPrefix(t.Data, []byte("")) { // downlevel-hidden
begin := bytes.IndexByte(t.Data, '>') + 1
end := len(t.Data) - len("")
w.Write(t.Data[:begin])
if err := o.Minify(m, w, buffer.NewReader(t.Data[begin:end]), nil); err != nil {
return minify.UpdateErrorPosition(err, z, t.Offset)
}
w.Write(t.Data[end:])
} else {
w.Write(t.Data) // downlevel-revealed or short downlevel-hidden
}
} else if 1 < len(t.Text) && t.Text[0] == '#' {
// SSI tags
w.Write(t.Data)
}
case html.SvgToken:
if err := m.MinifyMimetype(svgMimeBytes, w, buffer.NewReader(t.Data), nil); err != nil {
if err != minify.ErrNotExist {
return minify.UpdateErrorPosition(err, z, t.Offset)
}
w.Write(t.Data)
}
case html.MathToken:
if err := m.MinifyMimetype(mathMimeBytes, w, buffer.NewReader(t.Data), nil); err != nil {
if err != minify.ErrNotExist {
return minify.UpdateErrorPosition(err, z, t.Offset)
}
w.Write(t.Data)
}
case html.TextToken:
// CSS and JS minifiers for inline code
if rawTagHash != 0 {
if rawTagHash == Style || rawTagHash == Script || rawTagHash == Iframe {
var mimetype []byte
var params map[string]string
if rawTagHash == Iframe {
mimetype = htmlMimeBytes
} else if 0 < len(rawTagMediatype) {
mimetype, params = parse.Mediatype(rawTagMediatype)
} else if rawTagHash == Script {
mimetype = jsMimeBytes
} else if rawTagHash == Style {
mimetype = cssMimeBytes
}
if err := m.MinifyMimetype(mimetype, w, buffer.NewReader(t.Data), params); err != nil {
if err != minify.ErrNotExist {
return minify.UpdateErrorPosition(err, z, t.Offset)
}
w.Write(t.Data)
}
} else {
w.Write(t.Data)
}
} else if inPre {
w.Write(t.Data)
} else {
t.Data = parse.ReplaceMultipleWhitespaceAndEntities(t.Data, EntitiesMap, TextRevEntitiesMap)
// whitespace removal; trim left
if omitSpace && parse.IsWhitespace(t.Data[0]) {
t.Data = t.Data[1:]
}
// whitespace removal; trim right
omitSpace = false
if len(t.Data) == 0 {
omitSpace = true
} else if parse.IsWhitespace(t.Data[len(t.Data)-1]) {
omitSpace = true
i := 0
for {
next := tb.Peek(i)
// trim if EOF, text token with leading whitespace or block token
if next.TokenType == html.ErrorToken {
t.Data = t.Data[:len(t.Data)-1]
omitSpace = false
break
} else if next.TokenType == html.TextToken && !parse.IsAllWhitespace(next.Data) {
// stop looking when text encountered
break
} else if next.TokenType == html.StartTagToken || next.TokenType == html.EndTagToken {
if o.KeepWhitespace {
break
}
// remove when followed by a block tag
if next.Traits&blockTag != 0 {
t.Data = t.Data[:len(t.Data)-1]
omitSpace = false
break
} else if next.TokenType == html.StartTagToken {
break
}
}
i++
}
}
w.Write(t.Data)
}
case html.StartTagToken, html.EndTagToken:
rawTagHash = 0
hasAttributes := false
if t.TokenType == html.StartTagToken {
if next := tb.Peek(0); next.TokenType == html.AttributeToken {
hasAttributes = true
}
if t.Traits&rawTag != 0 {
// ignore empty script and style tags
if !hasAttributes && (t.Hash == Script || t.Hash == Style) {
if next := tb.Peek(1); next.TokenType == html.EndTagToken {
tb.Shift()
tb.Shift()
break
}
}
rawTagHash = t.Hash
rawTagMediatype = nil
// do not minify content of