mirror of
https://github.com/meteor/meteor.git
synced 2026-05-02 03:01:46 -04:00
533 lines
16 KiB
JavaScript
533 lines
16 KiB
JavaScript
// Token types:
|
|
//
|
|
// { t: 'Doctype',
|
|
// v: String (entire Doctype declaration from the source),
|
|
// name: String,
|
|
// systemId: String (optional),
|
|
// publicId: String (optional)
|
|
// }
|
|
//
|
|
// { t: 'Comment',
|
|
// v: String (not including "<!--" and "-->")
|
|
// }
|
|
//
|
|
// { t: 'Chars',
|
|
// v: String (pure text like you might pass to document.createTextNode,
|
|
// no character references)
|
|
// }
|
|
//
|
|
// { t: 'Tag',
|
|
// isEnd: Boolean (optional),
|
|
// isSelfClosing: Boolean (optional),
|
|
// n: String (tag name, in lowercase or camel case),
|
|
// attrs: { String: [zero or more 'Chars' or 'CharRef' objects] }
|
|
// (only for start tags; required)
|
|
// }
|
|
//
|
|
// { t: 'CharRef',
|
|
// v: String (entire character reference from the source, e.g. "&"),
|
|
// cp: [Integer] (array of Unicode code point numbers it expands to)
|
|
// }
|
|
//
|
|
// We keep around both the original form of the character reference and its
|
|
// expansion so that subsequent processing steps have the option to
|
|
// re-emit it (if they are generating HTML) or interpret it. Named and
|
|
// numerical code points may be more than 16 bits, in which case they
|
|
// need to passed through codePointToString to make a JavaScript string.
|
|
// Most named entities and all numeric character references are one codepoint
|
|
// (e.g. "&" is [38]), but a few are two codepoints.
|
|
//
|
|
// { t: 'Special',
|
|
// v: { ... anything ... }
|
|
// }
|
|
|
|
// The HTML tokenization spec says to preprocess the input stream to replace
|
|
// CR(LF)? with LF. However, preprocessing `scanner` would complicate things
|
|
// by making indexes not match the input (e.g. for error messages), so we just
|
|
// keep in mind as we go along that an LF might be represented by CRLF or CR.
|
|
// In most cases, it doesn't actually matter what combination of whitespace
|
|
// characters are present (e.g. inside tags).
|
|
var HTML_SPACE = /^[\f\n\r\t ]/;
|
|
|
|
var convertCRLF = function (str) {
|
|
return str.replace(/\r\n?/g, '\n');
|
|
};
|
|
|
|
getComment = HTMLTools.Parse.getComment = function (scanner) {
|
|
if (scanner.rest().slice(0, 4) !== '<!--')
|
|
return null;
|
|
scanner.pos += 4;
|
|
|
|
// Valid comments are easy to parse; they end at the first `--`!
|
|
// Our main job is throwing errors.
|
|
|
|
var rest = scanner.rest();
|
|
if (rest.charAt(0) === '>' || rest.slice(0, 2) === '->')
|
|
scanner.fatal("HTML comment can't start with > or ->");
|
|
|
|
var closePos = rest.indexOf('-->');
|
|
if (closePos < 0)
|
|
scanner.fatal("Unclosed HTML comment");
|
|
|
|
var commentContents = rest.slice(0, closePos);
|
|
if (commentContents.slice(-1) === '-')
|
|
scanner.fatal("HTML comment must end at first `--`");
|
|
if (commentContents.indexOf("--") >= 0)
|
|
scanner.fatal("HTML comment cannot contain `--` anywhere");
|
|
if (commentContents.indexOf('\u0000') >= 0)
|
|
scanner.fatal("HTML comment cannot contain NULL");
|
|
|
|
scanner.pos += closePos + 3;
|
|
|
|
return { t: 'Comment',
|
|
v: convertCRLF(commentContents) };
|
|
};
|
|
|
|
var skipSpaces = function (scanner) {
|
|
while (HTML_SPACE.test(scanner.peek()))
|
|
scanner.pos++;
|
|
};
|
|
|
|
var requireSpaces = function (scanner) {
|
|
if (! HTML_SPACE.test(scanner.peek()))
|
|
scanner.fatal("Expected space");
|
|
skipSpaces(scanner);
|
|
};
|
|
|
|
var getDoctypeQuotedString = function (scanner) {
|
|
var quote = scanner.peek();
|
|
if (! (quote === '"' || quote === "'"))
|
|
scanner.fatal("Expected single or double quote in DOCTYPE");
|
|
scanner.pos++;
|
|
|
|
if (scanner.peek() === quote)
|
|
// prevent a falsy return value (empty string)
|
|
scanner.fatal("Malformed DOCTYPE");
|
|
|
|
var str = '';
|
|
var ch;
|
|
while ((ch = scanner.peek()), ch !== quote) {
|
|
if ((! ch) || (ch === '\u0000') || (ch === '>'))
|
|
scanner.fatal("Malformed DOCTYPE");
|
|
str += ch;
|
|
scanner.pos++;
|
|
}
|
|
|
|
scanner.pos++;
|
|
|
|
return str;
|
|
};
|
|
|
|
// See http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#the-doctype.
|
|
//
|
|
// If `getDocType` sees "<!DOCTYPE" (case-insensitive), it will match or fail fatally.
|
|
getDoctype = HTMLTools.Parse.getDoctype = function (scanner) {
|
|
if (HTMLTools.asciiLowerCase(scanner.rest().slice(0, 9)) !== '<!doctype')
|
|
return null;
|
|
var start = scanner.pos;
|
|
scanner.pos += 9;
|
|
|
|
requireSpaces(scanner);
|
|
|
|
var ch = scanner.peek();
|
|
if ((! ch) || (ch === '>') || (ch === '\u0000'))
|
|
scanner.fatal('Malformed DOCTYPE');
|
|
var name = ch;
|
|
scanner.pos++;
|
|
|
|
while ((ch = scanner.peek()), ! (HTML_SPACE.test(ch) || ch === '>')) {
|
|
if ((! ch) || (ch === '\u0000'))
|
|
scanner.fatal('Malformed DOCTYPE');
|
|
name += ch;
|
|
scanner.pos++;
|
|
}
|
|
name = HTMLTools.asciiLowerCase(name);
|
|
|
|
// Now we're looking at a space or a `>`.
|
|
skipSpaces(scanner);
|
|
|
|
var systemId = null;
|
|
var publicId = null;
|
|
|
|
if (scanner.peek() !== '>') {
|
|
// Now we're essentially in the "After DOCTYPE name state" of the tokenizer,
|
|
// but we're not looking at space or `>`.
|
|
|
|
// this should be "public" or "system".
|
|
var publicOrSystem = HTMLTools.asciiLowerCase(scanner.rest().slice(0, 6));
|
|
|
|
if (publicOrSystem === 'system') {
|
|
scanner.pos += 6;
|
|
requireSpaces(scanner);
|
|
systemId = getDoctypeQuotedString(scanner);
|
|
skipSpaces(scanner);
|
|
if (scanner.peek() !== '>')
|
|
scanner.fatal("Malformed DOCTYPE");
|
|
} else if (publicOrSystem === 'public') {
|
|
scanner.pos += 6;
|
|
requireSpaces(scanner);
|
|
publicId = getDoctypeQuotedString(scanner);
|
|
if (scanner.peek() !== '>') {
|
|
requireSpaces(scanner);
|
|
if (scanner.peek() !== '>') {
|
|
systemId = getDoctypeQuotedString(scanner);
|
|
skipSpaces(scanner);
|
|
if (scanner.peek() !== '>')
|
|
scanner.fatal("Malformed DOCTYPE");
|
|
}
|
|
}
|
|
} else {
|
|
scanner.fatal("Expected PUBLIC or SYSTEM in DOCTYPE");
|
|
}
|
|
}
|
|
|
|
// looking at `>`
|
|
scanner.pos++;
|
|
var result = { t: 'Doctype',
|
|
v: scanner.input.slice(start, scanner.pos),
|
|
name: name };
|
|
|
|
if (systemId)
|
|
result.systemId = systemId;
|
|
if (publicId)
|
|
result.publicId = publicId;
|
|
|
|
return result;
|
|
};
|
|
|
|
// The special character `{` is only allowed as the first character
|
|
// of a Chars, so that we have a chance to detect template tags.
|
|
var getChars = makeRegexMatcher(/^[^&<\u0000][^&<\u0000{]*/);
|
|
|
|
// Returns the next HTML token, or `null` if we reach EOF.
|
|
//
|
|
// Note that if we have a `getSpecialTag` function that sometimes
|
|
// consumes characters and emits nothing (e.g. in the case of template
|
|
// comments), we may go from not-at-EOF to at-EOF and return `null`,
|
|
// while otherwise we always find some token to return.
|
|
getHTMLToken = HTMLTools.Parse.getHTMLToken = function (scanner, dataMode) {
|
|
var result = null;
|
|
if (scanner.getSpecialTag) {
|
|
// Try to parse a "special tag" by calling out to the provided
|
|
// `getSpecialTag` function. If the function returns `null` but
|
|
// consumes characters, it must have parsed a comment, so we return null
|
|
// and allow the lexer to continue. If it ever returns `null` without
|
|
// consuming anything, that means it didn't see anything interesting
|
|
// so we look for a normal token. If it returns a truthy value,
|
|
// the value must be an object. We wrap it in a Special token.
|
|
var lastPos = scanner.pos;
|
|
result = scanner.getSpecialTag(
|
|
scanner,
|
|
(dataMode === 'rcdata' ? TEMPLATE_TAG_POSITION.IN_RCDATA :
|
|
(dataMode === 'rawtext' ? TEMPLATE_TAG_POSITION.IN_RAWTEXT :
|
|
TEMPLATE_TAG_POSITION.ELEMENT)));
|
|
|
|
if (result)
|
|
return { t: 'Special', v: result };
|
|
else if (scanner.pos > lastPos)
|
|
return null;
|
|
}
|
|
|
|
var chars = getChars(scanner);
|
|
if (chars)
|
|
return { t: 'Chars',
|
|
v: convertCRLF(chars) };
|
|
|
|
var ch = scanner.peek();
|
|
if (! ch)
|
|
return null; // EOF
|
|
|
|
if (ch === '\u0000')
|
|
scanner.fatal("Illegal NULL character");
|
|
|
|
if (ch === '&') {
|
|
if (dataMode !== 'rawtext') {
|
|
var charRef = getCharacterReference(scanner);
|
|
if (charRef)
|
|
return charRef;
|
|
}
|
|
|
|
scanner.pos++;
|
|
return { t: 'Chars',
|
|
v: '&' };
|
|
}
|
|
|
|
// If we're here, we're looking at `<`.
|
|
|
|
if (scanner.peek() === '<' && dataMode) {
|
|
// don't interpret tags
|
|
scanner.pos++;
|
|
return { t: 'Chars',
|
|
v: '<' };
|
|
}
|
|
|
|
// `getTag` will claim anything starting with `<` not followed by `!`.
|
|
// `getComment` takes `<!--` and getDoctype takes `<!doctype`.
|
|
result = (getTagToken(scanner) || getComment(scanner) || getDoctype(scanner));
|
|
|
|
if (result)
|
|
return result;
|
|
|
|
scanner.fatal("Unexpected `<!` directive.");
|
|
};
|
|
|
|
var getTagName = makeRegexMatcher(/^[a-zA-Z][^\f\n\r\t />{]*/);
|
|
var getClangle = makeRegexMatcher(/^>/);
|
|
var getSlash = makeRegexMatcher(/^\//);
|
|
var getAttributeName = makeRegexMatcher(/^[^>/\u0000"'<=\f\n\r\t ][^\f\n\r\t /=>"'<\u0000]*/);
|
|
|
|
// Try to parse `>` or `/>`, mutating `tag` to be self-closing in the latter
|
|
// case (and failing fatally if `/` isn't followed by `>`).
|
|
// Return tag if successful.
|
|
var handleEndOfTag = function (scanner, tag) {
|
|
if (getClangle(scanner))
|
|
return tag;
|
|
|
|
if (getSlash(scanner)) {
|
|
if (! getClangle(scanner))
|
|
scanner.fatal("Expected `>` after `/`");
|
|
tag.isSelfClosing = true;
|
|
return tag;
|
|
}
|
|
|
|
return null;
|
|
};
|
|
|
|
var getQuotedAttributeValue = function (scanner, quote) {
|
|
if (scanner.peek() !== quote)
|
|
return null;
|
|
scanner.pos++;
|
|
|
|
var tokens = [];
|
|
var charsTokenToExtend = null;
|
|
|
|
var charRef;
|
|
while (true) {
|
|
var ch = scanner.peek();
|
|
var special;
|
|
var curPos = scanner.pos;
|
|
if (ch === quote) {
|
|
scanner.pos++;
|
|
return tokens;
|
|
} else if (! ch) {
|
|
scanner.fatal("Unclosed quoted attribute in tag");
|
|
} else if (ch === '\u0000') {
|
|
scanner.fatal("Unexpected NULL character in attribute value");
|
|
} else if (ch === '&' && (charRef = getCharacterReference(scanner, true, quote))) {
|
|
tokens.push(charRef);
|
|
charsTokenToExtend = null;
|
|
} else if (scanner.getSpecialTag &&
|
|
((special = scanner.getSpecialTag(scanner,
|
|
TEMPLATE_TAG_POSITION.IN_ATTRIBUTE)) ||
|
|
scanner.pos > curPos /* `{{! comment}}` */)) {
|
|
// note: this code is messy because it turns out to be annoying for getSpecialTag
|
|
// to return `null` when it scans a comment. Also, this code should be de-duped
|
|
// with getUnquotedAttributeValue
|
|
if (special) {
|
|
tokens.push({t: 'Special', v: special});
|
|
charsTokenToExtend = null;
|
|
}
|
|
} else {
|
|
if (! charsTokenToExtend) {
|
|
charsTokenToExtend = { t: 'Chars', v: '' };
|
|
tokens.push(charsTokenToExtend);
|
|
}
|
|
charsTokenToExtend.v += (ch === '\r' ? '\n' : ch);
|
|
scanner.pos++;
|
|
if (ch === '\r' && scanner.peek() === '\n')
|
|
scanner.pos++;
|
|
}
|
|
}
|
|
};
|
|
|
|
var getUnquotedAttributeValue = function (scanner) {
|
|
var tokens = [];
|
|
var charsTokenToExtend = null;
|
|
|
|
var charRef;
|
|
while (true) {
|
|
var ch = scanner.peek();
|
|
var special;
|
|
var curPos = scanner.pos;
|
|
if (HTML_SPACE.test(ch) || ch === '>') {
|
|
return tokens;
|
|
} else if (! ch) {
|
|
scanner.fatal("Unclosed attribute in tag");
|
|
} else if ('\u0000"\'<=`'.indexOf(ch) >= 0) {
|
|
scanner.fatal("Unexpected character in attribute value");
|
|
} else if (ch === '&' && (charRef = getCharacterReference(scanner, true, '>'))) {
|
|
tokens.push(charRef);
|
|
charsTokenToExtend = null;
|
|
} else if (scanner.getSpecialTag &&
|
|
((special = scanner.getSpecialTag(scanner,
|
|
TEMPLATE_TAG_POSITION.IN_ATTRIBUTE)) ||
|
|
scanner.pos > curPos /* `{{! comment}}` */)) {
|
|
if (special) {
|
|
tokens.push({t: 'Special', v: special});
|
|
charsTokenToExtend = null;
|
|
}
|
|
} else {
|
|
if (! charsTokenToExtend) {
|
|
charsTokenToExtend = { t: 'Chars', v: '' };
|
|
tokens.push(charsTokenToExtend);
|
|
}
|
|
charsTokenToExtend.v += ch;
|
|
scanner.pos++;
|
|
}
|
|
}
|
|
};
|
|
|
|
getTagToken = HTMLTools.Parse.getTagToken = function (scanner) {
|
|
if (! (scanner.peek() === '<' && scanner.rest().charAt(1) !== '!'))
|
|
return null;
|
|
scanner.pos++;
|
|
|
|
var tag = { t: 'Tag' };
|
|
|
|
// now looking at the character after `<`, which is not a `!`
|
|
if (scanner.peek() === '/') {
|
|
tag.isEnd = true;
|
|
scanner.pos++;
|
|
}
|
|
|
|
var tagName = getTagName(scanner);
|
|
if (! tagName)
|
|
scanner.fatal("Expected tag name after `<`");
|
|
tag.n = HTMLTools.properCaseTagName(tagName);
|
|
|
|
if (scanner.peek() === '/' && tag.isEnd)
|
|
scanner.fatal("End tag can't have trailing slash");
|
|
if (handleEndOfTag(scanner, tag))
|
|
return tag;
|
|
|
|
if (scanner.isEOF())
|
|
scanner.fatal("Unclosed `<`");
|
|
|
|
if (! HTML_SPACE.test(scanner.peek()))
|
|
// e.g. `<a{{b}}>`
|
|
scanner.fatal("Expected space after tag name");
|
|
|
|
// we're now in "Before attribute name state" of the tokenizer
|
|
skipSpaces(scanner);
|
|
|
|
if (scanner.peek() === '/' && tag.isEnd)
|
|
scanner.fatal("End tag can't have trailing slash");
|
|
if (handleEndOfTag(scanner, tag))
|
|
return tag;
|
|
|
|
if (tag.isEnd)
|
|
scanner.fatal("End tag can't have attributes");
|
|
|
|
tag.attrs = {};
|
|
|
|
while (true) {
|
|
// Note: at the top of this loop, we've already skipped any spaces.
|
|
|
|
// This will be set to true if after parsing the attribute, we should
|
|
// require spaces (or else an end of tag, i.e. `>` or `/>`).
|
|
var spacesRequiredAfter = false;
|
|
|
|
// first, try for a special tag.
|
|
var curPos = scanner.pos;
|
|
var special = (scanner.getSpecialTag &&
|
|
scanner.getSpecialTag(scanner,
|
|
TEMPLATE_TAG_POSITION.IN_START_TAG));
|
|
if (special || (scanner.pos > curPos)) {
|
|
if (special) {
|
|
tag.attrs.$specials = (tag.attrs.$specials || []);
|
|
tag.attrs.$specials.push({ t: 'Special', v: special });
|
|
} // else, must have scanned a `{{! comment}}`
|
|
|
|
spacesRequiredAfter = true;
|
|
} else {
|
|
|
|
var attributeName = getAttributeName(scanner);
|
|
if (! attributeName)
|
|
scanner.fatal("Expected attribute name in tag");
|
|
// Throw error on `{` in attribute name. This provides *some* error message
|
|
// if someone writes `<a x{{y}}>` or `<a x{{y}}=z>`. The HTML tokenization
|
|
// spec doesn't say that `{` is invalid, but the DOM API (setAttribute) won't
|
|
// allow it, so who cares.
|
|
if (attributeName.indexOf('{') >= 0)
|
|
scanner.fatal("Unexpected `{` in attribute name.");
|
|
attributeName = HTMLTools.properCaseAttributeName(attributeName);
|
|
|
|
if (tag.attrs.hasOwnProperty(attributeName))
|
|
scanner.fatal("Duplicate attribute in tag: " + attributeName);
|
|
|
|
tag.attrs[attributeName] = [];
|
|
|
|
skipSpaces(scanner);
|
|
|
|
if (handleEndOfTag(scanner, tag))
|
|
return tag;
|
|
|
|
var ch = scanner.peek();
|
|
if (! ch)
|
|
scanner.fatal("Unclosed <");
|
|
if ('\u0000"\'<'.indexOf(ch) >= 0)
|
|
scanner.fatal("Unexpected character after attribute name in tag");
|
|
|
|
if (ch === '=') {
|
|
scanner.pos++;
|
|
|
|
skipSpaces(scanner);
|
|
|
|
ch = scanner.peek();
|
|
if (! ch)
|
|
scanner.fatal("Unclosed <");
|
|
if ('\u0000><=`'.indexOf(ch) >= 0)
|
|
scanner.fatal("Unexpected character after = in tag");
|
|
|
|
if ((ch === '"') || (ch === "'"))
|
|
tag.attrs[attributeName] = getQuotedAttributeValue(scanner, ch);
|
|
else
|
|
tag.attrs[attributeName] = getUnquotedAttributeValue(scanner);
|
|
|
|
spacesRequiredAfter = true;
|
|
}
|
|
}
|
|
// now we are in the "post-attribute" position, whether it was a special attribute
|
|
// (like `{{x}}`) or a normal one (like `x` or `x=y`).
|
|
|
|
if (handleEndOfTag(scanner, tag))
|
|
return tag;
|
|
|
|
if (scanner.isEOF())
|
|
scanner.fatal("Unclosed `<`");
|
|
|
|
if (spacesRequiredAfter)
|
|
requireSpaces(scanner);
|
|
else
|
|
skipSpaces(scanner);
|
|
|
|
if (handleEndOfTag(scanner, tag))
|
|
return tag;
|
|
}
|
|
};
|
|
|
|
TEMPLATE_TAG_POSITION = HTMLTools.TEMPLATE_TAG_POSITION = {
|
|
ELEMENT: 1,
|
|
IN_START_TAG: 2,
|
|
IN_ATTRIBUTE: 3,
|
|
IN_RCDATA: 4,
|
|
IN_RAWTEXT: 5
|
|
};
|
|
|
|
// tagName must be proper case
|
|
isLookingAtEndTag = function (scanner, tagName) {
|
|
var rest = scanner.rest();
|
|
var pos = 0; // into rest
|
|
var firstPart = /^<\/([a-zA-Z]+)/.exec(rest);
|
|
if (firstPart &&
|
|
HTMLTools.properCaseTagName(firstPart[1]) === tagName) {
|
|
// we've seen `</foo`, now see if the end tag continues
|
|
pos += firstPart[0].length;
|
|
while (pos < rest.length && HTML_SPACE.test(rest.charAt(pos)))
|
|
pos++;
|
|
if (pos < rest.length && rest.charAt(pos) === '>')
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|