// Token types: // // { t: 'Doctype', // v: String (entire Doctype declaration from the source), // name: String, // systemId: String (optional), // publicId: String (optional) // } // // { t: 'Comment', // v: String (not including "") // } // // { t: 'Chars', // v: String (pure text like you might pass to document.createTextNode, // no character references) // } // // { t: 'Tag', // isEnd: Boolean (optional), // isSelfClosing: Boolean (optional), // n: String (tag name, in lowercase or camel case), // attrs: dictionary of { String: [tokens] } // OR [{ String: [tokens] }, TemplateTag tokens...] // (only for start tags; required) // } // // { t: 'CharRef', // v: String (entire character reference from the source, e.g. "&"), // cp: [Integer] (array of Unicode code point numbers it expands to) // } // // We keep around both the original form of the character reference and its // expansion so that subsequent processing steps have the option to // re-emit it (if they are generating HTML) or interpret it. Named and // numerical code points may be more than 16 bits, in which case they // need to passed through codePointToString to make a JavaScript string. // Most named entities and all numeric character references are one codepoint // (e.g. "&" is [38]), but a few are two codepoints. // // { t: 'TemplateTag', // v: HTMLTools.TemplateTag // } // The HTML tokenization spec says to preprocess the input stream to replace // CR(LF)? with LF. However, preprocessing `scanner` would complicate things // by making indexes not match the input (e.g. for error messages), so we just // keep in mind as we go along that an LF might be represented by CRLF or CR. // In most cases, it doesn't actually matter what combination of whitespace // characters are present (e.g. inside tags). var HTML_SPACE = /^[\f\n\r\t ]/; var convertCRLF = function (str) { return str.replace(/\r\n?/g, '\n'); }; getComment = HTMLTools.Parse.getComment = function (scanner) { if (scanner.rest().slice(0, 4) !== ''); if (closePos < 0) scanner.fatal("Unclosed HTML comment"); var commentContents = rest.slice(0, closePos); if (commentContents.slice(-1) === '-') scanner.fatal("HTML comment must end at first `--`"); if (commentContents.indexOf("--") >= 0) scanner.fatal("HTML comment cannot contain `--` anywhere"); if (commentContents.indexOf('\u0000') >= 0) scanner.fatal("HTML comment cannot contain NULL"); scanner.pos += closePos + 3; return { t: 'Comment', v: convertCRLF(commentContents) }; }; var skipSpaces = function (scanner) { while (HTML_SPACE.test(scanner.peek())) scanner.pos++; }; var requireSpaces = function (scanner) { if (! HTML_SPACE.test(scanner.peek())) scanner.fatal("Expected space"); skipSpaces(scanner); }; var getDoctypeQuotedString = function (scanner) { var quote = scanner.peek(); if (! (quote === '"' || quote === "'")) scanner.fatal("Expected single or double quote in DOCTYPE"); scanner.pos++; if (scanner.peek() === quote) // prevent a falsy return value (empty string) scanner.fatal("Malformed DOCTYPE"); var str = ''; var ch; while ((ch = scanner.peek()), ch !== quote) { if ((! ch) || (ch === '\u0000') || (ch === '>')) scanner.fatal("Malformed DOCTYPE"); str += ch; scanner.pos++; } scanner.pos++; return str; }; // See http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#the-doctype. // // If `getDocType` sees "') || (ch === '\u0000')) scanner.fatal('Malformed DOCTYPE'); var name = ch; scanner.pos++; while ((ch = scanner.peek()), ! (HTML_SPACE.test(ch) || ch === '>')) { if ((! ch) || (ch === '\u0000')) scanner.fatal('Malformed DOCTYPE'); name += ch; scanner.pos++; } name = HTMLTools.asciiLowerCase(name); // Now we're looking at a space or a `>`. skipSpaces(scanner); var systemId = null; var publicId = null; if (scanner.peek() !== '>') { // Now we're essentially in the "After DOCTYPE name state" of the tokenizer, // but we're not looking at space or `>`. // this should be "public" or "system". var publicOrSystem = HTMLTools.asciiLowerCase(scanner.rest().slice(0, 6)); if (publicOrSystem === 'system') { scanner.pos += 6; requireSpaces(scanner); systemId = getDoctypeQuotedString(scanner); skipSpaces(scanner); if (scanner.peek() !== '>') scanner.fatal("Malformed DOCTYPE"); } else if (publicOrSystem === 'public') { scanner.pos += 6; requireSpaces(scanner); publicId = getDoctypeQuotedString(scanner); if (scanner.peek() !== '>') { requireSpaces(scanner); if (scanner.peek() !== '>') { systemId = getDoctypeQuotedString(scanner); skipSpaces(scanner); if (scanner.peek() !== '>') scanner.fatal("Malformed DOCTYPE"); } } } else { scanner.fatal("Expected PUBLIC or SYSTEM in DOCTYPE"); } } // looking at `>` scanner.pos++; var result = { t: 'Doctype', v: scanner.input.slice(start, scanner.pos), name: name }; if (systemId) result.systemId = systemId; if (publicId) result.publicId = publicId; return result; }; // The special character `{` is only allowed as the first character // of a Chars, so that we have a chance to detect template tags. var getChars = makeRegexMatcher(/^[^&<\u0000][^&<\u0000{]*/); var assertIsTemplateTag = function (x) { if (! (x instanceof HTMLTools.TemplateTag)) throw new Error("Expected an instance of HTMLTools.TemplateTag"); return x; }; // Returns the next HTML token, or `null` if we reach EOF. // // Note that if we have a `getTemplateTag` function that sometimes // consumes characters and emits nothing (e.g. in the case of template // comments), we may go from not-at-EOF to at-EOF and return `null`, // while otherwise we always find some token to return. getHTMLToken = HTMLTools.Parse.getHTMLToken = function (scanner, dataMode) { var result = null; if (scanner.getTemplateTag) { // Try to parse a template tag by calling out to the provided // `getTemplateTag` function. If the function returns `null` but // consumes characters, it must have parsed a comment or something, // so we loop and try it again. If it ever returns `null` without // consuming anything, that means it didn't see anything interesting // so we look for a normal token. If it returns a truthy value, // the value must be instanceof HTMLTools.TemplateTag. We wrap it // in a Special token. var lastPos = scanner.pos; result = scanner.getTemplateTag( scanner, (dataMode === 'rcdata' ? TEMPLATE_TAG_POSITION.IN_RCDATA : (dataMode === 'rawtext' ? TEMPLATE_TAG_POSITION.IN_RAWTEXT : TEMPLATE_TAG_POSITION.ELEMENT))); if (result) return { t: 'TemplateTag', v: assertIsTemplateTag(result) }; else if (scanner.pos > lastPos) return null; } var chars = getChars(scanner); if (chars) return { t: 'Chars', v: convertCRLF(chars) }; var ch = scanner.peek(); if (! ch) return null; // EOF if (ch === '\u0000') scanner.fatal("Illegal NULL character"); if (ch === '&') { if (dataMode !== 'rawtext') { var charRef = getCharacterReference(scanner); if (charRef) return charRef; } scanner.pos++; return { t: 'Chars', v: '&' }; } // If we're here, we're looking at `<`. if (scanner.peek() === '<' && dataMode) { // don't interpret tags scanner.pos++; return { t: 'Chars', v: '<' }; } // `getTag` will claim anything starting with `<` not followed by `!`. // `getComment` takes `