parse HTML comments

This commit is contained in:
David Greenspan
2013-10-20 16:02:03 -07:00
parent bcbf3c1e56
commit dfb9c6d55d
5 changed files with 97 additions and 4 deletions

View File

@@ -2341,7 +2341,7 @@ var isLegalCodepoint = function (cp) {
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
//
// Matches a character reference if possible, including the initial `&`.
// Fails fatally in error cases, like a disallowed codepoint
// Fails fatally in error cases (assuming an initial `&` is matched), like a disallowed codepoint
// number or a bad named character reference.
//
// `inAttribute` is truthy if we are in an attribute value.

View File

@@ -2,6 +2,7 @@ HTML = {
_$: {
// stuff exposed for testing
Scanner: Scanner,
getCharacterReference: getCharacterReference
getCharacterReference: getCharacterReference,
getComment: getComment
}
};

View File

@@ -6,12 +6,12 @@ Package.describe({
Package.on_use(function (api) {
api.export('HTML');
api.add_files(['scanner.js', 'charref.js', 'exports.js']);
api.add_files(['scanner.js', 'charref.js', 'tokenize.js', 'exports.js']);
});
Package.on_test(function (api) {
api.use('tinytest');
api.use('html');
api.use('underscore');
api.add_files('charref_tests.js');
api.add_files(['charref_tests.js', 'tokenize_tests.js']);
});

32
packages/html/tokenize.js Normal file
View File

@@ -0,0 +1,32 @@
getComment = function (scanner) {
if (scanner.rest().slice(0, 4) !== '<!--')
return null;
scanner.pos += 4;
// Valid comments are easy to parse; they end at the first `--`!
// Our main job is throwing errors.
var rest = scanner.rest();
if (rest.charAt(0) === '>' || rest.slice(0, 2) === '->')
scanner.fatal("HTML comment can't start with > or ->");
var closePos = rest.indexOf('-->');
if (closePos < 0)
scanner.fatal("Unclosed HTML comment");
var commentContents = rest.slice(0, closePos);
if (commentContents.slice(-1) === '-')
scanner.fatal("HTML comment must end at first `--`");
if (commentContents.indexOf("--") >= 0)
scanner.fatal("HTML comment cannot contain `--` anywhere");
if (commentContents.indexOf('\u0000') >= 0)
scanner.fatal("HTML comment cannot contain NULL");
scanner.pos += closePos + 3;
return { t: 'Comment',
v: commentContents };
};

View File

@@ -0,0 +1,60 @@
var Scanner = HTML._$.Scanner;
var getComment = HTML._$.getComment;
Tinytest.add("html - comments", function (test) {
var succeed = function (input, content) {
var scanner = new Scanner(input);
var result = getComment(scanner);
test.isTrue(result);
test.equal(scanner.pos, content.length + 7);
test.equal(result, {
t: 'Comment',
v: content
});
};
var ignore = function (input) {
var scanner = new Scanner(input);
var result = getComment(scanner);;
test.isFalse(result);
test.equal(scanner.pos, 0);
};
var fatal = function (input, messageContains) {
var scanner = new Scanner(input);
var error;
try {
getComment(scanner);
} catch (e) {
error = e;
}
test.isTrue(error);
if (error)
test.isTrue(messageContains && error.message.indexOf(messageContains) >= 0, error.message);
};
ignore("<!DOCTYPE>");
ignore("<!-a");
ignore("<--");
ignore("<!");
ignore("abc");
ignore("<a");
fatal('<!--', 'Unclosed');
fatal('<!---', 'Unclosed');
fatal('<!----', 'Unclosed');
fatal('<!-- -', 'Unclosed');
fatal('<!-- --', 'Unclosed');
fatal('<!-- -- abcd', 'Unclosed');
fatal('<!-- ->', 'Unclosed');
fatal('<!-- a--b -->', 'cannot contain');
fatal('<!--x--->', 'must end at first');
fatal('<!-- a\u0000b -->', 'cannot contain');
fatal('<!--\u0000 x-->', 'cannot contain');
succeed('<!---->', '');
succeed('<!---x-->', '-x');
succeed('<!--x-->', 'x');
succeed('<!-- hello - - world -->', ' hello - - world ');
});