canonicalizeHtml = function(html) { var h = html; // kill IE-specific comments inserted by DomRange h = h.replace(//g, ''); h = h.replace(//g, ''); // ignore exact text of comments h = h.replace(//g, ''); // make all tags lowercase h = h.replace(/<\/?(\w+)/g, function(m) { return m.toLowerCase(); }); // replace whitespace sequences with spaces h = h.replace(/\s+/g, ' '); // Trim leading and trailing whitespace h = h.replace(/^\s+|\s+$/g, ''); // remove whitespace before and after tags h = h.replace(/\s*(<\/?\w.*?>)\s*/gm, function (m, tag) { return tag; }); // make tag attributes uniform h = h.replace(/<(\w+)\s+(.*?)\s*>/g, function(m, tagName, attrs) { // Drop expando property used by Sizzle (part of jQuery) which leaks into // attributes in IE8. Note that its value always contains spaces. attrs = attrs.replace(/sizcache[0-9]+="[^"]*"/g, ' '); // Similarly for expando properties used by jQuery to track data. attrs = attrs.replace(/jQuery[0-9]+="[0-9]+"/g, ' '); // Similarly for expando properties used to DOMBackend to keep // track of callbacks to fire when an element is removed attrs = attrs.replace(/\$blaze_teardown_callbacks="[^"]*"/g, ' '); // And by DOMRange to keep track of the element's DOMRange attrs = attrs.replace(/\$blaze_range="[^"]*"/g, ' '); attrs = attrs.replace(/\s*=\s*/g, '='); attrs = attrs.replace(/^\s+/g, ''); attrs = attrs.replace(/\s+$/g, ''); attrs = attrs.replace(/\s+/g, ' '); // quote unquoted attribute values, as in `type=checkbox`. This // will do the wrong thing if there's an `=` in an attribute value. attrs = attrs.replace(/(\w)=([^'" >/]+)/g, '$1="$2"'); // for the purpose of splitting attributes in a string like 'a="b" // c="d"', assume they are separated by a single space and values // are double- or single-quoted, but allow for spaces inside the // quotes. Split on space following quote. var attrList = attrs.replace(/(\w)='([^']*)' /g, "$1='$2'\u0000"); attrList = attrList.replace(/(\w)="([^"]*)" /g, '$1="$2"\u0000'); attrList = attrList.split("\u0000"); // put attributes in alphabetical order attrList.sort(); var tagContents = [tagName]; for(var i=0; i'; }); h = h.replace(/]*-->/g, ""); return h; }; var fs = require("fs"); o = fs.readFileSync(process.argv[2], {encoding: "utf8"}); o = canonicalizeHtml(o).replace(/(<\w)/g, "\n$1").replace(/(<\/\w*>)/g, "$1\n"); o = o.replace(/

\n<\/p>/gm, ""); console.log(o);