jquery.htmlclean.js 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576
  1. /*
  2. HTML Clean for jQuery
  3. Anthony Johnston
  4. http://www.antix.co.uk
  5. version 1.3.1
  6. $Revision$
  7. requires jQuery http://jquery.com
  8. Use and distibution http://www.opensource.org/licenses/bsd-license.php
  9. 2010-04-02 allowedTags/removeTags added (white/black list) thanks to David Wartian (Dwartian)
  10. 2010-06-30 replaceStyles added for replacement of bold, italic, super and sub styles on a tag
  11. 2012-04-30 allowedAttributes added, an array of attributed allowed on the elements
  12. 2013-02-25 now will push non-inline elements up the stack if nested in an inline element
  13. 2013-02-25 comment element support added, removed by default, see AllowComments in options
  14. */
  15. (function ($) {
  16. $.fn.htmlClean = function (options) {
  17. // iterate and html clean each matched element
  18. return this.each(function () {
  19. var $this = $(this);
  20. if (this.value) {
  21. this.value = $.htmlClean(this.value, options);
  22. } else {
  23. this.innerHTML = $.htmlClean(this.innerHTML, options);
  24. }
  25. });
  26. };
  27. // clean the passed html
  28. $.htmlClean = function (html, options) {
  29. options = $.extend({}, $.htmlClean.defaults, options);
  30. var tagsRE = /(<(\/)?(\w+:)?([\w]+)([^>]*)>)|<!--(.*?--)>/gi;
  31. var attrsRE = /([\w\-\:]+)=(".*?"|'.*?'|[^\s>]*)/gi;
  32. var tagMatch;
  33. var root = new Element();
  34. var stack = [root];
  35. var container = root;
  36. var protect = false;
  37. if (options.bodyOnly) {
  38. // check for body tag
  39. if (tagMatch = /<body[^>]*>((\n|.)*)<\/body>/i.exec(html)) {
  40. html = tagMatch[1];
  41. }
  42. }
  43. html = html.concat("<xxx>"); // ensure last element/text is found
  44. var lastIndex;
  45. while (tagMatch = tagsRE.exec(html)) {
  46. var tag = tagMatch[6]
  47. ? new Tag("--", null, tagMatch[6], options)
  48. : new Tag(tagMatch[4], tagMatch[2], tagMatch[5], options);
  49. // add the text
  50. var text = html.substring(lastIndex, tagMatch.index);
  51. if (text.length > 0) {
  52. var child = container.children[container.children.length - 1];
  53. if (container.children.length > 0
  54. && isText(child = container.children[container.children.length - 1])) {
  55. // merge text
  56. container.children[container.children.length - 1] = child.concat(text);
  57. } else {
  58. container.children.push(text);
  59. }
  60. }
  61. lastIndex = tagsRE.lastIndex;
  62. if (tag.isClosing) {
  63. // find matching container
  64. if (popToTagName(stack, [tag.name])) {
  65. stack.pop();
  66. container = stack[stack.length - 1];
  67. }
  68. } else {
  69. // create a new element
  70. var element = new Element(tag);
  71. // add attributes
  72. var attrMatch;
  73. while (attrMatch = attrsRE.exec(tag.rawAttributes)) {
  74. // check style attribute and do replacements
  75. if (attrMatch[1].toLowerCase() == "style"
  76. && options.replaceStyles) {
  77. var renderParent = !tag.isInline;
  78. for (var i = 0; i < options.replaceStyles.length; i++) {
  79. if (options.replaceStyles[i][0].test(attrMatch[2])) {
  80. if (!renderParent) {
  81. tag.render = false;
  82. renderParent = true;
  83. }
  84. container.children.push(element); // assumes not replaced
  85. stack.push(element);
  86. container = element; // assumes replacement is a container
  87. // create new tag and element
  88. tag = new Tag(options.replaceStyles[i][1], "", "", options);
  89. element = new Element(tag);
  90. }
  91. }
  92. }
  93. if (tag.allowedAttributes != null
  94. && (tag.allowedAttributes.length == 0
  95. || $.inArray(attrMatch[1], tag.allowedAttributes) > -1)) {
  96. element.attributes.push(new Attribute(attrMatch[1], attrMatch[2]));
  97. }
  98. }
  99. // add required empty ones
  100. $.each(tag.requiredAttributes, function () {
  101. var name = this.toString();
  102. if (!element.hasAttribute(name)) element.attributes.push(new Attribute(name, ""));
  103. });
  104. // check for replacements
  105. for (var repIndex = 0; repIndex < options.replace.length; repIndex++) {
  106. for (var tagIndex = 0; tagIndex < options.replace[repIndex][0].length; tagIndex++) {
  107. var byName = typeof (options.replace[repIndex][0][tagIndex]) == "string";
  108. if ((byName && options.replace[repIndex][0][tagIndex] == tag.name)
  109. || (!byName && options.replace[repIndex][0][tagIndex].test(tagMatch))) {
  110. // set the name to the replacement
  111. tag.rename(options.replace[repIndex][1]);
  112. repIndex = options.replace.length; // break out of both loops
  113. break;
  114. }
  115. }
  116. }
  117. // check container rules
  118. var add = true;
  119. if (!container.isRoot) {
  120. if (container.tag.isInline && !tag.isInline) {
  121. if (add = popToContainer(stack)) {
  122. container = stack[stack.length - 1];
  123. }
  124. } else if (container.tag.disallowNest && tag.disallowNest
  125. && !tag.requiredParent) {
  126. add = false;
  127. } else if (tag.requiredParent) {
  128. if (add = popToTagName(stack, tag.requiredParent)) {
  129. container = stack[stack.length - 1];
  130. }
  131. }
  132. }
  133. if (add) {
  134. container.children.push(element);
  135. if (tag.toProtect) {
  136. // skip to closing tag
  137. while (tagMatch2 = tagsRE.exec(html)) {
  138. var tag2 = new Tag(tagMatch2[3], tagMatch2[1], tagMatch2[4], options);
  139. if (tag2.isClosing && tag2.name == tag.name) {
  140. element.children.push(RegExp.leftContext.substring(lastIndex));
  141. lastIndex = tagsRE.lastIndex;
  142. break;
  143. }
  144. }
  145. } else {
  146. // set as current container element
  147. if (!tag.isSelfClosing && !tag.isNonClosing) {
  148. stack.push(element);
  149. container = element;
  150. }
  151. }
  152. }
  153. }
  154. }
  155. // render doc
  156. return $.htmlClean.trim(render(root, options).join(""));
  157. };
  158. // defaults
  159. $.htmlClean.defaults = {
  160. // only clean the body tagbody
  161. bodyOnly: true,
  162. // only allow tags in this array, (white list), contents still rendered
  163. allowedTags: [],
  164. // remove tags in this array, (black list), contents still rendered
  165. removeTags: ["basefont", "center", "dir", "font", "frame", "frameset", "iframe", "isindex", "menu", "noframes", "s", "strike", "u"],
  166. // array of [attributeName], [optional array of allowed on elements] e.g. [["id"], ["style", ["p", "dl"]]] // allow all elements to have id and allow style on 'p' and 'dl'
  167. allowedAttributes: [],
  168. // array of attribute names to remove on all elements in addition to those not in tagAttributes e.g ["width", "height"]
  169. removeAttrs: [],
  170. // array of [className], [optional array of allowed on elements] e.g. [["aClass"], ["anotherClass", ["p", "dl"]]]
  171. allowedClasses: [],
  172. // format the result
  173. format: false,
  174. // format indent to start on
  175. formatIndent: 0,
  176. // tags to replace, and what to replace with, tag name or regex to match the tag and attributes
  177. replace: [
  178. [["b", "big"], "strong"],
  179. [["i"], "em"]
  180. ],
  181. // styles to replace with tags, multiple style matches supported, inline tags are replaced by the first match blocks are retained
  182. replaceStyles: [
  183. [/font-weight:\s*bold/i, "strong"],
  184. [/font-style:\s*italic/i, "em"],
  185. [/vertical-align:\s*super/i, "sup"],
  186. [/vertical-align:\s*sub/i, "sub"]
  187. ],
  188. allowComments: false
  189. };
  190. function applyFormat(element, options, output, indent) {
  191. if (!element.tag.isInline && output.length > 0) {
  192. output.push("\n");
  193. for (i = 0; i < indent; i++) output.push("\t");
  194. }
  195. }
  196. function render(element, options) {
  197. var output = [], empty = element.attributes.length == 0, indent;
  198. if (element.tag.isComment) {
  199. if (options.allowComments) {
  200. output.push("<!--");
  201. output.push(element.tag.rawAttributes);
  202. output.push(">");
  203. if (options.format) applyFormat(element, options, output, indent - 1);
  204. }
  205. } else {
  206. var openingTag = this.name.concat(element.tag.rawAttributes == undefined ? "" : element.tag.rawAttributes);
  207. // don't render if not in allowedTags or in removeTags
  208. var renderTag
  209. = element.tag.render
  210. && (options.allowedTags.length == 0 || $.inArray(element.tag.name, options.allowedTags) > -1)
  211. && (options.removeTags.length == 0 || $.inArray(element.tag.name, options.removeTags) == -1);
  212. if (!element.isRoot && renderTag) {
  213. // render opening tag
  214. output.push("<");
  215. output.push(element.tag.name);
  216. $.each(element.attributes, function () {
  217. if ($.inArray(this.name, options.removeAttrs) == -1) {
  218. var m = RegExp(/^(['"]?)(.*?)['"]?$/).exec(this.value);
  219. var value = m[2];
  220. var valueQuote = m[1] || "'";
  221. // check for classes allowed
  222. if (this.name == "class" && options.allowedClasses.length > 0) {
  223. value =
  224. $.grep(value.split(" "), function (c) {
  225. return $.grep(options.allowedClasses, function (a) {
  226. return a == c
  227. || (a[0] == c && (a.length == 1 || $.inArray(element.tag.name, a[1]) > -1));
  228. }).length > 0;
  229. })
  230. .join(" ");
  231. }
  232. if (value != null && (value.length > 0 || $.inArray(this.name, element.tag.requiredAttributes) > -1)) {
  233. output.push(" ");
  234. output.push(this.name);
  235. output.push("=");
  236. output.push(valueQuote);
  237. output.push(value);
  238. output.push(valueQuote);
  239. }
  240. }
  241. });
  242. }
  243. if (element.tag.isSelfClosing) {
  244. // self closing
  245. if (renderTag) output.push(" />");
  246. empty = false;
  247. } else if (element.tag.isNonClosing) {
  248. empty = false;
  249. } else {
  250. if (!element.isRoot && renderTag) {
  251. // close
  252. output.push(">");
  253. }
  254. var indent = options.formatIndent++;
  255. // render children
  256. if (element.tag.toProtect) {
  257. var outputChildren = $.htmlClean.trim(element.children.join("")).replace(/<br>/ig, "\n");
  258. output.push(outputChildren);
  259. empty = outputChildren.length == 0;
  260. } else {
  261. var outputChildren = [];
  262. for (var i = 0; i < element.children.length; i++) {
  263. var child = element.children[i];
  264. var text = $.htmlClean.trim(textClean(isText(child) ? child : child.childrenToString()));
  265. if (isInline(child)) {
  266. if (i > 0 && text.length > 0
  267. && (startsWithWhitespace(child) || endsWithWhitespace(element.children[i - 1]))) {
  268. outputChildren.push(" ");
  269. }
  270. }
  271. if (isText(child)) {
  272. if (text.length > 0) {
  273. outputChildren.push(text);
  274. }
  275. } else {
  276. // don't allow a break to be the last child
  277. if (i != element.children.length - 1 || child.tag.name != "br") {
  278. if (options.format) applyFormat(child, options, outputChildren, indent);
  279. outputChildren = outputChildren.concat(render(child, options));
  280. }
  281. }
  282. }
  283. options.formatIndent--;
  284. if (outputChildren.length > 0) {
  285. if (options.format && outputChildren[0] != "\n") applyFormat(element, options, output, indent);
  286. output = output.concat(outputChildren);
  287. empty = false;
  288. }
  289. }
  290. if (!element.isRoot && renderTag) {
  291. // render the closing tag
  292. if (options.format) applyFormat(element, options, output, indent - 1);
  293. output.push("</");
  294. output.push(element.tag.name);
  295. output.push(">");
  296. }
  297. }
  298. // check for empty tags
  299. if (!element.tag.allowEmpty && empty) { return []; }
  300. }
  301. return output;
  302. }
  303. // find a matching tag, and pop to it, if not do nothing
  304. function popToTagName(stack, tagNameArray) {
  305. return pop(
  306. stack,
  307. function (element) {
  308. return $.inArray(element.tag.nameOriginal, tagNameArray) > -1
  309. });
  310. }
  311. function popToContainer(stack) {
  312. return pop(
  313. stack,
  314. function (element) {
  315. return element.isRoot || !element.tag.isInline;
  316. });
  317. }
  318. function pop(stack, test, index) {
  319. index = index || 1;
  320. var element = stack[stack.length - index];
  321. if (test(element)) {
  322. return true;
  323. } else if (stack.length - index > 0
  324. && pop(stack, test, index + 1)) {
  325. stack.pop();
  326. return true;
  327. }
  328. return false;
  329. }
  330. // Element Object
  331. function Element(tag) {
  332. if (tag) {
  333. this.tag = tag;
  334. this.isRoot = false;
  335. } else {
  336. this.tag = new Tag("root");
  337. this.isRoot = true;
  338. }
  339. this.attributes = [];
  340. this.children = [];
  341. this.hasAttribute = function (name) {
  342. for (var i = 0; i < this.attributes.length; i++) {
  343. if (this.attributes[i].name == name) return true;
  344. }
  345. return false;
  346. };
  347. this.childrenToString = function () {
  348. return this.children.join("");
  349. };
  350. return this;
  351. }
  352. // Attribute Object
  353. function Attribute(name, value) {
  354. this.name = name;
  355. this.value = value;
  356. return this;
  357. }
  358. // Tag object
  359. function Tag(name, close, rawAttributes, options) {
  360. this.name = name.toLowerCase();
  361. this.nameOriginal = this.name;
  362. this.render = true;
  363. this.init = function () {
  364. if (this.name == "--") {
  365. this.isComment = true;
  366. this.isSelfClosing = true;
  367. } else {
  368. this.isComment = false;
  369. this.isSelfClosing = $.inArray(this.name, tagSelfClosing) > -1;
  370. this.isNonClosing = $.inArray(this.name, tagNonClosing) > -1;
  371. this.isClosing = (close != undefined && close.length > 0);
  372. this.isInline = $.inArray(this.name, tagInline) > -1;
  373. this.disallowNest = $.inArray(this.name, tagDisallowNest) > -1;
  374. this.requiredParent = tagRequiredParent[$.inArray(this.name, tagRequiredParent) + 1];
  375. this.allowEmpty = $.inArray(this.name, tagAllowEmpty) > -1;
  376. this.toProtect = $.inArray(this.name, tagProtect) > -1;
  377. }
  378. this.rawAttributes = rawAttributes;
  379. this.requiredAttributes = tagAttributesRequired[$.inArray(this.name, tagAttributesRequired) + 1];
  380. if (options) {
  381. if (!options.tagAttributesCache) options.tagAttributesCache = [];
  382. if ($.inArray(this.name, options.tagAttributesCache) == -1) {
  383. var cacheItem = tagAttributes[$.inArray(this.name, tagAttributes) + 1].slice(0);
  384. // add extra ones from options
  385. for (var i = 0; i < options.allowedAttributes.length; i++) {
  386. var attrName = options.allowedAttributes[i][0];
  387. if ((
  388. options.allowedAttributes[i].length == 1
  389. || $.inArray(this.name, options.allowedAttributes[i][1]) > -1
  390. ) && $.inArray(attrName, cacheItem) == -1) {
  391. cacheItem.push(attrName);
  392. }
  393. }
  394. options.tagAttributesCache.push(this.name);
  395. options.tagAttributesCache.push(cacheItem);
  396. }
  397. this.allowedAttributes = options.tagAttributesCache[$.inArray(this.name, options.tagAttributesCache) + 1];
  398. }
  399. }
  400. this.init();
  401. this.rename = function (newName) {
  402. this.name = newName;
  403. this.init();
  404. };
  405. return this;
  406. }
  407. function startsWithWhitespace(item) {
  408. while (isElement(item) && item.children.length > 0) { item = item.children[0] }
  409. if (!isText(item)) return false;
  410. var text = textClean(item);
  411. return text.length > 0 && $.htmlClean.isWhitespace(text.charAt(0));
  412. }
  413. function endsWithWhitespace(item) {
  414. while (isElement(item) && item.children.length > 0) { item = item.children[item.children.length - 1] }
  415. if (!isText(item)) return false;
  416. var text = textClean(item);
  417. return text.length > 0 && $.htmlClean.isWhitespace(text.charAt(text.length - 1));
  418. }
  419. function isText(item) { return item.constructor == String; }
  420. function isInline(item) { return isText(item) || item.tag.isInline; }
  421. function isElement(item) { return item.constructor == Element; }
  422. function textClean(text) {
  423. return text
  424. .replace(/&nbsp;|\n/g, " ")
  425. .replace(/\s\s+/g, " ");
  426. }
  427. // trim off white space, doesn't use regex
  428. $.htmlClean.trim = function (text) {
  429. return $.htmlClean.trimStart($.htmlClean.trimEnd(text));
  430. };
  431. $.htmlClean.trimStart = function (text) {
  432. return text.substring($.htmlClean.trimStartIndex(text));
  433. };
  434. $.htmlClean.trimStartIndex = function (text) {
  435. for (var start = 0; start < text.length - 1 && $.htmlClean.isWhitespace(text.charAt(start)); start++);
  436. return start;
  437. };
  438. $.htmlClean.trimEnd = function (text) {
  439. return text.substring(0, $.htmlClean.trimEndIndex(text));
  440. };
  441. $.htmlClean.trimEndIndex = function (text) {
  442. for (var end = text.length - 1; end >= 0 && $.htmlClean.isWhitespace(text.charAt(end)); end--);
  443. return end + 1;
  444. };
  445. // checks a char is white space or not
  446. $.htmlClean.isWhitespace = function (c) { return $.inArray(c, whitespace) != -1; };
  447. // tags which are inline
  448. var tagInline = [
  449. "a", "abbr", "acronym", "address", "b", "big", "br", "button",
  450. "caption", "cite", "code", "del", "em", "font",
  451. "hr", "i", "input", "img", "ins", "label", "legend", "map", "q",
  452. "s", "samp", "select", "option", "param", "small", "span", "strike", "strong", "sub", "sup",
  453. "tt", "u", "var"];
  454. var tagDisallowNest = ["h1", "h2", "h3", "h4", "h5", "h6", "p", "th", "td", "object"];
  455. var tagAllowEmpty = ["th", "td"];
  456. var tagRequiredParent = [
  457. null,
  458. "li", ["ul", "ol"],
  459. "dt", ["dl"],
  460. "dd", ["dl"],
  461. "td", ["tr"],
  462. "th", ["tr"],
  463. "tr", ["table", "thead", "tbody", "tfoot"],
  464. "thead", ["table"],
  465. "tbody", ["table"],
  466. "tfoot", ["table"],
  467. "param", ["object"]
  468. ];
  469. var tagProtect = ["script", "style", "pre", "code"];
  470. // tags which self close e.g. <br />
  471. var tagSelfClosing = ["area", "base", "br", "col", "command", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr"];
  472. // tags which do not close
  473. var tagNonClosing = ["!doctype", "?xml"];
  474. // attributes allowed on tags
  475. var tagAttributes = [
  476. ["class"], // default, for all tags not mentioned
  477. "?xml", [],
  478. "!doctype", [],
  479. "a", ["accesskey", "class", "href", "name", "title", "rel", "rev", "type", "tabindex"],
  480. "abbr", ["class", "title"],
  481. "acronym", ["class", "title"],
  482. "blockquote", ["cite", "class"],
  483. "button", ["class", "disabled", "name", "type", "value"],
  484. "del", ["cite", "class", "datetime"],
  485. "form", ["accept", "action", "class", "enctype", "method", "name"],
  486. "input", ["accept", "accesskey", "alt", "checked", "class", "disabled", "ismap", "maxlength", "name", "size", "readonly", "src", "tabindex", "type", "usemap", "value"],
  487. "img", ["alt", "class", "height", "src", "width"],
  488. "ins", ["cite", "class", "datetime"],
  489. "label", ["accesskey", "class", "for"],
  490. "legend", ["accesskey", "class"],
  491. "link", ["href", "rel", "type"],
  492. "meta", ["content", "http-equiv", "name", "scheme", "charset"],
  493. "map", ["name"],
  494. "optgroup", ["class", "disabled", "label"],
  495. "option", ["class", "disabled", "label", "selected", "value"],
  496. "q", ["class", "cite"],
  497. "script", ["src", "type"],
  498. "select", ["class", "disabled", "multiple", "name", "size", "tabindex"],
  499. "style", ["type"],
  500. "table", ["class", "summary"],
  501. "th", ["class", "colspan", "rowspan"],
  502. "td", ["class", "colspan", "rowspan"],
  503. "textarea", ["accesskey", "class", "cols", "disabled", "name", "readonly", "rows", "tabindex"],
  504. "param", ["name", "value"],
  505. "embed", ["height", "src", "type", "width"]
  506. ];
  507. var tagAttributesRequired = [[], "img", ["alt"]];
  508. // white space chars
  509. var whitespace = [" ", " ", "\t", "\n", "\r", "\f"];
  510. })(jQuery);