decodeHtml.js 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. "use strict";
  2. var __importDefault = (this && this.__importDefault) || function (mod) {
  3. return (mod && mod.__esModule) ? mod : { "default": mod };
  4. };
  5. Object.defineProperty(exports, "__esModule", { value: true });
  6. exports.decodeHtml = void 0;
  7. const namedChars_json_1 = __importDefault(require("./namedChars.json"));
  8. // lazy compute this to make this file tree-shakable for browser
  9. let maxCRNameLength;
  10. const decodeHtml = (rawText, asAttr) => {
  11. let offset = 0;
  12. const end = rawText.length;
  13. let decodedText = '';
  14. function advance(length) {
  15. offset += length;
  16. rawText = rawText.slice(length);
  17. }
  18. while (offset < end) {
  19. const head = /&(?:#x?)?/i.exec(rawText);
  20. if (!head || offset + head.index >= end) {
  21. const remaining = end - offset;
  22. decodedText += rawText.slice(0, remaining);
  23. advance(remaining);
  24. break;
  25. }
  26. // Advance to the "&".
  27. decodedText += rawText.slice(0, head.index);
  28. advance(head.index);
  29. if (head[0] === '&') {
  30. // Named character reference.
  31. let name = '';
  32. let value = undefined;
  33. if (/[0-9a-z]/i.test(rawText[1])) {
  34. if (!maxCRNameLength) {
  35. maxCRNameLength = Object.keys(namedChars_json_1.default).reduce((max, name) => Math.max(max, name.length), 0);
  36. }
  37. for (let length = maxCRNameLength; !value && length > 0; --length) {
  38. name = rawText.slice(1, 1 + length);
  39. value = namedChars_json_1.default[name];
  40. }
  41. if (value) {
  42. const semi = name.endsWith(';');
  43. if (asAttr &&
  44. !semi &&
  45. /[=a-z0-9]/i.test(rawText[name.length + 1] || '')) {
  46. decodedText += '&' + name;
  47. advance(1 + name.length);
  48. }
  49. else {
  50. decodedText += value;
  51. advance(1 + name.length);
  52. }
  53. }
  54. else {
  55. decodedText += '&' + name;
  56. advance(1 + name.length);
  57. }
  58. }
  59. else {
  60. decodedText += '&';
  61. advance(1);
  62. }
  63. }
  64. else {
  65. // Numeric character reference.
  66. const hex = head[0] === '&#x';
  67. const pattern = hex ? /^&#x([0-9a-f]+);?/i : /^&#([0-9]+);?/;
  68. const body = pattern.exec(rawText);
  69. if (!body) {
  70. decodedText += head[0];
  71. advance(head[0].length);
  72. }
  73. else {
  74. // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
  75. let cp = Number.parseInt(body[1], hex ? 16 : 10);
  76. if (cp === 0) {
  77. cp = 0xfffd;
  78. }
  79. else if (cp > 0x10ffff) {
  80. cp = 0xfffd;
  81. }
  82. else if (cp >= 0xd800 && cp <= 0xdfff) {
  83. cp = 0xfffd;
  84. }
  85. else if ((cp >= 0xfdd0 && cp <= 0xfdef) || (cp & 0xfffe) === 0xfffe) {
  86. // noop
  87. }
  88. else if ((cp >= 0x01 && cp <= 0x08) ||
  89. cp === 0x0b ||
  90. (cp >= 0x0d && cp <= 0x1f) ||
  91. (cp >= 0x7f && cp <= 0x9f)) {
  92. cp = CCR_REPLACEMENTS[cp] || cp;
  93. }
  94. decodedText += String.fromCodePoint(cp);
  95. advance(body[0].length);
  96. }
  97. }
  98. }
  99. return decodedText;
  100. };
  101. exports.decodeHtml = decodeHtml;
  102. // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
  103. const CCR_REPLACEMENTS = {
  104. 0x80: 0x20ac,
  105. 0x82: 0x201a,
  106. 0x83: 0x0192,
  107. 0x84: 0x201e,
  108. 0x85: 0x2026,
  109. 0x86: 0x2020,
  110. 0x87: 0x2021,
  111. 0x88: 0x02c6,
  112. 0x89: 0x2030,
  113. 0x8a: 0x0160,
  114. 0x8b: 0x2039,
  115. 0x8c: 0x0152,
  116. 0x8e: 0x017d,
  117. 0x91: 0x2018,
  118. 0x92: 0x2019,
  119. 0x93: 0x201c,
  120. 0x94: 0x201d,
  121. 0x95: 0x2022,
  122. 0x96: 0x2013,
  123. 0x97: 0x2014,
  124. 0x98: 0x02dc,
  125. 0x99: 0x2122,
  126. 0x9a: 0x0161,
  127. 0x9b: 0x203a,
  128. 0x9c: 0x0153,
  129. 0x9e: 0x017e,
  130. 0x9f: 0x0178,
  131. };