Skip to content

Commit

Permalink
feat(html): add escape and unescape functions for HTML entities (#3335)
Browse files Browse the repository at this point in the history
  • Loading branch information
lionel-rowe authored May 30, 2023
1 parent 5199824 commit 6ab64b1
Show file tree
Hide file tree
Showing 5 changed files with 2,474 additions and 0 deletions.
17 changes: 17 additions & 0 deletions html/_tools/generate_data.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/usr/bin/env -S deno run --allow-net --allow-read --allow-write
// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.

// JSON version of the full canonical list of named HTML entities
// https://html.spec.whatwg.org/multipage/named-characters.html
import entityList from "https://html.spec.whatwg.org/entities.json" assert {
type: "json",
};

const data = Object.fromEntries(
Object.entries(entityList).map(([k, v]) => [k, v.characters]),
);

await Deno.writeTextFile(
new URL(import.meta.resolve("../named_entity_list.json")),
JSON.stringify(data, null, 2) + "\n",
);
104 changes: 104 additions & 0 deletions html/entities.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
// This module is browser compatible.

export type EntityList = Record<string, string>;

const rawToEntityEntries = [
["&", "&amp;"],
["<", "&lt;"],
[">", "&gt;"],
['"', "&quot;"],
["'", "&#39;"],
] as const;

const defaultEntityList: EntityList = Object.fromEntries([
...rawToEntityEntries.map(([raw, entity]) => [entity, raw]),
["&apos;", "'"],
["&nbsp;", "\xa0"],
]);

const rawToEntity = new Map<string, string>(rawToEntityEntries);

const rawRe = new RegExp(`[${[...rawToEntity.keys()].join("")}]`, "g");

/**
* Escapes text for safe interpolation into HTML text content and quoted attributes
*
* @example
* ```ts
* import { escape } from "https://deno.land/std@$STD_VERSION/html/entities.ts";
* import { assertEquals } from "https://deno.land/std@$STD_VERSION/testing/asserts.ts";
*
* assertEquals(escape("<>'&AA"), "&lt;&gt;&#39;&amp;AA");
*
* // characters that don't need to be escaped will be left alone,
* // even if named HTML entities exist for them
* assertEquals(escape("þð"), "þð");
* ```
*/
export function escape(str: string) {
return str.replaceAll(rawRe, (m) => rawToEntity.get(m)!);
}

export type UnescapeOptions = { entityList: EntityList };

const defaultUnescapeOptions: UnescapeOptions = {
entityList: defaultEntityList,
};

const MAX_CODE_POINT = 0x10ffff;

const RX_DEC_ENTITY = /&#([0-9]+);/g;
const RX_HEX_ENTITY = /&#x(\p{AHex}+);/gu;

const entityListRegexCache = new WeakMap<EntityList, RegExp>();

/**
* Unescapes HTML entities in text
*
* @example
* ```ts
* import { unescape } from "https://deno.land/std@$STD_VERSION/html/entities.ts";
* import { assertEquals } from "https://deno.land/std@$STD_VERSION/testing/asserts.ts";
*
* // default options (only handles &<>'" and numeric entities)
* assertEquals(unescape("&lt;&gt;&apos;&amp;&#65;&#x41;"), "<>'&AA");
* assertEquals(unescape("&thorn;&eth;"), "&thorn;&eth;");
*
* // using the full named entity list from the HTML spec (~47K unminified)
* import entityList from "https://deno.land/std@$STD_VERSION/html/named_entity_list.json" assert { type: "json" };
* assertEquals(unescape("&thorn;&eth;", { entityList }), "þð");
* ```
*/
export function unescape(
str: string,
options: Partial<UnescapeOptions> = {},
) {
const { entityList } = { ...defaultUnescapeOptions, ...options };

let entityRe = entityListRegexCache.get(entityList);

if (!entityRe) {
entityRe = new RegExp(
`(${
Object.keys(entityList)
.sort((a, b) => b.length - a.length)
.join("|")
})`,
"g",
);

entityListRegexCache.set(entityList, entityRe);
}

return str
.replaceAll(entityRe, (m) => entityList[m])
.replaceAll(RX_DEC_ENTITY, (_, dec) => codePointStrToChar(dec, 10))
.replaceAll(RX_HEX_ENTITY, (_, hex) => codePointStrToChar(hex, 16));
}

function codePointStrToChar(codePointStr: string, radix: number) {
const codePoint = parseInt(codePointStr, radix);

return codePoint > MAX_CODE_POINT ? "�" : String.fromCodePoint(codePoint);
}
110 changes: 110 additions & 0 deletions html/entities_test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.

import { escape, unescape } from "./entities.ts";
import { assertEquals } from "../testing/asserts.ts";
import entityList from "./named_entity_list.json" assert { type: "json" };

Deno.test("escape", async (t) => {
await t.step('escapes &<>"', () => {
assertEquals(escape("&<>'\""), "&amp;&lt;&gt;&#39;&quot;");
});
await t.step("escapes ' to &#39; (not &apos;)", () => {
assertEquals(escape("'"), "&#39;");
});
await t.step("doesn't escape non-breaking space", () => {
assertEquals(escape("\xa0"), "\xa0");
});
await t.step(
"doesn't escape other characters, even if they have named entities",
() => {
assertEquals(escape("þð"), "þð");
},
);
});

Deno.test("unescape", async (t) => {
await t.step("round-trips with escape", () => {
const chars = "&<>'\"";
assertEquals(unescape(escape(chars)), chars);
});

await t.step("named entities", async (t) => {
await t.step("default options", async (t) => {
await t.step("unescapes &apos; as alias for ' &#39;", () => {
assertEquals(unescape("&apos;"), "'");
});
await t.step("unescapes &nbsp;", () => {
assertEquals(unescape("&nbsp;"), "\xa0");
});
await t.step("doesn't unescape other named entities", () => {
assertEquals(unescape("&thorn;&eth;"), "&thorn;&eth;");
});
});

await t.step("full entity list", async (t) => {
await t.step("unescapes arbitrary named entities", () => {
assertEquals(unescape("&thorn;&eth;", { entityList }), "þð");
});
await t.step(
"unescapes truncated named entity (no trailing semicolon) if it is listed",
() => {
assertEquals(unescape("&amp", { entityList }), "&");
},
);
await t.step(
"consumes full named entity even when a truncated version is specified",
() => {
assertEquals(unescape("&amp;", { entityList }), "&");
},
);
await t.step(
"doesn't unescape truncated named entity if it isn't listed",
() => {
assertEquals(
unescape("&therefore; &therefore", { entityList }),
"∴ &therefore",
);
},
);
});
});

await t.step("decimal", async (t) => {
await t.step("unescapes decimal", () => {
assertEquals(unescape("&#46;"), ".");
});
await t.step("unescapes max decimal codepoint", () => {
assertEquals(unescape("&#1114111;"), "\u{10ffff}");
});
await t.step("unescapes decimal with leading zero", () => {
assertEquals(unescape("&#046;"), ".");
});
await t.step(
"unescapes invalid decimal codepoint to replacement character",
() => {
assertEquals(unescape("&#1114112;"), "�");
},
);
});

await t.step("hex", async (t) => {
await t.step("unescapes lower-case hex", () => {
assertEquals(unescape("&#x2e;"), ".");
});
await t.step("unescapes upper-case hex", () => {
assertEquals(unescape("&#x2E;"), ".");
});
await t.step("unescapes hex with leading zero", () => {
assertEquals(unescape("&#x02E;"), ".");
});
await t.step("unescapes max hex codepoint", () => {
assertEquals(unescape("&#x10ffff;"), "\u{10ffff}");
});
await t.step(
"unescapes invalid hex codepoint to replacement character",
() => {
assertEquals(unescape("&#x110000;"), "�");
},
);
});
});
10 changes: 10 additions & 0 deletions html/mod.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
// This module is browser compatible.

/**
* Functions for HTML tasks such as escaping or unescaping HTML entities
*
* @module
*/

export * from "./entities.ts";
Loading

0 comments on commit 6ab64b1

Please sign in to comment.