-
Notifications
You must be signed in to change notification settings - Fork 634
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(html): add escape and unescape functions for HTML entities (#3335)
- Loading branch information
1 parent
5199824
commit 6ab64b1
Showing
5 changed files
with
2,474 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#!/usr/bin/env -S deno run --allow-net --allow-read --allow-write | ||
// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license. | ||
|
||
// JSON version of the full canonical list of named HTML entities | ||
// https://html.spec.whatwg.org/multipage/named-characters.html | ||
import entityList from "https://html.spec.whatwg.org/entities.json" assert { | ||
type: "json", | ||
}; | ||
|
||
const data = Object.fromEntries( | ||
Object.entries(entityList).map(([k, v]) => [k, v.characters]), | ||
); | ||
|
||
await Deno.writeTextFile( | ||
new URL(import.meta.resolve("../named_entity_list.json")), | ||
JSON.stringify(data, null, 2) + "\n", | ||
); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license. | ||
// This module is browser compatible. | ||
|
||
export type EntityList = Record<string, string>; | ||
|
||
const rawToEntityEntries = [ | ||
["&", "&"], | ||
["<", "<"], | ||
[">", ">"], | ||
['"', """], | ||
["'", "'"], | ||
] as const; | ||
|
||
const defaultEntityList: EntityList = Object.fromEntries([ | ||
...rawToEntityEntries.map(([raw, entity]) => [entity, raw]), | ||
["'", "'"], | ||
[" ", "\xa0"], | ||
]); | ||
|
||
const rawToEntity = new Map<string, string>(rawToEntityEntries); | ||
|
||
const rawRe = new RegExp(`[${[...rawToEntity.keys()].join("")}]`, "g"); | ||
|
||
/** | ||
* Escapes text for safe interpolation into HTML text content and quoted attributes | ||
* | ||
* @example | ||
* ```ts | ||
* import { escape } from "https://deno.land/std@$STD_VERSION/html/entities.ts"; | ||
* import { assertEquals } from "https://deno.land/std@$STD_VERSION/testing/asserts.ts"; | ||
* | ||
* assertEquals(escape("<>'&AA"), "<>'&AA"); | ||
* | ||
* // characters that don't need to be escaped will be left alone, | ||
* // even if named HTML entities exist for them | ||
* assertEquals(escape("þð"), "þð"); | ||
* ``` | ||
*/ | ||
export function escape(str: string) { | ||
return str.replaceAll(rawRe, (m) => rawToEntity.get(m)!); | ||
} | ||
|
||
export type UnescapeOptions = { entityList: EntityList }; | ||
|
||
const defaultUnescapeOptions: UnescapeOptions = { | ||
entityList: defaultEntityList, | ||
}; | ||
|
||
const MAX_CODE_POINT = 0x10ffff; | ||
|
||
const RX_DEC_ENTITY = /&#([0-9]+);/g; | ||
const RX_HEX_ENTITY = /&#x(\p{AHex}+);/gu; | ||
|
||
const entityListRegexCache = new WeakMap<EntityList, RegExp>(); | ||
|
||
/** | ||
* Unescapes HTML entities in text | ||
* | ||
* @example | ||
* ```ts | ||
* import { unescape } from "https://deno.land/std@$STD_VERSION/html/entities.ts"; | ||
* import { assertEquals } from "https://deno.land/std@$STD_VERSION/testing/asserts.ts"; | ||
* | ||
* // default options (only handles &<>'" and numeric entities) | ||
* assertEquals(unescape("<>'&AA"), "<>'&AA"); | ||
* assertEquals(unescape("þð"), "þð"); | ||
* | ||
* // using the full named entity list from the HTML spec (~47K unminified) | ||
* import entityList from "https://deno.land/std@$STD_VERSION/html/named_entity_list.json" assert { type: "json" }; | ||
* assertEquals(unescape("þð", { entityList }), "þð"); | ||
* ``` | ||
*/ | ||
export function unescape( | ||
str: string, | ||
options: Partial<UnescapeOptions> = {}, | ||
) { | ||
const { entityList } = { ...defaultUnescapeOptions, ...options }; | ||
|
||
let entityRe = entityListRegexCache.get(entityList); | ||
|
||
if (!entityRe) { | ||
entityRe = new RegExp( | ||
`(${ | ||
Object.keys(entityList) | ||
.sort((a, b) => b.length - a.length) | ||
.join("|") | ||
})`, | ||
"g", | ||
); | ||
|
||
entityListRegexCache.set(entityList, entityRe); | ||
} | ||
|
||
return str | ||
.replaceAll(entityRe, (m) => entityList[m]) | ||
.replaceAll(RX_DEC_ENTITY, (_, dec) => codePointStrToChar(dec, 10)) | ||
.replaceAll(RX_HEX_ENTITY, (_, hex) => codePointStrToChar(hex, 16)); | ||
} | ||
|
||
function codePointStrToChar(codePointStr: string, radix: number) { | ||
const codePoint = parseInt(codePointStr, radix); | ||
|
||
return codePoint > MAX_CODE_POINT ? "�" : String.fromCodePoint(codePoint); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license. | ||
|
||
import { escape, unescape } from "./entities.ts"; | ||
import { assertEquals } from "../testing/asserts.ts"; | ||
import entityList from "./named_entity_list.json" assert { type: "json" }; | ||
|
||
Deno.test("escape", async (t) => { | ||
await t.step('escapes &<>"', () => { | ||
assertEquals(escape("&<>'\""), "&<>'""); | ||
}); | ||
await t.step("escapes ' to ' (not ')", () => { | ||
assertEquals(escape("'"), "'"); | ||
}); | ||
await t.step("doesn't escape non-breaking space", () => { | ||
assertEquals(escape("\xa0"), "\xa0"); | ||
}); | ||
await t.step( | ||
"doesn't escape other characters, even if they have named entities", | ||
() => { | ||
assertEquals(escape("þð"), "þð"); | ||
}, | ||
); | ||
}); | ||
|
||
Deno.test("unescape", async (t) => { | ||
await t.step("round-trips with escape", () => { | ||
const chars = "&<>'\""; | ||
assertEquals(unescape(escape(chars)), chars); | ||
}); | ||
|
||
await t.step("named entities", async (t) => { | ||
await t.step("default options", async (t) => { | ||
await t.step("unescapes ' as alias for ' '", () => { | ||
assertEquals(unescape("'"), "'"); | ||
}); | ||
await t.step("unescapes ", () => { | ||
assertEquals(unescape(" "), "\xa0"); | ||
}); | ||
await t.step("doesn't unescape other named entities", () => { | ||
assertEquals(unescape("þð"), "þð"); | ||
}); | ||
}); | ||
|
||
await t.step("full entity list", async (t) => { | ||
await t.step("unescapes arbitrary named entities", () => { | ||
assertEquals(unescape("þð", { entityList }), "þð"); | ||
}); | ||
await t.step( | ||
"unescapes truncated named entity (no trailing semicolon) if it is listed", | ||
() => { | ||
assertEquals(unescape("&", { entityList }), "&"); | ||
}, | ||
); | ||
await t.step( | ||
"consumes full named entity even when a truncated version is specified", | ||
() => { | ||
assertEquals(unescape("&", { entityList }), "&"); | ||
}, | ||
); | ||
await t.step( | ||
"doesn't unescape truncated named entity if it isn't listed", | ||
() => { | ||
assertEquals( | ||
unescape("∴ &therefore", { entityList }), | ||
"∴ &therefore", | ||
); | ||
}, | ||
); | ||
}); | ||
}); | ||
|
||
await t.step("decimal", async (t) => { | ||
await t.step("unescapes decimal", () => { | ||
assertEquals(unescape("."), "."); | ||
}); | ||
await t.step("unescapes max decimal codepoint", () => { | ||
assertEquals(unescape(""), "\u{10ffff}"); | ||
}); | ||
await t.step("unescapes decimal with leading zero", () => { | ||
assertEquals(unescape("."), "."); | ||
}); | ||
await t.step( | ||
"unescapes invalid decimal codepoint to replacement character", | ||
() => { | ||
assertEquals(unescape("�"), "�"); | ||
}, | ||
); | ||
}); | ||
|
||
await t.step("hex", async (t) => { | ||
await t.step("unescapes lower-case hex", () => { | ||
assertEquals(unescape("."), "."); | ||
}); | ||
await t.step("unescapes upper-case hex", () => { | ||
assertEquals(unescape("."), "."); | ||
}); | ||
await t.step("unescapes hex with leading zero", () => { | ||
assertEquals(unescape("."), "."); | ||
}); | ||
await t.step("unescapes max hex codepoint", () => { | ||
assertEquals(unescape(""), "\u{10ffff}"); | ||
}); | ||
await t.step( | ||
"unescapes invalid hex codepoint to replacement character", | ||
() => { | ||
assertEquals(unescape("�"), "�"); | ||
}, | ||
); | ||
}); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license. | ||
// This module is browser compatible. | ||
|
||
/** | ||
* Functions for HTML tasks such as escaping or unescaping HTML entities | ||
* | ||
* @module | ||
*/ | ||
|
||
export * from "./entities.ts"; |
Oops, something went wrong.