-
-
Notifications
You must be signed in to change notification settings - Fork 103
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: *minor breaking* fix issues with accents and the word splitter (#…
…1330) * fix: fix issues with accents and the word splitter
- Loading branch information
Showing
7 changed files
with
257 additions
and
62 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
import { | ||
regExAccents, | ||
regExAllLower, | ||
regExAllUpper, | ||
regExDanglingQuote, | ||
regExFirstUpper, | ||
regExSplitWords, | ||
regExSplitWords2, | ||
regExTrailingEndings, | ||
} from './textRegex'; | ||
|
||
describe('Validate textRegex', () => { | ||
// cspell:ignore CODE'ing | ||
test.each` | ||
text | expected | ||
${'hello'} | ${[]} | ||
${'CODEing'} | ${[['ing']]} | ||
${"CODE'ing"} | ${[["'ing"]]} | ||
${"ERROR'd"} | ${[["'d"]]} | ||
${"ERROR's"} | ${[["'s"]]} | ||
${'ERRORs'} | ${[['s']]} | ||
${'ERRORes'} | ${[['es']]} | ||
${'ERRORth'} | ${[['th']]} | ||
${'ERRORnth'} | ${[['nth']]} | ||
${'ERRORies'} | ${[['ies']]} | ||
${nfc('CAFÉed')} | ${[['ed']]} | ||
${nfd('CAFÉed')} | ${[['ed']]} | ||
${nfd('CAFÉ’ed')} | ${[['’ed']]} | ||
${nfd('CAFÉ’s')} | ${[['’s']]} | ||
`('regExTrailingEndings on "$text"', ({ text, expected }: { text: string; expected: string[] }) => { | ||
const m = [...text.matchAll(regExTrailingEndings)].map((m) => Array.from(m)); | ||
expect(m).toEqual(expected); | ||
}); | ||
|
||
test.each` | ||
text | expected | ||
${'hello'} | ${[]} | ||
${"ERROR's"} | ${[]} | ||
${"'thing"} | ${["'"]} | ||
${"n'cpp"} | ${["'"]} | ||
${"s'thing"} | ${["'"]} | ||
${"A'thing"} | ${["'"]} | ||
${"s 'thing"} | ${["'"]} | ||
${nfc(`é'thing`)} | ${["'"]} | ||
${nfd(`é'thing`)} | ${["'"]} | ||
`('regExDanglingQuote on "$text"', ({ text, expected }: { text: string; expected: string[] }) => { | ||
const m = text.match(regExDanglingQuote) ?? []; | ||
expect([...m]).toEqual(expected); | ||
}); | ||
|
||
test.each` | ||
text | expected | ||
${'hello'} | ${[]} | ||
${"ERROR's"} | ${[]} | ||
${nfc(`é'thing`)} | ${[]} | ||
${nfd(`é'thing`)} | ${[nfd('á').replace('a', '')]} | ||
`('regExAccents on "$text"', ({ text, expected }: { text: string; expected: string[] }) => { | ||
const m = text.match(regExAccents) ?? []; | ||
expect([...m]).toEqual(expected); | ||
}); | ||
|
||
// cspell:word érror | ||
test.each` | ||
text | expected | ||
${'hello'} | ${[]} | ||
${'ERROR'} | ${['ERROR']} | ||
${'ERRORs'} | ${[]} | ||
${nfc(`érror`).toUpperCase()} | ${[nfc('ÉRROR')]} | ||
${nfd(`érror`).toUpperCase()} | ${[nfd('ÉRROR')]} | ||
`('regExAllUpper on "$text"', ({ text, expected }: { text: string; expected: string[] }) => { | ||
const m = text.match(regExAllUpper) ?? []; | ||
expect([...m]).toEqual(expected); | ||
}); | ||
|
||
test.each` | ||
text | expected | ||
${'hello'} | ${['hello']} | ||
${'ERROR'} | ${[]} | ||
${'Errors'} | ${[]} | ||
${nfc(`érror`)} | ${[nfc('érror')]} | ||
${nfd(`érror`)} | ${[nfd('érror')]} | ||
${nfc(`érror`)} | ${[nfc('érror')]} | ||
${nfc(`café`)} | ${[nfc('café')]} | ||
${nfd(`café`)} | ${[nfd('café')]} | ||
`('regExAllLower on "$text"', ({ text, expected }: { text: string; expected: string[] }) => { | ||
const m = text.match(regExAllLower) ?? []; | ||
expect([...m]).toEqual(expected); | ||
}); | ||
|
||
test.each` | ||
text | expected | ||
${'hello'} | ${[]} | ||
${'ERROR'} | ${[]} | ||
${'Errors'} | ${['Errors']} | ||
${nfc(`Érror`)} | ${[nfc('Érror')]} | ||
${nfd(`Érror`)} | ${[nfd('Érror')]} | ||
`('regExFirstUpper on "$text"', ({ text, expected }: { text: string; expected: string[] }) => { | ||
const m = text.match(regExFirstUpper) ?? []; | ||
expect([...m]).toEqual(expected); | ||
}); | ||
|
||
test.each` | ||
text | expected | ||
${'hello'} | ${[]} | ||
${'errorCode'} | ${[['rC', 'r', 'C']]} | ||
${nfc('caféStyle')} | ${[[nfc('éS'), nfc('é'), 'S']]} | ||
${nfd('caféStyle')} | ${[[nfd('éS'), nfd('é'), 'S']]} | ||
${'Errors'} | ${[]} | ||
`('regExSplitWords on "$text"', ({ text, expected }: { text: string; expected: string[] }) => { | ||
const m = [...text.matchAll(regExSplitWords)].map((m) => Array.from(m)); | ||
expect(m).toEqual(expected); | ||
}); | ||
|
||
test.each` | ||
text | expected | ||
${'hello'} | ${[]} | ||
${'ERRORCode'} | ${[['RCo', 'R', 'Co']]} | ||
${nfc('CAFÉStyle')} | ${[[nfc('ÉSt'), nfc('É'), 'St']]} | ||
${nfd('CAFÉStyle')} | ${[[nfd('ÉSt'), nfd('É'), 'St']]} | ||
${nfc('CODEÉrror')} | ${[[nfc('EÉr'), 'E', nfc('Ér')]]} | ||
${nfd('CODEÉrror')} | ${[[nfd('EÉr'), 'E', nfd('Ér')]]} | ||
${'ERRORS'} | ${[]} | ||
`('regExSplitWords2 on "$text"', ({ text, expected }: { text: string; expected: string[] }) => { | ||
const m = [...text.matchAll(regExSplitWords2)].map((m) => Array.from(m)); | ||
expect(m).toEqual(expected); | ||
}); | ||
}); | ||
|
||
// function s(t: string, on: string | RegExp = '|'): string[] { | ||
// return t.split(on); | ||
// } | ||
|
||
function nfc(s: string): string { | ||
return s.normalize('NFC'); | ||
} | ||
|
||
function nfd(s: string): string { | ||
return s.normalize('NFD'); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,20 @@ | ||
// cspell:ignore ings ning gimuy anrvtbf | ||
|
||
export const regExLines = /.*(\r?\n|$)/g; | ||
export const regExUpperSOrIng = /(\p{Lu}+\\?['’]?(?:s|ing|ies|es|ings|ed|ning))(?!\p{Ll})/gu; | ||
export const regExUpperSOrIng = /([\p{Lu}\p{M}]+\\?['’]?(?:s|ing|ies|es|ings|ed|ning))(?!\p{Ll})/gu; | ||
export const regExSplitWords = /(\p{Ll}\p{M}?)(\p{Lu})/gu; | ||
export const regExSplitWords2 = /(\p{Lu}\p{M}?)(\p{Lu}\p{M}?\p{Ll})/gu; | ||
export const regExWords = /\p{L}(?:(?:\\?['’])?\p{L})*/gu; | ||
export const regExWords = /\p{L}\p{M}?(?:(?:\\?['’])?\p{L}\p{M}?)*/gu; | ||
export const regExWordsAndDigits = /(?:\d+)?[\p{L}\p{M}_'’-](?:(?:\\?['’])?[\p{L}\p{M}\w'’-])*/gu; | ||
export const regExIgnoreCharacters = /\p{sc=Hiragana}|\p{sc=Han}|\p{sc=Katakana}|[\u30A0-\u30FF]|[\p{sc=Hangul}]/gu; | ||
export const regExIgnoreCharacters = /[\p{sc=Hiragana}\p{sc=Han}\p{sc=Katakana}\u30A0-\u30FF\p{sc=Hangul}]/gu; | ||
export const regExFirstUpper = /^\p{Lu}\p{M}?\p{Ll}+$/u; | ||
export const regExAllUpper = /^(?:\p{Lu}\p{M}?)+$/u; | ||
export const regExAllLower = /^(?:\p{Ll}\p{M}?)+$/u; | ||
export const regExPossibleWordBreaks = /[_-]/g; | ||
export const regExMatchRegExParts = /^\/(.*)\/([gimuy]*)$/; | ||
export const regExAccents = /\p{M}/gu; | ||
export const regExEscapeCharacters = /(?<=\\)[anrvtbf]/gi; | ||
export const regExDanglingQuote = /(?<=\P{L}\p{L}?)[']/gu; | ||
/** Matches against leading `'` or `{single letter}'` */ | ||
export const regExDanglingQuote = /(?<=(?:^|(?!\p{M})\P{L})(?:\p{L}\p{M}?)?)[']/gu; | ||
/** Match tailing endings after CAPS words */ | ||
export const regExTrailingEndings = /(?<=\p{Lu}{2})['’]?(?:s|d|ing[s]|ies|e[ds]|ning|th|nth)(?!\p{Ll})/gu; | ||
export const regExTrailingEndings = /(?<=(?:\p{Lu}\p{M}?){2})['’]?(?:s|d|ings?|ies|e[ds]?|ning|th|nth)(?!\p{Ll})/gu; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters