forked from explosion/spaCy
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Replace pytokenizations with internal alignment (explosion#6293)
* Replace pytokenizations with internal alignment Replace pytokenizations with internal alignment algorithm that is restricted to only allow differences in whitespace and capitalization. * Rename `spacy.training.align` to `spacy.training.alignment` to contain the `Alignment` dataclass * Implement `get_alignments` in `spacy.training.align` * Refactor trailing whitespace handling * Remove unnecessary exception for empty docs Allow a non-empty whitespace-only doc to be aligned with an empty doc * Remove empty docs exceptions completely
- Loading branch information
1 parent
a4b32b9
commit 1c4df8f
Showing
10 changed files
with
182 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
from typing import List, Tuple | ||
from itertools import chain | ||
import re | ||
|
||
from ..errors import Errors | ||
|
||
|
||
def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]: | ||
# Create character-to-token mappings | ||
char_to_token_a = tuple(chain(*((i,) * len(x) for i, x in enumerate(A)))) | ||
char_to_token_b = tuple(chain(*((i,) * len(x) for i, x in enumerate(B)))) | ||
str_a = "".join(A).lower() | ||
str_b = "".join(B).lower() | ||
cdef int len_str_a = len(str_a) | ||
cdef int len_str_b = len(str_b) | ||
# Check that the two texts only differ in whitespace and capitalization | ||
if re.sub(r"\s+", "", str_a) != re.sub(r"\s+", "", str_b) or \ | ||
len_str_a != len(char_to_token_a) or \ | ||
len_str_b != len(char_to_token_b): | ||
raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10]))) | ||
cdef int char_idx_a = 0 | ||
cdef int char_idx_b = 0 | ||
cdef int token_idx_a = 0 | ||
cdef int token_idx_b = 0 | ||
cdef int prev_token_idx_a = -1 | ||
cdef int prev_token_idx_b = -1 | ||
a2b = [] | ||
b2a = [] | ||
while char_idx_a < len_str_a and char_idx_b < len_str_b: | ||
# Find the current token position from the character position | ||
token_idx_a = char_to_token_a[char_idx_a] | ||
token_idx_b = char_to_token_b[char_idx_b] | ||
# Add a set for the next token if a token boundary has been crossed | ||
if prev_token_idx_a != token_idx_a: | ||
a2b.append(set()) | ||
if prev_token_idx_b != token_idx_b: | ||
b2a.append(set()) | ||
# Process the alignment at the current position | ||
if A[token_idx_a] == B[token_idx_b]: | ||
# Current tokens are identical | ||
a2b[-1].add(token_idx_b) | ||
b2a[-1].add(token_idx_a) | ||
char_idx_a += len(A[token_idx_a]) | ||
char_idx_b += len(B[token_idx_b]) | ||
elif str_a[char_idx_a] == str_b[char_idx_b]: | ||
# Current chars are identical | ||
a2b[-1].add(token_idx_b) | ||
b2a[-1].add(token_idx_a) | ||
char_idx_a += 1 | ||
char_idx_b += 1 | ||
elif str_a[char_idx_a].isspace(): | ||
# Skip unaligned whitespace char in A | ||
char_idx_a += 1 | ||
elif str_b[char_idx_b].isspace(): | ||
# Skip unaligned whitespace char in B | ||
char_idx_b += 1 | ||
else: | ||
# This should never happen | ||
raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10]))) | ||
prev_token_idx_a = token_idx_a | ||
prev_token_idx_b = token_idx_b | ||
# Process unaligned trailing whitespace | ||
a2b.extend([set()] * len(set(char_to_token_a[char_idx_a:]))) | ||
b2a.extend([set()] * len(set(char_to_token_b[char_idx_b:]))) | ||
# Return values as sorted lists per token position | ||
return [sorted(x) for x in a2b], [sorted(x) for x in b2a] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters