Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nghiauet add comment for clarify #196

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add document for function
  • Loading branch information
Nghiauet committed Aug 11, 2023
commit 6106c84af52f0b0805b5b43d5599f9a91583e3e0
36 changes: 26 additions & 10 deletions utils/preprocess_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def align_sequences(source_sent, target_sent):
target_sent (list): Target sentence

Returns:
list: [start transform idx, end transform idx, transform type]
string: string taged each token with transform operation
transform type in list [$APPEND, $REPLACE, $TRANSFORM, $DELETE, MERGE_{merge_type}]
"""
# check if sent is OK
Expand Down Expand Up @@ -304,9 +304,9 @@ def align_sequences(source_sent, target_sent):
all_edits.extend(edits)

# get labels
labels = convert_edits_into_labels(source_tokens, all_edits) # label each sentence
labels = convert_edits_into_labels(source_tokens, all_edits) # label each token
# match tags to source tokens
sent_with_tags = add_labels_to_the_tokens(source_tokens, labels)
sent_with_tags = add_labels_to_the_tokens(source_tokens, labels) # create a sentence from list of labels
return sent_with_tags


Expand Down Expand Up @@ -415,13 +415,23 @@ def convert_alignments_into_edits(alignment, shift_idx):


def add_labels_to_the_tokens(source_tokens, labels, delimeters=SEQ_DELIMETERS):
""" create a sentence from list of labels

Args:
source_tokens (list): source tokens
labels (list): label for transform operation
delimeters (dictionary, optional): delimeter for tokens, labels and operation. Defaults to SEQ_DELIMETERS.

Returns:
string : string of tokens with all tags operation
"""
tokens_with_all_tags = []
source_tokens_with_start = [START_TOKEN] + source_tokens
source_tokens_with_start = [START_TOKEN] + source_tokens # add start token
for token, label_list in zip(source_tokens_with_start, labels):
all_tags = delimeters['operations'].join(label_list)
comb_record = token + delimeters['labels'] + all_tags
comb_record = token + delimeters['labels'] + all_tags # append token and operatins
tokens_with_all_tags.append(comb_record)
return delimeters['tokens'].join(tokens_with_all_tags)
return delimeters['tokens'].join(tokens_with_all_tags)


def convert_data_from_raw_files(source_file, target_file, output_file, chunk_size):
Expand Down Expand Up @@ -474,15 +484,21 @@ def convert_data_from_raw_files(source_file, target_file, output_file, chunk_siz
write_lines(output_file, tagged, 'a')


# This function converts the labels into edits
def convert_labels_into_edits(labels):
all_edits = []
"""Converts a list of labels into a list of edits.
Args:
labels (list of lists of str): A list of labels for each word.
Returns:
list of tuple of (int, int, list of str): A list of edits.
"""
edits = []
for i, label_list in enumerate(labels):
if label_list == ["$KEEP"]:
continue
else:
edit = [(i - 1, i), label_list]
all_edits.append(edit)
return all_edits
edits.append((i - 1, i, label_list))
return edits


def get_target_sent_by_levels(source_tokens, labels):
Expand Down