Skip to content

Commit

Permalink
fix: drop duplicate data
Browse files Browse the repository at this point in the history
  • Loading branch information
uoo723 committed Sep 27, 2022
1 parent bcf589e commit 946fb14
Showing 1 changed file with 6 additions and 2 deletions.
8 changes: 6 additions & 2 deletions src/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def __init__(
os.path.join(root_data_dir, aug_filename), low_memory=False
)
assert "발화문" in df.columns and "인텐트" in df.columns
self.df = df
self.df = df[["발화문", "인텐트"]].drop_duplicates().reset_index(drop=True)
return

if mode == "train":
Expand All @@ -42,7 +42,11 @@ def __init__(
elif mode == "test":
df = pd.read_csv(os.path.join(root_data_dir, "test.csv"), low_memory=False)

self.df = df[df["QA여부"] == "q"]
self.df = (
df[df["QA여부"] == "q"][["발화문", "인텐트"]]
.drop_duplicates()
.reset_index(drop=True)
)

def __getitem__(self, index) -> Tuple[str, str]:
return tuple(self.df.iloc[index][["발화문", "인텐트"]].tolist())
Expand Down

0 comments on commit 946fb14

Please sign in to comment.