code and datasets public

orionw · Aug 28, 2019 · f3aa06a · f3aa06a
commit f3aa06a
Show file tree

Hide file tree

Showing 53 changed files with 1,341,623 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+env/
+*.pyc
+full_datasets/reddit_jokes/reddit_cleaning/output
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Orion Weller
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,22 @@
+# Humor Detection
+## Code and Datasets for the Paper "Humor Detection: A Transformer Gets the Last Laugh" by Orion Weller and Kevin Seppi
+The repository contains the following:
+ - A way to regenerate the results found in the paper, by running `bash run_bert.sh`.  
+ - The full datasets referenced in the paper (short jokes, puns, and the reddit dataset) are located in `full_datasets` whereas the `data` folder contains the split files used for training and testing.  The file `create_data.sh` will create the splits (slightly different from the ones used in the paper - see `create_data.sh`).
+- pytorch_pretrained_bert contains files used by the model - these files are from the [huggingface repo](https://github.com/huggingface/pytorch-transformers#Training-large-models-introduction,-tools-and-examples) and are NOT up to date with the current `pytorch-transformers` repo.  
+
+**This repository is is not maintained and will not be updated.**
+
+## How to cite this paper:
+Authors of scientific papers who use this repository are encouraged to cite the following paper:
+```
+@ARTICLE{humorDetection2019,
+  title={Humor Detection: A Transformer gets the Last Laugh},
+  author={Weller, Orion and Seppi, Kevin},
+  journal={"Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing"},
+  month=Nov,
+  year = "2019",
+}
+```
+
+
diff --git a/create_data.sh b/create_data.sh
@@ -0,0 +1,11 @@
+#  This script will take the full data and regenerate the TSV files needed for the model.
+#  Unfortunately, due to how new I was at this, I didn't set seeds in all the locations that are needed. Fortunately, I did save the original data splits which are located in the `data` folder. However, recreating this (as is currently in these files) shows results that are ~1% off in either direction and are consistent with reported paper results.
+pip3 install -r requirements.txt
+# process the data
+python3 full_datasets/reddit_jokes/reddit_cleaning/GetSplitFiles.py
+python3 full_datasets/reddit_jokes/reddit_cleaning/GetTSVFileForBERT.py
+cp full_datasets/reddit_jokes/reddit_cleaning/output/output_for_bert/full/*.tsv data/
+# remove the validation set
+rm data/dev.tsv
+# move the test set for evaluation
+mv data/test.tsv data/dev.tsv
diff --git a/data/README.md b/data/README.md
@@ -0,0 +1,3 @@
+# Data Folder For Training/Testing
+The `.tsv` files in the base of this folder are the Reddit Full dataset, split into the train data and the global test set (named `dev.tsv` for convienence in using the same code.  The actual dev file is located in the `reddit_full` folder.
+
diff --git a/data/dev.tsv b/data/dev.tsv
diff --git a/data/puns/dev.tsv b/data/puns/dev.tsv
diff --git a/data/puns/test.tsv b/data/puns/test.tsv
diff --git a/data/puns/train.tsv b/data/puns/train.tsv
diff --git a/data/reddit_body_only/dev.tsv b/data/reddit_body_only/dev.tsv
diff --git a/data/reddit_body_only/test.tsv b/data/reddit_body_only/test.tsv
diff --git a/data/reddit_body_only/train.tsv b/data/reddit_body_only/train.tsv
diff --git a/data/reddit_full/dev.csv b/data/reddit_full/dev.csv
diff --git a/data/reddit_full/test.tsv b/data/reddit_full/test.tsv
diff --git a/data/reddit_full/train.tsv b/data/reddit_full/train.tsv
diff --git a/data/reddit_punchline_only/dev.tsv b/data/reddit_punchline_only/dev.tsv
diff --git a/data/reddit_punchline_only/test.tsv b/data/reddit_punchline_only/test.tsv
diff --git a/data/reddit_punchline_only/train.tsv b/data/reddit_punchline_only/train.tsv
diff --git a/data/short_jokes/dev.tsv b/data/short_jokes/dev.tsv
diff --git a/data/short_jokes/test.tsv b/data/short_jokes/test.tsv
diff --git a/data/short_jokes/train.tsv b/data/short_jokes/train.tsv
diff --git a/data/train.tsv b/data/train.tsv
diff --git a/full_datasets/puns/puns_pos_neg_data.csv b/full_datasets/puns/puns_pos_neg_data.csv
diff --git a/full_datasets/reddit_jokes/reddit_cleaning/ConvertForCNN.py b/full_datasets/reddit_jokes/reddit_cleaning/ConvertForCNN.py
@@ -0,0 +1,36 @@
+import pandas as pd
+
+def split_on_score(df):
+    pos = df[df["score"] == 1]
+    neg = df[df["score"] == 0]
+    pos.drop("score", axis=1, inplace=True)
+    neg.drop("score", axis=1, inplace=True)
+    assert(len(pos) + len(neg) == len(df))
+    return pos, neg
+
+for string in ["full", "punch", "body"]:
+    train = pd.read_csv("output/output_for_bert/{}/train.csv".format(string), encoding="utf-8", names=['score', 'same_letter', "text"], index_col=None)
+    dev = pd.read_csv("output/output_for_bert/{}/dev.csv".format(string), encoding="utf-8", names=['score', 'same_letter', "text"], index_col=None)
+    test = pd.read_csv("output/output_for_bert/{}/test.csv".format(string), encoding="utf-8", names=['score', 'same_letter', "text"], index_col=None)
+
+    train.drop("same_letter", axis=1, inplace=True)
+    dev.drop("same_letter", axis=1, inplace=True)
+    test.drop("same_letter", axis=1, inplace=True)
+    # This is where we split into positive and negative for the CNN
+    train_pos, train_neg = split_on_score(train)
+    test_pos, test_neg = split_on_score(test)
+    dev_pos, dev_neg = split_on_score(dev)
+
+    train_pos.to_csv("output/output_for_bert/{}/train_pos.txt".format(string), encoding="utf-8", header=None, index=None, sep=' ', mode='w')
+    test_pos.to_csv("output/output_for_bert/{}/test_pos.txt".format(string), encoding="utf-8", header=None, index=None, sep=' ', mode='w')
+    dev_pos.to_csv("output/output_for_bert/{}/dev_pos.txt".format(string), encoding="utf-8", header=None, index=None, sep=' ', mode='w')
+
+
+    train_neg.to_csv("output/output_for_bert/{}/train_neg.txt".format(string), header=None, index=None, sep=' ', mode='w')
+    test_neg.to_csv("output/output_for_bert/{}/test_neg.txt".format(string), header=None, index=None, sep=' ', mode='w')
+    dev_neg.to_csv("output/output_for_bert/{}/dev_neg.txt".format(string), header=None, index=None, sep=' ', mode='w',)
+
+    print(train_pos.head())
+
+print("Output has been created")
+
diff --git a/full_datasets/reddit_jokes/reddit_cleaning/GetSplitFiles.py b/full_datasets/reddit_jokes/reddit_cleaning/GetSplitFiles.py
@@ -0,0 +1,79 @@
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.utils import shuffle
+from sklearn.utils import resample
+import os
+
+base_path = os.path.join("full_datasets", "reddit_jokes")
+all_features = pd.read_csv(os.path.join(base_path, "reddit_full_data.csv"), encoding="utf-8", index_col=0)
+print(all_features.head())
+
+CUTOFF = 200
+all_features["score"] = (all_features["score"] > CUTOFF).astype(int)
+all_features["score"].value_counts()
+
+all_features["text"] = all_features["title"] + "_____" + all_features["selftext"]
+all_features = all_features.dropna()
+
+all_features["text"] = all_features["text"].apply(lambda s: s.replace('\n', ''))
+all_features["text"] = all_features["text"].apply(lambda s: s.replace('\t', ''))
+all_features["text"] = all_features["text"].apply(lambda s: s.replace('\r', ''))
+
+def upsample(df_majority, df_minority):
+    # Upsample minority class
+    df_minority_upsampled = resample(df_minority, 
+                                     replace=True,     # sample with replacement
+                                     n_samples=df_majority.shape[0],    # to match majority class
+                                     random_state=42) # reproducible results
+
+    # Combine majority class with upsampled minority class
+    df_upsampled = pd.concat([df_majority, df_minority_upsampled])
+    data_train = df_upsampled
+    # Display new class counts
+    return data_train
+
+def downsample(df_majority, df_minority):
+    # downsample majority class
+    df_majority_upsampled = resample(df_majority, 
+                                     replace=False,     # sample with replacement
+                                     n_samples=df_minority.shape[0],    # to match majority class
+                                     random_state=42) # reproducible results
+
+    # Combine majority class with upsampled minority class
+    df_downsampled = pd.concat([df_minority, df_majority_upsampled])
+    data_train = df_downsampled
+    # Display new class counts
+    return data_train
+
+data_train, data_split = train_test_split(all_features, test_size=0.3, stratify=all_features["score"], random_state=17)
+print(data_train.shape, data_split.shape)
+data_train.reset_index(inplace=True, drop=True)
+data_val, data_test = train_test_split(data_split, test_size=0.5, stratify=data_split["score"], random_state=17)
+print(data_val.shape, data_test.shape)
+
+# now to sample
+print("Sampling")
+df_majority = data_val[data_val["score"]==0]
+df_minority = data_val[data_val["score"]==1]
+data_val = downsample(df_majority, df_minority)
+print(data_val.shape, "Is the shape of the validation")
+
+df_majority = data_test[data_test.score==0]
+df_minority = data_test[data_test.score==1]
+data_test = downsample(df_majority, df_minority)
+print(data_test.shape, "Is the shape of the test")
+
+df_majority = data_train[data_train.score==0]
+df_minority = data_train[data_train.score==1]
+data_train = upsample(df_majority, df_minority)
+data_train = shuffle(data_train)
+print(data_train.shape, "Is the shape of the train")
+
+print(data_train["score"].value_counts())
+
+data_train.to_csv(os.path.join(base_path, "reddit_cleaning", "output/train.csv"), encoding="utf-8", header=True)
+data_test.to_csv(os.path.join(base_path, "reddit_cleaning", "output/test.csv"), encoding="utf-8", header=True)
+data_val.to_csv(os.path.join(base_path, "reddit_cleaning", "output/dev.csv"), encoding="utf-8", header=True,)
+
+print("Done splitting")
diff --git a/full_datasets/reddit_jokes/reddit_cleaning/GetTSVFileForBERT.py b/full_datasets/reddit_jokes/reddit_cleaning/GetTSVFileForBERT.py
@@ -0,0 +1,49 @@
+import pandas as pd
+import os
+
+base_path = os.path.join("full_datasets", "reddit_jokes", "reddit_cleaning")
+train = pd.read_csv(os.path.join(base_path, "output/train.csv"), encoding="utf-8")
+dev = pd.read_csv(os.path.join(base_path, "output/dev.csv"), encoding="utf-8")
+test = pd.read_csv(os.path.join(base_path, "output/test.csv"), encoding="utf-8")
+
+print(train.head())
+
+def ready_for_bert(given_df, keep_column="text"):
+    data = given_df.copy(deep=True)
+    data[keep_column] = data[keep_column].apply(lambda s: s.replace('\n', ''))
+    data[keep_column] = data[keep_column].apply(lambda s: s.replace('\t', ''))
+    data[keep_column] = data[keep_column].apply(lambda s: s.replace('\r', ''))
+    data["same_letter"] = "a"
+    data = data[["score", "same_letter", keep_column]]
+    return data
+
+train_full = ready_for_bert(train)
+test_full = ready_for_bert(test)
+dev_full = ready_for_bert(dev)
+
+train_body = ready_for_bert(train, "title")
+test_body = ready_for_bert(test, "title")
+dev_body = ready_for_bert(dev, "title")
+
+train_punch = ready_for_bert(train, "selftext")
+test_punch = ready_for_bert(test, "selftext")
+dev_punch = ready_for_bert(dev, "selftext")
+
+train_full.head()
+
+print(train_full["score"].value_counts())
+
+train_full.to_csv(os.path.join(base_path, "output/output_for_bert/full/train.tsv"), encoding="utf-8", header=False)
+test_full.to_csv(os.path.join(base_path, "output/output_for_bert/full/test.tsv"), encoding="utf-8", header=False)
+dev_full.to_csv(os.path.join(base_path, "output/output_for_bert/full/dev.tsv"), encoding="utf-8", header=False,)
+
+
+train_body.to_csv(os.path.join(base_path, "output/output_for_bert/body/train.tsv"), encoding="utf-8", header=False)
+test_body.to_csv(os.path.join(base_path, "output/output_for_bert/body/test.tsv"), encoding="utf-8", header=False)
+dev_body.to_csv(os.path.join(base_path, "output/output_for_bert/body/dev.tsv"), encoding="utf-8", header=False,)
+
+
+train_punch.to_csv(os.path.join(base_path, "output/output_for_bert/punch/train.tsv"), encoding="utf-8", header=False)
+test_punch.to_csv(os.path.join(base_path, "output/output_for_bert/punch/test.tsv"), encoding="utf-8", header=False)
+dev_punch.to_csv(os.path.join(base_path, "output/output_for_bert/punch/dev.tsv"), encoding="utf-8", header=False,)
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Data Folder For Training/Testing
		The `.tsv` files in the base of this folder are the Reddit Full dataset, split into the train data and the global test set (named `dev.tsv` for convienence in using the same code. The actual dev file is located in the `reddit_full` folder.