Add vertical stacking (asreview#21)

laurens88 · Nov 16, 2022 · b40f26a · b40f26a
1 parent ff4f89b
commit b40f26a
Show file tree

Hide file tree

Showing 4 changed files with 95 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -8,11 +8,12 @@ LAB](https://github.com/asreview/asreview) that can be used to:
 the amount of missing data and duplicates)
 - [**Convert**](#data-convert) file formats via the command line
 - [**Deduplicate**](#data-dedup) data based on properties of the data
+- [**Stack**](#data-vstack-experimental) multiple datasets on top of each other to create a single dataset
 - [**Compose**](#data-compose-experimental) a single (labeled, partly labeled, or unlabeled) dataset from multiple datasets.
 
 ASReview datatools is available for ASReview Lab **v1.1+**.
 If you are using ASReview Lab v0.x, use [ASReview-statistics](https://pypi.org/project/asreview-statistics/) instead of ASReview datatools.
----
+
 ## Installation
 ASReview Datatools requires Python 3.7+ and [ASReview LAB](https://github.com/asreview/asreview) version 1.1 or later.
 
@@ -48,7 +49,7 @@ Each tool has its own help description which is available with
 ```bash
 asreview data NAME_OF_TOOL -h
 ```
----
+
 ## Tools
 ### Data Describe
 
@@ -164,6 +165,27 @@ platform](https://github.com/asreview/systematic-review-datasets).
 asreview data dedup benchmark:van_de_schoot_2017 -o van_de_schoot_2017_dedup.csv
 ```
 
+### Data Vstack (Experimental)
+Vertical stacking: combine as many datasets as you want into a single dataset.
+
+❗ Vstack is an experimental feature. We would love to hear your feedback.
+Please keep in mind that this feature can change in the future.
+
+Your datasets should be in any [ASReview-compatible data format](https://asreview.readthedocs.io/en/latest/data_format.html).
+All input files should be in the same format, the output path should also be of the same file format.
+
+Stack several datasets on top of each other: 
+```
+asreview data vstack output.csv MY_DATASET_1.csv MY_DATASET_2.csv MY_DATASET_3.csv
+```
+Here, 3 datasets are exported into a single dataset `output.csv`.
+The output path can be followed by any number of datasets to be stacked.
+
+#### Note
+Vstack does not do any deduplication.
+For deduplication you might want to use the [deduplication tool](#data-dedup).
+If you wish to create a single (labeled, partly labeled, or unlabeled) dataset from multiple datasets containing labeling decisions while having control over duplicates and labels, use [compose](#data-compose-experimental) instead.
+
 ### Data Compose (Experimental)
 Compose is where datasets with different labels (or no labels) can be assembled into a single dataset.
 
@@ -245,8 +267,6 @@ In case any duplicate ambiguously labeled records exist, either within a dataset
 
 If there are conflicting/contradictory labels, the user is warned, records with inconsistent labels are shown, and the script is aborted.
 
----
-
 ## License
 
 This extension is published under the [MIT license](/LICENSE).

diff --git a/asreviewcontrib/datatools/entrypoint.py b/asreviewcontrib/datatools/entrypoint.py
@@ -1,7 +1,6 @@
 import argparse
 
 from asreview.entry_points import BaseEntryPoint
-
 from asreviewcontrib.datatools.compose import _parse_arguments_compose
 from asreviewcontrib.datatools.compose import compose
 from asreviewcontrib.datatools.convert import _parse_arguments_convert
@@ -10,9 +9,10 @@
 from asreviewcontrib.datatools.dedup import dedup
 from asreviewcontrib.datatools.describe import _parse_arguments_describe
 from asreviewcontrib.datatools.describe import describe
+from asreviewcontrib.datatools.stack import _parse_arguments_stack
+from asreviewcontrib.datatools.stack import stack
 
-DATATOOLS = ["describe", "dedup", "convert", "compose"]
-
+DATATOOLS = ["describe", "dedup", "convert", "compose", "stack"]
 
 class DataEntryPoint(BaseEntryPoint):
     description = "Home of all data tools for ASReview."
@@ -55,6 +55,11 @@ def execute(self, argv):
                     resolve=args_compose.conflict_resolve,
                 )
 
+            if argv[0] == "stack":
+                args_stack_parser = _parse_arguments_stack()
+                args_stack = args_stack_parser.parse_args(argv[1:])
+                stack(args_stack.output_path, args_stack.datasets)
+
         # Print help message if subcommand not given or incorrect
         else:
 

diff --git a/asreviewcontrib/datatools/stack.py b/asreviewcontrib/datatools/stack.py
@@ -0,0 +1,43 @@
+import argparse
+from pathlib import Path
+
+import pandas as pd
+from asreview import ASReviewData
+from asreview.data.base import load_data
+
+
+def _check_suffix(input_files, output_file):
+    # Also raises ValueError on URLs that do not end with a file extension
+    suffixes = [Path(item).suffix for item in input_files if item is not None]
+    suffixes.append(Path(output_file).suffix)
+
+    set_ris = {".txt", ".ris"}
+    set_tabular = {".csv", ".tab", ".tsv", ".xlsx"}
+    set_suffixes = set(suffixes)
+
+    if len(set(suffixes)) > 1:
+        if not (set_suffixes.issubset(set_ris) or set_suffixes.issubset(set_tabular)):
+            raise ValueError(
+                "• Several file types were given; All input files, as well as the output file should be of the same "
+                "type. "
+            )
+
+
+def stack(output_file, input_files):
+    _check_suffix(input_files, output_file)
+
+    list_dfs = [load_data(item).df for item in input_files]
+    df_stacked = pd.concat(list_dfs).reset_index(drop=True)
+    as_stacked = ASReviewData(df=df_stacked)
+
+    as_stacked.to_file(output_file)
+
+
+def _parse_arguments_stack():
+    parser = argparse.ArgumentParser(prog="ASReview dataset stacking")
+    parser.add_argument("output_path", type=str, help="The output file path.")
+    parser.add_argument(
+        "datasets", type=str, nargs="+", help="Any number of datasets to stack."
+    )
+
+    return parser
diff --git a/tests/test_stack.py b/tests/test_stack.py
@@ -0,0 +1,20 @@
+from pathlib import Path
+
+from asreview.data import ASReviewData
+from asreviewcontrib.datatools.stack import stack
+
+
+test_dir = Path(__file__).parent
+file_1 = Path(test_dir, "demo_data", "dataset_1.ris")
+file_2 = Path(test_dir, "demo_data", "dataset_2.ris")
+
+
+def test_stack(tmpdir):
+    output_path = Path(tmpdir, "test_output.ris")
+    stack(output_path, [file_1, file_2])
+    as_test = ASReviewData.from_file(output_path)
+
+    assert len(as_test.df) == 14
+    assert as_test.df['included'].value_counts()[-1] == 9
+    assert as_test.df['included'].value_counts()[0] == 3
+    assert as_test.df['included'].value_counts()[1] == 2