fixes; code cleanup; replaced local gatk4 modules with repo modules; …

…testing
nf-core · Nov 18, 2021 · 185d6d8 · 185d6d8
1 parent cad30e8
commit 185d6d8
Show file tree

Hide file tree

Showing 26 changed files with 545 additions and 538 deletions.
diff --git a/assets/samplesheet_full.csv b/assets/samplesheet_full.csv
@@ -0,0 +1,5 @@
+sample,fastq_1,fastq_2,strandedness
+GM12878,s3://nf-core-awsmegatests/rnaseq/input_data/SRX1603629_T1_1.fastq.gz,s3://nf-core-awsmegatests/rnaseq/input_data/SRX1603629_T1_2.fastq.gz,reverse
+GM12878,s3://nf-core-awsmegatests/rnaseq/input_data/SRX1603630_T1_1.fastq.gz,s3://nf-core-awsmegatests/rnaseq/input_data/SRX1603630_T1_2.fastq.gz,reverse
+K562,s3://nf-core-awsmegatests/rnaseq/input_data/SRX1603392_T1_1.fastq.gz,s3://nf-core-awsmegatests/rnaseq/input_data/SRX1603392_T1_2.fastq.gz,reverse
+K562,s3://nf-core-awsmegatests/rnaseq/input_data/SRX1603393_T1_1.fastq.gz,s3://nf-core-awsmegatests/rnaseq/input_data/SRX1603393_T1_2.fastq.gz,reverse
diff --git a/assets/samplesheet_test.csv b/assets/samplesheet_test.csv
@@ -1,2 +1,4 @@
 sample,fastq_1,fastq_2,strandedness
 GM12878,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/homo_sapiens/illumina/fastq/test.rnaseq_1.fastq.gz,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/homo_sapiens/illumina/fastq/test.rnaseq_2.fastq.gz,reverse
+TEST1,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/homo_sapiens/illumina/fastq/test.rnaseq_1.fastq.gz,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/homo_sapiens/illumina/fastq/test.rnaseq_2.fastq.gz,reverse
+TEST2,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/homo_sapiens/illumina/fastq/test.rnaseq_1.fastq.gz,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/homo_sapiens/illumina/fastq/test.rnaseq_2.fastq.gz,reverse
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -1,7 +1,4 @@
-#!/usr/bin/env python
-
-# TODO nf-core: Update the script to check the samplesheet
-# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
+#!/usr/bin/env python3
 
 import os
 import sys
@@ -10,7 +7,7 @@
 
 
 def parse_args(args=None):
-    Description = "Reformat nf-core/rnavar samplesheet file and check its contents."
+    Description = "Reformat nf-core/rnaseq samplesheet file and check its contents."
     Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
 
     parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
@@ -29,63 +26,66 @@ def make_dir(path):
 
 
 def print_error(error, context="Line", context_str=""):
-    error_str = "ERROR: Please check samplesheet -> {}".format(error)
+    error_str = f"ERROR: Please check samplesheet -> {error}"
     if context != "" and context_str != "":
-        error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
-            error, context.strip(), context_str.strip()
-        )
+        error_str = f"ERROR: Please check samplesheet -> {error}\n{context.strip()}: '{context_str.strip()}'"
     print(error_str)
     sys.exit(1)
 
 
-# TODO nf-core: Update the check_samplesheet function
 def check_samplesheet(file_in, file_out):
     """
     This function checks that the samplesheet follows the following structure:
 
-    sample,fastq_1,fastq_2
-    SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz
-    SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz
-    SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,
+    sample,fastq_1,fastq_2,strandedness
+    SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz,forward
+    SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz,forward
+    SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,,forward
 
     For an example see:
-    https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
+    https://github.com/nf-core/test-datasets/blob/rnaseq/samplesheet/v3.1/samplesheet_test.csv
     """
 
     sample_mapping_dict = {}
-    with open(file_in, "r") as fin:
+    with open(file_in, "r", encoding='utf-8-sig') as fin:
 
         ## Check header
-        MIN_COLS = 2
-        # TODO nf-core: Update the column names for the input samplesheet
-        HEADER = ["sample", "fastq_1", "fastq_2"]
+        MIN_COLS = 3
+        HEADER = ["sample", "fastq_1", "fastq_2", "strandedness"]
         header = [x.strip('"') for x in fin.readline().strip().split(",")]
         if header[: len(HEADER)] != HEADER:
-            print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER)))
+            print(
+                f"ERROR: Please check samplesheet header -> {','.join(header)} != {','.join(HEADER)}"
+            )
             sys.exit(1)
 
         ## Check sample entries
         for line in fin:
             lspl = [x.strip().strip('"') for x in line.strip().split(",")]
 
-            # Check valid number of columns per row
+            ## Check valid number of columns per row
             if len(lspl) < len(HEADER):
                 print_error(
-                    "Invalid number of columns (minimum = {})!".format(len(HEADER)),
+                    f"Invalid number of columns (minimum = {len(HEADER)})!",
                     "Line",
                     line,
                 )
+
             num_cols = len([x for x in lspl if x])
             if num_cols < MIN_COLS:
                 print_error(
-                    "Invalid number of populated columns (minimum = {})!".format(MIN_COLS),
+                    f"Invalid number of populated columns (minimum = {MIN_COLS})!",
                     "Line",
                     line,
                 )
 
             ## Check sample name entries
-            sample, fastq_1, fastq_2 = lspl[: len(HEADER)]
-            sample = sample.replace(" ", "_")
+            sample, fastq_1, fastq_2, strandedness = lspl[: len(HEADER)]
+            if sample.find(" ") != -1:
+                print(
+                    f"WARNING: Spaces have been replaced by underscores for sample: {sample}"
+                )
+                sample = sample.replace(" ", "_")
             if not sample:
                 print_error("Sample entry has not been specified!", "Line", line)
 
@@ -101,16 +101,32 @@ def check_samplesheet(file_in, file_out):
                             line,
                         )
 
+            ## Check strandedness
+            strandednesses = ["unstranded", "forward", "reverse"]
+            if strandedness:
+                if strandedness not in strandednesses:
+                    print_error(
+                        f"Strandedness must be one of '{', '.join(strandednesses)}'!",
+                        "Line",
+                        line,
+                    )
+            else:
+                print_error(
+                    f"Strandedness has not been specified! Must be one of {', '.join(strandednesses)}.",
+                    "Line",
+                    line,
+                )
+
             ## Auto-detect paired-end/single-end
-            sample_info = []  ## [single_end, fastq_1, fastq_2]
+            sample_info = []  ## [single_end, fastq_1, fastq_2, strandedness]
             if sample and fastq_1 and fastq_2:  ## Paired-end short reads
-                sample_info = ["0", fastq_1, fastq_2]
+                sample_info = ["0", fastq_1, fastq_2, strandedness]
             elif sample and fastq_1 and not fastq_2:  ## Single-end short reads
-                sample_info = ["1", fastq_1, fastq_2]
+                sample_info = ["1", fastq_1, fastq_2, strandedness]
             else:
                 print_error("Invalid combination of columns provided!", "Line", line)
 
-            ## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 ] }
+            ## Create sample mapping dictionary = {sample: [[ single_end, fastq_1, fastq_2, strandedness ]]}
             if sample not in sample_mapping_dict:
                 sample_mapping_dict[sample] = [sample_info]
             else:
@@ -124,17 +140,38 @@ def check_samplesheet(file_in, file_out):
         out_dir = os.path.dirname(file_out)
         make_dir(out_dir)
         with open(file_out, "w") as fout:
-            fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2"]) + "\n")
+            fout.write(
+                ",".join(["sample", "single_end", "fastq_1", "fastq_2", "strandedness"])
+                + "\n"
+            )
             for sample in sorted(sample_mapping_dict.keys()):
 
-                ## Check that multiple runs of the same sample are of the same datatype
-                if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]):
-                    print_error("Multiple runs of a sample must be of the same datatype!", "Sample: {}".format(sample))
+                ## Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end
+                if not all(
+                    x[0] == sample_mapping_dict[sample][0][0]
+                    for x in sample_mapping_dict[sample]
+                ):
+                    print_error(
+                        f"Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end!",
+                        "Sample",
+                        sample,
+                    )
+
+                ## Check that multiple runs of the same sample are of the same strandedness
+                if not all(
+                    x[-1] == sample_mapping_dict[sample][0][-1]
+                    for x in sample_mapping_dict[sample]
+                ):
+                    print_error(
+                        f"Multiple runs of a sample must have the same strandedness!",
+                        "Sample",
+                        sample,
+                    )
 
                 for idx, val in enumerate(sample_mapping_dict[sample]):
-                    fout.write(",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n")
+                    fout.write(",".join([f"{sample}_T{idx+1}"] + val) + "\n")
     else:
-        print_error("No entries to process!", "Samplesheet: {}".format(file_in))
+        print_error(f"No entries to process!", "Samplesheet: {file_in}")
 
 
 def main(args=None):

diff --git a/conf/modules.config b/conf/modules.config
@@ -112,12 +112,19 @@ params {
             publish_dir      = 'variant_calling'
             publish_files    = ['vcf.gz':'', 'vcf.gz.tbi':'']
         }
+        'gatk_indexfeaturefile' {
+            args             = ''
+            suffix           = '.haplotypecaller'
+            publish_by_meta  = true
+            publish_dir      = 'variant_calling'
+            publish_files    = ['vcf.gz':'', 'vcf.gz.tbi':'']
+        }
         'gatk_variantfilter' {
             args             = ''
             suffix           = '.filtered'
             publish_by_meta  = true
             publish_dir      = 'variant_filtering'
-            publish_files    = ['vcf':'', 'vcf.idx':'']
+            publish_files    = ['vcf.gz':'', 'vcf.gz.tbi':'']
         }
         'fastqc' {
             args             = "--quiet"

diff --git a/conf/test.config b/conf/test.config
@@ -20,7 +20,8 @@ params {
     max_time   = 6.h
 
     // Input data
-    input = 'assets/samplesheet_test.csv'
+    //input = 'assets/samplesheet_test.csv'
+    input = 'assets/samplesheet_full.csv'
 
     // Genome references
     fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'

diff --git a/modules.json b/modules.json
@@ -24,12 +24,21 @@
             "gatk4/createsequencedictionary": {
                 "git_sha": "3b600af50eae8264960df817277cfe303d2acd47"
             },
+            "gatk4/indexfeaturefile": {
+                "git_sha": "1a4c7cec1b9d82fdaa15897d8e9a9e9a4767444d"
+            },
             "gatk4/intervallisttools": {
                 "git_sha": "3b600af50eae8264960df817277cfe303d2acd47"
             },
+            "gatk4/mergevcfs": {
+                "git_sha": "3b600af50eae8264960df817277cfe303d2acd47"
+            },
             "gatk4/splitncigarreads": {
                 "git_sha": "3b600af50eae8264960df817277cfe303d2acd47"
             },
+            "gatk4/variantfiltration": {
+                "git_sha": "3b600af50eae8264960df817277cfe303d2acd47"
+            },
             "gffread": {
                 "git_sha": "49da8642876ae4d91128168cd0db4f1c858d7792"
             },

diff --git a/modules/local/gatk4/applybqsr/functions.nf b/modules/local/gatk4/applybqsr/functions.nf
diff --git a/modules/local/gatk4/applybqsr/main.nf b/modules/local/gatk4/applybqsr/main.nf