Skip to content

Commit

Permalink
fixes; code cleanup; replaced local gatk4 modules with repo modules; …
Browse files Browse the repository at this point in the history
…testing
  • Loading branch information
praveenraj2018 committed Nov 18, 2021
1 parent cad30e8 commit 185d6d8
Show file tree
Hide file tree
Showing 26 changed files with 545 additions and 538 deletions.
5 changes: 5 additions & 0 deletions assets/samplesheet_full.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
sample,fastq_1,fastq_2,strandedness
GM12878,s3://nf-core-awsmegatests/rnaseq/input_data/SRX1603629_T1_1.fastq.gz,s3://nf-core-awsmegatests/rnaseq/input_data/SRX1603629_T1_2.fastq.gz,reverse
GM12878,s3://nf-core-awsmegatests/rnaseq/input_data/SRX1603630_T1_1.fastq.gz,s3://nf-core-awsmegatests/rnaseq/input_data/SRX1603630_T1_2.fastq.gz,reverse
K562,s3://nf-core-awsmegatests/rnaseq/input_data/SRX1603392_T1_1.fastq.gz,s3://nf-core-awsmegatests/rnaseq/input_data/SRX1603392_T1_2.fastq.gz,reverse
K562,s3://nf-core-awsmegatests/rnaseq/input_data/SRX1603393_T1_1.fastq.gz,s3://nf-core-awsmegatests/rnaseq/input_data/SRX1603393_T1_2.fastq.gz,reverse
2 changes: 2 additions & 0 deletions assets/samplesheet_test.csv
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
sample,fastq_1,fastq_2,strandedness
GM12878,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/homo_sapiens/illumina/fastq/test.rnaseq_1.fastq.gz,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/homo_sapiens/illumina/fastq/test.rnaseq_2.fastq.gz,reverse
TEST1,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/homo_sapiens/illumina/fastq/test.rnaseq_1.fastq.gz,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/homo_sapiens/illumina/fastq/test.rnaseq_2.fastq.gz,reverse
TEST2,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/homo_sapiens/illumina/fastq/test.rnaseq_1.fastq.gz,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/homo_sapiens/illumina/fastq/test.rnaseq_2.fastq.gz,reverse
107 changes: 72 additions & 35 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
#!/usr/bin/env python

# TODO nf-core: Update the script to check the samplesheet
# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
#!/usr/bin/env python3

import os
import sys
Expand All @@ -10,7 +7,7 @@


def parse_args(args=None):
Description = "Reformat nf-core/rnavar samplesheet file and check its contents."
Description = "Reformat nf-core/rnaseq samplesheet file and check its contents."
Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"

parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
Expand All @@ -29,63 +26,66 @@ def make_dir(path):


def print_error(error, context="Line", context_str=""):
error_str = "ERROR: Please check samplesheet -> {}".format(error)
error_str = f"ERROR: Please check samplesheet -> {error}"
if context != "" and context_str != "":
error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
error, context.strip(), context_str.strip()
)
error_str = f"ERROR: Please check samplesheet -> {error}\n{context.strip()}: '{context_str.strip()}'"
print(error_str)
sys.exit(1)


# TODO nf-core: Update the check_samplesheet function
def check_samplesheet(file_in, file_out):
"""
This function checks that the samplesheet follows the following structure:
sample,fastq_1,fastq_2
SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz
SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz
SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,
sample,fastq_1,fastq_2,strandedness
SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz,forward
SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz,forward
SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,,forward
For an example see:
https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
https://github.com/nf-core/test-datasets/blob/rnaseq/samplesheet/v3.1/samplesheet_test.csv
"""

sample_mapping_dict = {}
with open(file_in, "r") as fin:
with open(file_in, "r", encoding='utf-8-sig') as fin:

## Check header
MIN_COLS = 2
# TODO nf-core: Update the column names for the input samplesheet
HEADER = ["sample", "fastq_1", "fastq_2"]
MIN_COLS = 3
HEADER = ["sample", "fastq_1", "fastq_2", "strandedness"]
header = [x.strip('"') for x in fin.readline().strip().split(",")]
if header[: len(HEADER)] != HEADER:
print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER)))
print(
f"ERROR: Please check samplesheet header -> {','.join(header)} != {','.join(HEADER)}"
)
sys.exit(1)

## Check sample entries
for line in fin:
lspl = [x.strip().strip('"') for x in line.strip().split(",")]

# Check valid number of columns per row
## Check valid number of columns per row
if len(lspl) < len(HEADER):
print_error(
"Invalid number of columns (minimum = {})!".format(len(HEADER)),
f"Invalid number of columns (minimum = {len(HEADER)})!",
"Line",
line,
)

num_cols = len([x for x in lspl if x])
if num_cols < MIN_COLS:
print_error(
"Invalid number of populated columns (minimum = {})!".format(MIN_COLS),
f"Invalid number of populated columns (minimum = {MIN_COLS})!",
"Line",
line,
)

## Check sample name entries
sample, fastq_1, fastq_2 = lspl[: len(HEADER)]
sample = sample.replace(" ", "_")
sample, fastq_1, fastq_2, strandedness = lspl[: len(HEADER)]
if sample.find(" ") != -1:
print(
f"WARNING: Spaces have been replaced by underscores for sample: {sample}"
)
sample = sample.replace(" ", "_")
if not sample:
print_error("Sample entry has not been specified!", "Line", line)

Expand All @@ -101,16 +101,32 @@ def check_samplesheet(file_in, file_out):
line,
)

## Check strandedness
strandednesses = ["unstranded", "forward", "reverse"]
if strandedness:
if strandedness not in strandednesses:
print_error(
f"Strandedness must be one of '{', '.join(strandednesses)}'!",
"Line",
line,
)
else:
print_error(
f"Strandedness has not been specified! Must be one of {', '.join(strandednesses)}.",
"Line",
line,
)

## Auto-detect paired-end/single-end
sample_info = [] ## [single_end, fastq_1, fastq_2]
sample_info = [] ## [single_end, fastq_1, fastq_2, strandedness]
if sample and fastq_1 and fastq_2: ## Paired-end short reads
sample_info = ["0", fastq_1, fastq_2]
sample_info = ["0", fastq_1, fastq_2, strandedness]
elif sample and fastq_1 and not fastq_2: ## Single-end short reads
sample_info = ["1", fastq_1, fastq_2]
sample_info = ["1", fastq_1, fastq_2, strandedness]
else:
print_error("Invalid combination of columns provided!", "Line", line)

## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 ] }
## Create sample mapping dictionary = {sample: [[ single_end, fastq_1, fastq_2, strandedness ]]}
if sample not in sample_mapping_dict:
sample_mapping_dict[sample] = [sample_info]
else:
Expand All @@ -124,17 +140,38 @@ def check_samplesheet(file_in, file_out):
out_dir = os.path.dirname(file_out)
make_dir(out_dir)
with open(file_out, "w") as fout:
fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2"]) + "\n")
fout.write(
",".join(["sample", "single_end", "fastq_1", "fastq_2", "strandedness"])
+ "\n"
)
for sample in sorted(sample_mapping_dict.keys()):

## Check that multiple runs of the same sample are of the same datatype
if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]):
print_error("Multiple runs of a sample must be of the same datatype!", "Sample: {}".format(sample))
## Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end
if not all(
x[0] == sample_mapping_dict[sample][0][0]
for x in sample_mapping_dict[sample]
):
print_error(
f"Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end!",
"Sample",
sample,
)

## Check that multiple runs of the same sample are of the same strandedness
if not all(
x[-1] == sample_mapping_dict[sample][0][-1]
for x in sample_mapping_dict[sample]
):
print_error(
f"Multiple runs of a sample must have the same strandedness!",
"Sample",
sample,
)

for idx, val in enumerate(sample_mapping_dict[sample]):
fout.write(",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n")
fout.write(",".join([f"{sample}_T{idx+1}"] + val) + "\n")
else:
print_error("No entries to process!", "Samplesheet: {}".format(file_in))
print_error(f"No entries to process!", "Samplesheet: {file_in}")


def main(args=None):
Expand Down
9 changes: 8 additions & 1 deletion conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,19 @@ params {
publish_dir = 'variant_calling'
publish_files = ['vcf.gz':'', 'vcf.gz.tbi':'']
}
'gatk_indexfeaturefile' {
args = ''
suffix = '.haplotypecaller'
publish_by_meta = true
publish_dir = 'variant_calling'
publish_files = ['vcf.gz':'', 'vcf.gz.tbi':'']
}
'gatk_variantfilter' {
args = ''
suffix = '.filtered'
publish_by_meta = true
publish_dir = 'variant_filtering'
publish_files = ['vcf':'', 'vcf.idx':'']
publish_files = ['vcf.gz':'', 'vcf.gz.tbi':'']
}
'fastqc' {
args = "--quiet"
Expand Down
3 changes: 2 additions & 1 deletion conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ params {
max_time = 6.h

// Input data
input = 'assets/samplesheet_test.csv'
//input = 'assets/samplesheet_test.csv'
input = 'assets/samplesheet_full.csv'

// Genome references
fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
Expand Down
9 changes: 9 additions & 0 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,21 @@
"gatk4/createsequencedictionary": {
"git_sha": "3b600af50eae8264960df817277cfe303d2acd47"
},
"gatk4/indexfeaturefile": {
"git_sha": "1a4c7cec1b9d82fdaa15897d8e9a9e9a4767444d"
},
"gatk4/intervallisttools": {
"git_sha": "3b600af50eae8264960df817277cfe303d2acd47"
},
"gatk4/mergevcfs": {
"git_sha": "3b600af50eae8264960df817277cfe303d2acd47"
},
"gatk4/splitncigarreads": {
"git_sha": "3b600af50eae8264960df817277cfe303d2acd47"
},
"gatk4/variantfiltration": {
"git_sha": "3b600af50eae8264960df817277cfe303d2acd47"
},
"gffread": {
"git_sha": "49da8642876ae4d91128168cd0db4f1c858d7792"
},
Expand Down
68 changes: 0 additions & 68 deletions modules/local/gatk4/applybqsr/functions.nf

This file was deleted.

46 changes: 0 additions & 46 deletions modules/local/gatk4/applybqsr/main.nf

This file was deleted.

Loading

0 comments on commit 185d6d8

Please sign in to comment.