Skip to content

Commit

Permalink
swap the use of merged_cohort_stats [ci skip]
Browse files Browse the repository at this point in the history
  • Loading branch information
abhi18av committed Jul 27, 2023
1 parent 79804aa commit 8a3e469
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 17 deletions.
48 changes: 38 additions & 10 deletions bin/generate_merged_cohort_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,49 @@
parser.add_argument('--call_wf_cohort_stats_tsv', default="joint.cohort_stats.tsv", metavar='call_wf_cohort_stats_tsv', type=str, help='File enlisting the cohort results of CALL_WF')
parser.add_argument('--output_file', default="joint.merged_cohort_stats.tsv", metavar='output_file', type=str, help='Name of the output file merged cohort statistics')
args = vars(parser.parse_args())

# Read all CSV files
df_cohort_stats = pd.read_csv(args['call_wf_cohort_stats_tsv'], sep="\t", index_col="SAMPLE")

df_approved_relabundance_stats = pd.read_csv(args['relabundance_approved_tsv'], sep="\t", index_col="SAMPLE")
df_approved_relabundance_stats = df_approved_relabundance_stats.convert_dtypes()
# Read the TSV files into dataframes
df_cohort_stats = pd.read_csv(args['call_wf_cohort_stats_tsv'], sep="\t")
df_cohort_stats.columns = df_cohort_stats.columns.str.strip()
df_cohort_stats['SAMPLE'] = df_cohort_stats['SAMPLE'].str.strip()
df_cohort_stats = df_cohort_stats.set_index('SAMPLE')


df_approved_relabundance_stats = pd.read_csv(args['relabundance_approved_tsv'], sep="\t")
df_approved_relabundance_stats.columns = df_approved_relabundance_stats.columns.str.strip()
df_approved_relabundance_stats['SAMPLE'] = df_approved_relabundance_stats['SAMPLE'].str.strip()
df_approved_relabundance_stats = df_approved_relabundance_stats.set_index('SAMPLE')


df_rejected_relabundance_stats = pd.read_csv(args['relabundance_rejected_tsv'], sep="\t")
df_rejected_relabundance_stats.columns = df_rejected_relabundance_stats.columns.str.strip()
df_rejected_relabundance_stats['SAMPLE'] = df_rejected_relabundance_stats['SAMPLE'].str.strip()
df_rejected_relabundance_stats = df_rejected_relabundance_stats.set_index('SAMPLE')


df_rejected_relabundance_stats = pd.read_csv(args['relabundance_rejected_tsv'], sep="\t", index_col="SAMPLE")
df_rejected_relabundance_stats = df_rejected_relabundance_stats.convert_dtypes()

# Join the datasets
df_relabundance_stats_concat = pd.concat([df_approved_relabundance_stats, df_rejected_relabundance_stats])
df_joint_cohort_stats = df_cohort_stats.join(df_relabundance_stats_concat, on="SAMPLE", how="left")

df_joint_cohort_stats = df_cohort_stats.join(df_relabundance_stats_concat, how="outer")


# Reorder the columns
df_joint_cohort_stats.columns = df_joint_cohort_stats.columns.str.strip()
new_cols = ['AVG_INSERT_SIZE', 'MAPPED_PERCENTAGE', 'RAW_TOTAL_SEQS', 'AVERAGE_BASE_QUALITY', 'MEAN_COVERAGE', 'SD_COVERAGE', 'MEDIAN_COVERAGE', 'MAD_COVERAGE', 'PCT_EXC_ADAPTER', 'PCT_EXC_MAPQ', 'PCT_EXC_DUPE', 'PCT_EXC_UNPAIRED', 'PCT_EXC_BASEQ', 'PCT_EXC_OVERLAP', 'PCT_EXC_CAPPED', 'PCT_EXC_TOTAL', 'PCT_1X', 'PCT_5X', 'PCT_10X', 'PCT_30X', 'PCT_50X', 'PCT_100X', 'LINEAGES', 'FREQUENCIES', 'MAPPED_NTM_FRACTION_16S', 'MAPPED_NTM_FRACTION_16S_THRESHOLD_MET', 'COVERAGE_THRESHOLD_MET', 'BREADTH_OF_COVERAGE_THRESHOLD_MET', 'RELABUNDANCE_THRESHOLD_MET', 'ALL_THRESHOLDS_MET']
df_final_cohort_stats= df_joint_cohort_stats[new_cols]


# Impute the NaN value after join
df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'] = df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'].fillna(0)

# Prepare for boolean operation
df_final_cohort_stats['MAPPED_NTM_FRACTION_16S_THRESHOLD_MET'] = df_final_cohort_stats['MAPPED_NTM_FRACTION_16S_THRESHOLD_MET'].astype('Int64')
df_final_cohort_stats['COVERAGE_THRESHOLD_MET'] = df_final_cohort_stats['COVERAGE_THRESHOLD_MET'].astype('Int64')
df_final_cohort_stats['BREADTH_OF_COVERAGE_THRESHOLD_MET'] = df_final_cohort_stats['BREADTH_OF_COVERAGE_THRESHOLD_MET'].astype('Int64')
df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'] = df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'].astype('Int64')

# Derive the final threshold using Boolean operations
df_final_cohort_stats['ALL_THRESHOLDS_MET'] = df_final_cohort_stats['MAPPED_NTM_FRACTION_16S_THRESHOLD_MET'].astype('bool') & df_final_cohort_stats['COVERAGE_THRESHOLD_MET'].astype('bool') & df_final_cohort_stats['BREADTH_OF_COVERAGE_THRESHOLD_MET'].astype('bool') & df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'].astype('bool')
df_final_cohort_stats['ALL_THRESHOLDS_MET'] = df_final_cohort_stats['ALL_THRESHOLDS_MET'].replace({True: 1, False: 0})

# Write the final dataframe to file
df_final_cohort_stats.to_csv(args['output_file'], sep="\t")
9 changes: 5 additions & 4 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -58,15 +58,16 @@ workflow {

CALL_WF( MAP_WF.out.sorted_reads_ch )

//NOTE: Samples implicitly get filtered in BCFTOOLS_MERGE if they don't have any identified variants
MINOR_VARIANT_ANALYSIS_WF(CALL_WF.out.reformatted_lofreq_vcfs_tuple_ch)

UTILS_MERGE_COHORT_STATS ( MINOR_VARIANT_ANALYSIS_WF.out.approved_samples_ch,
MINOR_VARIANT_ANALYSIS_WF.out.rejected_samples_ch,
CALL_WF.out.cohort_stats_tsv )
UTILS_MERGE_COHORT_STATS( MINOR_VARIANT_ANALYSIS_WF.out.approved_samples_ch,
MINOR_VARIANT_ANALYSIS_WF.out.rejected_samples_ch,
CALL_WF.out.cohort_stats_tsv )

MERGE_WF( CALL_WF.out.gvcf_ch,
CALL_WF.out.reformatted_lofreq_vcfs_tuple_ch,
CALL_WF.out.cohort_stats_tsv,
UTILS_MERGE_COHORT_STATS.out.merged_cohort_stats_ch,
MINOR_VARIANT_ANALYSIS_WF.out.approved_samples_ch,
MINOR_VARIANT_ANALYSIS_WF.out.rejected_samples_ch)

Expand Down
2 changes: 1 addition & 1 deletion modules/utils/merge_cohort_stats.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ process UTILS_MERGE_COHORT_STATS {
path(call_wf_cohort_stats_tsv)

output:
path("*merged_cohort_stats.tsv")
path("*merged_cohort_stats.tsv"), emit: merged_cohort_stats_ch


script:
Expand Down
4 changes: 2 additions & 2 deletions workflows/merge_wf.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ workflow MERGE_WF {
take:
gvcf_ch
reformatted_lofreq_vcfs_tuple_ch
cohort_stats_tsv
merged_cohort_stats_tsv
approved_samples_ch
rejected_samples_ch

Expand Down Expand Up @@ -42,7 +42,7 @@ workflow MERGE_WF {

//NOTE: Use the stats file for the entire cohort (from CALL_WF)
// and filter out the samples which pass all thresholds
approved_call_wf_samples_ch = cohort_stats_tsv
approved_call_wf_samples_ch = merged_cohort_stats_tsv
.splitCsv(header: false, skip: 1, sep: '\t' )
.map { row -> [
row.first(), // SAMPLE
Expand Down
1 change: 1 addition & 0 deletions workflows/minor_variant_analysis_wf.nf
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ workflow MINOR_VARIANT_ANALYSIS_WF {
.reduce { a, b -> "$a $b " }
.dump(tag:'MINOR_VARIANT_WF: vcfs_string_ch', pretty: true)

//NOTE: Samples implicitly get filtered here if they don't have any identified variants
BCFTOOLS_MERGE(vcfs_string_ch, reformatted_lofreq_vcfs_tuple_ch)

// merge_call_resistance_lofreq
Expand Down

0 comments on commit 8a3e469

Please sign in to comment.