swap the use of merged_cohort_stats [ci skip]

TORCH-Consortium · Jul 27, 2023 · 8a3e469 · 8a3e469
1 parent 79804aa
commit 8a3e469
Show file tree

Hide file tree

Showing 5 changed files with 47 additions and 17 deletions.
diff --git a/bin/generate_merged_cohort_stats.py b/bin/generate_merged_cohort_stats.py
@@ -11,21 +11,49 @@
     parser.add_argument('--call_wf_cohort_stats_tsv', default="joint.cohort_stats.tsv", metavar='call_wf_cohort_stats_tsv', type=str, help='File enlisting the cohort results of CALL_WF')
     parser.add_argument('--output_file', default="joint.merged_cohort_stats.tsv", metavar='output_file', type=str, help='Name of the output file merged cohort statistics')
     args = vars(parser.parse_args())
-
-    # Read all CSV files
-    df_cohort_stats = pd.read_csv(args['call_wf_cohort_stats_tsv'], sep="\t", index_col="SAMPLE")
 
-    df_approved_relabundance_stats =  pd.read_csv(args['relabundance_approved_tsv'], sep="\t", index_col="SAMPLE")
-    df_approved_relabundance_stats =  df_approved_relabundance_stats.convert_dtypes()
+    # Read the TSV files into dataframes
+    df_cohort_stats = pd.read_csv(args['call_wf_cohort_stats_tsv'], sep="\t")
+    df_cohort_stats.columns = df_cohort_stats.columns.str.strip()
+    df_cohort_stats['SAMPLE'] = df_cohort_stats['SAMPLE'].str.strip()
+    df_cohort_stats = df_cohort_stats.set_index('SAMPLE')
+
+
+    df_approved_relabundance_stats =  pd.read_csv(args['relabundance_approved_tsv'], sep="\t")
+    df_approved_relabundance_stats.columns =  df_approved_relabundance_stats.columns.str.strip()
+    df_approved_relabundance_stats['SAMPLE'] = df_approved_relabundance_stats['SAMPLE'].str.strip()
+    df_approved_relabundance_stats = df_approved_relabundance_stats.set_index('SAMPLE')
+
+
+    df_rejected_relabundance_stats =  pd.read_csv(args['relabundance_rejected_tsv'], sep="\t")
+    df_rejected_relabundance_stats.columns =  df_rejected_relabundance_stats.columns.str.strip()
+    df_rejected_relabundance_stats['SAMPLE'] = df_rejected_relabundance_stats['SAMPLE'].str.strip()
+    df_rejected_relabundance_stats = df_rejected_relabundance_stats.set_index('SAMPLE')
+
 
-    df_rejected_relabundance_stats =  pd.read_csv(args['relabundance_rejected_tsv'], sep="\t", index_col="SAMPLE")
-    df_rejected_relabundance_stats =  df_rejected_relabundance_stats.convert_dtypes()
-
     # Join the datasets
     df_relabundance_stats_concat = pd.concat([df_approved_relabundance_stats, df_rejected_relabundance_stats])
-    df_joint_cohort_stats =  df_cohort_stats.join(df_relabundance_stats_concat, on="SAMPLE", how="left")
-
+    df_joint_cohort_stats =  df_cohort_stats.join(df_relabundance_stats_concat, how="outer")
+
+
     # Reorder the columns
+    df_joint_cohort_stats.columns = df_joint_cohort_stats.columns.str.strip()
     new_cols = ['AVG_INSERT_SIZE', 'MAPPED_PERCENTAGE', 'RAW_TOTAL_SEQS', 'AVERAGE_BASE_QUALITY', 'MEAN_COVERAGE', 'SD_COVERAGE', 'MEDIAN_COVERAGE', 'MAD_COVERAGE', 'PCT_EXC_ADAPTER', 'PCT_EXC_MAPQ', 'PCT_EXC_DUPE', 'PCT_EXC_UNPAIRED', 'PCT_EXC_BASEQ', 'PCT_EXC_OVERLAP', 'PCT_EXC_CAPPED', 'PCT_EXC_TOTAL', 'PCT_1X', 'PCT_5X', 'PCT_10X', 'PCT_30X', 'PCT_50X', 'PCT_100X', 'LINEAGES', 'FREQUENCIES', 'MAPPED_NTM_FRACTION_16S', 'MAPPED_NTM_FRACTION_16S_THRESHOLD_MET', 'COVERAGE_THRESHOLD_MET', 'BREADTH_OF_COVERAGE_THRESHOLD_MET', 'RELABUNDANCE_THRESHOLD_MET', 'ALL_THRESHOLDS_MET']
     df_final_cohort_stats= df_joint_cohort_stats[new_cols]
+
+
+    # Impute the NaN value after join
+    df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'] = df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'].fillna(0)
+
+    # Prepare for boolean operation
+    df_final_cohort_stats['MAPPED_NTM_FRACTION_16S_THRESHOLD_MET'] = df_final_cohort_stats['MAPPED_NTM_FRACTION_16S_THRESHOLD_MET'].astype('Int64')
+    df_final_cohort_stats['COVERAGE_THRESHOLD_MET'] = df_final_cohort_stats['COVERAGE_THRESHOLD_MET'].astype('Int64')
+    df_final_cohort_stats['BREADTH_OF_COVERAGE_THRESHOLD_MET'] = df_final_cohort_stats['BREADTH_OF_COVERAGE_THRESHOLD_MET'].astype('Int64')
+    df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'] = df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'].astype('Int64')
+
+    # Derive the final threshold using Boolean operations
+    df_final_cohort_stats['ALL_THRESHOLDS_MET'] = df_final_cohort_stats['MAPPED_NTM_FRACTION_16S_THRESHOLD_MET'].astype('bool')  & df_final_cohort_stats['COVERAGE_THRESHOLD_MET'].astype('bool')  & df_final_cohort_stats['BREADTH_OF_COVERAGE_THRESHOLD_MET'].astype('bool')  & df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'].astype('bool')
+    df_final_cohort_stats['ALL_THRESHOLDS_MET'] = df_final_cohort_stats['ALL_THRESHOLDS_MET'].replace({True: 1, False: 0})
+
+    # Write the final dataframe to file
     df_final_cohort_stats.to_csv(args['output_file'], sep="\t")
diff --git a/main.nf b/main.nf
@@ -58,15 +58,16 @@ workflow {
 
         CALL_WF( MAP_WF.out.sorted_reads_ch )
 
+        //NOTE: Samples implicitly get filtered in BCFTOOLS_MERGE if they don't have any identified variants
         MINOR_VARIANT_ANALYSIS_WF(CALL_WF.out.reformatted_lofreq_vcfs_tuple_ch)
 
-        UTILS_MERGE_COHORT_STATS ( MINOR_VARIANT_ANALYSIS_WF.out.approved_samples_ch,
-                                   MINOR_VARIANT_ANALYSIS_WF.out.rejected_samples_ch,
-                                   CALL_WF.out.cohort_stats_tsv )
+        UTILS_MERGE_COHORT_STATS( MINOR_VARIANT_ANALYSIS_WF.out.approved_samples_ch,
+                                  MINOR_VARIANT_ANALYSIS_WF.out.rejected_samples_ch,
+                                  CALL_WF.out.cohort_stats_tsv )
 
         MERGE_WF( CALL_WF.out.gvcf_ch,
                   CALL_WF.out.reformatted_lofreq_vcfs_tuple_ch, 
-                  CALL_WF.out.cohort_stats_tsv,
+                  UTILS_MERGE_COHORT_STATS.out.merged_cohort_stats_ch,
                   MINOR_VARIANT_ANALYSIS_WF.out.approved_samples_ch,
                   MINOR_VARIANT_ANALYSIS_WF.out.rejected_samples_ch)
 

diff --git a/modules/utils/merge_cohort_stats.nf b/modules/utils/merge_cohort_stats.nf
@@ -8,7 +8,7 @@ process UTILS_MERGE_COHORT_STATS {
         path(call_wf_cohort_stats_tsv)
 
     output:
-        path("*merged_cohort_stats.tsv")
+        path("*merged_cohort_stats.tsv"),    emit: merged_cohort_stats_ch
 
 
     script:

diff --git a/workflows/merge_wf.nf b/workflows/merge_wf.nf
@@ -14,7 +14,7 @@ workflow MERGE_WF {
     take:
         gvcf_ch
         reformatted_lofreq_vcfs_tuple_ch
-        cohort_stats_tsv
+        merged_cohort_stats_tsv
         approved_samples_ch
         rejected_samples_ch
 
@@ -42,7 +42,7 @@ workflow MERGE_WF {
 
         //NOTE: Use the stats file for the entire cohort (from CALL_WF)
         // and filter out the samples which pass all thresholds
-        approved_call_wf_samples_ch = cohort_stats_tsv
+        approved_call_wf_samples_ch = merged_cohort_stats_tsv
                                 .splitCsv(header: false, skip: 1, sep: '\t' )
                                 .map { row -> [
                                         row.first(),           // SAMPLE

diff --git a/workflows/minor_variant_analysis_wf.nf b/workflows/minor_variant_analysis_wf.nf
@@ -19,6 +19,7 @@ workflow MINOR_VARIANT_ANALYSIS_WF {
                                 .reduce { a, b -> "$a $b " }
                                 .dump(tag:'MINOR_VARIANT_WF: vcfs_string_ch', pretty: true)
 
+        //NOTE: Samples implicitly get filtered here if they don't have any identified variants
         BCFTOOLS_MERGE(vcfs_string_ch, reformatted_lofreq_vcfs_tuple_ch)
 
         // merge_call_resistance_lofreq
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,7 +8,7 @@ process UTILS_MERGE_COHORT_STATS { @@
             path(call_wf_cohort_stats_tsv)
         output:
-            path("*merged_cohort_stats.tsv")
+            path("*merged_cohort_stats.tsv"),    emit: merged_cohort_stats_ch
         script:
@@ Expand Down @@