From 5f7aea915a213be99fd26884d58dc257b7338229 Mon Sep 17 00:00:00 2001
From: "Matthew A. Wyczalkowski" <m.wyczalkowski@wustl.edu>
Date: Tue, 6 Sep 2022 19:02:35 -0500
Subject: [PATCH] Bugfixes to make sure catalog2, catalog3, and demographics
 match

---
 1_run_discovery.sh              | 29 ++++++++++++++++++++++++++++-
 3_make_catalog3.sh              |  4 ++--
 results/CPTAC3.Demographics.tsv |  5 -----
 src/make_catalog2.sh            | 27 ++++++++++++++++++++++++++-
 src/make_catalog3.py            |  2 +-
 src/process_demographics.sh     |  7 ++++++-
 src/run_discovery.sh            | 22 ++--------------------
 7 files changed, 65 insertions(+), 31 deletions(-)
 delete mode 100644 results/CPTAC3.Demographics.tsv

diff --git a/1_run_discovery.sh b/1_run_discovery.sh
index 3649009..5b88171 100644
--- a/1_run_discovery.sh
+++ b/1_run_discovery.sh
@@ -2,7 +2,34 @@
 
 source discovery_config.sh
 
-CMD="bash src/run_discovery.sh $@ -J 10 -vvv -t $GDC_TOKEN $CASES "
+mkdir -p logs
+LOGE="logs/1_run_discovery.err"
+LOGO="logs/1_run_discovery.out"
+
+CMD="bash src/run_discovery.sh $@ -J 10 -vvv -t $GDC_TOKEN $CASES  > $LOGO 2> $LOGE"
 >&2 echo Running: $CMD
+>&2 echo Writing logs to $LOGO and $LOGE
 eval $CMD
 
+
+# this makes assumptions about log output.  Better to make discovery less noisy
+OUTD="logs/outputs" 
+NERR=$(grep -il error $OUTD/*/*log* | wc -l)
+if grep -q -i error $OUTD/*/*log* ; then
+    >&2 echo The following $NERR files had errors \(top 10 shown\):
+    grep -il error $OUTD/*/*log* | head
+else
+    >&2 echo No errors found
+fi
+NWRN=$(grep -il warning $OUTD/*/*log* | wc -l)
+if grep -q -i warning $OUTD/*/*log* ; then
+    >&2 echo The following $NWRN files had warnings \(top 10 shown\):
+    grep -il warning $OUTD/*/*log* | head
+
+    # Give examples of warnings found, ignoring trivial ones
+    grep -h -i warning $LOGE $LOGO | grep -v "exists. Deleting" | sort -u | head
+else
+    >&2 echo No warnings found
+fi
+
+
diff --git a/3_make_catalog3.sh b/3_make_catalog3.sh
index 5dddacf..7b4436f 100644
--- a/3_make_catalog3.sh
+++ b/3_make_catalog3.sh
@@ -1,10 +1,10 @@
 source discovery_config.sh
 
+# Making catalog2 using data from previous discovery run
 LOGE="logs/process_catalog3.err"
 LOGO="logs/process_catalog3.out"
 
-#CMD="bash src/process_catalog.sh $@ $PROJECT $CASES > $LOGO 2> $LOGE "
-CMD="bash src/process_catalog.sh $@ $PROJECT $CASES "
+CMD="bash src/process_catalog.sh $@ $PROJECT $CASES > $LOGO 2> $LOGE "
 >&2 echo Running: $CMD
 >&2 echo Writing logs to $LOGO and $LOGE
 eval $CMD
diff --git a/results/CPTAC3.Demographics.tsv b/results/CPTAC3.Demographics.tsv
deleted file mode 100644
index 219ee5c..0000000
--- a/results/CPTAC3.Demographics.tsv
+++ /dev/null
@@ -1,5 +0,0 @@
-case	disease	ethnicity	gender	race	days_to_birth
-C3L-01838	unknown	not hispanic or latino	male	white	-25663
-C3L-01839	unknown	not hispanic or latino	male	white	-25831
-C3L-01840	unknown	not hispanic or latino	female	white	-20522
-C3L-01861	unknown	not hispanic or latino	male	white	-20974
diff --git a/src/make_catalog2.sh b/src/make_catalog2.sh
index a5305fd..8fd8ad0 100644
--- a/src/make_catalog2.sh
+++ b/src/make_catalog2.sh
@@ -12,7 +12,7 @@ read -r -d '' USAGE <<'EOF'
 Write a comprehensive summary of aligned reads and methylation array from GDC.  v2.2
 
 Usage:
-  make_catalog.sh [options] CASE DISEASE
+  make_catalog2.sh [options] CASE DISEASE
 
 Options:
 -h: Print this help message
@@ -533,6 +533,18 @@ function process_reads {
 #   10 experimental strategy
 #   11 md5sum
 
+# Update summer 2022: making discovery common for catalog2 and catalog3 means
+# that "assumed reference" is better interpreted as "alignment", and can have
+# the following values:
+# * submitted_aligned
+# * submitted_unaligned
+# * harmonized
+# * NA
+# For the output, catalog2 uses the following valurs for reference:
+#    * hg19 for submitted aligned reads
+#    * NA for submitted unaligned reads
+#    * hg38 for harmonized reads
+
     # Loop over all lines in input file RFN and write catalog entry for each
     while read L; do
 
@@ -572,6 +584,19 @@ function process_reads {
             exit 1
         fi
 
+        # Rename reference to be consistent with Catalog2 usage
+        # * submitted_aligned -> hg19
+        # * submitted_unaligned -> NA
+        # * harmonized -> hg38
+        # * NA -> NA (this includes methylation)
+        if [ $REF == "submitted_aligned" ]; then
+            REF="hg19"
+        elif [ $REF == "submitted_unaligned" ]; then
+            REF="NA"
+        elif [ $REF == "harmonized" ]; then
+            REF="hg38"
+        fi                
+
         # Get result type for harmonized RNA-Seq BAMs: genomic, chimeric, transcriptome
         #   example: 73746f82-9ea4-45ac-87d8-bf0e3dc0c2fe.rna_seq.transcriptome.gdc_realn.bam
         RESULT_TYPE="NA"
diff --git a/src/make_catalog3.py b/src/make_catalog3.py
index c982d98..45cb0d3 100644
--- a/src/make_catalog3.py
+++ b/src/make_catalog3.py
@@ -114,7 +114,7 @@ def get_dv_string(rf_row):
 def get_data_variety_FASTQ(rf):
     FQ_ix = rf['data_format']=='FASTQ'
     BM_ix = rf['data_format']=='BAM' 
-    UA_ix = rf['alignment']=='unaligned'
+    UA_ix = rf['alignment']=='submitted_unaligned'
 
     target_ix = FQ_ix | (BM_ix & UA_ix)
 
diff --git a/src/process_demographics.sh b/src/process_demographics.sh
index ade99f1..91b8eeb 100644
--- a/src/process_demographics.sh
+++ b/src/process_demographics.sh
@@ -14,8 +14,10 @@ Options:
 -d: Dry run.  Print commands but do not execute queries
 -v: Verbose.  May be repeated to get verbose output from called scripts
 -L LOGBASE: base directory of runtime output.  Default ./logs
+-1: stop after processing one case
 
 CASES is a TSV file with case name and disease in first and second columns
+We are adding the disease as a field here, as it is no longer written during discovery
 
 PROJECT (e.g., CPTAC3) is passed directly to catalog3 column
 
@@ -29,7 +31,7 @@ DESTD="./results"
 
 # Using rungo as a template for parallel: https://github.com/ding-lab/TinDaisy/blob/master/src/rungo
 # http://wiki.bash-hackers.org/howto/getopts_tutorial
-while getopts ":hdvD:L:" opt; do
+while getopts ":hdvD:L:1" opt; do
   case $opt in
     h)
       echo "$USAGE"
@@ -47,6 +49,9 @@ while getopts ":hdvD:L:" opt; do
     L)
       LOGBASE="$OPTARG"
       ;;
+    1)
+      JUSTONE=1
+      ;;
     \?)
       >&2 echo "Invalid option: -$OPTARG"
       echo "$USAGE"
diff --git a/src/run_discovery.sh b/src/run_discovery.sh
index 0492619..41fb89c 100644
--- a/src/run_discovery.sh
+++ b/src/run_discovery.sh
@@ -15,7 +15,7 @@ Options:
 -v: Verbose.  May be repeated to get verbose output from called scripts
 -J N: Evaluate N cases in parallel.  If 0, disable parallel mode. Default 0
 -1: stop after processing one case
--L LOGBASE: base directory of runtime output.  Default ./dat
+-L LOGBASE: base directory of runtime output.  Default ./logs
 -t GDC_TOKEN: GDC token file
 
 CASES is a TSV file with case name and disease in first and second columns
@@ -164,7 +164,7 @@ function process_cases {
         STDOUT_FN="$LOGD/log.${CASE}.out"
         STDERR_FN="$LOGD/log.${CASE}.err"
 
-        CMD="bash src/process_case.sh $XARGS -t $GDC_TOKEN -O $LOGD $DEM $VERBOSE_ARG $CASE > $STDOUT_FN 2> $STDERR_FN"
+        CMD="bash src/process_case.sh $XARGS -t $GDC_TOKEN -O $LOGD -D $DIS $DEM $VERBOSE_ARG $CASE > $STDOUT_FN 2> $STDERR_FN"
 
         if [ $NJOBS != 0 ]; then
             JOBLOG="$LOGD/$CASE.log"
@@ -205,23 +205,5 @@ fi
 
 END=$(date)
 >&2 echo [ $END ] Discovery complete
-
-OUTD="$LOGBASE/outputs" # must match value in src/process_multi_cases.sh
-NERR=$(grep -il error $OUTD/*/*log* | wc -l)
-if grep -q -i error $OUTD/*/*log* ; then
-    >&2 echo The following $NERR files had errors \(top 10 shown\):
-    grep -il error $OUTD/*/*log* | head
-else
-    >&2 echo No errors found
-fi
-NWRN=$(grep -il warning $OUTD/*/*log* | wc -l)
-if grep -q -i warning $OUTD/*/*log* ; then
-    >&2 echo The following $NWRN files had warnings \(top 10 shown\):
-    grep -il warning $OUTD/*/*log* | head
-else
-    >&2 echo No warnings found
-fi
-
 >&2 echo Timing summary: 
 >&2 echo Discovery start: [ $START ]  End: [ $END ]
-