From 5f7aea915a213be99fd26884d58dc257b7338229 Mon Sep 17 00:00:00 2001 From: "Matthew A. Wyczalkowski" Date: Tue, 6 Sep 2022 19:02:35 -0500 Subject: [PATCH] Bugfixes to make sure catalog2, catalog3, and demographics match --- 1_run_discovery.sh | 29 ++++++++++++++++++++++++++++- 3_make_catalog3.sh | 4 ++-- results/CPTAC3.Demographics.tsv | 5 ----- src/make_catalog2.sh | 27 ++++++++++++++++++++++++++- src/make_catalog3.py | 2 +- src/process_demographics.sh | 7 ++++++- src/run_discovery.sh | 22 ++-------------------- 7 files changed, 65 insertions(+), 31 deletions(-) delete mode 100644 results/CPTAC3.Demographics.tsv diff --git a/1_run_discovery.sh b/1_run_discovery.sh index 3649009..5b88171 100644 --- a/1_run_discovery.sh +++ b/1_run_discovery.sh @@ -2,7 +2,34 @@ source discovery_config.sh -CMD="bash src/run_discovery.sh $@ -J 10 -vvv -t $GDC_TOKEN $CASES " +mkdir -p logs +LOGE="logs/1_run_discovery.err" +LOGO="logs/1_run_discovery.out" + +CMD="bash src/run_discovery.sh $@ -J 10 -vvv -t $GDC_TOKEN $CASES > $LOGO 2> $LOGE" >&2 echo Running: $CMD +>&2 echo Writing logs to $LOGO and $LOGE eval $CMD + +# this makes assumptions about log output. Better to make discovery less noisy +OUTD="logs/outputs" +NERR=$(grep -il error $OUTD/*/*log* | wc -l) +if grep -q -i error $OUTD/*/*log* ; then + >&2 echo The following $NERR files had errors \(top 10 shown\): + grep -il error $OUTD/*/*log* | head +else + >&2 echo No errors found +fi +NWRN=$(grep -il warning $OUTD/*/*log* | wc -l) +if grep -q -i warning $OUTD/*/*log* ; then + >&2 echo The following $NWRN files had warnings \(top 10 shown\): + grep -il warning $OUTD/*/*log* | head + + # Give examples of warnings found, ignoring trivial ones + grep -h -i warning $LOGE $LOGO | grep -v "exists. Deleting" | sort -u | head +else + >&2 echo No warnings found +fi + + diff --git a/3_make_catalog3.sh b/3_make_catalog3.sh index 5dddacf..7b4436f 100644 --- a/3_make_catalog3.sh +++ b/3_make_catalog3.sh @@ -1,10 +1,10 @@ source discovery_config.sh +# Making catalog2 using data from previous discovery run LOGE="logs/process_catalog3.err" LOGO="logs/process_catalog3.out" -#CMD="bash src/process_catalog.sh $@ $PROJECT $CASES > $LOGO 2> $LOGE " -CMD="bash src/process_catalog.sh $@ $PROJECT $CASES " +CMD="bash src/process_catalog.sh $@ $PROJECT $CASES > $LOGO 2> $LOGE " >&2 echo Running: $CMD >&2 echo Writing logs to $LOGO and $LOGE eval $CMD diff --git a/results/CPTAC3.Demographics.tsv b/results/CPTAC3.Demographics.tsv deleted file mode 100644 index 219ee5c..0000000 --- a/results/CPTAC3.Demographics.tsv +++ /dev/null @@ -1,5 +0,0 @@ -case disease ethnicity gender race days_to_birth -C3L-01838 unknown not hispanic or latino male white -25663 -C3L-01839 unknown not hispanic or latino male white -25831 -C3L-01840 unknown not hispanic or latino female white -20522 -C3L-01861 unknown not hispanic or latino male white -20974 diff --git a/src/make_catalog2.sh b/src/make_catalog2.sh index a5305fd..8fd8ad0 100644 --- a/src/make_catalog2.sh +++ b/src/make_catalog2.sh @@ -12,7 +12,7 @@ read -r -d '' USAGE <<'EOF' Write a comprehensive summary of aligned reads and methylation array from GDC. v2.2 Usage: - make_catalog.sh [options] CASE DISEASE + make_catalog2.sh [options] CASE DISEASE Options: -h: Print this help message @@ -533,6 +533,18 @@ function process_reads { # 10 experimental strategy # 11 md5sum +# Update summer 2022: making discovery common for catalog2 and catalog3 means +# that "assumed reference" is better interpreted as "alignment", and can have +# the following values: +# * submitted_aligned +# * submitted_unaligned +# * harmonized +# * NA +# For the output, catalog2 uses the following valurs for reference: +# * hg19 for submitted aligned reads +# * NA for submitted unaligned reads +# * hg38 for harmonized reads + # Loop over all lines in input file RFN and write catalog entry for each while read L; do @@ -572,6 +584,19 @@ function process_reads { exit 1 fi + # Rename reference to be consistent with Catalog2 usage + # * submitted_aligned -> hg19 + # * submitted_unaligned -> NA + # * harmonized -> hg38 + # * NA -> NA (this includes methylation) + if [ $REF == "submitted_aligned" ]; then + REF="hg19" + elif [ $REF == "submitted_unaligned" ]; then + REF="NA" + elif [ $REF == "harmonized" ]; then + REF="hg38" + fi + # Get result type for harmonized RNA-Seq BAMs: genomic, chimeric, transcriptome # example: 73746f82-9ea4-45ac-87d8-bf0e3dc0c2fe.rna_seq.transcriptome.gdc_realn.bam RESULT_TYPE="NA" diff --git a/src/make_catalog3.py b/src/make_catalog3.py index c982d98..45cb0d3 100644 --- a/src/make_catalog3.py +++ b/src/make_catalog3.py @@ -114,7 +114,7 @@ def get_dv_string(rf_row): def get_data_variety_FASTQ(rf): FQ_ix = rf['data_format']=='FASTQ' BM_ix = rf['data_format']=='BAM' - UA_ix = rf['alignment']=='unaligned' + UA_ix = rf['alignment']=='submitted_unaligned' target_ix = FQ_ix | (BM_ix & UA_ix) diff --git a/src/process_demographics.sh b/src/process_demographics.sh index ade99f1..91b8eeb 100644 --- a/src/process_demographics.sh +++ b/src/process_demographics.sh @@ -14,8 +14,10 @@ Options: -d: Dry run. Print commands but do not execute queries -v: Verbose. May be repeated to get verbose output from called scripts -L LOGBASE: base directory of runtime output. Default ./logs +-1: stop after processing one case CASES is a TSV file with case name and disease in first and second columns +We are adding the disease as a field here, as it is no longer written during discovery PROJECT (e.g., CPTAC3) is passed directly to catalog3 column @@ -29,7 +31,7 @@ DESTD="./results" # Using rungo as a template for parallel: https://github.com/ding-lab/TinDaisy/blob/master/src/rungo # http://wiki.bash-hackers.org/howto/getopts_tutorial -while getopts ":hdvD:L:" opt; do +while getopts ":hdvD:L:1" opt; do case $opt in h) echo "$USAGE" @@ -47,6 +49,9 @@ while getopts ":hdvD:L:" opt; do L) LOGBASE="$OPTARG" ;; + 1) + JUSTONE=1 + ;; \?) >&2 echo "Invalid option: -$OPTARG" echo "$USAGE" diff --git a/src/run_discovery.sh b/src/run_discovery.sh index 0492619..41fb89c 100644 --- a/src/run_discovery.sh +++ b/src/run_discovery.sh @@ -15,7 +15,7 @@ Options: -v: Verbose. May be repeated to get verbose output from called scripts -J N: Evaluate N cases in parallel. If 0, disable parallel mode. Default 0 -1: stop after processing one case --L LOGBASE: base directory of runtime output. Default ./dat +-L LOGBASE: base directory of runtime output. Default ./logs -t GDC_TOKEN: GDC token file CASES is a TSV file with case name and disease in first and second columns @@ -164,7 +164,7 @@ function process_cases { STDOUT_FN="$LOGD/log.${CASE}.out" STDERR_FN="$LOGD/log.${CASE}.err" - CMD="bash src/process_case.sh $XARGS -t $GDC_TOKEN -O $LOGD $DEM $VERBOSE_ARG $CASE > $STDOUT_FN 2> $STDERR_FN" + CMD="bash src/process_case.sh $XARGS -t $GDC_TOKEN -O $LOGD -D $DIS $DEM $VERBOSE_ARG $CASE > $STDOUT_FN 2> $STDERR_FN" if [ $NJOBS != 0 ]; then JOBLOG="$LOGD/$CASE.log" @@ -205,23 +205,5 @@ fi END=$(date) >&2 echo [ $END ] Discovery complete - -OUTD="$LOGBASE/outputs" # must match value in src/process_multi_cases.sh -NERR=$(grep -il error $OUTD/*/*log* | wc -l) -if grep -q -i error $OUTD/*/*log* ; then - >&2 echo The following $NERR files had errors \(top 10 shown\): - grep -il error $OUTD/*/*log* | head -else - >&2 echo No errors found -fi -NWRN=$(grep -il warning $OUTD/*/*log* | wc -l) -if grep -q -i warning $OUTD/*/*log* ; then - >&2 echo The following $NWRN files had warnings \(top 10 shown\): - grep -il warning $OUTD/*/*log* | head -else - >&2 echo No warnings found -fi - >&2 echo Timing summary: >&2 echo Discovery start: [ $START ] End: [ $END ] -