Skip to content

Commit

Permalink
Bugfixes to make sure catalog2, catalog3, and demographics match
Browse files Browse the repository at this point in the history
  • Loading branch information
mwyczalkowski committed Sep 7, 2022
1 parent 83cdc6f commit 5f7aea9
Show file tree
Hide file tree
Showing 7 changed files with 65 additions and 31 deletions.
29 changes: 28 additions & 1 deletion 1_run_discovery.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,34 @@

source discovery_config.sh

CMD="bash src/run_discovery.sh $@ -J 10 -vvv -t $GDC_TOKEN $CASES "
mkdir -p logs
LOGE="logs/1_run_discovery.err"
LOGO="logs/1_run_discovery.out"

CMD="bash src/run_discovery.sh $@ -J 10 -vvv -t $GDC_TOKEN $CASES > $LOGO 2> $LOGE"
>&2 echo Running: $CMD
>&2 echo Writing logs to $LOGO and $LOGE
eval $CMD


# this makes assumptions about log output. Better to make discovery less noisy
OUTD="logs/outputs"
NERR=$(grep -il error $OUTD/*/*log* | wc -l)
if grep -q -i error $OUTD/*/*log* ; then
>&2 echo The following $NERR files had errors \(top 10 shown\):
grep -il error $OUTD/*/*log* | head
else
>&2 echo No errors found
fi
NWRN=$(grep -il warning $OUTD/*/*log* | wc -l)
if grep -q -i warning $OUTD/*/*log* ; then
>&2 echo The following $NWRN files had warnings \(top 10 shown\):
grep -il warning $OUTD/*/*log* | head

# Give examples of warnings found, ignoring trivial ones
grep -h -i warning $LOGE $LOGO | grep -v "exists. Deleting" | sort -u | head
else
>&2 echo No warnings found
fi


4 changes: 2 additions & 2 deletions 3_make_catalog3.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
source discovery_config.sh

# Making catalog2 using data from previous discovery run
LOGE="logs/process_catalog3.err"
LOGO="logs/process_catalog3.out"

#CMD="bash src/process_catalog.sh $@ $PROJECT $CASES > $LOGO 2> $LOGE "
CMD="bash src/process_catalog.sh $@ $PROJECT $CASES "
CMD="bash src/process_catalog.sh $@ $PROJECT $CASES > $LOGO 2> $LOGE "
>&2 echo Running: $CMD
>&2 echo Writing logs to $LOGO and $LOGE
eval $CMD
Expand Down
5 changes: 0 additions & 5 deletions results/CPTAC3.Demographics.tsv

This file was deleted.

27 changes: 26 additions & 1 deletion src/make_catalog2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ read -r -d '' USAGE <<'EOF'
Write a comprehensive summary of aligned reads and methylation array from GDC. v2.2
Usage:
make_catalog.sh [options] CASE DISEASE
make_catalog2.sh [options] CASE DISEASE
Options:
-h: Print this help message
Expand Down Expand Up @@ -533,6 +533,18 @@ function process_reads {
# 10 experimental strategy
# 11 md5sum

# Update summer 2022: making discovery common for catalog2 and catalog3 means
# that "assumed reference" is better interpreted as "alignment", and can have
# the following values:
# * submitted_aligned
# * submitted_unaligned
# * harmonized
# * NA
# For the output, catalog2 uses the following valurs for reference:
# * hg19 for submitted aligned reads
# * NA for submitted unaligned reads
# * hg38 for harmonized reads

# Loop over all lines in input file RFN and write catalog entry for each
while read L; do

Expand Down Expand Up @@ -572,6 +584,19 @@ function process_reads {
exit 1
fi

# Rename reference to be consistent with Catalog2 usage
# * submitted_aligned -> hg19
# * submitted_unaligned -> NA
# * harmonized -> hg38
# * NA -> NA (this includes methylation)
if [ $REF == "submitted_aligned" ]; then
REF="hg19"
elif [ $REF == "submitted_unaligned" ]; then
REF="NA"
elif [ $REF == "harmonized" ]; then
REF="hg38"
fi

# Get result type for harmonized RNA-Seq BAMs: genomic, chimeric, transcriptome
# example: 73746f82-9ea4-45ac-87d8-bf0e3dc0c2fe.rna_seq.transcriptome.gdc_realn.bam
RESULT_TYPE="NA"
Expand Down
2 changes: 1 addition & 1 deletion src/make_catalog3.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def get_dv_string(rf_row):
def get_data_variety_FASTQ(rf):
FQ_ix = rf['data_format']=='FASTQ'
BM_ix = rf['data_format']=='BAM'
UA_ix = rf['alignment']=='unaligned'
UA_ix = rf['alignment']=='submitted_unaligned'

target_ix = FQ_ix | (BM_ix & UA_ix)

Expand Down
7 changes: 6 additions & 1 deletion src/process_demographics.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ Options:
-d: Dry run. Print commands but do not execute queries
-v: Verbose. May be repeated to get verbose output from called scripts
-L LOGBASE: base directory of runtime output. Default ./logs
-1: stop after processing one case
CASES is a TSV file with case name and disease in first and second columns
We are adding the disease as a field here, as it is no longer written during discovery
PROJECT (e.g., CPTAC3) is passed directly to catalog3 column
Expand All @@ -29,7 +31,7 @@ DESTD="./results"

# Using rungo as a template for parallel: https://github.com/ding-lab/TinDaisy/blob/master/src/rungo
# http://wiki.bash-hackers.org/howto/getopts_tutorial
while getopts ":hdvD:L:" opt; do
while getopts ":hdvD:L:1" opt; do
case $opt in
h)
echo "$USAGE"
Expand All @@ -47,6 +49,9 @@ while getopts ":hdvD:L:" opt; do
L)
LOGBASE="$OPTARG"
;;
1)
JUSTONE=1
;;
\?)
>&2 echo "Invalid option: -$OPTARG"
echo "$USAGE"
Expand Down
22 changes: 2 additions & 20 deletions src/run_discovery.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Options:
-v: Verbose. May be repeated to get verbose output from called scripts
-J N: Evaluate N cases in parallel. If 0, disable parallel mode. Default 0
-1: stop after processing one case
-L LOGBASE: base directory of runtime output. Default ./dat
-L LOGBASE: base directory of runtime output. Default ./logs
-t GDC_TOKEN: GDC token file
CASES is a TSV file with case name and disease in first and second columns
Expand Down Expand Up @@ -164,7 +164,7 @@ function process_cases {
STDOUT_FN="$LOGD/log.${CASE}.out"
STDERR_FN="$LOGD/log.${CASE}.err"

CMD="bash src/process_case.sh $XARGS -t $GDC_TOKEN -O $LOGD $DEM $VERBOSE_ARG $CASE > $STDOUT_FN 2> $STDERR_FN"
CMD="bash src/process_case.sh $XARGS -t $GDC_TOKEN -O $LOGD -D $DIS $DEM $VERBOSE_ARG $CASE > $STDOUT_FN 2> $STDERR_FN"

if [ $NJOBS != 0 ]; then
JOBLOG="$LOGD/$CASE.log"
Expand Down Expand Up @@ -205,23 +205,5 @@ fi

END=$(date)
>&2 echo [ $END ] Discovery complete

OUTD="$LOGBASE/outputs" # must match value in src/process_multi_cases.sh
NERR=$(grep -il error $OUTD/*/*log* | wc -l)
if grep -q -i error $OUTD/*/*log* ; then
>&2 echo The following $NERR files had errors \(top 10 shown\):
grep -il error $OUTD/*/*log* | head
else
>&2 echo No errors found
fi
NWRN=$(grep -il warning $OUTD/*/*log* | wc -l)
if grep -q -i warning $OUTD/*/*log* ; then
>&2 echo The following $NWRN files had warnings \(top 10 shown\):
grep -il warning $OUTD/*/*log* | head
else
>&2 echo No warnings found
fi

>&2 echo Timing summary:
>&2 echo Discovery start: [ $START ] End: [ $END ]

0 comments on commit 5f7aea9

Please sign in to comment.