Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Non-GVS bits required for GVS [VS-971] #8362

Merged
merged 5 commits into from
Jun 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@ sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_D

echo "Running Filtering WDL through cromwell"

cat > ${WORKING_DIR}/src/test/resources/cromwell_monitoring_script.sh <<FIN
while true
do
echo 'dummy monitoring script running...'
sleep 10
done
FIN
cat $WORKING_DIR/vcf_site_level_filtering_mod.json
java -jar $CROMWELL_JAR run $WDL_DIR/JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_mod.json

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,6 @@
"JointVcfFiltering.annotations": ["ReadPosRankSum", "FS", "SOR", "QD"],
"JointVcfFiltering.output_prefix": "test_10_samples",
"JointVcfFiltering.resource_args": "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
"JointVcfFiltering.extract_extra_args": "-L chr21"
}
"JointVcfFiltering.extract_extra_args": "-L chr21",
"JointVcfFiltering.monitoring_script": "/home/runner/work/gatk/gatk/src/test/resources/cromwell_monitoring_script.sh"
}
36 changes: 33 additions & 3 deletions scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ workflow JointVcfFiltering {
RuntimeAttributes? extract_runtime_attributes
RuntimeAttributes? train_runtime_attributes
RuntimeAttributes? score_runtime_attributes

File? monitoring_script
}

parameter_meta {
Expand All @@ -68,7 +70,8 @@ workflow JointVcfFiltering {
extra_args = extract_extra_args,
gatk_docker = gatk_docker,
gatk_override = gatk_override,
runtime_attributes = extract_runtime_attributes
runtime_attributes = extract_runtime_attributes,
monitoring_script = monitoring_script
}

call TrainVariantAnnotationsModel {
Expand All @@ -82,7 +85,8 @@ workflow JointVcfFiltering {
extra_args = train_extra_args,
gatk_docker = gatk_docker,
gatk_override = gatk_override,
runtime_attributes = train_runtime_attributes
runtime_attributes = train_runtime_attributes,
monitoring_script = monitoring_script
}

scatter (i in range(length(input_vcfs))) {
Expand All @@ -100,7 +104,8 @@ workflow JointVcfFiltering {
extra_args = score_extra_args,
gatk_docker = gatk_docker,
gatk_override = gatk_override,
runtime_attributes = score_runtime_attributes
runtime_attributes = score_runtime_attributes,
monitoring_script = monitoring_script
}
}

Expand All @@ -116,6 +121,13 @@ workflow JointVcfFiltering {
Array[File] scored_vcf_idxs = ScoreVariantAnnotations.scored_vcf_idx
Array[File?] annotations_hdf5s = ScoreVariantAnnotations.annotations_hdf5
Array[File?] scores_hdf5s = ScoreVariantAnnotations.scores_hdf5

Array[File?] monitoring_logs = flatten(
[
[ExtractVariantAnnotations.monitoring_log],
[TrainVariantAnnotationsModel.monitoring_log],
ScoreVariantAnnotations.monitoring_log
])
}
}

Expand All @@ -127,6 +139,7 @@ task ExtractVariantAnnotations {
Array[String] annotations
String resource_args
String? extra_args
File? monitoring_script

String gatk_docker
File? gatk_override
Expand All @@ -143,6 +156,10 @@ task ExtractVariantAnnotations {
set -e
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}

if [ -s ~{monitoring_script} ]; then
bash ~{monitoring_script} > monitoring.log &
fi

gatk --java-options "-Xmx~{default=6 runtime_attributes.command_mem_gb}G" \
ExtractVariantAnnotations \
-V ~{input_vcf} \
Expand All @@ -167,6 +184,7 @@ task ExtractVariantAnnotations {
File? unlabeled_annotations_hdf5 = "~{output_prefix}.extract.unlabeled.annot.hdf5"
File extracted_vcf = "~{output_prefix}.extract.vcf.gz" # this line will break if extra_args includes the do-not-gzip-vcf-output argument
File extracted_vcf_idx = "~{output_prefix}.extract.vcf.gz.tbi" # this line will break if extra_args includes the do-not-gzip-vcf-output argument
File? monitoring_log = "monitoring.log"
}
}

Expand All @@ -179,6 +197,7 @@ task TrainVariantAnnotationsModel {
File? hyperparameters_json
String output_prefix
String? extra_args
File? monitoring_script

String gatk_docker
File? gatk_override
Expand All @@ -190,6 +209,10 @@ task TrainVariantAnnotationsModel {
set -e
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}

if [ -s ~{monitoring_script} ]; then
bash ~{monitoring_script} > monitoring.log &
fi

gatk --java-options "-Xmx~{default=6 runtime_attributes.command_mem_gb}G" \
TrainVariantAnnotationsModel \
--annotations-hdf5 ~{annotations_hdf5} \
Expand All @@ -213,6 +236,7 @@ task TrainVariantAnnotationsModel {

output {
Array[File] model_files = glob("~{output_prefix}.train.*")
File? monitoring_log = "monitoring.log"
}
}

Expand All @@ -228,6 +252,7 @@ task ScoreVariantAnnotations {
String model_prefix
Array[File] model_files
String? extra_args
File? monitoring_script

String gatk_docker
File? gatk_override
Expand All @@ -245,6 +270,10 @@ task ScoreVariantAnnotations {
command {
set -e

if [ -s ~{monitoring_script} ]; then
bash ~{monitoring_script} > monitoring.log &
fi

export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}

mkdir model-files
Expand Down Expand Up @@ -276,5 +305,6 @@ task ScoreVariantAnnotations {
File scored_vcf_idx = "~{output_prefix}.score.vcf.gz.tbi" # this line will break if extra_args includes the do-not-gzip-vcf-output argument
File? annotations_hdf5 = "~{output_prefix}.score.annot.hdf5" # this file will only be produced if the number of sites scored is nonzero
File? scores_hdf5 = "~{output_prefix}.score.scores.hdf5" # this file will only be produced if the number of sites scored is nonzero
File? monitoring_log = "monitoring.log"
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.broadinstitute.hellbender.utils.bigquery;
package org.broadinstitute.hellbender.utils.gvs.bigquery;

import com.google.cloud.bigquery.JobStatistics;
import com.google.cloud.bigquery.TableResult;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.broadinstitute.hellbender.utils.bigquery;
package org.broadinstitute.hellbender.utils.gvs.bigquery;

import com.google.cloud.bigquery.*;
import io.grpc.StatusRuntimeException;
Expand Down Expand Up @@ -451,6 +451,19 @@ public static StorageAPIAvroReaderAndBigQueryStatistics executeQueryWithStorageA
}

public static boolean doRowsExistFor(String projectID, String datasetName, String tableName, String columnName, String value) {
String template = "SELECT COUNT(*) FROM `%s.%s.%s` WHERE %s = '%s'";
String query = String.format(template, projectID, datasetName, tableName, columnName, value);

BigQueryResultAndStatistics resultAndStatistics = BigQueryUtils.executeQuery(projectID, query, true, null);
for (final FieldValueList row : resultAndStatistics.result.iterateAll()) {
final long count = row.get(0).getLongValue();
return count != 0;
}
throw new GATKException(String.format("No rows returned from count of `%s.%s.%s` for %s = '%s'",
projectID, datasetName, tableName, columnName, value));
}

public static boolean doRowsExistFor(String projectID, String datasetName, String tableName, String columnName, Long value) {
String template = "SELECT COUNT(*) FROM `%s.%s.%s` WHERE %s = %s";
String query = String.format(template, projectID, datasetName, tableName, columnName, value);

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.broadinstitute.hellbender.utils.bigquery;
package org.broadinstitute.hellbender.utils.gvs.bigquery;

import htsjdk.samtools.util.CloseableIterator;
import org.apache.avro.generic.GenericRecord;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.broadinstitute.hellbender.utils.bigquery;
package org.broadinstitute.hellbender.utils.gvs.bigquery;

import com.google.cloud.bigquery.storage.v1.AvroRows;
import com.google.cloud.bigquery.storage.v1.BigQueryReadClient;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.broadinstitute.hellbender.utils.bigquery;
package org.broadinstitute.hellbender.utils.gvs.bigquery;

import com.google.cloud.bigquery.JobStatistics;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.broadinstitute.hellbender.utils.bigquery;
package org.broadinstitute.hellbender.utils.gvs.bigquery;

import com.google.common.collect.ImmutableList;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,9 @@ their names (or descriptions) depend on some threshold. Those filters are not i
public static final String VQSR_FAILURE_PREFIX = "low_VQSLOD_";
public static final String VQSR_FAILURE_SNP = VQSR_FAILURE_PREFIX + SNP;
public static final String VQSR_FAILURE_INDEL = VQSR_FAILURE_PREFIX + INDEL;
public static final String VQS_SENS_FAILURE_PREFIX = "low_VQS_SENS_";
// Prefix for a site (SNP/INDEL) that failed calibration sensitivity cutoff. In this case, the site would be a
// failure if the sensitivity is greater than the threshold.
public static final String VQS_SENS_FAILURE_PREFIX = "high_VQS_SENS_";
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this was confusing enough that it was gotten wrong initially maybe add a comment explaining what this means?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's analogous to the low_VQSLOD_ prefix - "Prefix for a site (SNP/INDEL) that failed calibration sensitivity cutoff". In this case, the site would be a failure if the sensitivity is greater than the threshold.

public static final String VQS_SENS_FAILURE_SNP = VQS_SENS_FAILURE_PREFIX + SNP;
public static final String VQS_SENS_FAILURE_INDEL = VQS_SENS_FAILURE_PREFIX + INDEL;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.broadinstitute.hellbender.utils.bigquery;
package org.broadinstitute.hellbender.utils.gvs.bigquery;

import com.google.cloud.bigquery.FieldValueList;
import org.apache.avro.generic.GenericRecord;
Expand Down