Skip to content

Commit

Permalink
Tool to detect CRAM base corruption caused by GATK issue 8768 (#8819)
Browse files Browse the repository at this point in the history
A new diagnostic tool, CRAMIssue8768Detector, that analyzes a CRAM file to look for possible base corruption caused by #8768

This issue affects GATK versions 4.3.0.0 through 4.5.0.0, and is fixed in GATK 4.6.0.0.
This issue also affects Picard versions 2.27.3 through 3.1.1, and is fixed in Picard 3.2.0.

The bug is triggered when writing a CRAM file using one of the affected GATK/Picard versions,
and both of the following conditions are met:

    * At least one read is mapped to the very first base of a reference contig
    * The file contains more than one CRAM container (10,000 reads) with reads mapped to that same reference contig

When both of these conditions are met, the resulting CRAM file may have corrupt containers containing reads with an incorrect sequence.

This tool writes a report to an output text file indicating whether the CRAM file appears to have read base corruption caused by issue 8768, and listing the affected containers. By default, the output report will have a summary of the average mismatch rate for all suspected bad containers and a few presumed good containers in order to determine if there is a large difference in the base mismatch rate.

Optionally, a TSV file with the same information as the textual report, but in tabular form, can be written using the "--output-tsv" argument.

---------

Co-authored-by: David Roazen <droazen@broadinstitute.org>
  • Loading branch information
cmnbroad and droazen authored Jun 29, 2024
1 parent 92dc4ae commit ea58e61
Show file tree
Hide file tree
Showing 30 changed files with 2,454 additions and 36 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
package org.broadinstitute.hellbender.tools;

import org.broadinstitute.barclay.argparser.*;
import org.broadinstitute.hellbender.tools.filediagnostics.CRAMIssue8768Analyzer;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.GATKPath;
import picard.cmdline.programgroups.OtherProgramGroup;

/**
* A diagnostic tool that analyzes a CRAM file to look for possible base corruption caused by
* <a href="https://github.com/broadinstitute/gatk/issues/8768">GATK issue 8768</a>.
*
* <p>This issue affects GATK versions 4.3.0.0 through 4.5.0.0, and is fixed in GATK 4.6.0.0.</p>
*
* <p>This issue also affects Picard versions 2.27.3 through 3.1.1, and is fixed in Picard 3.2.0.</p>
*
* <p>The bug is triggered when writing a CRAM file using one of the affected GATK/Picard versions,
* and both of the following conditions are met:</p>
*
* <ul>
* <li>At least one read is mapped to the very first base of a reference contig</li>
* <li>The file contains more than one CRAM container (10,000 reads) with reads mapped to that same reference contig</li>
* </ul>
*
* <p>When both of these conditions are met, the resulting CRAM file may have corrupt containers containing reads
* with an incorrect sequence.</p>
*
* <p>This tool writes a report to an output text file indicating whether the CRAM file appears to have read base corruption caused by issue 8768,
* and listing the affected containers. By default, the output report will have a summary of the average mismatch rate for all suspected bad containers
* and a few presumed good containers in order to determine if there is a large difference in the base mismatch rate.</p>
*
* <p>Optionally, a TSV file with the same information as the textual report, but in tabular form, can be written
* using the "--output-tsv" argument.</p>
*
* <p>To analyze the base mismatch rate for ALL containers, use the "verbose" option.</p>
*
* <p>Works on files ending in .cram.</p>
* <br />
*
* <h4>Sample Usage:</h4>
* <pre>
* gatk CRAMIssue8768Detector \
* -I input.cram \
* -O output_report.txt \
* -R reference.fasta
* </pre>
* <pre>
* gatk CRAMIssue8768Detector \
* -I input.cram \
* -O output_report.txt \
* -R reference.fasta \
* --output-tsv output_report_as_table.tsv
* </pre>
*/
@ExperimentalFeature
@WorkflowProperties
@CommandLineProgramProperties(
summary = "Analyze a CRAM file to check for base corruption caused by GATK issue 8768",
oneLineSummary = "Analyze a CRAM file to check for base corruption caused by GATK issue 8768",
programGroup = OtherProgramGroup.class
)
public class CRAMIssue8768Detector extends CommandLineProgram {
// default average mismatch rate threshold above which we consider the file to be corrupt
private static final double DEFAULT_MISMATCH_RATE_THRESHOLD = 0.05;

@Argument(fullName = StandardArgumentDefinitions.INPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME,
doc = "Input path of CRAM file to analyze",
common = true)
@WorkflowInput
public GATKPath inputPath;

@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
doc = "Output diagnostics text file",
common = true)
@WorkflowOutput
public GATKPath textOutputPath;

public static final String OUTPUT_TSV__ARG_NAME = "output-tsv";
@Argument(fullName = OUTPUT_TSV__ARG_NAME,
shortName = OUTPUT_TSV__ARG_NAME,
doc = "Output diagnostics tsv file",
optional = true)
@WorkflowOutput
public GATKPath tsvOutputPath;

@Argument(fullName = StandardArgumentDefinitions.REFERENCE_LONG_NAME,
shortName = StandardArgumentDefinitions.REFERENCE_SHORT_NAME,
doc = "Reference for the CRAM file",
common = true)
@WorkflowOutput
public GATKPath referencePath;

public static final String MISMATCH_RATE_THRESHOLD_ARG_NAME = "mismatch-rate-threshold";
@Argument(fullName = MISMATCH_RATE_THRESHOLD_ARG_NAME,
shortName = MISMATCH_RATE_THRESHOLD_ARG_NAME,
doc = "Mismatch rate threshold above which we consider the file to be corrupt",
optional = true)
public double mismatchRateThreshold = DEFAULT_MISMATCH_RATE_THRESHOLD;

public static final String VERBOSE_ARG_NAME = "verbose";
@Argument(fullName = VERBOSE_ARG_NAME,
shortName= VERBOSE_ARG_NAME,
doc="Calculate and print the mismatch rate for all containers",
optional=true)
public boolean verbose = false;

public static final String ECHO_ARG_NAME = "echo-to-stdout";
@Argument(fullName = ECHO_ARG_NAME,
shortName= ECHO_ARG_NAME,
doc="Echo text output to stdout",
optional=true)
public boolean echoToStdout = false;

private CRAMIssue8768Analyzer cramAnalyzer;

@Override
protected Object doWork() {
cramAnalyzer = new CRAMIssue8768Analyzer(
inputPath,
textOutputPath,
tsvOutputPath,
referencePath,
mismatchRateThreshold,
verbose,
echoToStdout);
cramAnalyzer.doAnalysis();
return cramAnalyzer.getRetCode();
}

@Override
protected void onShutdown() {
if ( cramAnalyzer != null ) {
try {
cramAnalyzer.close();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import org.broadinstitute.hellbender.tools.filediagnostics.HTSAnalyzerFactory;
import picard.cmdline.programgroups.OtherProgramGroup;

import java.io.File;

/**
* A diagnostic tool that prints meta information about a GATK input file.
*
Expand Down Expand Up @@ -43,8 +41,8 @@ public class PrintFileDiagnostics extends CommandLineProgram {
doc = "Outut file for diagnostics (must be a local file)",
optional = false,
common = true)
@WorkflowInput
public File outputFile;
@WorkflowOutput
public GATKPath outputPath;

@Argument(shortName="count-limit",
fullName="count-limit",
Expand All @@ -56,7 +54,7 @@ public class PrintFileDiagnostics extends CommandLineProgram {
@Override
protected void onStartup() {
super.onStartup();
htsAnalyzer = HTSAnalyzerFactory.getFileAnalyzer(inputPath, outputFile, countLimit);
htsAnalyzer = HTSAnalyzerFactory.getFileAnalyzer(inputPath, outputPath, countLimit);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,17 @@
*/
public class BAIAnalyzer extends HTSAnalyzer {

public BAIAnalyzer(final GATKPath inputPath, final File outputFile) {
super(inputPath, outputFile);
public BAIAnalyzer(final GATKPath inputPath, final GATKPath outputPath) {
super(inputPath, outputPath);
}

/**
* Run the analyzer for the file.
*/
protected void doAnalysis() {
System.out.println(String.format("\nOutput written to %s\n", outputFile));
BAMIndexer.createAndWriteIndex(inputPath.toPath().toFile(), outputFile, true);
System.out.println(String.format("\nOutput written to %s\n", outputPath));
// note this method is not nio aware
BAMIndexer.createAndWriteIndex(inputPath.toPath().toFile(), new File(outputPath.getRawInputString()), true);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,22 @@
import htsjdk.samtools.util.RuntimeIOException;
import org.broadinstitute.hellbender.engine.GATKPath;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.*;
import java.nio.file.Files;

/**
* Analyzer for CRAM (.crai) index files.
*/
public class CRAIAnalyzer extends HTSAnalyzer {

final FileOutputStream fos;
final OutputStream fos;

public CRAIAnalyzer(final GATKPath inputPath, final File outputFile) {
super(inputPath, outputFile);
public CRAIAnalyzer(final GATKPath inputPath, final GATKPath outputPath) {
super(inputPath, outputPath);
try {
fos = new FileOutputStream(outputFile);
fos = Files.newOutputStream(outputPath.toPath());
} catch (final IOException e) {

throw new RuntimeIOException(e);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,11 @@
import htsjdk.samtools.util.RuntimeIOException;
import org.broadinstitute.hellbender.engine.GATKPath;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.*;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.file.Files;
import java.util.Base64;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;

Expand All @@ -36,13 +33,13 @@ public class CRAMAnalyzer extends HTSAnalyzer {
long coreBlocksDataSize = 0L;
long recordCount = 0;
final long countLimit;
final FileOutputStream fos;
final OutputStream fos;

public CRAMAnalyzer(final GATKPath inputPathName, final File outputFile, final long countLimit) {
super(inputPathName, outputFile);
public CRAMAnalyzer(final GATKPath inputPathName, final GATKPath outputPath, final long countLimit) {
super(inputPathName, outputPath);
this.countLimit = countLimit;
try {
fos = new FileOutputStream(outputFile);
fos = Files.newOutputStream(outputPath.toPath());
} catch (final IOException e) {
throw new RuntimeIOException(e);
}
Expand Down
Loading

0 comments on commit ea58e61

Please sign in to comment.