Skip to content

Commit

Permalink
Merge pull request MultiQC#1344 from kpalin/nanostat
Browse files Browse the repository at this point in the history
New module for NanoStat
  • Loading branch information
ewels authored Jul 3, 2021
2 parents 7594729 + be09456 commit 7637a30
Show file tree
Hide file tree
Showing 8 changed files with 347 additions and 0 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
- GffCompare can annotate and estimate accuracy of one or more GFF files compared with a reference annotation.
- [**Lima**](https://github.com/PacificBiosciences/barcoding)
- The PacBio Barcode Demultiplexer
- [**NanoStat**](https://github.com/wdecoster/nanostat)
- Calculate various statistics from a long read sequencing datasets
- [**odgi**](https://github.com/pangenome/odgi)
- Optimized dynamic genome/graph implementation
- [**Pangolin**](https://github.com/cov-lineages/pangolin)
Expand Down
1 change: 1 addition & 0 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ MultiQC Modules:
mosdepth: modules/mosdepth.md
MTNucRatio: modules/mtnucratio.md
MultiVCFAnalyzer: modules/multivcfanalyzer.md
NanoStat: modules/nanostat.md
ngsderive: modules/ngsderive.md
odgi: modules/odgi.md
OptiType: modules/optitype.md
Expand Down
11 changes: 11 additions & 0 deletions docs/modules/nanostat.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
Name: NanoStat
URL: https://github.com/wdecoster/nanostat
Description: >
Calculate various statistics from a long read sequencing dataset in fastq, bam or albacore
sequencing summary format.
---

The nanostat module parses text output generated by
[NanoStat](https://github.com/wdecoster/nanostat/), a program for summarising results of sequencing
on Oxford Nanopore methods (MinION, PromethION etc.)
3 changes: 3 additions & 0 deletions multiqc/modules/nanostat/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from __future__ import absolute_import

from .nanostat import MultiqcModule
318 changes: 318 additions & 0 deletions multiqc/modules/nanostat/nanostat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,318 @@
#!/usr/bin/env python

""" MultiQC module to parse output from NanoStat """

from __future__ import print_function
from collections import OrderedDict
import logging
import jinja2

from multiqc import config
from multiqc.utils import mqc_colour
from multiqc.plots import bargraph, table
from multiqc.modules.base_module import BaseMultiqcModule

# Initialise the logger
log = logging.getLogger(__name__)


class MultiqcModule(BaseMultiqcModule):
"""NanoStat module"""

_KEYS_NUM = [
"Active channels",
"Number of reads",
"Total bases",
"Total bases aligned",
"Read length N50",
"Mean read length",
"Median read length",
"Median read quality",
"Mean read quality",
"Average percent identity",
"Median percent identity",
]

_KEYS_READ_Q = [
">Q5",
">Q7",
">Q10",
">Q12",
">Q15",
]
_stat_types = ("aligned", "seq summary", "unrecognized")

def __init__(self):

# Initialise the parent object
super(MultiqcModule, self).__init__(
name="NanoStat",
anchor="nanostat",
href="https://github.com/wdecoster/nanostat/",
info="various statistics from a long read sequencing dataset in fastq, bam or sequencing summary format.",
)

# Find and load any NanoStat reports
self.nanostat_data = dict()
self.has_aligned = False
self.has_seq_summary = False
for f in self.find_log_files("nanostat", filehandles=True):
self.parse_nanostat_log(f)

# Filter to strip out ignored sample names
self.nanostat_data = self.ignore_samples(self.nanostat_data)

if len(self.nanostat_data) == 0:
raise UserWarning

log.info("Found {} reports".format(len(self.nanostat_data)))

# Write parsed report data to a file
self.write_data_file(self.nanostat_data, "multiqc_nanostat")

# Stats Tables
if self.has_aligned:
self.nanostat_stats_table("aligned")
if self.has_seq_summary:
self.nanostat_stats_table("seq summary")

# Quality distribution Plot
self.reads_by_quality_plot()

def parse_nanostat_log(self, f):
"""Parse output from NanoStat
Note: Tool can be run in two different modes, giving two variants to the output.
To avoid overwriting keys from different modes, keys are given a suffix.
"""

nano_stats = {}
for line in f["f"]:

line = jinja2.escape(line)
parts = line.strip().split(":")
if len(parts) == 0:
continue

key = parts[0]

if key in self._KEYS_NUM:
val = float(parts[1].replace(",", ""))
nano_stats[key] = val
elif key in self._KEYS_READ_Q:
# Number of reads above Q score cutoff
val = int(parts[1].strip().split()[0])
nano_stats[key] = val

if "Total bases aligned" in nano_stats:
stat_type = "aligned"
self.has_aligned = True
elif "Active channels" in nano_stats:
stat_type = "seq summary"
self.has_seq_summary = True
else:
log.debug(f"Did not recognise NanoStat file '{f['fn']}' - skipping")
return

out_d = {f"{k}_{stat_type}": v for k, v in nano_stats.items()}

# Warn if we find overlapping data for the same sample
if f["s_name"] in self.nanostat_data:
# Only if the same has some keys in common
if not set(self.nanostat_data[f["s_name"]].keys()).isdisjoint(out_d.keys()):
log.debug("Duplicate sample data found! Overwriting: {}".format(f["s_name"]))

self.nanostat_data.setdefault(f["s_name"], {}).update(out_d)

self.add_data_source(f)

def nanostat_stats_table(self, stat_type):
"""Take the parsed stats from the Kallisto report and add it to the
basic stats table at the top of the report"""

headers_base = OrderedDict()
headers_base["Active channels"] = {
"title": "Active channels",
"description": "Active channels",
"scale": "Greens",
"format": "{:,.0f}",
}
headers_base["Median read length"] = {
"title": f"Median length",
"description": f"Median read length (bp)",
"suffix": " bp",
"format": "{:,.0f}",
"shared_key": "nucleotides",
"scale": "BuPu",
}
headers_base["Mean read length"] = {
"title": f"Mean length",
"description": f"Mean read length (bp)",
"suffix": " bp",
"scale": "Purples",
"format": "{:,.0f}",
"shared_key": "nucleotides",
"hidden": True,
}
headers_base["Read length N50"] = {
"title": "Read N50",
"description": "Read length N50",
"format": "{:,.0f}",
"suffix": " bp",
"scale": "RdPu",
}
headers_base["Median read quality"] = {
"title": "Median Qual",
"description": "Median read quality (Phred scale)",
"shared_key": "phred_score",
"scale": "RdYlGn",
}
headers_base["Mean read quality"] = {
"title": "Mean Qual",
"description": "Mean read quality (Phred scale)",
"scale": "PiYG",
"shared_key": "phred_score",
"hidden": True,
}
headers_base["Median percent identity"] = {
"title": "Median Identity",
"description": "Median percent identity",
"min": 0,
"max": 100,
"suffix": "%",
"scale": "RdYlBu",
"shared_key": "percent_identity",
}
headers_base["Average percent identity"] = {
"title": "Mean Identity",
"description": "Average percent identity",
"max": 100,
"suffix": "%",
"scale": "Spectral",
"shared_key": "percent_identity",
"hidden": True,
}
headers_base["Number of reads"] = {
"title": f"# Reads ({config.long_read_count_prefix})",
"description": f"Number of reads ({config.long_read_count_desc})",
"modify": lambda x: x * config.long_read_count_multiplier,
"shared_key": "long_read_count",
"scale": "YlGn",
}
headers_base["Total bases"] = {
"title": f"Total Bases ({config.base_count_prefix})",
"description": f"Total bases ({config.base_count_desc})",
"modify": lambda x: x * config.base_count_multiplier,
"shared_key": "base_count",
"scale": "BrBG",
}
headers_base["Total bases aligned"] = {
"title": f"Aligned Bases ({config.base_count_prefix})",
"description": f"Total bases aligned ({config.base_count_desc})",
"modify": lambda x: x * config.base_count_multiplier,
"shared_key": "base_count",
"scale": "PuOr",
}

# Add the stat_type suffix
headers = OrderedDict()
for k in headers_base:
key = f"{k}_{stat_type}"
headers[key] = headers_base.get(k, dict()).copy()

# Table config
table_config = {
"namespace": "NanoStat",
"id": "nanostat_{}_stats_table".format(stat_type.replace(" ", "_")),
"table_title": f"NanoStat {stat_type}",
}

# Add the report section
description = ""
if stat_type == "aligned":
description = "NanoStat statistics from FastQ, FASTA or BAM files."
if stat_type == "seq summary":
description = "NanoStat statistics from albacore or guppy summary files."
self.add_section(
name="{} stats".format(stat_type.replace("_", " ").capitalize()),
anchor="nanostat_{}_stats".format(stat_type.replace(" ", "_")),
description=description,
plot=table.plot(self.nanostat_data, headers, table_config),
)

def reads_by_quality_plot(self):
"""Make the HighCharts HTML to plot the reads by quality"""

def _get_total_reads(data_dict):
stat_type = self._stat_types[0]
for stat_type in self._stat_types:
total_key = f"Number of reads_{stat_type}"
if total_key in data_dict:
return data_dict[total_key], stat_type
return None, None

bar_data = {}
stat_type = "unrecognized"
# Order of keys, from >Q5 to >Q15
_range_names = {
">Q5": "<Q5",
">Q7": "Q5-7",
">Q10": "Q7-10",
">Q12": "Q10-12",
">Q15": "Q12-15",
"rest": ">Q15",
}
for s_name, data_dict in self.nanostat_data.items():
reads_total, stat_type = _get_total_reads(data_dict)
if s_name in bar_data and stat_type == "aligned":
log.debug("Sample '{s_name}' duplicated in the quality plot - ignoring aligned data")
continue
elif s_name in bar_data and stat_type == "seq summary":
log.debug("Sample '{s_name}' duplicated in the quality plot - overwriting with seq summary data")
bar_data[s_name] = {}

prev_reads = reads_total
for k, range_name in _range_names.items():
if k != "rest":
data_key = f"{k}_{stat_type}"
reads_gt = data_dict[data_key]

bar_data[s_name][range_name] = prev_reads - reads_gt

if bar_data[s_name][range_name] < 0:
log.error(f"Error on {s_name} {range_name} {data_key} . Negative number of reads")
prev_reads = reads_gt
else:
data_key = f"&gt;Q15_{stat_type}"
bar_data[s_name][range_name] = data_dict[data_key]

cats = OrderedDict()
keys = reversed(list(_range_names.values()))
colours = mqc_colour.mqc_colour_scale("RdYlGn-rev", 0, len(_range_names))
for idx, k in enumerate(keys):
cats[k] = {"name": "Reads " + k, "color": colours.get_colour(idx, lighten=1)}

# Config for the plot
config = {
"id": "nanostat_quality_dist",
"title": "NanoStat: Reads by quality",
"ylab": "# Reads",
"cpswitch_counts_label": "Number of Reads",
}

# Add the report section
self.add_section(
name="Reads by quality",
anchor=f"nanostat_read_qualities",
description="Read counts categorised by read quality (phred score).",
helptext="""
Sequencing machines assign each generated read a quality score using the
[Phred scale](https://en.wikipedia.org/wiki/Phred_quality_score).
The phred score represents the liklelyhood that a given read contains errors.
So, high quality reads have a high score.
Data may come from NanoPlot reports generated with sequencing summary files or alignment stats.
If a sample has data from both, the sequencing summary is preferred.
""",
plot=bargraph.plot(bar_data, cats, config),
)
7 changes: 7 additions & 0 deletions multiqc/utils/config_defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ fn_clean_exts:
- ".vep"
- "_vep"
- "ccs"
- "_NanoStats"

# These are removed after the above, only if sample names
# start or end with this string. Again, removed in order.
Expand Down Expand Up @@ -495,6 +496,12 @@ module_order:
- qc3C:
module_tag:
- hi-c
- nanostat:
module_tag:
- DNA
- RNA
- Methylation
- WGS
- samblaster:
module_tag:
- DNA
Expand Down
4 changes: 4 additions & 0 deletions multiqc/utils/search_patterns.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,10 @@ kaiju:
kallisto:
contents: "[quant] finding pseudoalignments for the reads"
shared: true
nanostat:
max_filesize: 4096
contents: "Number, percentage and megabases of reads above quality cutoffs"
num_lines: 20
kat:
fn: "*.dist_analysis.json"
kraken:
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@
"mosdepth = multiqc.modules.mosdepth:MultiqcModule",
"mtnucratio = multiqc.modules.mtnucratio:MultiqcModule",
"multivcfanalyzer = multiqc.modules.multivcfanalyzer:MultiqcModule",
"nanostat = multiqc.modules.nanostat:MultiqcModule",
"ngsderive = multiqc.modules.ngsderive:MultiqcModule",
"odgi = multiqc.modules.odgi:MultiqcModule",
"optitype = multiqc.modules.optitype:MultiqcModule",
Expand Down

0 comments on commit 7637a30

Please sign in to comment.