Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/mouse #53

Merged
merged 13 commits into from
Apr 4, 2020
Prev Previous commit
Next Next commit
pval probe detection
  • Loading branch information
marcmaxson committed Mar 26, 2020
commit a7f35bda6e0b081694479c438949eb756381f1a1
29 changes: 26 additions & 3 deletions methylprep/download/miniml.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ def sample_sheet_from_miniml(geo_id, series_path, platform, samp_dict, meta_dict
Notes:
cleans up redundant columns, including Sample_Name == title and platform
"""

# id and position are NOT in the miniml file, must match with GSMID later
_dict = {'GSM_ID': [], 'Sample_Name': [], 'Sentrix_ID': [], 'Sentrix_Position': []}
for GSM_ID, sample_title in samp_dict.items():
Expand All @@ -245,6 +246,7 @@ def sample_sheet_from_miniml(geo_id, series_path, platform, samp_dict, meta_dict
if key not in _dict:
_dict[key] = []
_dict[key].append(value)

# arrays must be all same length
out = _dict.copy()
for column in _dict.keys():
Expand All @@ -261,14 +263,35 @@ def sample_sheet_from_miniml(geo_id, series_path, platform, samp_dict, meta_dict
if set(_dict[column]) == set(_dict[other_column]) and column != other_column:
LOGGER.debug(f"{column} == {other_column}; dropping {column}")
out.pop(column,None)

try:
df = pd.DataFrame(data=out)
except ValueError as e: # arrays must all be same length
from collections import Counter
LOGGER.info(f"ValueError - array lengths vary in sample meta data: {[(key, len(val)) for key,val in out.items()]}")
## would be HARD to salvage it by filling in blanks for missing rows ##
raise ValueError(f"{e}; this happens when a samplesheet is missing descriptors for one or more samples.")
LOGGER.info(f"Trying another method to save partial sample data into csv/pkl file.")
## alt method to salvage it by filling in blanks for missing rows -- but doesn't seem to capture Sentrix_ID / Sentrix_Position ##
column_counts = {'GSM_ID': [], 'Sample_Name': []} # column_name : [GSM_IDs included]
out = {} # list of dicts as sample rows, keyed to GSM_IDs
for GSM_ID, sample_title in samp_dict.items():
out[GSM_ID] = {'GSM_ID': GSM_ID, 'Sample_Name': sample_title}
column_counts['GSM_ID'].append(GSM_ID)
column_counts['Sample_Name'].append(GSM_ID)
for key, value in meta_dict[GSM_ID].items():
out[GSM_ID][key] = value
if key not in column_counts:
column_counts[key] = [GSM_ID]
else:
column_counts[key].append(GSM_ID)
# check if any fields are missing GSM_IDs, and fill in with blanks
for col_name, gsm_id_list in column_counts.items():
if len(out) != len(gsm_id_list):
for GSM_ID in out.keys():
if GSM_ID not in gsm_id_list:
out[GSM_ID][col_name] = ''
try:
df = pd.DataFrame(data=list(out.values()))
except:
raise ValueError(f"{e}; this happens when a samplesheet is missing descriptors for one or more samples.")
# filter: only retain control samples
if extract_controls:
keywords = ['control','ctrl']
Expand Down
201 changes: 201 additions & 0 deletions methylprep/processing/p_value_probe_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
try:
from methylcheck import mean_beta_compare
except ImportError:
mean_beta_compare = None
import types
from scipy import stats
from statsmodels.distributions.empirical_distribution import ECDF
import pandas as pd
import numpy as np


def _pval_minfi(data_containers):
""" negative control p-value """
# Pull M and U values
meth = pd.DataFrame(data_containers[0]._SampleDataContainer__data_frame.index)
unmeth = pd.DataFrame(data_containers[0]._SampleDataContainer__data_frame.index)

for i,c in enumerate(data_containers):
sample = data_containers[i].sample
m = c._SampleDataContainer__data_frame.rename(columns={'meth':sample})
u = c._SampleDataContainer__data_frame.rename(columns={'unmeth':sample})
meth = pd.merge(left=meth,right=m[sample],left_on='IlmnID',right_on='IlmnID',)
unmeth = pd.merge(left=unmeth,right=u[sample],left_on='IlmnID',right_on='IlmnID')

# Create empty dataframes for red and green negative controls
negctlsR = pd.DataFrame(data_containers[0].ctrl_red['Extended_Type'])
negctlsG = pd.DataFrame(data_containers[0].ctrl_green['Extended_Type'])

# Fill red and green dataframes
for i,c in enumerate(data_containers):
sample = str(data_containers[i].sample)
dfR = c.ctrl_red
dfR = dfR[dfR['Control_Type']=='NEGATIVE']
dfR = dfR[['Extended_Type','mean_value']].rename(columns={'mean_value':sample})
dfG = c.ctrl_green
dfG = dfG[dfG['Control_Type']=='NEGATIVE']
dfG = dfG[['Extended_Type','mean_value']].rename(columns={'mean_value':sample})
negctlsR = pd.merge(left=negctlsR,right=dfR,on='Extended_Type')
negctlsG = pd.merge(left=negctlsG,right=dfG,on='Extended_Type')

# Reset index on dataframes
negctlsG = negctlsG.set_index('Extended_Type')
negctlsR = negctlsR.set_index('Extended_Type')

# Get M and U values for IG, IR and II

# first pull out sections of manifest (will be used to identify which probes belong to each IG, IR, II)
manifest = data_containers[0].manifest.data_frame[['Infinium_Design_Type','Color_Channel']]
IG = manifest[(manifest['Color_Channel']=='Grn') & (manifest['Infinium_Design_Type']=='I')]
IR = manifest[(manifest['Color_Channel']=='Red') & (manifest['Infinium_Design_Type']=='I')]
II = manifest[manifest['Infinium_Design_Type']=='II']

# second merge with meth and unmeth dataframes
IG_meth = pd.merge(left=IG,right=meth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')
IG_unmeth = pd.merge(left=IG,right=unmeth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')
IR_meth = pd.merge(left=IR,right=meth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')
IR_unmeth = pd.merge(left=IR,right=unmeth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')
II_meth = pd.merge(left=II,right=meth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')
II_unmeth = pd.merge(left=II,right=unmeth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')

# Calcuate parameters
sdG = stats.median_absolute_deviation(negctlsG)
muG = np.median(negctlsG,axis=0)
sdR = stats.median_absolute_deviation(negctlsR)
muR = np.median(negctlsR,axis=0)

# calculate p values for type 1 Red
pIR = pd.DataFrame(index=IR_meth.index,
data=1 - stats.norm.cdf(IR_meth+IR_unmeth,2*muR,2*sdR),
columns=IR_meth.columns)

# calculate p values for type 1 Green
pIG = pd.DataFrame(index=IG_meth.index,
data=1 - stats.norm.cdf(IG_meth+IG_unmeth,2*muG,2*sdG),
columns=IG_meth.columns)

# calculat4e p values for type II
pII = pd.DataFrame(index=II_meth.index,
data=1-stats.norm.cdf(II_meth+II_unmeth,muR+muG,sdR+sdG),
columns=II_meth.columns)
# concat and sort
pval = pd.concat([pIR, pIG, pII])
pval = pval.sort_values(by='IlmnID')
print(pval)
return pval

def _pval_sesame(data_containers):
""" pOOHBah """
# Pull M and U values
meth = pd.DataFrame(data_containers[0]._SampleDataContainer__data_frame.index)
unmeth = pd.DataFrame(data_containers[0]._SampleDataContainer__data_frame.index)

for i,c in enumerate(data_containers):
sample = data_containers[i].sample
m = c._SampleDataContainer__data_frame.rename(columns={'meth':sample})
u = c._SampleDataContainer__data_frame.rename(columns={'unmeth':sample})
meth = pd.merge(left=meth,right=m[sample],left_on='IlmnID',right_on='IlmnID',)
unmeth = pd.merge(left=unmeth,right=u[sample],left_on='IlmnID',right_on='IlmnID')

# Separate M and U values for IG, IR and II
# first pull out sections of manifest (will be used to identify which probes belong to each IG, IR, II)
manifest = data_containers[0].manifest.data_frame[['Infinium_Design_Type','Color_Channel']]
IG = manifest[(manifest['Color_Channel']=='Grn') & (manifest['Infinium_Design_Type']=='I')]
IR = manifest[(manifest['Color_Channel']=='Red') & (manifest['Infinium_Design_Type']=='I')]
II = manifest[manifest['Infinium_Design_Type']=='II']

# second merge with meth and unmeth dataframes
IG_meth = pd.merge(left=IG,right=meth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')
IG_unmeth = pd.merge(left=IG,right=unmeth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')
IR_meth = pd.merge(left=IR,right=meth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')
IR_unmeth = pd.merge(left=IR,right=unmeth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')
II_meth = pd.merge(left=II,right=meth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')
II_unmeth = pd.merge(left=II,right=unmeth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')

pval = pd.DataFrame(data=manifest.index, columns=['IlmnID'])
for i,c in enumerate(data_containers):
funcG = ECDF(data_containers[i].oob_green['mean_value'].values)
funcR = ECDF(data_containers[i].oob_red['mean_value'].values)
sample = data_containers[i].sample
pIR = pd.DataFrame(index=IR_meth.index,data=1-np.maximum(funcR(IR_meth[sample]), funcR(IR_unmeth[sample])),columns=[sample])
pIG = pd.DataFrame(index=IG_meth.index,data=1-np.maximum(funcG(IG_meth[sample]), funcG(IG_unmeth[sample])),columns=[sample])
pII = pd.DataFrame(index=II_meth.index,data=1-np.maximum(funcG(II_meth[sample]), funcR(II_unmeth[sample])),columns=[sample])
p = pd.concat([pIR,pIG,pII]).reset_index()
pval = pd.merge(pval,p)
return pval


def _pval_sesame_preprocess(data_container, column='mean_value', probe_column='IlmnID'):
"""Performs p-value detection of low signal/noise probes. This ONE SAMPLE version uses meth/unmeth before it is contructed into a _SampleDataContainer__data_frame.
- returns a dataframe of probes and their detected p-value levels.
- this will be saved to the csv output, so it can be used to drop probes at later step.
- output: index are probes (IlmnID); one column [poobah_pval] contains the sample p-values."""
meth = data_container.methylated.data_frame
unmeth = data_container.unmethylated.data_frame
manifest = data_container.manifest.data_frame[['Infinium_Design_Type','Color_Channel']]
IG = manifest[(manifest['Color_Channel']=='Grn') & (manifest['Infinium_Design_Type']=='I')]
IR = manifest[(manifest['Color_Channel']=='Red') & (manifest['Infinium_Design_Type']=='I')]
II = manifest[manifest['Infinium_Design_Type']=='II']

# merge with meth and unmeth dataframes; reindex is preferred (no warning) way of .loc[slice] now
IG_meth = meth.reindex(IG.index)
IG_unmeth = unmeth.reindex(IG.index)
IR_meth = meth.reindex(IR.index)
IR_unmeth = unmeth.reindex(IR.index)
II_meth = meth.reindex(II.index)
II_unmeth = unmeth.reindex(II.index)

funcG = ECDF(data_container.oob_green['mean_value'].values)
funcR = ECDF(data_container.oob_red['mean_value'].values)
pIR = pd.DataFrame(index=IR_meth.index, data=1-np.maximum(funcR(IR_meth[column]), funcR(IR_unmeth[column])), columns=[column])
pIG = pd.DataFrame(index=IG_meth.index, data=1-np.maximum(funcG(IG_meth[column]), funcG(IG_unmeth[column])), columns=[column])
pII = pd.DataFrame(index=II_meth.index, data=1-np.maximum(funcG(II_meth[column]), funcR(II_unmeth[column])), columns=[column])
pval = pd.concat([pIR,pIG,pII]).reset_index()
pval = pval.set_index(probe_column)
pval = pval.rename(columns={column: 'poobah_pval'})
# index is IlmnID; one column is 'poobah_pval' -- p-values
return pval


def detect_probes(data_containers, method='sesame', save=False, silent=True):
"""
About:
a wrapper for the p-value probe detection methods. Tries to check inputs and rationalize them
with methyl-suite's standard data objects.

Inputs:
a list of sample data_containers. The dataframe must have a 'IlMnID' for the index of probe names.
And probe names should be `cgxxxxxx` format to work with other methylcheck functions

To create this, use:

data_containers = methylprep.run_pipeline(data_dir,
save_uncorrected=True,
betas=True)
(if there are more than 200 samples, you'll need to load the data from disk instead, as nothing will be returned)

method:
sesame -- use the sesame ported algorithm
minfi -- use the minfi ported algorithm

Checks:
data_containers must have 'meth' and 'unmeth' columns (uncorrected data)
the values for these columns should be between 0 and 10000s
(beta: 0 to 1; m_value: -neg to pos+ range)

Returns:
dataframe of p-value filtered probes
"""
if not ('unmeth' in data_containers[0]._SampleDataContainer__data_frame.columns and
'meth' in data_containers[0]._SampleDataContainer__data_frame.columns):
raise ValueError("Provide a list of data_containers that includes uncorrected data (with 'meth' and 'unmeth' columns, using the 'save_uncorrected' option in run_pipeline)")
if method == 'minfi':
pval = pval_minfi(data_containers)
else:
pval = pval_sesame(data_containers)

if silent == False and type(mean_beta_compare) is types.FunctionType:
# plot it
# df1 and df2 are probe X sample_id matrices
mean_beta_compare(df1, df2, save=save, verbose=False, silent=silent)
return pval
16 changes: 14 additions & 2 deletions methylprep/processing/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
)
from .preprocess import preprocess_noob
from .raw_dataset import get_raw_datasets

from .p_value_probe_detection import _pval_sesame_preprocess

__all__ = ['SampleDataContainer', 'get_manifest', 'run_pipeline', 'consolidate_values_for_sheet']

Expand Down Expand Up @@ -356,14 +356,19 @@ class SampleDataContainer():
raw_dataset {RawDataset} -- A sample's RawDataset for a single well on the processed array.
manifest {Manifest} -- The Manifest for the correlated RawDataset's array type.
bit (default: float64) -- option to store data as float16 or float32 to save space.
pval (default: False) -- whether to apply p-value-detection algorithm to remove
unreliable probes (based on signal/noise ratio of fluoresence)

Jan 2020: added .snp_(un)methylated property. used in postprocess.consolidate_crontrol_snp()
Mar 2020: added p-value detection option
"""

__data_frame = None

def __init__(self, raw_dataset, manifest, retain_uncorrected_probe_intensities=False, bit='float32'):
def __init__(self, raw_dataset, manifest, retain_uncorrected_probe_intensities=False,
bit='float32', pval=False):
self.manifest = manifest
self.pval = pval
self.raw_dataset = raw_dataset
self.sample = raw_dataset.sample
self.retain_uncorrected_probe_intensities=retain_uncorrected_probe_intensities
Expand Down Expand Up @@ -411,6 +416,13 @@ def preprocess(self):
uncorrected_meth = self.methylated.data_frame.copy()
uncorrected_unmeth = self.unmethylated.data_frame.copy()

if self.pval == True:
# (self.methylated.data_frame, self.unmethylated.data_frame)
pval_probes_df = _pval_sesame_preprocess(self)
# missing: how to merge this df into the data_frame and actually drop probes
# with pd.merge


preprocess_noob(self) # apply corrections: bg subtract, then noob (in preprocess.py)

methylated = self.methylated.data_frame[['noob']]
Expand Down