FoxoTech · marcmaxson · Jul 19, 2021 · Jul 14, 2021 · Jul 19, 2021 · Jul 19, 2021
diff --git a/docs/debug_notebooks/Comparing methylprep, sesame, minfi v1.5.2.ipynb b/docs/debug_notebooks/Comparing methylprep, sesame, minfi v1.5.2.ipynb
diff --git a/docs/debug_notebooks/sesame_epic_one_GSE147430.R b/docs/debug_notebooks/sesame_epic_one_GSE147430.R
@@ -18,7 +18,7 @@ sdfs <- lapply(searchIDATprefixes(in_dir), readIDATpair) #readIdat is for one fi
 savefiles(sdfs, 1, 'raw')
 
 # INFER
-sdfs = lapply(sdfs, inferTypeIChannel, verbose=TRUE)
+sdfs.infer = lapply(sdfs, inferTypeIChannel, verbose=TRUE)
 savefiles(sdfs, 1, 'infer')
 
 # POOBAH ssets <- lapply(ssets, detectionMask) # poobah and qualityMask now set in sdf@extra$mask by readIDATpair

diff --git a/docs/example_data/read_geo/GSE111165_test.csv b/docs/example_data/read_geo/GSE111165_test.csv
diff --git a/docs/example_data/read_geo/GSE133355_processed_test.xlsx b/docs/example_data/read_geo/GSE133355_processed_test.xlsx
diff --git a/docs/example_data/read_geo/GSE72120_test.txt b/docs/example_data/read_geo/GSE72120_test.txt
diff --git a/docs/release-history.md b/docs/release-history.md
@@ -1,26 +1,35 @@
 # Release History
 
-## v1.5.0
+## v1.5.2
+- Bug fix: added 'Extended_Type' into control_probes.pkl output. Required by methylcheck.plot_controls().
+- Minor bug fixes and improved unit test coverage.
+- Fixed bug where `process --minfi` was not working with `--all`. Added more test coverage for CLI.
+- updated read_geo to handle more edge cases
+- deprecated some never-used functions.
+  - instead of methylprep.files.idat.RunInfo use IdatDataset( verbose=True )
+
+
+## v1.5.0, v1.5.1
 - MAJOR refactor/overhaul of all the internal classes. This was necessary to fully support the mouse array.
-  - new SigSet class object that mirror's sesame's SigSet and SigDF object.
-    - Combines idats, manifest, and sample sheet into one object that is inherited by SampleDataContainer
-  - RawDataset, MethylationDataset, ProbeSubtype all deprecated and replaced by SigSet
-  - SampleDataContainer class is now basically the SigSet plus all pipeline processing settings
-  - new mouse manifest covers all probes and matches sesame's output
-  - Processing will work even if a batch of IDATs have differing probe counts for same array_type, though those
-  differing probes in question may not be saved.
-  - unit tests confirm that methylprep, sesame, and minfi beta values output match to within 1% of each other now. Note that the intermediate stages of processing (after NOOB and after DYE) do not match
-  with sesame in this version. Can be +/- 100 intensity units, likely due to differences in order of
-  steps and/or oob/mask probes used.
+- new SigSet class object that mirror's sesame's SigSet and SigDF object.
+- Combines idats, manifest, and sample sheet into one object that is inherited by SampleDataContainer
+- RawDataset, MethylationDataset, ProbeSubtype all deprecated and replaced by SigSet
+- SampleDataContainer class is now basically the SigSet plus all pipeline processing settings
+- new mouse manifest covers all probes and matches sesame's output
+- Processing will work even if a batch of IDATs have differing probe counts for same array_type, though those
+differing probes in question may not be saved.
+- unit tests confirm that methylprep, sesame, and minfi beta values output match to within 1% of each other now. Note that the intermediate stages of processing (after NOOB and after DYE) do not match
+with sesame in this version. Can be +/- 100 intensity units, likely due to differences in order of
+steps and/or oob/mask probes used.
 
 ## v1.4.7
 - mouse manifest updated to conform with illumina Genome Studio / sesame probe naming convention.
-  - mouse_probes.pkl now includes different probe types. Previously, if a probe type was 'mu' (multi)
-  or 'rp' (repeat) or IlmnID started with 'uk' (unknown?), it was moved to experimental mouse_probes.pkl.
-  This was about 6300 probes.
-  Now, all 'Multi' and 'Random' probes are moved and stored in mouse_probes.pkl, about 30,000.
-  - mouse manifest has a 'design' column with tons of human-readable notes on different probe origins,
-  including analogous EPIC human-mapped probes.
+- mouse_probes.pkl now includes different probe types. Previously, if a probe type was 'mu' (multi)
+or 'rp' (repeat) or IlmnID started with 'uk' (unknown?), it was moved to experimental mouse_probes.pkl.
+This was about 6300 probes.
+Now, all 'Multi' and 'Random' probes are moved and stored in mouse_probes.pkl, about 30,000.
+- mouse manifest has a 'design' column with tons of human-readable notes on different probe origins,
+including analogous EPIC human-mapped probes.
 
 ## v1.4.6
 - pipeline CSV output will now include meth, unmeth, beta, and m-values for all probes, including failed probes.
@@ -80,4 +89,4 @@
 - methylprep.download has a new all-encompassing pipeline that will read GEO data sets and convert
     any data file type into a pickle of beta_values, whether from idats or processed matrix files.
 
-## Older versions exist on pypi, but no changelog 
+## Older versions exist on pypi, but no changelog
diff --git a/methylprep/cli.py b/methylprep/cli.py
@@ -253,9 +253,10 @@ def cli_process(cmd_args):
         args.no_meta_export = True
         args.poobah = True
         args.export_poobah = True
-        args.minfi = False
         args.no_quality_mask = False
 
+    #print(vars(args).items())
+
     run_pipeline(
         args.data_dir,
         array_type=args.array_type,

diff --git a/methylprep/files/idat.py b/methylprep/files/idat.py
@@ -87,13 +87,12 @@ class IdatSectionCode(IntEnum):
 
 # Object Definitions
 # ----------------------------------------------------------------------------
-
+""" DEPRECATED: use IdatDataset(... verbose=True) instead.
 class RunInfo():
-    """A dataclass defining IDAT run information.
+    '''A dataclass defining IDAT run information.
 
     Arguments:
-        idat_file {file-like} -- An open IDAT file.
-    """
+        idat_file {file-like} -- An open IDAT file.'''
 
     __slots__ = [
         'run_time',
@@ -104,12 +103,13 @@ class RunInfo():
     ]
 
     def __init__(self, idat_file):
-        """Initializes the RunInfo data class by reading the provided idat_file."""
+        # Initializes the RunInfo data class by reading the provided idat_file.
         self.run_time = read_string(idat_file)
         self.block_type = read_string(idat_file)
         self.block_pars = read_string(idat_file)
         self.block_code = read_string(idat_file)
         self.code_version = read_string(idat_file)
+"""
 
 class IdatDataset():
     """Validates and parses an Illumina IDAT file.

diff --git a/methylprep/files/manifests.py b/methylprep/files/manifests.py
@@ -302,9 +302,8 @@ def get_data_types(self):
         return data_types
 
     def get_probe_details(self, probe_type, channel=None):
-        """given a probe type (I, II, SnpI, SnpII, Control) and a channel (Channel.RED | Channel.GREEN),
-        This will return info needed to map probes to their names (e.g. cg0031313 or rs00542420),
-        which are NOT in the idat files."""
+        """used by infer_channel_switch. Given a probe type (I, II, SnpI, SnpII, Control) and a channel (Channel.RED | Channel.GREEN),
+        this will return info needed to map probes to their names (e.g. cg0031313 or rs00542420), which are NOT in the idat files."""
         if not isinstance(probe_type, ProbeType):
             raise Exception('probe_type is not a valid ProbeType')
 

diff --git a/methylprep/models/sigset.py b/methylprep/models/sigset.py
@@ -117,9 +117,9 @@ class SigSet():
     [x] IG - Type-I Grn channel probes;
     [x] oobG - Out-of-band Grn channel probes (matching Type-I Red channel probes in number);
     [x] oobR - Out-of-band Red channel probes (matching Type-I Grn channel probes in number);
-    [-] ctl - control probes.
+    [x] ctrl_green, ctrl_red - control probes.
     [x] methylated, unmethylated, snp_methylated, snp_unmethylated
-    [x] fg_green, fg_red (opposite of oobG and oobR)
+    [x] fg_green, fg_red (opposite of oobG and oobR) AKA ibG, ibR for in-band probes.
 
     - just tidying up how we access this stuff, and trying to stick to IlmnID everywhere because the illumina_id within IDAT files is no longer unique as a ref.
     - I checked again, and no other array breaks these rules. But sounds like Bret won’t stick to this pattern going forward with designs. So I suspect other software will break with new arrays, unless they rewrite for this too.
@@ -305,6 +305,8 @@ def update_probe_means(self, noob_green, noob_red, red_factor=None):
         upstream: container.sigset.update_probe_means(noob_green, noob_red)
 
         replaces 'bg_corrected' column with 'noob_Meth' or 'noob_Unmeth' column.
+
+        does NOT update ctrl_red or ctrl_green; these are updated within the NOOB function because structually different.
         """
 
         for probe_subset, decoder_parts in self.subsets.items():
@@ -387,14 +389,15 @@ def update_probe_means(self, noob_green, noob_red, red_factor=None):
         self.__minfi_noob = False
         self.__linear_dye = True if red_factor is not None else False
 
+    """
     # from raw_dataset; may no longer be needed, but kept for testing against new approach 2021
     def get_oob_controls(self, green_idat, red_idat, manifest, include_rs=True):
-        """ Out-of-bound controls are the mean intensity values for the
+        ''' Out-of-bound controls are the mean intensity values for the
         channel in the opposite channel's probes (IG oob and IR oob)
 
 .. todo::
     TEST -- does this give same output as SigSet.oobG and oobR?
-        """
+        '''
         param_sets = [
             {'channel': Channel.RED, 'idat': green_idat, 'manifest': manifest},
             {'channel': Channel.GREEN, 'idat': red_idat, 'manifest': manifest},
@@ -451,6 +454,7 @@ def get_oob_controls(self, green_idat, red_idat, manifest, include_rs=True):
                 ).rename(columns={'mean_value': 'Unmeth'})
                 oobR.drop(['AddressA_ID', 'AddressB_ID'], axis=1)
         return (oobG.sort_index(), oobR.sort_index())
+    """
 
     # from raw_dataset
     def filter_oob_probes(self, channel, manifest, idat_dataset, include_rs=True):

diff --git a/methylprep/models/sketchy_probes.py b/methylprep/models/sketchy_probes.py
@@ -4,8 +4,9 @@
 from importlib import resources # py3.7+
 pkg_namespace = 'methylprep.models'
 
+"""
 def sketchy_probes_warning(filepath):
-    """ not implemented anywhere yet """
+    ''' not implemented anywhere yet '''
     # used to warn user if running a Catalog_ID that contains sketchy illumina probes
 
     with resources.path(pkg_namespace, 'illumina_sketchy_probes_996.npy') as probe_filepath:
@@ -22,6 +23,7 @@ def sketchy_probes_warning(filepath):
         return Path(filepath).name
     else:
         return None
+"""
 
 with resources.path(pkg_namespace, 'qualityMask450.txt.gz') as probe_filepath:
     qualityMask450 = pd.read_csv(probe_filepath)['x']

diff --git a/methylprep/processing/p_value_probe_detection.py b/methylprep/processing/p_value_probe_detection.py
@@ -9,8 +9,80 @@
 import numpy as np
 
 
+def _pval_sesame_preprocess(data_container):
+    """Performs p-value detection of low signal/noise probes. This ONE SAMPLE version uses meth/unmeth before it is contructed into a _SampleDataContainer__data_frame.
+    - returns a dataframe of probes and their detected p-value levels.
+    - this will be saved to the csv output, so it can be used to drop probes at later step.
+    - output: index are probes (IlmnID or illumina_id); one column [poobah_pval] contains the sample p-values.
+    - called by pipeline CLI --poobah option.
+    - confirmed that this version produces identical results to the pre-v1.5.0 version on 2021-06-16
+    """
+    # 2021-03-22 assumed 'mean_value' for red and green MEANT meth and unmeth (OOBS), respectively.
+    funcG = ECDF(data_container.oobG['Unmeth'].values)
+    funcR = ECDF(data_container.oobR['Meth'].values)
+    pIR = pd.DataFrame(
+        index=data_container.IR.index,
+        data=1-np.maximum(funcR(data_container.IR['Meth']), funcR(data_container.IR['Unmeth'])),
+        columns=['poobah_pval'])
+    pIG = pd.DataFrame(
+        index=data_container.IG.index,
+        data=1-np.maximum(funcG(data_container.IG['Meth']), funcG(data_container.IG['Unmeth'])),
+        columns=['poobah_pval'])
+    pII = pd.DataFrame(
+        index=data_container.II.index,
+        data=1-np.maximum(funcG(data_container.II['Meth']), funcR(data_container.II['Unmeth'])),
+        columns=['poobah_pval'])
+    # pval output: index is IlmnID; and threre's one column, 'poobah_pval' with p-values
+    pval = pd.concat([pIR,pIG,pII])
+    return pval
+
+""" DEPRECATED FUNCTIONS (<v1.5.0)
+def detect_probes(data_containers, method='sesame', save=False, silent=True):
+    '''
+About:
+    a wrapper for the p-value probe detection methods. Tries to check inputs and rationalize them
+    with methyl-suite's standard data objects.
+
+Inputs:
+    a list of sample data_containers. The dataframe must have a 'IlMnID' for the index of probe names.
+    And probe names should be `cgxxxxxx` format to work with other methylcheck functions
+
+    To create this, use:
+
+    data_containers = methylprep.run_pipeline(data_dir,
+            save_uncorrected=True,
+            betas=True)
+    (if there are more than 200 samples, you'll need to load the data from disk instead, as nothing will be returned)
+
+    method:
+        sesame -- use the sesame ported algorithm
+        minfi -- use the minfi ported algorithm
+
+Checks:
+    data_containers must have 'meth' and 'unmeth' columns (uncorrected data)
+    the values for these columns should be between 0 and 10000s
+    (beta: 0 to 1; m_value: -neg to pos+ range)
+
+Returns:
+    dataframe of p-value filtered probes
+    '''
+    if not ('unmeth' in data_containers[0]._SampleDataContainer__data_frame.columns and
+            'meth' in data_containers[0]._SampleDataContainer__data_frame.columns):
+            raise ValueError("Provide a list of data_containers that includes uncorrected data (with 'meth' and 'unmeth' columns, using the 'save_uncorrected' option in run_pipeline)")
+    if method == 'minfi':
+        pval = pval_minfi(data_containers)
+    else:
+        pval = pval_sesame(data_containers)
+
+    if silent == False and type(mean_beta_compare) is types.FunctionType:
+        # plot it
+        # df1 and df2 are probe X sample_id matrices
+        mean_beta_compare(df1, df2, save=save, verbose=False, silent=silent)
+    return pval
+
+
 def _pval_minfi(data_containers):
-    """ negative control p-value """
+    # negative control p-value
     # Pull M and U values
     meth = pd.DataFrame(data_containers[0]._SampleDataContainer__data_frame.index)
     unmeth = pd.DataFrame(data_containers[0]._SampleDataContainer__data_frame.index)
@@ -84,7 +156,7 @@ def _pval_minfi(data_containers):
     return pval
 
 def _pval_sesame(data_containers):
-    """ pOOHBah, called using ___ in sesame """
+    # pOOHBah, called using ___ in sesame
     # Pull M and U values
     meth = pd.DataFrame(data_containers[0]._SampleDataContainer__data_frame.index)
     unmeth = pd.DataFrame(data_containers[0]._SampleDataContainer__data_frame.index)
@@ -123,75 +195,4 @@ def _pval_sesame(data_containers):
         p = pd.concat([pIR,pIG,pII]).reset_index()
         pval = pd.merge(pval,p)
     return pval
-
-
-def _pval_sesame_preprocess(data_container):
-    """Performs p-value detection of low signal/noise probes. This ONE SAMPLE version uses meth/unmeth before it is contructed into a _SampleDataContainer__data_frame.
-    - returns a dataframe of probes and their detected p-value levels.
-    - this will be saved to the csv output, so it can be used to drop probes at later step.
-    - output: index are probes (IlmnID or illumina_id); one column [poobah_pval] contains the sample p-values.
-    - called by pipeline CLI --poobah option.
-    - confirmed that this version produces identical results to the pre-v1.5.0 version on 2021-06-16
-    """
-    # 2021-03-22 assumed 'mean_value' for red and green MEANT meth and unmeth (OOBS), respectively.
-    funcG = ECDF(data_container.oobG['Unmeth'].values)
-    funcR = ECDF(data_container.oobR['Meth'].values)
-    pIR = pd.DataFrame(
-        index=data_container.IR.index,
-        data=1-np.maximum(funcR(data_container.IR['Meth']), funcR(data_container.IR['Unmeth'])),
-        columns=['poobah_pval'])
-    pIG = pd.DataFrame(
-        index=data_container.IG.index,
-        data=1-np.maximum(funcG(data_container.IG['Meth']), funcG(data_container.IG['Unmeth'])),
-        columns=['poobah_pval'])
-    pII = pd.DataFrame(
-        index=data_container.II.index,
-        data=1-np.maximum(funcG(data_container.II['Meth']), funcR(data_container.II['Unmeth'])),
-        columns=['poobah_pval'])
-    # pval output: index is IlmnID; and threre's one column, 'poobah_pval' with p-values
-    pval = pd.concat([pIR,pIG,pII])
-    return pval
-
-
-def detect_probes(data_containers, method='sesame', save=False, silent=True):
-    """
-About:
-    a wrapper for the p-value probe detection methods. Tries to check inputs and rationalize them
-    with methyl-suite's standard data objects.
-
-Inputs:
-    a list of sample data_containers. The dataframe must have a 'IlMnID' for the index of probe names.
-    And probe names should be `cgxxxxxx` format to work with other methylcheck functions
-
-    To create this, use:
-
-    data_containers = methylprep.run_pipeline(data_dir,
-            save_uncorrected=True,
-            betas=True)
-    (if there are more than 200 samples, you'll need to load the data from disk instead, as nothing will be returned)
-
-    method:
-        sesame -- use the sesame ported algorithm
-        minfi -- use the minfi ported algorithm
-
-Checks:
-    data_containers must have 'meth' and 'unmeth' columns (uncorrected data)
-    the values for these columns should be between 0 and 10000s
-    (beta: 0 to 1; m_value: -neg to pos+ range)
-
-Returns:
-    dataframe of p-value filtered probes
-    """
-    if not ('unmeth' in data_containers[0]._SampleDataContainer__data_frame.columns and
-            'meth' in data_containers[0]._SampleDataContainer__data_frame.columns):
-            raise ValueError("Provide a list of data_containers that includes uncorrected data (with 'meth' and 'unmeth' columns, using the 'save_uncorrected' option in run_pipeline)")
-    if method == 'minfi':
-        pval = pval_minfi(data_containers)
-    else:
-        pval = pval_sesame(data_containers)
-
-    if silent == False and type(mean_beta_compare) is types.FunctionType:
-        # plot it
-        # df1 and df2 are probe X sample_id matrices
-        mean_beta_compare(df1, df2, save=save, verbose=False, silent=silent)
-    return pval
+"""