Minor bug fixes, and coveralls stopped working (#35)

* added batch_size parameter to run_pipeline * added CLI functionality * batch_size python/CLI and tests * removed test; changed default behavior: won't raise error if file-to-be-downloaded already exists * Update setup.py * Update test_batch_size.py * Rename test_batch_size.py to test_pipeline_batch_size.py * dropped redundant tests and sped up one * Feature/public data (#21) * download command, as well as some batch_size adjustments * fixed string issue * renaming update and removed redundant tests * bs4 required for Array ingester * tests * workaround to return objects with batch size changes * workaround to return objects with batch size changes * bug * tests pass for batch_size * version 1.1 (#22) * download command, as well as some batch_size adjustments * fixed string issue * renaming update and removed redundant tests * bs4 required for Array ingester * tests * workaround to return objects with batch size changes * workaround to return objects with batch size changes * bug * tests pass for batch_size * progress bars * documenting `download` * Update cli.py * restore sample_name filter * added rawMetaDataset class and moved get_sample_sheet_s3 to more logical place here (#24) * updated docs for 1.1.1 * Update README.md * Update setup.py * exposed create_sample_sheet and download no_clean options * manifest file download in lambda * manifest file download in lambda * manifest file download in lambda * v1.1.3 bump bug fix * handles blank sample_name and ensures names are unique. * Update setup.py * geo downloader tweaks, fixed docs * minor tweaks to sample_sheet parser * v1.1.8: CLI retain --uncorrected mean prob values; sample_sheet sample_type sample_sub_type; sample_sheet accepts alt sentrix column headers * v1.1.8: CLI retain --uncorrected mean prob values; sample_sheet sample_type sample_sub_type; sample_sheet accepts alt sentrix column headers * v1.1.8 * v1.1.9 minor bug fix to alt filename * bug fix: sample QC control status * v1.1.11 generates meta_data pickle * Update config.yml * Update config.yml * Update config.yml * Update config.yml * coveralls
FoxoTech · Nov 5, 2019 · 59d8691 · 59d8691
1 parent 6c0f9f1
commit 59d8691
Show file tree

Hide file tree

Showing 8 changed files with 121 additions and 21 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -12,8 +12,10 @@ jobs:
           # key: deps9-{{ .Branch }}-{{ checksum "Pipfile.lock" }}
           key: deps1-{{ .Branch }}-{{ checksum "requirements.txt" }}
       - run:
-          command: |
-            pip install pipenv
+          command: |            
+            pip3 install pipenv       
+      - run:
+          command: |            
             pipenv install --dev --skip-lock # --ignore-pipfile
             pipenv install -e . --skip-lock # pytest recommends this: install your package in "editable" mode by running this from the same directory
             # pipenv will use requirements.txt with these params.

diff --git a/index.rst b/index.rst
@@ -15,10 +15,12 @@ Getting Started
 .. toctree::
    :maxdepth: 2
    :caption: Contents:
-   
+
    docs/methylprep_tutorial.md
-   docs/cli.md   
+   docs/cli.md
    docs/source/modules
+   methylcheck (QC) package <https://life-epigenetics-methylcheck.readthedocs-hosted.com/en/latest/>
+   methylize (analysis) package <https://life-epigenetics-methylize.readthedocs-hosted.com/en/latest/>
 
 Indices and tables
 ==================

diff --git a/methylprep/cli.py b/methylprep/cli.py
@@ -169,14 +169,6 @@ def cli_process(cmd_args):
         help='Sample(s) to process. You can pass multiple sample names with multiple -n params.',
     )
 
-    parser.add_argument(
-        '-e', '--no_export',
-        required=False,
-        action='store_false', # if -e passed, this suppresses data export (if running as part of pipeline or something)
-        default=True, # if False, CLI returns nothing.
-        help='Default is to export data to csv in same folder where IDAT file resides. Pass in --no_export to suppress this.',
-    )
-
     parser.add_argument(
         '-b', '--betas',
         required=False,
@@ -186,7 +178,7 @@ def cli_process(cmd_args):
     )
 
     parser.add_argument(
-        '--m_value',
+        '-v', '--m_value',
         required=False,
         action='store_true',
         default=False,
@@ -201,13 +193,29 @@ def cli_process(cmd_args):
     )
 
     parser.add_argument(
-        '--uncorrected',
+        '-u', '--uncorrected',
         required=False,
         action='store_true',
         default=False,
         help='If specified, processed csv will contain two additional columns (meth and unmeth) that have not been NOOB corrected.'
     )
 
+    parser.add_argument(
+        '-e', '--no_export',
+        required=False,
+        action='store_false', # if -e passed, this suppresses data export
+        default=True, # if False, CLI returns nothing. will set export=True
+        help='Default is to export data to csv in same folder where IDAT file resides. Pass in --no_export to suppress this.',
+    )
+
+    parser.add_argument(
+        '-x', '--no_meta_export',
+        required=False,
+        action='store_false', # if -x passed, this suppresses meta data export
+        default=True, # will set meta_data_frame == True
+        help='Default is to convert the sample sheet into a pickled DataFrame, recognized in methylcheck and methylize. Pass in --no_meta_export to suppress this.',
+    )
+
     args = parser.parse_args(cmd_args)
 
     array_type = args.array_type
@@ -221,7 +229,6 @@ def cli_process(cmd_args):
     run_pipeline(
         args.data_dir,
         array_type=args.array_type,
-        export=args.no_export,
         manifest_filepath=args.manifest,
         sample_sheet_filepath=args.sample_sheet,
         sample_name=args.sample_name,
@@ -230,6 +237,8 @@ def cli_process(cmd_args):
         m_value=args.m_value,
         batch_size=args.batch_size,
         save_uncorrected=args.uncorrected,
+        export=args.no_export, # flag flips here
+        meta_data_frame=args.no_meta_export, # flag flips here
     )
 
 def cli_download(cmd_args):

diff --git a/methylprep/files/sample_sheets.py b/methylprep/files/sample_sheets.py
@@ -253,6 +253,8 @@ class SampleSheet():
 
     def __init__(self, filepath_or_buffer, data_dir):
         self.__samples = []
+        self.fields = {}
+        self.renamed_fields = {}
 
         self.data_dir = data_dir
         self.headers = []
@@ -345,7 +347,10 @@ def build_samples(self):
                 sentrix_position=sentrix_position,
                 **row,
             )
-
+            if sample.renamed_fields != {}:
+                self.renamed_fields.update(sample.renamed_fields)
+            self.fields.update(sample.fields)
+
             self.__samples.append(sample)
 
     def contains_column(self, column_name):

diff --git a/methylprep/models/samples.py b/methylprep/models/samples.py
@@ -5,6 +5,9 @@
 from glob import glob
 
 LOGGER = logging.getLogger(__name__)
+REQUIRED = ['Sentrix_ID', 'Sentrix_Position', 'SentrixBarcode_A', 'SentrixPosition_A', 'Control',
+    'Sample_Group', 'Sample_Name', 'Sample_Plate', 'Pool_ID', 'Sample_Well', 'GSM_ID',
+    'Sample_Type', 'Sub_Type']
 
 class Sample():
     """Object representing a row in a SampleSheet file
@@ -32,7 +35,22 @@ def __init__(self, data_dir, sentrix_id, sentrix_position, **addl_fields):
         self.data_dir = data_dir
         self.sentrix_id = sentrix_id
         self.sentrix_position = sentrix_position
-
+        self.renamed_fields = {}
+
+        # any OTHER sample_sheet columns are passed in exactly as they appear, if possible, and if column names exist.
+        # these will pass into the meta_data pkl created, and any renamed fields must be noted in a lookup.
+        for field in addl_fields:
+            if field not in REQUIRED:
+                new_field_name = field.replace(' ','_')
+                if len(field) == 0:
+                    continue
+                if field[0].isdigit():
+                    new_field_name = field[1:]
+                if not field.isalnum(): # letters or numbers, or caps. no spaces or unicode
+                    import re
+                    new_field_name = re.sub(r'\W+', '', new_field_name)
+                setattr(self, new_field_name, addl_fields[field])
+                self.renamed_fields[field] = new_field_name
         self.group = addl_fields.get('Sample_Group')
         self.name = addl_fields.get('Sample_Name')
         self.plate = addl_fields.get('Sample_Plate')
@@ -41,7 +59,22 @@ def __init__(self, data_dir, sentrix_id, sentrix_position, **addl_fields):
         self.GSM_ID = addl_fields.get('GSM_ID') # for GEO published sample compatability
         self.type = addl_fields.get('Sample_Type','Unknown') # from GEO MINiML meta data
         self.sub_type = addl_fields.get('Sub_Type') # from GEO
-        self.is_control = addl_fields.get('Control',False) # from GEO MINiML meta data
+        self.is_control = True if addl_fields.get('Control') in (1,'1',True, 'True', 'true', 'TRUE') else False
+        self.fields = {}
+        self.fields.update(self.renamed_fields)
+        self.fields.update({
+            'Sentrix_ID': 'Sentrix_ID',
+            'Sentrix_Position': 'Sentrix_Position', # these will be standardized here, regardless of sample_sheet variation names
+            'Sample_Group': 'Sample_Group',
+            'Sample_Name': 'Sample_Name',
+            'Sample_Plate': 'Sample_Plate',
+            'Sample_Type': 'Sample_Type',
+            'Sub_Type': 'Sub_Type',
+            'Sample_Well': 'Sample_Well',
+            'Pool_ID': 'Pool_ID',
+            'GSM_ID': 'GSM_ID',
+            'Control': 'Control',
+        })
 
     def __str__(self):
         return f'{self.sentrix_id}_{self.sentrix_position}'

diff --git a/methylprep/processing/pipeline.py b/methylprep/processing/pipeline.py
@@ -55,7 +55,7 @@ def get_manifest(raw_datasets, array_type=None, manifest_filepath=None):
 def run_pipeline(data_dir, array_type=None, export=False, manifest_filepath=None,
                  sample_sheet_filepath=None, sample_name=None,
                  betas=False, m_value=False, make_sample_sheet=False, batch_size=None,
-                 save_uncorrected=False):
+                 save_uncorrected=False, meta_data_frame=True):
     """The main CLI processing pipeline. This does every processing step and returns a data set.
 
     Arguments:
@@ -110,6 +110,15 @@ def run_pipeline(data_dir, array_type=None, export=False, manifest_filepath=None
     sample_sheet = get_sample_sheet(data_dir, filepath=sample_sheet_filepath)
 
     samples = sample_sheet.get_samples()
+    if sample_sheet.renamed_fields != {}:
+        show_fields = []
+        for k,v in sample_sheet.renamed_fields.items():
+            if v != k:
+                show_fields.append(f"{k} --> {v}\n")
+            else:
+                show_fields.append(f"{k}\n")
+        LOGGER.info(f"Found {len(show_fields)} additional fields in sample_sheet: {''.join(show_fields)}")
+
     batches = []
     batch = []
     sample_id_counter = 1
@@ -199,6 +208,46 @@ def run_pipeline(data_dir, array_type=None, export=False, manifest_filepath=None
             continue
         data_containers.extend(batch_data_containers)
 
+    if meta_data_frame == True:
+        #sample_sheet.fields is a complete mapping of original and renamed_fields
+        cols = list(sample_sheet.fields.values()) + ['Sample_ID']
+        meta_frame = pd.DataFrame(columns=cols)
+        field_classattr_lookup = {
+        'Sentrix_ID': 'sentrix_id',
+        'Sentrix_Position': 'sentrix_position',
+        'Sample_Group': 'group',
+        'Sample_Name': 'name',
+        'Sample_Plate': 'plate',
+        'Pool_ID': 'pool',
+        'Sample_Well': 'well',
+        'GSM_ID': 'GSM_ID',
+        'Sample_Type': 'type',
+        'Sub_Type': 'sub_type',
+        'Control': 'is_control',
+        }
+        # row contains the renamed fields, and pulls in the original data from sample_sheet
+        for sample in samples:
+            row = {}
+            for field in sample_sheet.fields.keys():
+                if sample_sheet.fields[field] in field_classattr_lookup:
+                    row[ sample_sheet.fields[field] ] = getattr(sample, field_classattr_lookup[sample_sheet.fields[field]] )
+                elif field in sample_sheet.renamed_fields:
+                    row[ sample_sheet.fields[field] ] = getattr(sample, sample_sheet.renamed_fields[field])
+                else:
+                    LOGGER.info(f"extra column: {field} ignored")
+                #    row[ sample_sheet.fields[field] ] = getattr(sample, field)
+            # add the UID that matches m_value/beta value pickles
+            #... unless there's a GSM_ID too
+            # appears that methylprep m_value and beta files only include ID_Position as column names.
+            #if row.get('GSM_ID') != None:
+            #    row['Sample_ID'] = f"{row['GSM_ID']}_{row['Sentrix_ID']}_{row['Sentrix_Position']}"
+            #else:
+            row['Sample_ID'] = f"{row['Sentrix_ID']}_{row['Sentrix_Position']}"
+            meta_frame = meta_frame.append(row, ignore_index=True)
+        meta_frame_filename = f'sample_sheet_meta_data.pkl'
+        meta_frame.to_pickle(meta_frame_filename)
+        LOGGER.info(f"[!] Exported meta_data to {meta_frame_filename}")
+
     # batch processing done; consolidate and return data. This uses much more memory, but not called if in batch mode.
     if batch_size and batch_size >= 200:
         print("Because the batch size was >200 samples, files are saved but no data objects are returned.")

diff --git a/requirements.txt b/requirements.txt
@@ -4,7 +4,7 @@ scipy
 statsmodels
 pytest
 coverage
-coveralls-python
+python-coveralls
 sphinxcontrib-apidoc
 m2r
 nbsphinx

diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 
 setup(
     name='methylprep',
-    version='1.1.9',
+    version='1.1.11',
     description='Python-based Illumina methylation array preprocessing software',
     long_description=open('README.md').read(),
     long_description_content_type='text/markdown',