diff --git a/.circleci/config.yml b/.circleci/config.yml
index 793b831..4d7a308 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -12,8 +12,10 @@ jobs:
# key: deps9-{{ .Branch }}-{{ checksum "Pipfile.lock" }}
key: deps1-{{ .Branch }}-{{ checksum "requirements.txt" }}
- run:
- command: |
- pip install pipenv
+ command: |
+ pip3 install pipenv
+ - run:
+ command: |
pipenv install --dev --skip-lock # --ignore-pipfile
pipenv install -e . --skip-lock # pytest recommends this: install your package in "editable" mode by running this from the same directory
# pipenv will use requirements.txt with these params.
diff --git a/index.rst b/index.rst
index 1550070..9f609f6 100644
--- a/index.rst
+++ b/index.rst
@@ -15,10 +15,12 @@ Getting Started
.. toctree::
:maxdepth: 2
:caption: Contents:
-
+
docs/methylprep_tutorial.md
- docs/cli.md
+ docs/cli.md
docs/source/modules
+ methylcheck (QC) package
+ methylize (analysis) package
Indices and tables
==================
diff --git a/methylprep/cli.py b/methylprep/cli.py
index 11102a9..f2742cc 100644
--- a/methylprep/cli.py
+++ b/methylprep/cli.py
@@ -169,14 +169,6 @@ def cli_process(cmd_args):
help='Sample(s) to process. You can pass multiple sample names with multiple -n params.',
)
- parser.add_argument(
- '-e', '--no_export',
- required=False,
- action='store_false', # if -e passed, this suppresses data export (if running as part of pipeline or something)
- default=True, # if False, CLI returns nothing.
- help='Default is to export data to csv in same folder where IDAT file resides. Pass in --no_export to suppress this.',
- )
-
parser.add_argument(
'-b', '--betas',
required=False,
@@ -186,7 +178,7 @@ def cli_process(cmd_args):
)
parser.add_argument(
- '--m_value',
+ '-v', '--m_value',
required=False,
action='store_true',
default=False,
@@ -201,13 +193,29 @@ def cli_process(cmd_args):
)
parser.add_argument(
- '--uncorrected',
+ '-u', '--uncorrected',
required=False,
action='store_true',
default=False,
help='If specified, processed csv will contain two additional columns (meth and unmeth) that have not been NOOB corrected.'
)
+ parser.add_argument(
+ '-e', '--no_export',
+ required=False,
+ action='store_false', # if -e passed, this suppresses data export
+ default=True, # if False, CLI returns nothing. will set export=True
+ help='Default is to export data to csv in same folder where IDAT file resides. Pass in --no_export to suppress this.',
+ )
+
+ parser.add_argument(
+ '-x', '--no_meta_export',
+ required=False,
+ action='store_false', # if -x passed, this suppresses meta data export
+ default=True, # will set meta_data_frame == True
+ help='Default is to convert the sample sheet into a pickled DataFrame, recognized in methylcheck and methylize. Pass in --no_meta_export to suppress this.',
+ )
+
args = parser.parse_args(cmd_args)
array_type = args.array_type
@@ -221,7 +229,6 @@ def cli_process(cmd_args):
run_pipeline(
args.data_dir,
array_type=args.array_type,
- export=args.no_export,
manifest_filepath=args.manifest,
sample_sheet_filepath=args.sample_sheet,
sample_name=args.sample_name,
@@ -230,6 +237,8 @@ def cli_process(cmd_args):
m_value=args.m_value,
batch_size=args.batch_size,
save_uncorrected=args.uncorrected,
+ export=args.no_export, # flag flips here
+ meta_data_frame=args.no_meta_export, # flag flips here
)
def cli_download(cmd_args):
diff --git a/methylprep/files/sample_sheets.py b/methylprep/files/sample_sheets.py
index 20ef120..7c00d27 100644
--- a/methylprep/files/sample_sheets.py
+++ b/methylprep/files/sample_sheets.py
@@ -253,6 +253,8 @@ class SampleSheet():
def __init__(self, filepath_or_buffer, data_dir):
self.__samples = []
+ self.fields = {}
+ self.renamed_fields = {}
self.data_dir = data_dir
self.headers = []
@@ -345,7 +347,10 @@ def build_samples(self):
sentrix_position=sentrix_position,
**row,
)
-
+ if sample.renamed_fields != {}:
+ self.renamed_fields.update(sample.renamed_fields)
+ self.fields.update(sample.fields)
+
self.__samples.append(sample)
def contains_column(self, column_name):
diff --git a/methylprep/models/samples.py b/methylprep/models/samples.py
index 63c7dc8..40c3eae 100644
--- a/methylprep/models/samples.py
+++ b/methylprep/models/samples.py
@@ -5,6 +5,9 @@
from glob import glob
LOGGER = logging.getLogger(__name__)
+REQUIRED = ['Sentrix_ID', 'Sentrix_Position', 'SentrixBarcode_A', 'SentrixPosition_A', 'Control',
+ 'Sample_Group', 'Sample_Name', 'Sample_Plate', 'Pool_ID', 'Sample_Well', 'GSM_ID',
+ 'Sample_Type', 'Sub_Type']
class Sample():
"""Object representing a row in a SampleSheet file
@@ -32,7 +35,22 @@ def __init__(self, data_dir, sentrix_id, sentrix_position, **addl_fields):
self.data_dir = data_dir
self.sentrix_id = sentrix_id
self.sentrix_position = sentrix_position
-
+ self.renamed_fields = {}
+
+ # any OTHER sample_sheet columns are passed in exactly as they appear, if possible, and if column names exist.
+ # these will pass into the meta_data pkl created, and any renamed fields must be noted in a lookup.
+ for field in addl_fields:
+ if field not in REQUIRED:
+ new_field_name = field.replace(' ','_')
+ if len(field) == 0:
+ continue
+ if field[0].isdigit():
+ new_field_name = field[1:]
+ if not field.isalnum(): # letters or numbers, or caps. no spaces or unicode
+ import re
+ new_field_name = re.sub(r'\W+', '', new_field_name)
+ setattr(self, new_field_name, addl_fields[field])
+ self.renamed_fields[field] = new_field_name
self.group = addl_fields.get('Sample_Group')
self.name = addl_fields.get('Sample_Name')
self.plate = addl_fields.get('Sample_Plate')
@@ -41,7 +59,22 @@ def __init__(self, data_dir, sentrix_id, sentrix_position, **addl_fields):
self.GSM_ID = addl_fields.get('GSM_ID') # for GEO published sample compatability
self.type = addl_fields.get('Sample_Type','Unknown') # from GEO MINiML meta data
self.sub_type = addl_fields.get('Sub_Type') # from GEO
- self.is_control = addl_fields.get('Control',False) # from GEO MINiML meta data
+ self.is_control = True if addl_fields.get('Control') in (1,'1',True, 'True', 'true', 'TRUE') else False
+ self.fields = {}
+ self.fields.update(self.renamed_fields)
+ self.fields.update({
+ 'Sentrix_ID': 'Sentrix_ID',
+ 'Sentrix_Position': 'Sentrix_Position', # these will be standardized here, regardless of sample_sheet variation names
+ 'Sample_Group': 'Sample_Group',
+ 'Sample_Name': 'Sample_Name',
+ 'Sample_Plate': 'Sample_Plate',
+ 'Sample_Type': 'Sample_Type',
+ 'Sub_Type': 'Sub_Type',
+ 'Sample_Well': 'Sample_Well',
+ 'Pool_ID': 'Pool_ID',
+ 'GSM_ID': 'GSM_ID',
+ 'Control': 'Control',
+ })
def __str__(self):
return f'{self.sentrix_id}_{self.sentrix_position}'
diff --git a/methylprep/processing/pipeline.py b/methylprep/processing/pipeline.py
index 5c85cd7..921c591 100644
--- a/methylprep/processing/pipeline.py
+++ b/methylprep/processing/pipeline.py
@@ -55,7 +55,7 @@ def get_manifest(raw_datasets, array_type=None, manifest_filepath=None):
def run_pipeline(data_dir, array_type=None, export=False, manifest_filepath=None,
sample_sheet_filepath=None, sample_name=None,
betas=False, m_value=False, make_sample_sheet=False, batch_size=None,
- save_uncorrected=False):
+ save_uncorrected=False, meta_data_frame=True):
"""The main CLI processing pipeline. This does every processing step and returns a data set.
Arguments:
@@ -110,6 +110,15 @@ def run_pipeline(data_dir, array_type=None, export=False, manifest_filepath=None
sample_sheet = get_sample_sheet(data_dir, filepath=sample_sheet_filepath)
samples = sample_sheet.get_samples()
+ if sample_sheet.renamed_fields != {}:
+ show_fields = []
+ for k,v in sample_sheet.renamed_fields.items():
+ if v != k:
+ show_fields.append(f"{k} --> {v}\n")
+ else:
+ show_fields.append(f"{k}\n")
+ LOGGER.info(f"Found {len(show_fields)} additional fields in sample_sheet: {''.join(show_fields)}")
+
batches = []
batch = []
sample_id_counter = 1
@@ -199,6 +208,46 @@ def run_pipeline(data_dir, array_type=None, export=False, manifest_filepath=None
continue
data_containers.extend(batch_data_containers)
+ if meta_data_frame == True:
+ #sample_sheet.fields is a complete mapping of original and renamed_fields
+ cols = list(sample_sheet.fields.values()) + ['Sample_ID']
+ meta_frame = pd.DataFrame(columns=cols)
+ field_classattr_lookup = {
+ 'Sentrix_ID': 'sentrix_id',
+ 'Sentrix_Position': 'sentrix_position',
+ 'Sample_Group': 'group',
+ 'Sample_Name': 'name',
+ 'Sample_Plate': 'plate',
+ 'Pool_ID': 'pool',
+ 'Sample_Well': 'well',
+ 'GSM_ID': 'GSM_ID',
+ 'Sample_Type': 'type',
+ 'Sub_Type': 'sub_type',
+ 'Control': 'is_control',
+ }
+ # row contains the renamed fields, and pulls in the original data from sample_sheet
+ for sample in samples:
+ row = {}
+ for field in sample_sheet.fields.keys():
+ if sample_sheet.fields[field] in field_classattr_lookup:
+ row[ sample_sheet.fields[field] ] = getattr(sample, field_classattr_lookup[sample_sheet.fields[field]] )
+ elif field in sample_sheet.renamed_fields:
+ row[ sample_sheet.fields[field] ] = getattr(sample, sample_sheet.renamed_fields[field])
+ else:
+ LOGGER.info(f"extra column: {field} ignored")
+ # row[ sample_sheet.fields[field] ] = getattr(sample, field)
+ # add the UID that matches m_value/beta value pickles
+ #... unless there's a GSM_ID too
+ # appears that methylprep m_value and beta files only include ID_Position as column names.
+ #if row.get('GSM_ID') != None:
+ # row['Sample_ID'] = f"{row['GSM_ID']}_{row['Sentrix_ID']}_{row['Sentrix_Position']}"
+ #else:
+ row['Sample_ID'] = f"{row['Sentrix_ID']}_{row['Sentrix_Position']}"
+ meta_frame = meta_frame.append(row, ignore_index=True)
+ meta_frame_filename = f'sample_sheet_meta_data.pkl'
+ meta_frame.to_pickle(meta_frame_filename)
+ LOGGER.info(f"[!] Exported meta_data to {meta_frame_filename}")
+
# batch processing done; consolidate and return data. This uses much more memory, but not called if in batch mode.
if batch_size and batch_size >= 200:
print("Because the batch size was >200 samples, files are saved but no data objects are returned.")
diff --git a/requirements.txt b/requirements.txt
index 734bc86..c734149 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ scipy
statsmodels
pytest
coverage
-coveralls-python
+python-coveralls
sphinxcontrib-apidoc
m2r
nbsphinx
diff --git a/setup.py b/setup.py
index ed6cc90..0000c2e 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
setup(
name='methylprep',
- version='1.1.9',
+ version='1.1.11',
description='Python-based Illumina methylation array preprocessing software',
long_description=open('README.md').read(),
long_description_content_type='text/markdown',