diff --git a/.circleci/config.yml b/.circleci/config.yml index 793b831..4d7a308 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -12,8 +12,10 @@ jobs: # key: deps9-{{ .Branch }}-{{ checksum "Pipfile.lock" }} key: deps1-{{ .Branch }}-{{ checksum "requirements.txt" }} - run: - command: | - pip install pipenv + command: | + pip3 install pipenv + - run: + command: | pipenv install --dev --skip-lock # --ignore-pipfile pipenv install -e . --skip-lock # pytest recommends this: install your package in "editable" mode by running this from the same directory # pipenv will use requirements.txt with these params. diff --git a/index.rst b/index.rst index 1550070..9f609f6 100644 --- a/index.rst +++ b/index.rst @@ -15,10 +15,12 @@ Getting Started .. toctree:: :maxdepth: 2 :caption: Contents: - + docs/methylprep_tutorial.md - docs/cli.md + docs/cli.md docs/source/modules + methylcheck (QC) package + methylize (analysis) package Indices and tables ================== diff --git a/methylprep/cli.py b/methylprep/cli.py index 11102a9..f2742cc 100644 --- a/methylprep/cli.py +++ b/methylprep/cli.py @@ -169,14 +169,6 @@ def cli_process(cmd_args): help='Sample(s) to process. You can pass multiple sample names with multiple -n params.', ) - parser.add_argument( - '-e', '--no_export', - required=False, - action='store_false', # if -e passed, this suppresses data export (if running as part of pipeline or something) - default=True, # if False, CLI returns nothing. - help='Default is to export data to csv in same folder where IDAT file resides. Pass in --no_export to suppress this.', - ) - parser.add_argument( '-b', '--betas', required=False, @@ -186,7 +178,7 @@ def cli_process(cmd_args): ) parser.add_argument( - '--m_value', + '-v', '--m_value', required=False, action='store_true', default=False, @@ -201,13 +193,29 @@ def cli_process(cmd_args): ) parser.add_argument( - '--uncorrected', + '-u', '--uncorrected', required=False, action='store_true', default=False, help='If specified, processed csv will contain two additional columns (meth and unmeth) that have not been NOOB corrected.' ) + parser.add_argument( + '-e', '--no_export', + required=False, + action='store_false', # if -e passed, this suppresses data export + default=True, # if False, CLI returns nothing. will set export=True + help='Default is to export data to csv in same folder where IDAT file resides. Pass in --no_export to suppress this.', + ) + + parser.add_argument( + '-x', '--no_meta_export', + required=False, + action='store_false', # if -x passed, this suppresses meta data export + default=True, # will set meta_data_frame == True + help='Default is to convert the sample sheet into a pickled DataFrame, recognized in methylcheck and methylize. Pass in --no_meta_export to suppress this.', + ) + args = parser.parse_args(cmd_args) array_type = args.array_type @@ -221,7 +229,6 @@ def cli_process(cmd_args): run_pipeline( args.data_dir, array_type=args.array_type, - export=args.no_export, manifest_filepath=args.manifest, sample_sheet_filepath=args.sample_sheet, sample_name=args.sample_name, @@ -230,6 +237,8 @@ def cli_process(cmd_args): m_value=args.m_value, batch_size=args.batch_size, save_uncorrected=args.uncorrected, + export=args.no_export, # flag flips here + meta_data_frame=args.no_meta_export, # flag flips here ) def cli_download(cmd_args): diff --git a/methylprep/files/sample_sheets.py b/methylprep/files/sample_sheets.py index 20ef120..7c00d27 100644 --- a/methylprep/files/sample_sheets.py +++ b/methylprep/files/sample_sheets.py @@ -253,6 +253,8 @@ class SampleSheet(): def __init__(self, filepath_or_buffer, data_dir): self.__samples = [] + self.fields = {} + self.renamed_fields = {} self.data_dir = data_dir self.headers = [] @@ -345,7 +347,10 @@ def build_samples(self): sentrix_position=sentrix_position, **row, ) - + if sample.renamed_fields != {}: + self.renamed_fields.update(sample.renamed_fields) + self.fields.update(sample.fields) + self.__samples.append(sample) def contains_column(self, column_name): diff --git a/methylprep/models/samples.py b/methylprep/models/samples.py index 63c7dc8..40c3eae 100644 --- a/methylprep/models/samples.py +++ b/methylprep/models/samples.py @@ -5,6 +5,9 @@ from glob import glob LOGGER = logging.getLogger(__name__) +REQUIRED = ['Sentrix_ID', 'Sentrix_Position', 'SentrixBarcode_A', 'SentrixPosition_A', 'Control', + 'Sample_Group', 'Sample_Name', 'Sample_Plate', 'Pool_ID', 'Sample_Well', 'GSM_ID', + 'Sample_Type', 'Sub_Type'] class Sample(): """Object representing a row in a SampleSheet file @@ -32,7 +35,22 @@ def __init__(self, data_dir, sentrix_id, sentrix_position, **addl_fields): self.data_dir = data_dir self.sentrix_id = sentrix_id self.sentrix_position = sentrix_position - + self.renamed_fields = {} + + # any OTHER sample_sheet columns are passed in exactly as they appear, if possible, and if column names exist. + # these will pass into the meta_data pkl created, and any renamed fields must be noted in a lookup. + for field in addl_fields: + if field not in REQUIRED: + new_field_name = field.replace(' ','_') + if len(field) == 0: + continue + if field[0].isdigit(): + new_field_name = field[1:] + if not field.isalnum(): # letters or numbers, or caps. no spaces or unicode + import re + new_field_name = re.sub(r'\W+', '', new_field_name) + setattr(self, new_field_name, addl_fields[field]) + self.renamed_fields[field] = new_field_name self.group = addl_fields.get('Sample_Group') self.name = addl_fields.get('Sample_Name') self.plate = addl_fields.get('Sample_Plate') @@ -41,7 +59,22 @@ def __init__(self, data_dir, sentrix_id, sentrix_position, **addl_fields): self.GSM_ID = addl_fields.get('GSM_ID') # for GEO published sample compatability self.type = addl_fields.get('Sample_Type','Unknown') # from GEO MINiML meta data self.sub_type = addl_fields.get('Sub_Type') # from GEO - self.is_control = addl_fields.get('Control',False) # from GEO MINiML meta data + self.is_control = True if addl_fields.get('Control') in (1,'1',True, 'True', 'true', 'TRUE') else False + self.fields = {} + self.fields.update(self.renamed_fields) + self.fields.update({ + 'Sentrix_ID': 'Sentrix_ID', + 'Sentrix_Position': 'Sentrix_Position', # these will be standardized here, regardless of sample_sheet variation names + 'Sample_Group': 'Sample_Group', + 'Sample_Name': 'Sample_Name', + 'Sample_Plate': 'Sample_Plate', + 'Sample_Type': 'Sample_Type', + 'Sub_Type': 'Sub_Type', + 'Sample_Well': 'Sample_Well', + 'Pool_ID': 'Pool_ID', + 'GSM_ID': 'GSM_ID', + 'Control': 'Control', + }) def __str__(self): return f'{self.sentrix_id}_{self.sentrix_position}' diff --git a/methylprep/processing/pipeline.py b/methylprep/processing/pipeline.py index 5c85cd7..921c591 100644 --- a/methylprep/processing/pipeline.py +++ b/methylprep/processing/pipeline.py @@ -55,7 +55,7 @@ def get_manifest(raw_datasets, array_type=None, manifest_filepath=None): def run_pipeline(data_dir, array_type=None, export=False, manifest_filepath=None, sample_sheet_filepath=None, sample_name=None, betas=False, m_value=False, make_sample_sheet=False, batch_size=None, - save_uncorrected=False): + save_uncorrected=False, meta_data_frame=True): """The main CLI processing pipeline. This does every processing step and returns a data set. Arguments: @@ -110,6 +110,15 @@ def run_pipeline(data_dir, array_type=None, export=False, manifest_filepath=None sample_sheet = get_sample_sheet(data_dir, filepath=sample_sheet_filepath) samples = sample_sheet.get_samples() + if sample_sheet.renamed_fields != {}: + show_fields = [] + for k,v in sample_sheet.renamed_fields.items(): + if v != k: + show_fields.append(f"{k} --> {v}\n") + else: + show_fields.append(f"{k}\n") + LOGGER.info(f"Found {len(show_fields)} additional fields in sample_sheet: {''.join(show_fields)}") + batches = [] batch = [] sample_id_counter = 1 @@ -199,6 +208,46 @@ def run_pipeline(data_dir, array_type=None, export=False, manifest_filepath=None continue data_containers.extend(batch_data_containers) + if meta_data_frame == True: + #sample_sheet.fields is a complete mapping of original and renamed_fields + cols = list(sample_sheet.fields.values()) + ['Sample_ID'] + meta_frame = pd.DataFrame(columns=cols) + field_classattr_lookup = { + 'Sentrix_ID': 'sentrix_id', + 'Sentrix_Position': 'sentrix_position', + 'Sample_Group': 'group', + 'Sample_Name': 'name', + 'Sample_Plate': 'plate', + 'Pool_ID': 'pool', + 'Sample_Well': 'well', + 'GSM_ID': 'GSM_ID', + 'Sample_Type': 'type', + 'Sub_Type': 'sub_type', + 'Control': 'is_control', + } + # row contains the renamed fields, and pulls in the original data from sample_sheet + for sample in samples: + row = {} + for field in sample_sheet.fields.keys(): + if sample_sheet.fields[field] in field_classattr_lookup: + row[ sample_sheet.fields[field] ] = getattr(sample, field_classattr_lookup[sample_sheet.fields[field]] ) + elif field in sample_sheet.renamed_fields: + row[ sample_sheet.fields[field] ] = getattr(sample, sample_sheet.renamed_fields[field]) + else: + LOGGER.info(f"extra column: {field} ignored") + # row[ sample_sheet.fields[field] ] = getattr(sample, field) + # add the UID that matches m_value/beta value pickles + #... unless there's a GSM_ID too + # appears that methylprep m_value and beta files only include ID_Position as column names. + #if row.get('GSM_ID') != None: + # row['Sample_ID'] = f"{row['GSM_ID']}_{row['Sentrix_ID']}_{row['Sentrix_Position']}" + #else: + row['Sample_ID'] = f"{row['Sentrix_ID']}_{row['Sentrix_Position']}" + meta_frame = meta_frame.append(row, ignore_index=True) + meta_frame_filename = f'sample_sheet_meta_data.pkl' + meta_frame.to_pickle(meta_frame_filename) + LOGGER.info(f"[!] Exported meta_data to {meta_frame_filename}") + # batch processing done; consolidate and return data. This uses much more memory, but not called if in batch mode. if batch_size and batch_size >= 200: print("Because the batch size was >200 samples, files are saved but no data objects are returned.") diff --git a/requirements.txt b/requirements.txt index 734bc86..c734149 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ scipy statsmodels pytest coverage -coveralls-python +python-coveralls sphinxcontrib-apidoc m2r nbsphinx diff --git a/setup.py b/setup.py index ed6cc90..0000c2e 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name='methylprep', - version='1.1.9', + version='1.1.11', description='Python-based Illumina methylation array preprocessing software', long_description=open('README.md').read(), long_description_content_type='text/markdown',