Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/composite #39

Merged
merged 15 commits into from
Jan 16, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
v1.2.1 progress bar
  • Loading branch information
marcmaxson committed Jan 7, 2020
commit 3062fa5baddb0092a885ed51e85beacd0d0d181d
11 changes: 8 additions & 3 deletions methylprep/processing/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging
import numpy as np
import pandas as pd
from tqdm import tqdm
from ..utils.progress_bar import * # checks environment and imports tqdm appropriately.
from collections import Counter
from pathlib import Path
# App
Expand All @@ -19,8 +19,8 @@
from .preprocess import preprocess_noob
from .raw_dataset import get_raw_datasets

__all__ = ['SampleDataContainer', 'get_manifest', 'run_pipeline', 'consolidate_values_for_sheet']

__all__ = ['SampleDataContainer', 'get_manifest', 'run_pipeline', 'consolidate_values_for_sheet']

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -170,7 +170,7 @@ def run_pipeline(data_dir, array_type=None, export=False, manifest_filepath=None

batch_data_containers = []
export_paths = set() # inform CLI user where to look
for raw_dataset in tqdm(raw_datasets):
for raw_dataset in tqdm(raw_datasets, total=len(raw_datasets), desc="Processing samples"):
data_container = SampleDataContainer(
raw_dataset=raw_dataset,
manifest=manifest,
Expand All @@ -185,6 +185,11 @@ def run_pipeline(data_dir, array_type=None, export=False, manifest_filepath=None
data_container.export(output_path)
export_paths.add(output_path)

print(f"obj container: {pympler.asizeof.basicsize(data_container)}")
print(f"obj batch container: {pympler.asizeof.basicsize(batch_data_containers)}")

print('[finished SampleDataContainer processing]')

if betas:
df = consolidate_values_for_sheet(batch_data_containers, postprocess_func_colname='beta_value')
if not batch_size:
Expand Down
6 changes: 3 additions & 3 deletions methylprep/processing/raw_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
)
from ..files import IdatDataset
from ..utils import inner_join_data

from ..utils.progress_bar import * # checks environment and imports tqdm appropriately.

__all__ = ['RawDataset', 'RawMetaDataset', 'get_raw_datasets', 'get_raw_meta_datasets', 'get_array_type']

Expand Down Expand Up @@ -57,10 +57,10 @@ def get_raw_datasets(sample_sheet, sample_name=None, from_s3=None, meta_only=Fal
elif from_s3 and not meta_only:
parser = RawDataset.from_sample_s3
zip_reader = from_s3
raw_datasets = [parser(zip_reader, sample) for sample in samples]
raw_datasets = tqdm([parser(zip_reader, sample) for sample in samples], total=len(samples), desc='Getting raw datasets')
elif not from_s3 and not meta_only:
parser = RawDataset.from_sample
raw_datasets = [parser(sample) for sample in samples]
raw_datasets = tqdm([parser(sample) for sample in samples], total=len(samples), desc='Getting raw datasets')

if not meta_only:
# ensure all idat files have same number of probes
Expand Down
10 changes: 10 additions & 0 deletions methylprep/utils/progress_bar.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
def in_notebook():
if hasattr(__builtins__,'__IPYTHON__'):
return True #globals()[notebook_mod] = __import__(notebook_mod)
else:
return False #globals()[console_mod] = __import__(console_mod)

if in_notebook():
from tqdm import tqdm_notebook as tqdm
else:
from tqdm import tqdm as tqdm