improved docs and bugs for beta_bake

FoxoTech · Sep 4, 2020 · 5aae024 · 5aae024
1 parent 32a9c1f
commit 5aae024
Show file tree

Hide file tree

Showing 3 changed files with 61 additions and 23 deletions.
diff --git a/docs/example_download.md b/docs/example_download.md
@@ -1,17 +1,35 @@
 # Public GEO datasets
 
 `methylprep` provides methods to use public data in a variety of formats.
-- idat
+- idat (preferred format, as it contains everything saved from the array processing machine)
 - processed tab delimited (`txt`)
 - processed `csv`
 - processed `xlsx`
+- any GEO files contains one or more of these keywords: `['matrix', 'processed', 'signals', 'intensities', 'normalized', 'intensity', 'raw_data', 'mean', 'average', 'beta']`
 - pickled dataframes (`pkl`) created using `methylprep process` or `methylprep.run_pipeline`
   - format: dataframe should have probe names as columns or rows, and sample probe values in the other dimension.
   - meta data: as a dataframe with values for samples, so long as one of those characteristics, the sample name, matches the Sentrix_Position sample name that is the default output of Illumina arrays.
   - Note: if you use `methylprep.load_both` and pass in the folder location of your methylprep processed data, it will construct the meta data frame for you. Ultimately, it is reading the GEO `miniml` format (XML) file for the public data set, or your samplesheet if you provided one.
 
+### downloading from GEO
+Version 1.3.0 and greater contains one all-encompassing pipeline function for downloading and parsing
+NIH GEO methylation data sets in a variety of formats, returning a python 3x pickled dataframe of probe beta_values for each sample in a published batch. The internal workflow is messy, but all you need is the
+GEO accession number in the format of `GSExxxxxx` where `x` is some number.
+You can also use `methylprep alert -k <keyword>` to produce a CSV with all the search results and GEO IDs
+you may want to run into this function.
 
-### download from GEO
+![pipeline](methylprep-geo-downloading-pipeline.png)
+
+```Shell
+(base) $ python -m methylprep beta_bake -i GSE74013 -d GSE74013
+INFO:methylprep.download.miniml:Downloading GSE74013_family.xml.tgz
+INFO:methylprep.download.miniml:MINiML file does not provide (Sentrix_id_R00C00) for 24/24 samples.
+INFO:methylprep.download.miniml:Final samplesheet contains 24 rows and 9 columns
+```
+Output file containing a beta_values pickled dataframe: GSE74013_beta_values.pkl.gz
+Output file containing meta data: GSE74013_GPL13534_meta_data.pkl.gz
+
+You can also access parts of the beta_bakery pipeline individually.
 
 ```Shell
 (base) $ python -m methylprep download -i GSE122126 -d GEO/GSE122126
@@ -32,12 +50,13 @@ ERROR:methylprep.download.process_data:[!] Geo data set GSE123211 probably does
 ERROR:methylprep.download.process_data:Series failed to download successfully.
 ```
 
-If you want to use the author's processed data instead of reprocessing it yourself,
-download the `.gz` file using a web browser, then `gunzip` it to create a `txt | pkl | xlsx | csv` file,
-and then load that using `methylprep.read_geo`.
+the beta bakery will download and convert GEO data for you. But if you prefer to download the author's processed data yourself, you can download the `.gz` file using a web browser, then `gunzip` it to create a `txt | pkl | xlsx | csv` file, and then load that using `methylprep.read_geo` or `methylcheck.read_geo`.
 
 ### loading processed GEO data
 
+If you're using GEO data in its original format, there is a `read_geo()` function in methyl-suite that can
+read most formats.
+
 ```python
 import methylprep
 import methylcheck
@@ -50,3 +69,13 @@ methylcheck.beta_density_plot(df)
 ```
 
 ![Fig.19](tutorial_figs/fig19.png)
+
+Alternatively, if you run `methylprep beta_bake` on the GEO series, the output files can be opened with
+`pandas` as they are standard python dataframes.
+
+```python
+import pandas as pd
+import methylcheck
+betas = pd.read_pickle('GSE74013/GSE74013_beta_values.pkl')
+methylcheck.sample_plot(betas)
+```
diff --git a/methylprep/cli.py b/methylprep/cli.py
@@ -327,7 +327,7 @@ def cli_beta_bakery(cmd_args):
         args.clean = False
     else:
         args.clean = True
-    delattr(args,'no_clean')    
+    delattr(args,'no_clean')
     pipeline_find_betas_any_source(**vars(args))
 
 
@@ -373,19 +373,24 @@ def cli_download(cmd_args):
     )
 
     parser.add_argument(
-        '-c', '--clean',
+        '-n', '--no_clean',
         required=False,
         action="store_false",
         help='Leave processing and raw data files in folders. By default, these files are removed during processing.'
     )
 
     args = parser.parse_args(cmd_args)
+    if args.no_clean == True:
+        args.clean = False
+    else:
+        args.clean = True
+    delattr(args,'no_clean')
 
     if args.id:
         if args.batch_size:
-            run_series(args.id, args.data_dir, dict_only=args.dict_only, batch_size=args.batch_size, clean=args.no_clean)
+            run_series(args.id, args.data_dir, dict_only=args.dict_only, batch_size=args.batch_size, clean=args.clean)
         else:
-            run_series(args.id, args.data_dir, dict_only=args.dict_only, clean=args.no_clean)
+            run_series(args.id, args.data_dir, dict_only=args.dict_only, clean=args.clean)
     elif args.list:
         if args.batch_size:
             run_series_list(args.list, args.data_dir, dict_only=args.dict_only, batch_size=args.batch_size)

diff --git a/methylprep/download/geo.py b/methylprep/download/geo.py
@@ -377,26 +377,31 @@ def pipeline_find_betas_any_source(**kwargs):
 
     #3: run EACH geo series through downloader/processor
     each_geo_result = {}
+    zipfile_names = []
     for geo_id in geo_ids:
         # this download processed data, or idats, or -tbl-1.txt files inside the _family.xml.tgz files.
         # also downloads meta_data
         # download_geo_processed() gets nothing if idats exist.
         result = download_geo_processed(geo_id, working, verbose=kwargs.get('verbose', False))
+        if result['found_idats'] == False and result['processed_files'] == False and result['tbl_txt_files'] == False:
+            LOGGER.warning(f"No downloadable methylation data found for {geo_id}.")
+            continue
         # result dict tells this function what it found.
         if result['tbl_txt_files'] == True:
-            LOGGER.info(f"FYI it found -tbl-1.txt files with meta data")
-        if result['found_idats'] == False:
-            continue
-        try:
-            # dict_only: should download without processing idats.
-            methylprep.download.process_data.run_series(geo_id,
-                working.name,
-                dict_only=True,
-                batch_size=BATCH_SIZE,
-                clean=True,
-                abort_if_no_idats=True)
-        except Exception as e:
-            LOGGER.error(f"run_series ERROR: {e}")
+            LOGGER.info(f"Found -tbl-1.txt files with meta data for {geo_id}.")
+        if result['tbl_txt_files'] == True:
+            LOGGER.info(f"Found processed csv files for {geo_id}.")
+        if result['found_idats'] == True:
+            try:
+                # dict_only: should download without processing idats.
+                methylprep.download.process_data.run_series(geo_id,
+                    working.name,
+                    dict_only=True,
+                    batch_size=BATCH_SIZE,
+                    clean=True,
+                    abort_if_no_idats=True)
+            except Exception as e:
+                LOGGER.error(f"run_series ERROR: {e}")
         each_geo_result[geo_id] = result
 
         #4: memory check / debug
@@ -407,7 +412,6 @@ def pipeline_find_betas_any_source(**kwargs):
         #LOGGER.info(f"Downloaded and extracted: {geo_ids}. Zipping and moving to S3 for processing in pipeline now.")
         zipfile_names = [] # one for each GSExxx, or one per pkl file found that was too big and used gzip.
 
-
         #5: compress and package files
         zipfile_name = f"{geo_id}.zip"
         outfile_path = Path(working.name, zipfile_name)