Add docs for Finder module

leonelluiscorado · Feb 15, 2024 · 6e79ac1 · 6e79ac1
1 parent e03363c
commit 6e79ac1
Showing 1 changed file with 56 additions and 22 deletions.
diff --git a/pipeline/gedi_finder.py b/pipeline/gedi_finder.py
@@ -2,7 +2,31 @@
 import requests as r
 from datetime import datetime
 
+# Set up dictionary where key is GEDI shortname + version
+concept_ids = {
+    'GEDI01_B.002': 'C1908344278-LPDAAC_ECS', 
+    'GEDI02_A.002': 'C1908348134-LPDAAC_ECS', 
+    'GEDI02_B.002': 'C1908350066-LPDAAC_ECS',
+    'GEDI04_A.002': 'C2237824918-ORNL_CLOUD'
+}
+
 class GEDIFinder:
+    """
+    The Finder :class: exports all the available URLs to download GEDI Data that passes over a given ROI and timestamp.
+
+    Args:
+        product: GEDI Product (without version). Products available are {'GEDI01_B'; 'GEDI02_A'; 'GEDI02_B'; 'GEDI04_A'}
+        version: Version of the desired GEDI Product. There are only two available versions 001 and 002
+        date_start: Starting datetime to search for GEDI Data. Must be in format YEAR.month.day (e.g 2020.04.01)
+        date_end: End datetime to search for GEDI Data. Must be in format YEAR.month.day (e.g 2020.12.31)
+        roi: Region of Interest to search for granules. Coordinates must be in WG84 EPSG:4326 and organized as follows: [UL_Lat, UL_Lon, LR_Lat, LR_Lon]
+
+    Example usage:
+        finder = GEDIFinder(product='GEDI04_A', version='002', date_start='2021.01.01', date_end='2021.12.31', roi=[])
+        granules = finder.find(save_file=False)
+        granules
+        >>> ["URL1", "URL2", "URL3", ...]
+    """
 
     def __init__(self, product='GEDI02_A', version='002', date_start='', date_end='', roi=None):
 
@@ -17,30 +41,27 @@ def __init__(self, product='GEDI02_A', version='002', date_start='', date_end=''
             print("Dates provided not valid. Valid format is \"Y.m.d\" (e.g. 2019.01.01).")
 
         if roi is not None:
-            # GEDIFinder expects bbox to be (LL_lon, LL_lat, UR_lon, UR_lat)
+            # GEDI Finder expects bbox to be (LL_lon, LL_lat, UR_lon, UR_lat)
             [ul_lat, ul_lon, lr_lat, lr_lon] = roi
             self.roi = " ".join(map(str, [ul_lon, lr_lat, lr_lon, ul_lat]))
 
 
-    def _find_all_granules(self):
+    def __find_all_granules(self):
         """
-        Requests all the links and download sizes for each granule found over the ROI provided.
-        Based on the GEDI Data Resources github repository by : 
+        Functions that requests all the links and download sizes for each granule found over the ROI provided.
         """
 
-        # Define the base CMR granule search url, including LPDAAC provider name and max page size (2000 is the max allowed)
-        cmr = "https://cmr.earthdata.nasa.gov/search/granules.json?pretty=true&provider=LPDAAC_ECS&page_size=2000&concept_id="
-
-        # Set up dictionary where key is GEDI shortname + version
-        concept_ids = {'GEDI01_B.002': 'C1908344278-LPDAAC_ECS', 
-                    'GEDI02_A.002': 'C1908348134-LPDAAC_ECS', 
-                    'GEDI02_B.002': 'C1908350066-LPDAAC_ECS'}
-
         # CMR uses pagination for queries with more features returned than the page size
         page = 1
-        bbox = self.roi.replace(' ', ',')  # remove any white spaces
+        bbox = self.roi.replace(' ', ',')  # Remove any white spaces
         product = self.product+"."+self.version
 
+        # Provider is always after the "-" at its concept_id
+        provider = concept_ids[product].split("-")[-1]
+
+        # Define the base CMR granule search url, including LPDAAC provider name and max page size (2000 is the max allowed)
+        cmr = f"https://cmr.earthdata.nasa.gov/search/granules.json?pretty=true&provider={provider}&page_size=2000&concept_id="
+
         try:
             # Send GET request to CMR granule search endpoint w/ product concept ID, bbox & page number, format return as json
             cmr_response = r.get(f"{cmr}{concept_ids[product]}&bounding_box={bbox}&pageNum={page}").json()['feed']['entry']
@@ -58,16 +79,18 @@ def _find_all_granules(self):
             exit(0)
 
 
-    def _date_filter(self, granules):
+    def __date_filter(self, granules):
         """
-        GEDI Finder, by default, finds all the granules that pass over ROI.
+        The finder outputs all the granules that pass over ROI, by default.
         This function (date_filter) filters the desired granules by the dates provided.
         """
         filter_g = []
 
         for g in granules:
-            # Date of granule is at the 7th section on CMR website
-            date_sec = datetime.strptime(g[0].split("/")[7], "%Y.%m.%d")
+            # Date of granule is in the filename described as a Julian Date YYYYDDD (e.g. 2020348)
+            granule_name = g[0].split("/")[-1]
+            date_sec = granule_name.split("_")[2][0:7]
+            date_sec = datetime.strptime(date_sec, "%Y%j")
 
             # Stop search query if passes end_date
             if date_sec > self.date_end:
@@ -79,26 +102,37 @@ def _date_filter(self, granules):
         return filter_g
 
 
-    def _check_download_size(self, link_list):
+    def __check_download_size(self, link_list):
         """
         Converts MB to GB and returns download size of all the links provided by the *link_list*
         """
         return sum(float(l[1]) for l in link_list) / 1000
 
 
-    def find(self, output_filepath, save_file=True):
+    def find(self, save_file=True, output_filepath=None) -> list:
+        """
+        Executes the finding algorithm.
+        Args:
+            save_file: If true, saves all the download URLs to a file.
+            output_filepath: Filepath to URLs file.
+
+        Returns:
+            a list with all the date filtered granule links for download
+        """
 
-        all_granules = self._find_all_granules()
+        all_granules = self.__find_all_granules()
 
         print(f"[Finder] Found {len(all_granules)} granules over bbox [{self.roi}]")
 
-        granules_date_filtered = self._date_filter(all_granules)
+        granules_date_filtered = self.__date_filter(all_granules)
 
         print(f"[Finder] Between dates ({self.date_start}) and ({self.date_end}) exist {len(granules_date_filtered)} granules over bbox [{self.roi}]")
-        print(f"[Finder] Estimated download size for select granules : {self._check_download_size(granules_date_filtered):.2f} GB")
+        print(f"[Finder] Estimated download size for select granules : {self.__check_download_size(granules_date_filtered):.2f} GB")
 
         if save_file:
+            # Save txt file with all the links found
             filename = f"{self.product.replace('.', '_')}_GranuleList_{datetime.now().strftime('%Y%m%d%H%M%S')}.txt"
+            output_filepath = output_filepath if not output_filepath is None else ""
             # Open file and write each granule link on a new line
             with open(os.path.join(output_filepath, filename), "w") as gf:
                 for g in granules_date_filtered: