Removing Pipenv and adding Yapf.

aws · Jul 5, 2019 · 136828a · 136828a
1 parent dd6deda
commit 136828a
Show file tree

Hide file tree

Showing 45 changed files with 457 additions and 1,002 deletions.
diff --git a/.flake8 b/.flake8
@@ -1,2 +1,2 @@
 [flake8]
-ignore = E501,E203,W503
+ignore = E501,E126,W503
diff --git a/.gitignore b/.gitignore
@@ -130,8 +130,11 @@ output/
 # Development
 dev/
 metrics/
+python/
 
 # SAM
 .aws-sam
-*parameters.json
-*requirements*.txt
+testing/*parameters.json
+testing/*requirements*.txt
+building/*parameters.json
+building/*requirements*.txt
diff --git a/Pipfile b/Pipfile
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/awswrangler/__init__.py b/awswrangler/__init__.py
@@ -1,4 +1,5 @@
 import logging
+import importlib
 
 from awswrangler.__version__ import __title__, __description__, __version__  # noqa
 from awswrangler.session import Session  # noqa
@@ -7,8 +8,9 @@
 from awswrangler.athena import Athena  # noqa
 from awswrangler.glue import Glue  # noqa
 from awswrangler.redshift import Redshift  # noqa
-from awswrangler.spark import Spark  # noqa
 import awswrangler.utils  # noqa
 
+if importlib.util.find_spec("pyspark"):
+    from awswrangler.spark import Spark  # noqa
 
 logging.getLogger("awswrangler").addHandler(logging.NullHandler())
diff --git a/awswrangler/__version__.py b/awswrangler/__version__.py
@@ -1,4 +1,4 @@
 __title__ = "awswrangler"
 __description__ = "Utilities for Pandas and Apache Spark on AWS."
-__version__ = "0.0b14"
+__version__ = "0.0b16"
 __license__ = "Apache License 2.0"
diff --git a/awswrangler/athena.py b/awswrangler/athena.py
@@ -1,7 +1,6 @@
 from time import sleep
 import logging
 
-
 LOGGER = logging.getLogger(__name__)
 
 QUERY_WAIT_POLLING_DELAY = 0.2  # MILLISECONDS
@@ -13,8 +12,7 @@ def __init__(self, session):
 
     def run_query(self, query, database, s3_output):
         client = self._session.boto3_session.client(
-            service_name="athena", config=self._session.botocore_config
-        )
+            service_name="athena", config=self._session.botocore_config)
         response = client.start_query_execution(
             QueryString=query,
             QueryExecutionContext={"Database": database},
@@ -24,14 +22,13 @@ def run_query(self, query, database, s3_output):
 
     def wait_query(self, query_execution_id):
         client = self._session.boto3_session.client(
-            service_name="athena", config=self._session.botocore_config
-        )
+            service_name="athena", config=self._session.botocore_config)
         final_states = ["FAILED", "SUCCEEDED", "CANCELLED"]
-        response = client.get_query_execution(QueryExecutionId=query_execution_id)
-        while (
-            response.get("QueryExecution").get("Status").get("State")
-            not in final_states
-        ):
+        response = client.get_query_execution(
+            QueryExecutionId=query_execution_id)
+        while (response.get("QueryExecution").get("Status").get("State") not in
+               final_states):
             sleep(QUERY_WAIT_POLLING_DELAY)
-            response = client.get_query_execution(QueryExecutionId=query_execution_id)
+            response = client.get_query_execution(
+                QueryExecutionId=query_execution_id)
         return response
diff --git a/awswrangler/glue.py b/awswrangler/glue.py
@@ -4,7 +4,6 @@
 
 from awswrangler.exceptions import UnsupportedType, UnsupportedFileFormat
 
-
 LOGGER = logging.getLogger(__name__)
 
 
@@ -13,16 +12,16 @@ def __init__(self, session):
         self._session = session
 
     def metadata_to_glue(
-        self,
-        dataframe,
-        path,
-        objects_paths,
-        file_format,
-        database=None,
-        table=None,
-        partition_cols=None,
-        preserve_index=True,
-        mode="append",
+            self,
+            dataframe,
+            path,
+            objects_paths,
+            file_format,
+            database=None,
+            table=None,
+            partition_cols=None,
+            preserve_index=True,
+            mode="append",
     ):
         schema = Glue._build_schema(
             dataframe=dataframe,
@@ -44,8 +43,7 @@ def metadata_to_glue(
             )
         if partition_cols:
             partitions_tuples = Glue._parse_partitions_tuples(
-                objects_paths=objects_paths, partition_cols=partition_cols
-            )
+                objects_paths=objects_paths, partition_cols=partition_cols)
             self.add_partitions(
                 database=database,
                 table=table,
@@ -55,43 +53,43 @@ def metadata_to_glue(
 
     def delete_table_if_exists(self, database, table):
         client = self._session.boto3_session.client(
-            service_name="glue", config=self._session.botocore_config
-        )
+            service_name="glue", config=self._session.botocore_config)
         try:
             client.delete_table(DatabaseName=database, Name=table)
         except client.exceptions.EntityNotFoundException:
             pass
 
     def does_table_exists(self, database, table):
         client = self._session.boto3_session.client(
-            service_name="glue", config=self._session.botocore_config
-        )
+            service_name="glue", config=self._session.botocore_config)
         try:
             client.get_table(DatabaseName=database, Name=table)
             return True
         except client.exceptions.EntityNotFoundException:
             return False
 
-    def create_table(
-        self, database, table, schema, path, file_format, partition_cols=None
-    ):
+    def create_table(self,
+                     database,
+                     table,
+                     schema,
+                     path,
+                     file_format,
+                     partition_cols=None):
         client = self._session.boto3_session.client(
-            service_name="glue", config=self._session.botocore_config
-        )
+            service_name="glue", config=self._session.botocore_config)
         if file_format == "parquet":
             table_input = Glue.parquet_table_definition(
-                table, partition_cols, schema, path
-            )
+                table, partition_cols, schema, path)
         elif file_format == "csv":
-            table_input = Glue.csv_table_definition(table, partition_cols, schema, path)
+            table_input = Glue.csv_table_definition(table, partition_cols,
+                                                    schema, path)
         else:
             raise UnsupportedFileFormat(file_format)
         client.create_table(DatabaseName=database, TableInput=table_input)
 
     def add_partitions(self, database, table, partition_paths, file_format):
         client = self._session.boto3_session.client(
-            service_name="glue", config=self._session.botocore_config
-        )
+            service_name="glue", config=self._session.botocore_config)
         if not partition_paths:
             return None
         partitions = list()
@@ -107,23 +105,24 @@ def add_partitions(self, database, table, partition_paths, file_format):
         for _ in range(pages_num):
             page = partitions[:100]
             del partitions[:100]
-            client.batch_create_partition(
-                DatabaseName=database, TableName=table, PartitionInputList=page
-            )
+            client.batch_create_partition(DatabaseName=database,
+                                          TableName=table,
+                                          PartitionInputList=page)
 
     def get_connection_details(self, name):
         client = self._session.boto3_session.client(
-            service_name="glue", config=self._session.botocore_config
-        )
-        return client.get_connection(Name=name, HidePassword=False)["Connection"]
+            service_name="glue", config=self._session.botocore_config)
+        return client.get_connection(Name=name,
+                                     HidePassword=False)["Connection"]
 
     @staticmethod
     def _build_schema(dataframe, partition_cols, preserve_index):
         if not partition_cols:
             partition_cols = []
         schema_built = []
         if preserve_index:
-            name = str(dataframe.index.name) if dataframe.index.name else "index"
+            name = str(
+                dataframe.index.name) if dataframe.index.name else "index"
             dataframe.index.name = "index"
             dtype = str(dataframe.index.dtype)
             if name not in partition_cols:
@@ -168,9 +167,14 @@ def csv_table_definition(table, partition_cols, schema, path):
         if not partition_cols:
             partition_cols = []
         return {
-            "Name": table,
-            "PartitionKeys": [{"Name": x, "Type": "string"} for x in partition_cols],
-            "TableType": "EXTERNAL_TABLE",
+            "Name":
+            table,
+            "PartitionKeys": [{
+                "Name": x,
+                "Type": "string"
+            } for x in partition_cols],
+            "TableType":
+            "EXTERNAL_TABLE",
             "Parameters": {
                 "classification": "csv",
                 "compressionType": "none",
@@ -180,15 +184,22 @@ def csv_table_definition(table, partition_cols, schema, path):
                 "areColumnsQuoted": "false",
             },
             "StorageDescriptor": {
-                "Columns": [{"Name": x[0], "Type": x[1]} for x in schema],
+                "Columns": [{
+                    "Name": x[0],
+                    "Type": x[1]
+                } for x in schema],
                 "Location": path,
                 "InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
-                "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
+                "OutputFormat":
+                "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
                 "Compressed": False,
                 "NumberOfBuckets": -1,
                 "SerdeInfo": {
-                    "Parameters": {"field.delim": ","},
-                    "SerializationLibrary": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
+                    "Parameters": {
+                        "field.delim": ","
+                    },
+                    "SerializationLibrary":
+                    "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
                 },
                 "StoredAsSubDirectories": False,
                 "SortColumns": [],
@@ -210,8 +221,11 @@ def csv_partition_definition(partition):
                 "InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
                 "Location": partition[0],
                 "SerdeInfo": {
-                    "Parameters": {"field.delim": ","},
-                    "SerializationLibrary": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
+                    "Parameters": {
+                        "field.delim": ","
+                    },
+                    "SerializationLibrary":
+                    "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
                 },
                 "StoredAsSubDirectories": False,
             },
@@ -223,24 +237,37 @@ def parquet_table_definition(table, partition_cols, schema, path):
         if not partition_cols:
             partition_cols = []
         return {
-            "Name": table,
-            "PartitionKeys": [{"Name": x, "Type": "string"} for x in partition_cols],
-            "TableType": "EXTERNAL_TABLE",
+            "Name":
+            table,
+            "PartitionKeys": [{
+                "Name": x,
+                "Type": "string"
+            } for x in partition_cols],
+            "TableType":
+            "EXTERNAL_TABLE",
             "Parameters": {
                 "classification": "parquet",
                 "compressionType": "none",
                 "typeOfData": "file",
             },
             "StorageDescriptor": {
-                "Columns": [{"Name": x[0], "Type": x[1]} for x in schema],
+                "Columns": [{
+                    "Name": x[0],
+                    "Type": x[1]
+                } for x in schema],
                 "Location": path,
-                "InputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
-                "OutputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
+                "InputFormat":
+                "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
+                "OutputFormat":
+                "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
                 "Compressed": False,
                 "NumberOfBuckets": -1,
                 "SerdeInfo": {
-                    "SerializationLibrary": "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
-                    "Parameters": {"serialization.format": "1"},
+                    "SerializationLibrary":
+                    "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
+                    "Parameters": {
+                        "serialization.format": "1"
+                    },
                 },
                 "StoredAsSubDirectories": False,
                 "SortColumns": [],
@@ -260,8 +287,11 @@ def parquet_partition_definition(partition):
                 "InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
                 "Location": partition[0],
                 "SerdeInfo": {
-                    "Parameters": {"serialization.format": "1"},
-                    "SerializationLibrary": "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
+                    "Parameters": {
+                        "serialization.format": "1"
+                    },
+                    "SerializationLibrary":
+                    "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
                 },
                 "StoredAsSubDirectories": False,
             },
@@ -271,14 +301,15 @@ def parquet_partition_definition(partition):
     @staticmethod
     def _parse_partitions_tuples(objects_paths, partition_cols):
         paths = {f"{path.rpartition('/')[0]}/" for path in objects_paths}
-        return [
-            (
-                path,
-                Glue._parse_partition_values(path=path, partition_cols=partition_cols),
-            )
-            for path in paths
-        ]
+        return [(
+            path,
+            Glue._parse_partition_values(path=path,
+                                         partition_cols=partition_cols),
+        ) for path in paths]
 
     @staticmethod
     def _parse_partition_values(path, partition_cols):
-        return [re.search(f"/{col}=(.*?)/", path).group(1) for col in partition_cols]
+        return [
+            re.search(f"/{col}=(.*?)/", path).group(1)
+            for col in partition_cols
+        ]