Merge pull request man-group#5 from manahl/add_python_tests

Unit tests, docstrings, and minor bugfixes
lukerm · Dec 28, 2017 · e6bf78b · e6bf78b
2 parents 6531c61 + f850b5d
commit e6bf78b
Show file tree

Hide file tree

Showing 10 changed files with 389 additions and 50 deletions.
diff --git a/pynorama/sessions/memory.py b/pynorama/sessions/memory.py
@@ -2,7 +2,7 @@
 
 
 class InMemorySessionStore(SessionStore):
-    """Primitive sessions store that only uses a transient application memory."""
+    """Primitive session store that only uses transient application memory."""
     def __init__(self):
         super(InMemorySessionStore, self).__init__()
 

diff --git a/pynorama/table/pandas_table.py b/pynorama/table/pandas_table.py
@@ -7,114 +7,188 @@
 
 
 def query_transform(pandas_table, transform):
+    """Perform a pandas query on the table.
+
+    Do basic pre-processing to support aggregation functions:
+    mean, median, std, quantile. These get evaluated before
+    passing to query. For example, if the distance column
+    contains [10, 20, 30, 40], then:
+
+    'distance > quantile[distance,0.5]'
+
+    First gets evaluated to:
+
+    'distance > 0.25'
+
+    which then gets passed to pandas.DataFrame.query().
+
+    Required fields of transform:
+        query: query to pass to pandas.DataFrame.query()
+    """
     dataframe = pandas_table.dataframe
-    text = transform["query"]
+    text = transform['query']
 
     def replace_function(match):
         fn = match.group(1).strip()
         column = match.group(2).strip()
 
-        return getattr(dataframe[column], fn)()
+        return str(getattr(dataframe[column], fn)())
 
     def replace_function_with_param(match):
-        function = match.group(1).strip()
+        fn = match.group(1).strip()
         column = match.group(2).strip()
         q = float(match.group(3).strip())
 
-        return getattr(dataframe[column], function)(q)
-
-    text = re.sub(r"(mean|median|std)\[(.+),(.+)\]", replace_function_with_param, text)
-    text = re.sub(r"(quantile)\[(.+)\]", replace_function, text)
+        return str(getattr(dataframe[column], fn)(q))
 
+    text = re.sub(r"(mean|median|std)\[(.+)\]", replace_function, text)
+    text = re.sub(r"(quantile)\[(.+),(.+)\]", replace_function_with_param, text)
     return PandasTable(dataframe.query(text))
 
 
 def sample_transform(pandas_table, transform):
+    """Sample from the pandas table, no replacement, keeping the original order.
+
+    Required fields of transform:
+        fraction: [0, 1] - fraction of dataframe to keep
+    """
     dataframe = pandas_table.dataframe
     # Don't use dataframe.sample, because rather keep order
-    n = int(round(len(dataframe) * transform["fraction"]))
-    chosen_idx = np.random.choice(len(dataframe), size=n)
+    n = int(round(len(dataframe) * transform['fraction']))
+    chosen_idx = np.random.choice(len(dataframe), size=n, replace=False)
     chosen_idx.sort()
 
     return PandasTable(dataframe.take(chosen_idx))
 
 
 def sort_transform(pandas_table, transform):
+    """Sort the pandas table.
+
+    Required fields of transform:
+        column: the column to sort on
+        ascending: True/False
+    """
     dataframe = pandas_table.dataframe
-    if transform["column"] == "index":
-        return PandasTable(dataframe.sort_index(ascending=transform["ascending"]))
-    return PandasTable(dataframe.sort(transform["column"],
-                       ascending=transform["ascending"]))
+    if transform['column'] == 'index':
+        return PandasTable(
+            dataframe.sort_index(ascending=transform['ascending']))
+    elif hasattr(dataframe, 'sort_values'):
+        return PandasTable(
+            dataframe.sort_values(transform['column'],
+                                  ascending=transform['ascending']))
+    else:
+        return PandasTable(
+            dataframe.sort(transform['column'],
+                           ascending=transform['ascending']))
+
 
 
 def quantile_range_transform(pandas_table, transform):
+    """Filter a table to only keep values within a given quantile range.
+
+    Required fields of transform:
+        column: the column to filter on
+        lower: [0, 1] - lower quantile (inclusive)
+        upper: [0, 1] - upper quantile (inclusive)
+    """
     dataframe = pandas_table.dataframe
-    column = transform["column"]
-    lower_quantile = dataframe[column].quantile(transform["lower"])
-    upper_quantile = dataframe[column].quantile(transform["upper"])
+    column = transform['column']
+    lower_quantile = dataframe[column].quantile(transform['lower'])
+    upper_quantile = dataframe[column].quantile(transform['upper'])
     return PandasTable(dataframe[(dataframe[column] >= lower_quantile) &
                                  (dataframe[column] <= upper_quantile)])
 
 
 def histogram_transform(pandas_table, transform):
+    """Create a histogram from the values in a given column of the Pandas table.
+
+    Both numerical and string columns are suppported.
+
+    Required fields of transform:
+        column: the column whose values are used for the histogram
+        bins: (optional) number of bins to use in the transform. Default is 30.
+    """
+
     dataframe = pandas_table.dataframe
-    column = transform["column"]
-    bins = transform.get("bins", 30)
+    column = transform['column']
+    bins = int(transform.get('bins', 30))
     series = dataframe[column]
     if np.issubdtype(series.dtype, np.number):
         hist, bin_edges = np.histogram(series[(~np.isnan(series))], bins=bins)
-        width = [bin_edges[i+1] - bin_edges[i] for i in range(len(bin_edges))-1]
-        x = [(bin_edges[i+1]+bin_edges[i])/2 for i in range(len(bin_edges))-1]
+        width = [bin_edges[i + 1] - bin_edges[i] for i in range(len(bin_edges) - 1)]
+        x = [(bin_edges[i + 1] + bin_edges[i]) / 2 for i in range(len(bin_edges) - 1)]
         y = hist.tolist()
     else:
         counts = series.value_counts()
         cut = counts.iloc[0:bins]
-        cut["Other"] = counts[bins:].sum()
+        cut['Other'] = counts[bins:].sum()
+        # FIXME: Workaround for https://github.com/pandas-dev/pandas/issues/18678
+        if np.isnan(cut['Other']):
+            cut['Other'] = 0
         x = cut.index.tolist()
         y = cut.values.tolist()
+        # FIXME: Is this the right width for categorical variables? Perhaps a list of 1-s?
+        width = None
 
-    return PandasTable(dataframe, {"x": x, "y": y, "width": width})
+    return PandasTable(dataframe, {'x': x, 'y': y, 'width': width})
 
 
-# doesn't care for np.inf and -np.inf
 def nans_transform(pandas_table, transform):
+    """Filter/keep rows with at least one nan value.
+
+    Doesn't care for np.inf and -np.inf.
+
+    Required fields of transform:
+        filter: "show"/"hide" - whether to show or hide rows with nans
+    """
     dataframe = pandas_table.dataframe
     matches = pd.isnull(dataframe).any(axis=1)
-    if transform["filter"] == "show":
-        return dataframe[matches]
-    if transform["filter"] == "hide":
-        return dataframe[~matches]
-    raise ValueError("invalid filter parameter")
+    if transform['filter'] == 'show':
+        return PandasTable(dataframe[matches])
+    if transform['filter'] == 'hide':
+        return PandasTable(dataframe[~matches])
+    raise ValueError('invalid filter parameter')
 
 
-def found(element, searchterm):
+def _found(element, searchterm):
+    """Return True if a given cell matches the search term, False otherwise"""
     if element is None:
         return False
-    if isinstance(element, (str, unicode)):
+    if isinstance(element, basestring):
         return fnmatch.fnmatch(element.strip().lower(), searchterm.strip().lower())
     if isinstance(element, (float, int)):
+        # FIXME: This doesn't play nicely with Pandas datetypes
         if np.isnan(element):
             return False
         return float(searchterm) == element
     raise ValueError("Can't match the type %s" % type(element))
 
 
 def search_transform(pandas_table, transform):
+    """Find all rows that match a given search term.
+
+    Do some best-effort type-casting to maximise the chances of a match.
+
+    Required fields of transform:
+        column: the column to search on
+        searchterm: the element to match
+    """
     dataframe = pandas_table.dataframe
-    column = transform["column"]
+    column = transform['column']
     return PandasTable(dataframe[dataframe[column].map(
-        lambda element: found(element, transform["searchterm"]))])
+        lambda element: _found(element, transform['searchterm']))])
 
 
 class PandasTable(Table):
+    """Pynorama table backed by a fully-materialised Pandas dataframe"""
     TRANSFORMS_MAPPING = {
-        "query": query_transform,
-        "sample": sample_transform,
-        "sort": sort_transform,
-        "quantile_range": quantile_range_transform,
-        "search": search_transform,
-        "nans": nans_transform,
-        "histogram": histogram_transform
+        'query': query_transform,
+        'sample': sample_transform,
+        'sort': sort_transform,
+        'quantile_range': quantile_range_transform,
+        'search': search_transform,
+        'nans': nans_transform,
+        'histogram': histogram_transform
     }
 
     def __init__(self, dataframe, side_result=None):

diff --git a/tests/unit/sessions/__init__.py b/tests/unit/sessions/__init__.py
diff --git a/tests/unit/sessions/test_base_store.py b/tests/unit/sessions/test_base_store.py
@@ -0,0 +1,31 @@
+import pytest
+from mock import sentinel
+from pynorama.sessions.base_store import SessionStore
+
+
+class MockSessionStore(SessionStore):
+    def save_sessions(self, view_name, sessions):
+        pass
+
+    def load_sessions(self, view_name):
+        pass
+
+
+def test_base_session_store_save_sessions():
+    store = SessionStore()
+    with pytest.raises(NotImplementedError) as e:
+        store.save_sessions(sentinel.view_name, sentinel.sessions)
+    assert 'Please implement save_sessions' in str(e)
+
+
+def test_base_session_store_load_sessions():
+    store = SessionStore()
+    with pytest.raises(NotImplementedError) as e:
+        store.load_sessions(sentinel.view_name)
+    assert 'Please implement load_sessions' in str(e)
+
+
+def test_base_session_store_get_and_set_sessions():
+    store = MockSessionStore()
+    store.set_sessions(sentinel.view_name, sentinel.sessions)
+    assert sentinel.sessions == store.get_sessions(sentinel.view_name)
diff --git a/tests/unit/test_json_sesssion_store.py → tests/unit/sessions/test_json_file.py b/tests/unit/test_json_sesssion_store.py → tests/unit/sessions/test_json_file.py
@@ -1,7 +1,8 @@
 pytest_plugins = ['pytest_shutil']
 
+import os
+
 from pynorama.sessions import JsonFileSessionStore
-from datetime import datetime
 from json import load
 
 
@@ -10,22 +11,21 @@ def test_load_sessions(workspace):
     {"foo": "bar", "xyz": 1, "someday": "2017-12-19T13:18:44.745Z"}
     """
 
-    with open(workspace.workspace + '/test.json', 'w') as f:
+    with open(os.path.join(workspace.workspace, 'test.json'), 'w') as f:
         f.write(raw_session)
 
     store = JsonFileSessionStore(workspace.workspace)
     session_data = store.load_sessions('test')
     assert cmp({"foo": "bar", "xyz": 1, "someday": "2017-12-19T13:18:44.745Z"},
                session_data) == 0
 
+
 def test_save_sessions(workspace):
     store = JsonFileSessionStore(workspace.workspace)
     expected = {"foo": "bar", "xyz": 1, "someday": "2017-12-19T13:18:44.745Z"}
-    store.save_sessions('test',
-                        expected)
+    store.save_sessions('test', expected)
 
-    with open(workspace.workspace + '/test.json', 'r')  as f:
+    with open(os.path.join(workspace.workspace, 'test.json'), 'r') as f:
         actual = load(f)
 
     assert cmp(actual, expected) == 0
-
diff --git a/tests/unit/sessions/test_memory.py b/tests/unit/sessions/test_memory.py
@@ -0,0 +1,13 @@
+from mock import sentinel
+from pynorama.sessions.memory import InMemorySessionStore
+
+
+def test_memory_session_store_get_and_set_sessions():
+    store = InMemorySessionStore()
+    store.set_sessions(sentinel.view_name, sentinel.sessions)
+    assert sentinel.sessions == store.get_sessions(sentinel.view_name)
+
+
+def test_memory_session_store_get_sessions_nonexistent():
+    store = InMemorySessionStore()
+    assert {} == store.get_sessions(sentinel.view_name)
diff --git a/tests/unit/test_mongo_sesssion_store.py → tests/unit/sessions/test_mongo.py b/tests/unit/test_mongo_sesssion_store.py → tests/unit/sessions/test_mongo.py
@@ -1,14 +1,15 @@
 from pynorama.sessions import MongoSessionStore
-from datetime import datetime
 from pytest_mongodb.plugin import mongodb
 from mongomock.mongo_client import MongoClient
 
+
 def test_load_sessions(mongodb):
     assert 'sessions' in mongodb.collection_names()
     sesstion_data = MongoSessionStore(mongodb.sessions).load_sessions('test')
     assert cmp({"foo": "bar", "xyz": 1, "someday": "2017-12-19T13:18:44.745Z"},
                sesstion_data) == 0
 
+
 def test_save_sessions():
     collection = MongoClient().db.collection
     store = MongoSessionStore(collection)
@@ -24,6 +25,3 @@ def test_save_sessions():
     assert views_data.get('view_name') == 'test'
     assert cmp({"foo": "bar", "xyz": 1, "someday": "2017-12-19T13:18:44.745Z"},
                views_data.get('sessions')) == 0
-
-
-
diff --git a/tests/unit/table/__init__.py b/tests/unit/table/__init__.py