Skip to content
This repository has been archived by the owner on Aug 28, 2022. It is now read-only.

Commit

Permalink
Merge pull request man-group#5 from manahl/add_python_tests
Browse files Browse the repository at this point in the history
Unit tests, docstrings, and minor bugfixes
  • Loading branch information
egao1980 authored Dec 28, 2017
2 parents 6531c61 + f850b5d commit e6bf78b
Show file tree
Hide file tree
Showing 10 changed files with 389 additions and 50 deletions.
2 changes: 1 addition & 1 deletion pynorama/sessions/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


class InMemorySessionStore(SessionStore):
"""Primitive sessions store that only uses a transient application memory."""
"""Primitive session store that only uses transient application memory."""
def __init__(self):
super(InMemorySessionStore, self).__init__()

Expand Down
152 changes: 113 additions & 39 deletions pynorama/table/pandas_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,114 +7,188 @@


def query_transform(pandas_table, transform):
"""Perform a pandas query on the table.
Do basic pre-processing to support aggregation functions:
mean, median, std, quantile. These get evaluated before
passing to query. For example, if the distance column
contains [10, 20, 30, 40], then:
'distance > quantile[distance,0.5]'
First gets evaluated to:
'distance > 0.25'
which then gets passed to pandas.DataFrame.query().
Required fields of transform:
query: query to pass to pandas.DataFrame.query()
"""
dataframe = pandas_table.dataframe
text = transform["query"]
text = transform['query']

def replace_function(match):
fn = match.group(1).strip()
column = match.group(2).strip()

return getattr(dataframe[column], fn)()
return str(getattr(dataframe[column], fn)())

def replace_function_with_param(match):
function = match.group(1).strip()
fn = match.group(1).strip()
column = match.group(2).strip()
q = float(match.group(3).strip())

return getattr(dataframe[column], function)(q)

text = re.sub(r"(mean|median|std)\[(.+),(.+)\]", replace_function_with_param, text)
text = re.sub(r"(quantile)\[(.+)\]", replace_function, text)
return str(getattr(dataframe[column], fn)(q))

text = re.sub(r"(mean|median|std)\[(.+)\]", replace_function, text)
text = re.sub(r"(quantile)\[(.+),(.+)\]", replace_function_with_param, text)
return PandasTable(dataframe.query(text))


def sample_transform(pandas_table, transform):
"""Sample from the pandas table, no replacement, keeping the original order.
Required fields of transform:
fraction: [0, 1] - fraction of dataframe to keep
"""
dataframe = pandas_table.dataframe
# Don't use dataframe.sample, because rather keep order
n = int(round(len(dataframe) * transform["fraction"]))
chosen_idx = np.random.choice(len(dataframe), size=n)
n = int(round(len(dataframe) * transform['fraction']))
chosen_idx = np.random.choice(len(dataframe), size=n, replace=False)
chosen_idx.sort()

return PandasTable(dataframe.take(chosen_idx))


def sort_transform(pandas_table, transform):
"""Sort the pandas table.
Required fields of transform:
column: the column to sort on
ascending: True/False
"""
dataframe = pandas_table.dataframe
if transform["column"] == "index":
return PandasTable(dataframe.sort_index(ascending=transform["ascending"]))
return PandasTable(dataframe.sort(transform["column"],
ascending=transform["ascending"]))
if transform['column'] == 'index':
return PandasTable(
dataframe.sort_index(ascending=transform['ascending']))
elif hasattr(dataframe, 'sort_values'):
return PandasTable(
dataframe.sort_values(transform['column'],
ascending=transform['ascending']))
else:
return PandasTable(
dataframe.sort(transform['column'],
ascending=transform['ascending']))



def quantile_range_transform(pandas_table, transform):
"""Filter a table to only keep values within a given quantile range.
Required fields of transform:
column: the column to filter on
lower: [0, 1] - lower quantile (inclusive)
upper: [0, 1] - upper quantile (inclusive)
"""
dataframe = pandas_table.dataframe
column = transform["column"]
lower_quantile = dataframe[column].quantile(transform["lower"])
upper_quantile = dataframe[column].quantile(transform["upper"])
column = transform['column']
lower_quantile = dataframe[column].quantile(transform['lower'])
upper_quantile = dataframe[column].quantile(transform['upper'])
return PandasTable(dataframe[(dataframe[column] >= lower_quantile) &
(dataframe[column] <= upper_quantile)])


def histogram_transform(pandas_table, transform):
"""Create a histogram from the values in a given column of the Pandas table.
Both numerical and string columns are suppported.
Required fields of transform:
column: the column whose values are used for the histogram
bins: (optional) number of bins to use in the transform. Default is 30.
"""

dataframe = pandas_table.dataframe
column = transform["column"]
bins = transform.get("bins", 30)
column = transform['column']
bins = int(transform.get('bins', 30))
series = dataframe[column]
if np.issubdtype(series.dtype, np.number):
hist, bin_edges = np.histogram(series[(~np.isnan(series))], bins=bins)
width = [bin_edges[i+1] - bin_edges[i] for i in range(len(bin_edges))-1]
x = [(bin_edges[i+1]+bin_edges[i])/2 for i in range(len(bin_edges))-1]
width = [bin_edges[i + 1] - bin_edges[i] for i in range(len(bin_edges) - 1)]
x = [(bin_edges[i + 1] + bin_edges[i]) / 2 for i in range(len(bin_edges) - 1)]
y = hist.tolist()
else:
counts = series.value_counts()
cut = counts.iloc[0:bins]
cut["Other"] = counts[bins:].sum()
cut['Other'] = counts[bins:].sum()
# FIXME: Workaround for https://github.com/pandas-dev/pandas/issues/18678
if np.isnan(cut['Other']):
cut['Other'] = 0
x = cut.index.tolist()
y = cut.values.tolist()
# FIXME: Is this the right width for categorical variables? Perhaps a list of 1-s?
width = None

return PandasTable(dataframe, {"x": x, "y": y, "width": width})
return PandasTable(dataframe, {'x': x, 'y': y, 'width': width})


# doesn't care for np.inf and -np.inf
def nans_transform(pandas_table, transform):
"""Filter/keep rows with at least one nan value.
Doesn't care for np.inf and -np.inf.
Required fields of transform:
filter: "show"/"hide" - whether to show or hide rows with nans
"""
dataframe = pandas_table.dataframe
matches = pd.isnull(dataframe).any(axis=1)
if transform["filter"] == "show":
return dataframe[matches]
if transform["filter"] == "hide":
return dataframe[~matches]
raise ValueError("invalid filter parameter")
if transform['filter'] == 'show':
return PandasTable(dataframe[matches])
if transform['filter'] == 'hide':
return PandasTable(dataframe[~matches])
raise ValueError('invalid filter parameter')


def found(element, searchterm):
def _found(element, searchterm):
"""Return True if a given cell matches the search term, False otherwise"""
if element is None:
return False
if isinstance(element, (str, unicode)):
if isinstance(element, basestring):
return fnmatch.fnmatch(element.strip().lower(), searchterm.strip().lower())
if isinstance(element, (float, int)):
# FIXME: This doesn't play nicely with Pandas datetypes
if np.isnan(element):
return False
return float(searchterm) == element
raise ValueError("Can't match the type %s" % type(element))


def search_transform(pandas_table, transform):
"""Find all rows that match a given search term.
Do some best-effort type-casting to maximise the chances of a match.
Required fields of transform:
column: the column to search on
searchterm: the element to match
"""
dataframe = pandas_table.dataframe
column = transform["column"]
column = transform['column']
return PandasTable(dataframe[dataframe[column].map(
lambda element: found(element, transform["searchterm"]))])
lambda element: _found(element, transform['searchterm']))])


class PandasTable(Table):
"""Pynorama table backed by a fully-materialised Pandas dataframe"""
TRANSFORMS_MAPPING = {
"query": query_transform,
"sample": sample_transform,
"sort": sort_transform,
"quantile_range": quantile_range_transform,
"search": search_transform,
"nans": nans_transform,
"histogram": histogram_transform
'query': query_transform,
'sample': sample_transform,
'sort': sort_transform,
'quantile_range': quantile_range_transform,
'search': search_transform,
'nans': nans_transform,
'histogram': histogram_transform
}

def __init__(self, dataframe, side_result=None):
Expand Down
Empty file added tests/unit/sessions/__init__.py
Empty file.
31 changes: 31 additions & 0 deletions tests/unit/sessions/test_base_store.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import pytest
from mock import sentinel
from pynorama.sessions.base_store import SessionStore


class MockSessionStore(SessionStore):
def save_sessions(self, view_name, sessions):
pass

def load_sessions(self, view_name):
pass


def test_base_session_store_save_sessions():
store = SessionStore()
with pytest.raises(NotImplementedError) as e:
store.save_sessions(sentinel.view_name, sentinel.sessions)
assert 'Please implement save_sessions' in str(e)


def test_base_session_store_load_sessions():
store = SessionStore()
with pytest.raises(NotImplementedError) as e:
store.load_sessions(sentinel.view_name)
assert 'Please implement load_sessions' in str(e)


def test_base_session_store_get_and_set_sessions():
store = MockSessionStore()
store.set_sessions(sentinel.view_name, sentinel.sessions)
assert sentinel.sessions == store.get_sessions(sentinel.view_name)
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
pytest_plugins = ['pytest_shutil']

import os

from pynorama.sessions import JsonFileSessionStore
from datetime import datetime
from json import load


Expand All @@ -10,22 +11,21 @@ def test_load_sessions(workspace):
{"foo": "bar", "xyz": 1, "someday": "2017-12-19T13:18:44.745Z"}
"""

with open(workspace.workspace + '/test.json', 'w') as f:
with open(os.path.join(workspace.workspace, 'test.json'), 'w') as f:
f.write(raw_session)

store = JsonFileSessionStore(workspace.workspace)
session_data = store.load_sessions('test')
assert cmp({"foo": "bar", "xyz": 1, "someday": "2017-12-19T13:18:44.745Z"},
session_data) == 0


def test_save_sessions(workspace):
store = JsonFileSessionStore(workspace.workspace)
expected = {"foo": "bar", "xyz": 1, "someday": "2017-12-19T13:18:44.745Z"}
store.save_sessions('test',
expected)
store.save_sessions('test', expected)

with open(workspace.workspace + '/test.json', 'r') as f:
with open(os.path.join(workspace.workspace, 'test.json'), 'r') as f:
actual = load(f)

assert cmp(actual, expected) == 0

13 changes: 13 additions & 0 deletions tests/unit/sessions/test_memory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from mock import sentinel
from pynorama.sessions.memory import InMemorySessionStore


def test_memory_session_store_get_and_set_sessions():
store = InMemorySessionStore()
store.set_sessions(sentinel.view_name, sentinel.sessions)
assert sentinel.sessions == store.get_sessions(sentinel.view_name)


def test_memory_session_store_get_sessions_nonexistent():
store = InMemorySessionStore()
assert {} == store.get_sessions(sentinel.view_name)
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
from pynorama.sessions import MongoSessionStore
from datetime import datetime
from pytest_mongodb.plugin import mongodb
from mongomock.mongo_client import MongoClient


def test_load_sessions(mongodb):
assert 'sessions' in mongodb.collection_names()
sesstion_data = MongoSessionStore(mongodb.sessions).load_sessions('test')
assert cmp({"foo": "bar", "xyz": 1, "someday": "2017-12-19T13:18:44.745Z"},
sesstion_data) == 0


def test_save_sessions():
collection = MongoClient().db.collection
store = MongoSessionStore(collection)
Expand All @@ -24,6 +25,3 @@ def test_save_sessions():
assert views_data.get('view_name') == 'test'
assert cmp({"foo": "bar", "xyz": 1, "someday": "2017-12-19T13:18:44.745Z"},
views_data.get('sessions')) == 0



Empty file added tests/unit/table/__init__.py
Empty file.
Loading

0 comments on commit e6bf78b

Please sign in to comment.