Skip to content

Commit

Permalink
#78 Sieve with AttrValueRecordSieve implemented/tested
Browse files Browse the repository at this point in the history
  • Loading branch information
justb4 committed Jul 4, 2018
1 parent c53f21a commit 42e5658
Show file tree
Hide file tree
Showing 13 changed files with 995 additions and 622 deletions.
28 changes: 28 additions & 0 deletions docs/code.rst
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,10 @@ Components: Filters
:members:
:show-inheritance:

.. automodule:: stetl.filters.sieve
:members:
:show-inheritance:

.. automodule:: stetl.filters.stringfilter
:members:
:show-inheritance:
Expand All @@ -131,6 +135,30 @@ Components: Filters
:members:
:show-inheritance:

.. automodule:: stetl.filters.execfilter
:members:
:show-inheritance:

.. automodule:: stetl.filters.nullfilter
:members:
:show-inheritance:

.. automodule:: stetl.filters.packetbuffer
:members:
:show-inheritance:

.. automodule:: stetl.filters.packetwriter
:members:
:show-inheritance:

.. automodule:: stetl.filters.regexfilter
:members:
:show-inheritance:

.. automodule:: stetl.filters.zipfileextractor
:members:
:show-inheritance:

Components: Outputs
-------------------

Expand Down
Binary file modified examples/basics/12_gdal_ogr/output/cities.dbf
Binary file not shown.
28 changes: 28 additions & 0 deletions examples/basics/17_sieve/etl.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Trivial example Sieve filter.
# The input data is in input/cities.csv.
# We sieve out (passthrough) all records where city attr value
# matches "amsterdam" or "otterlo".

[etl]
chains = input_csv|attr_value_sieve|output_std,
input_csv|attr_value_sieve|output_file

[input_csv]
class = inputs.fileinput.CsvFileInput
file_path = input/cities.csv
output_format = record_array

[attr_value_sieve]
class = filters.sieve.AttrValueRecordSieve
input_format = record_array
output_format = record_array
attr_name = city
attr_values = amsterdam,otterlo

[output_std]
class = outputs.standardoutput.StandardOutput

[output_file]
class = outputs.fileoutput.FileOutput
file_path = output/cities_sieved.txt

8 changes: 8 additions & 0 deletions examples/basics/17_sieve/etl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/sh
#
# ETL for copying a file to standard output.
#
# Shortcut to call Stetl main.py with etl config.
#
stetl -c etl.cfg

5 changes: 5 additions & 0 deletions examples/basics/17_sieve/input/cities.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
city,lat,lon
amsterdam,52.4,4.9
otterlo,52.101,5.773
rotterdam,51.9,4.5
eindhoven,51.44,5.47
1 change: 1 addition & 0 deletions examples/basics/17_sieve/output/cities_sieved.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[{'lat': '52.4', 'city': 'amsterdam', 'lon': '4.9'}, {'lat': '52.101', 'city': 'otterlo', 'lon': '5.773'}]
Binary file modified examples/basics/3_shape/output/gmlcities.dbf
Binary file not shown.
1,380 changes: 759 additions & 621 deletions examples/basics/runall.log

Large diffs are not rendered by default.

98 changes: 98 additions & 0 deletions stetl/filters/sieve.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Lets data Packets pass-through, "sieve", based on criteria in their data.
# See issue: https://github.com/geopython/stetl/issues/78
#
# A concrete example is AttrValueRecordSieve which sieves records matching
# specific attribute values. One can also think of Sieves based on XPath expressions
# (e.g. for XML, GML), or geospatial, based on for example WFS-like filters like bounding boxes.
#
# Author: Just van den Broecke
#
from stetl.component import Config
from stetl.filter import Filter
from stetl.util import Util
from stetl.packet import FORMAT

log = Util.get_log('Sieve')


class Sieve(Filter):
"""
ABC for specific Sieves that pass-through, "sieve", Packets based on criteria in their data.
"""

def __init__(self, configdict, section, consumes, produces):
Filter.__init__(self, configdict, section, consumes, produces)

def invoke(self, packet):
if packet.data is None:
return packet
return self.sieve(packet)

def sieve(self, packet):
"""
To be implemented in subclasses.
:param packet:
:return:
"""
return packet


class AttrValueRecordSieve(Sieve):
"""
Sieves by attr/value(s) in Record Packets.
"""

@Config(ptype=str, required=True)
def attr_name(self):
"""
Name of attribute whose value(s) are to be sieved.
"""
pass

@Config(ptype=list, default=list(), required=False)
def attr_values(self):
"""
Value(s) for attribute to be to sieved. If empty any value is passed through (existence
of attr_name is criterium).
"""
pass

def __init__(self, configdict, section):
Sieve.__init__(self, configdict, section, consumes=[FORMAT.record_array, FORMAT.record], produces=[FORMAT.record_array, FORMAT.record])

def sieve(self, packet):
"""
Filter out Packets that are not matching designated attr value(s).
:param packet:
:return:
"""

# Start with empty result: fill with matching records
record_data = packet.data
packet.data = None

# Data can be list or single record
if type(record_data) is list:
packet.data = list()
for record in record_data:
if self.matches_attr(record):
packet.data.append(record)
elif type(record_data) is dict:
if self.matches_attr(record_data):
packet.data = record_data

return packet

def matches_attr(self, record):
# Attr not even in record: no use going on
if self.attr_name not in record:
return False

# Match if no value
if not self.attr_values:
return True

return record[self.attr_name] in self.attr_values
2 changes: 1 addition & 1 deletion stetl/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.2-dev"
__version__ = "1.2-rc3"
5 changes: 5 additions & 0 deletions tests/data/cities.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
city,lat,lon
amsterdam,52.4,4.9
otterlo,52.101,5.773
rotterdam,51.9,4.5
eindhoven,51.44,5.47
21 changes: 21 additions & 0 deletions tests/filters/configs/sieve.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Config file for unit testing Sieve Filter.

[etl]
chains = input_csv_file|attr_value_sieve|packet_buffer|output_std

[input_csv_file]
class = inputs.fileinput.CsvFileInput
file_path = tests/data/cities.csv

[attr_value_sieve]
class = filters.sieve.AttrValueRecordSieve
input_format = record_array
output_format = record_array
attr_name = city
attr_values = amsterdam,otterlo

[packet_buffer]
class = filters.packetbuffer.PacketBuffer

[output_std]
class = outputs.standardoutput.StandardOutput
41 changes: 41 additions & 0 deletions tests/filters/test_sieve.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import os

from stetl.etl import ETL
from stetl.filters.packetbuffer import PacketBuffer
from stetl.filters.sieve import AttrValueRecordSieve
from tests.stetl_test_case import StetlTestCase

class SieveTest(StetlTestCase):
"""Unit tests for Sieve"""

def setUp(self):
super(SieveTest, self).setUp()

# Initialize Stetl
curr_dir = os.path.dirname(os.path.realpath(__file__))
cfg_dict = {'config_file': os.path.join(curr_dir, 'configs/sieve.cfg')}
self.etl = ETL(cfg_dict)

def test_class(self):
chain = StetlTestCase.get_chain(self.etl)
section = StetlTestCase.get_section(chain, 1)
class_name = self.etl.configdict.get(section, 'class')

self.assertEqual('filters.sieve.AttrValueRecordSieve', class_name)

def test_instance(self):
chain = StetlTestCase.get_chain(self.etl)

self.assertTrue(isinstance(chain.get_by_index(1), AttrValueRecordSieve))

def test_execute(self):
chain = StetlTestCase.get_chain(self.etl)
chain.run()

buffer_filter = chain.get_by_class(PacketBuffer)
packet_list = buffer_filter.packet_list

# Two city records filtered out
self.assertEqual(len(packet_list[0].data), 2)
self.assertEqual(str(packet_list[0].data[0]['city']), "amsterdam")
self.assertEqual(str(packet_list[0].data[1]['city']), "otterlo")

0 comments on commit 42e5658

Please sign in to comment.