added meta_select at example1

Former-commit-id: 87de455
DEIB-GECO · Mar 13, 2017 · 4de0a4f · 4de0a4f
1 parent f68547a
commit 4de0a4f
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -7,4 +7,14 @@ Python-Spark implementation of the GMQL system
 
 ##Set up of the project
 1. Download this repository
-2. In your IDE add the following paths to the project
+2. In your IDE add to the paths of the project the ones relative to your Spark installation
+
+## Example of usage
+Your spark engine is instantiated when you call the following
+```python
+import gmql as gl
+```
+after this you can access the spark context by `gl.sc`
+
+## Tests
+In the folder [tests](./tests) you can find some examples of using the library
diff --git a/gmql/dataset/GMQLDataset.py b/gmql/dataset/GMQLDataset.py
@@ -1,9 +1,14 @@
 from gmql import sc
 
 class GMQLDataset:
+    dataset = None
+
+    def __init__(self, dataset=None):
+        self.dataset = dataset
 
     def load_from_path(self, path, parser):
         rdd = sc.textFile(path).map(parser.parse_line)
-        return rdd
+        return GMQLDataset(dataset=rdd)
 
     def meta_select(self, predicate):
+        rdd = self.dataset.filter(predicate)
diff --git a/tests/example_1.py b/tests/example_1.py
@@ -10,4 +10,6 @@
 print('starting reading bed file')
 bed_dataset = dataset.load_from_path(path=bed_path, parser=bed_parser)
 
-S = bed_dataset.meta_select(lambda sample: "tumor" in )
+# Select only the lines with 'id' = 'id-2'
+only_id2_lines_dataset = bed_dataset.meta_select(lambda row: row['name'] == 'id-2')
+