starting Bed file parsing

Former-commit-id: 48891c5
DEIB-GECO · Mar 12, 2017 · a3eecfa · a3eecfa
1 parent a2f064b
commit a3eecfa
Show file tree

Hide file tree

Showing 17 changed files with 332 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,58 @@
 # Created by https://www.gitignore.io/api/pycharm,python
 
+### JetBrains ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff:
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+
+# Sensitive or high-churn files:
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.xml
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+
+# Gradle:
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Mongo Explorer plugin:
+.idea/**/mongoSettings.xml
+
+## File-based project format:
+*.iws
+
+## Plugin-specific files:
+
+# IntelliJ
+/out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+### JetBrains Patch ###
+# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
+
+# *.iml
+# modules.xml
+# .idea/misc.xml
+# *.ipr
+
+
 ### PyCharm ###
 # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
 # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
@@ -145,4 +198,4 @@ ENV/
 # Rope project settings
 .ropeproject
 
-# End of https://www.gitignore.io/api/pycharm,python
+# End of https://www.gitignore.io/api/pycharm,python
diff --git a/.gitignore~ b/.gitignore~
@@ -0,0 +1,148 @@
+# Created by https://www.gitignore.io/api/pycharm,python
+
+### PyCharm ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff:
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+
+# Sensitive or high-churn files:
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.xml
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+
+# Gradle:
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Mongo Explorer plugin:
+.idea/**/mongoSettings.xml
+
+## File-based project format:
+*.iws
+
+## Plugin-specific files:
+
+# IntelliJ
+/out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+### PyCharm Patch ###
+# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
+
+# *.iml
+# modules.xml
+# .idea/misc.xml
+# *.ipr
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# dotenv
+.env
+
+# virtualenv
+.venv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
+
+# End of https://www.gitignore.io/api/pycharm,python
diff --git a/.idea/GMQL-Python.iml b/.idea/GMQL-Python.iml
diff --git a/README.md b/README.md
@@ -1,2 +1,10 @@
 # GMQL-Python
 Python-Spark implementation of the GMQL system
+
+## Requirements
+- A python environment
+- Apache Spark
+
+##Set up of the project
+1. Download this repository
+2. In your IDE add the following paths to the project
diff --git a/gmql/__init__.py b/gmql/__init__.py
@@ -0,0 +1,13 @@
+"""
+Setting up the pyspark environment
+"""
+spark_home = '/home/luca/spark-2.1.0-bin-hadoop2.7'
+
+import findspark
+findspark.init(spark_home=spark_home)
+import pyspark
+
+app_name = 'gmql_spark'
+
+# getting the Spark context
+sc = pyspark.SparkContext(appName=app_name)
diff --git a/gmql/dataset/GMQLDataset.py b/gmql/dataset/GMQLDataset.py
@@ -0,0 +1,7 @@
+from gmql import sc
+
+class GMQLDataset:
+
+    def load_from_path(self, path, parser):
+        rdd = sc.textFile(path).map(parser.parse_line)
+        return rdd
diff --git a/gmql/dataset/__init__.py b/gmql/dataset/__init__.py
diff --git a/gmql/dataset/loaders/__init__.py b/gmql/dataset/loaders/__init__.py
diff --git a/gmql/dataset/parsers/BedParser.py b/gmql/dataset/parsers/BedParser.py
@@ -0,0 +1,22 @@
+from .Parser import Parser
+from pybedtools import BedTool
+
+
+class BedParser(Parser):
+
+    header = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd',
+              'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']
+
+    def parse_line(self, line):
+        elems = line.split("\t")
+        n_fields = len(elems)
+        dict_row = {}
+        for i in range(n_fields):
+            dict_row[self.header[i]] = elems[i]
+        return dict_row
+
+
+def parse_to_dataframe(self, file_path):
+    bed_file = BedTool(file_path)
+    df = bed_file.to_dataframe()
+    return df
diff --git a/gmql/dataset/parsers/Parser.py b/gmql/dataset/parsers/Parser.py
@@ -0,0 +1,7 @@
+
+
+class Parser:
+
+    def parse_line(self,):
+        pass
+
diff --git a/gmql/dataset/parsers/__init__.py b/gmql/dataset/parsers/__init__.py
diff --git a/gmql/operators/ProjectMD.py b/gmql/operators/ProjectMD.py
diff --git a/gmql/operators/__init__.py b/gmql/operators/__init__.py
diff --git a/resources/ENCSR373BIX_rep2_1_pe_bwa_biorep_filtered_hotspots.bed.REMOVED.git-id b/resources/ENCSR373BIX_rep2_1_pe_bwa_biorep_filtered_hotspots.bed.REMOVED.git-id
@@ -0,0 +1 @@
+2999cfa90579accd1ca723b562464c86913bf34a
diff --git a/setup.py b/setup.py
@@ -0,0 +1,12 @@
+from setuptools import setup
+
+setup(name='gmql',
+      version='0.1',
+      description='Python library for GMQL computation',
+      url='https://github.com/lucananni93/GMQL-Python',
+      author='Luca Nanni',
+      author_email='luca.nanni@mail.polimi.it',
+      license='MIT',
+      packages=['gmql'],
+      requires=['findspark'],
+      zip_safe=False)
diff --git a/tests/callScalaGMQLFromPython.py b/tests/callScalaGMQLFromPython.py
@@ -0,0 +1,47 @@
+"""
+    Main of example that demonstrate:
+    1) the usage of the spylon package
+    2) how to call GMQL scala functions from Python
+"""
+
+import spylon.spark as sc
+
+c = sc.SparkConfiguration()
+c._spark_home = "/home/luca/spark-2.1.0-bin-hadoop2.7"
+
+# I add the GMQL jar files for accessing them from pyspark
+c.jars = ["/home/luca/Scrivania/GMQL/GMQL-Core/target/GMQL-Core-2.0.jar",
+          "/home/luca/Scrivania/GMQL/GMQL-Server/target/GMQL-Server-2.0.jar",
+          "/home/luca/Scrivania/GMQL/GMQL-Spark/target/GMQL-Spark-4.0.jar"]
+
+
+path_file = "/home/luca/Scrivania/GMQL-Python/resources/ENCSR373BIX_rep2_1_pe_bwa_biorep_filtered_hotspots.bed"
+path_output = "/home/luca/Scrivania/GMQL-Python/resources/"
+
+# I get the pyspark context from Spylon
+sc = c.spark_context("prova_spylon")
+
+# Instantiation of a GMQLSparkExecutor
+binsize = sc._jvm.it.polimi.genomics.core.BinSize(5000, 5000, 1000)
+maxBinDistance = 100000
+REF_PARALLILISM = 20
+testingIOFormats = False
+sparkContext = sc._jvm.org.apache.spark.SparkContext.getOrCreate()
+outputFormat = sc._jvm.it.polimi.genomics.core.GMQLOutputFormat.TAB()
+
+gmql_spark_executor = sc._jvm.it.polimi.genomics.\
+    spark.implementation.GMQLSparkExecutor(binsize, maxBinDistance, REF_PARALLILISM,
+                                           testingIOFormats, sparkContext, outputFormat)
+
+# Creation of a GMQLServer given the spark executor previously defined
+server = sc._jvm.it.polimi.genomics.GMQLServer.GmqlServer(gmql_spark_executor, None)
+
+# Getting a BedParser object from GMQL
+bedParser = sc._jvm.it.polimi.genomics.spark.implementation.loaders.BedParser
+
+# Executing the query
+DS1 = server.READ(path_file).USING(bedParser)
+
+
+
+print("DONE")
diff --git a/tests/example_1.py b/tests/example_1.py
@@ -0,0 +1,12 @@
+import gmql as gl
+from gmql.dataset.GMQLDataset import  GMQLDataset
+from gmql.dataset.parsers.BedParser import BedParser
+
+bed_path = "/home/luca/Scrivania/GMQL-Python/resources/ENCSR373BIX_rep2_1_pe_bwa_biorep_filtered_hotspots.bed"
+
+dataset = GMQLDataset()
+bed_parser = BedParser()
+
+print('starting reading bed file')
+bed_rdd = dataset.load_from_path(path=bed_path, parser=bed_parser)
+bed_rdd.take(10).collect