From a3eecfa74985ce4ab58f76902b37ee5179c59295 Mon Sep 17 00:00:00 2001 From: lucananni93 Date: Sun, 12 Mar 2017 22:17:45 +0100 Subject: [PATCH] starting Bed file parsing Former-commit-id: 48891c52a57c57ae0786d4546bbec02288c36abb --- .gitignore | 55 ++++++- .gitignore~ | 148 ++++++++++++++++++ .idea/GMQL-Python.iml | 2 +- README.md | 8 + gmql/__init__.py | 13 ++ gmql/dataset/GMQLDataset.py | 7 + gmql/dataset/__init__.py | 0 gmql/dataset/loaders/__init__.py | 0 gmql/dataset/parsers/BedParser.py | 22 +++ gmql/dataset/parsers/Parser.py | 7 + gmql/dataset/parsers/__init__.py | 0 gmql/operators/ProjectMD.py | 0 gmql/operators/__init__.py | 0 ...iorep_filtered_hotspots.bed.REMOVED.git-id | 1 + setup.py | 12 ++ tests/callScalaGMQLFromPython.py | 47 ++++++ tests/example_1.py | 12 ++ 17 files changed, 332 insertions(+), 2 deletions(-) create mode 100644 .gitignore~ create mode 100644 gmql/__init__.py create mode 100644 gmql/dataset/GMQLDataset.py create mode 100644 gmql/dataset/__init__.py create mode 100644 gmql/dataset/loaders/__init__.py create mode 100644 gmql/dataset/parsers/BedParser.py create mode 100644 gmql/dataset/parsers/Parser.py create mode 100644 gmql/dataset/parsers/__init__.py create mode 100644 gmql/operators/ProjectMD.py create mode 100644 gmql/operators/__init__.py create mode 100644 resources/ENCSR373BIX_rep2_1_pe_bwa_biorep_filtered_hotspots.bed.REMOVED.git-id create mode 100644 setup.py create mode 100644 tests/callScalaGMQLFromPython.py create mode 100644 tests/example_1.py diff --git a/.gitignore b/.gitignore index 5376237..f567392 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,58 @@ # Created by https://www.gitignore.io/api/pycharm,python +### JetBrains ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff: +.idea/**/workspace.xml +.idea/**/tasks.xml + +# Sensitive or high-churn files: +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.xml +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml + +# Gradle: +.idea/**/gradle.xml +.idea/**/libraries + +# Mongo Explorer plugin: +.idea/**/mongoSettings.xml + +## File-based project format: +*.iws + +## Plugin-specific files: + +# IntelliJ +/out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +### JetBrains Patch ### +# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 + +# *.iml +# modules.xml +# .idea/misc.xml +# *.ipr + + ### PyCharm ### # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 @@ -145,4 +198,4 @@ ENV/ # Rope project settings .ropeproject -# End of https://www.gitignore.io/api/pycharm,python \ No newline at end of file +# End of https://www.gitignore.io/api/pycharm,python diff --git a/.gitignore~ b/.gitignore~ new file mode 100644 index 0000000..5376237 --- /dev/null +++ b/.gitignore~ @@ -0,0 +1,148 @@ +# Created by https://www.gitignore.io/api/pycharm,python + +### PyCharm ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff: +.idea/**/workspace.xml +.idea/**/tasks.xml + +# Sensitive or high-churn files: +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.xml +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml + +# Gradle: +.idea/**/gradle.xml +.idea/**/libraries + +# Mongo Explorer plugin: +.idea/**/mongoSettings.xml + +## File-based project format: +*.iws + +## Plugin-specific files: + +# IntelliJ +/out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +### PyCharm Patch ### +# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 + +# *.iml +# modules.xml +# .idea/misc.xml +# *.ipr + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ + +# Spyder project settings +.spyderproject + +# Rope project settings +.ropeproject + +# End of https://www.gitignore.io/api/pycharm,python \ No newline at end of file diff --git a/.idea/GMQL-Python.iml b/.idea/GMQL-Python.iml index 6711606..1125174 100644 --- a/.idea/GMQL-Python.iml +++ b/.idea/GMQL-Python.iml @@ -2,7 +2,7 @@ - + diff --git a/README.md b/README.md index a316170..540b520 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,10 @@ # GMQL-Python Python-Spark implementation of the GMQL system + +## Requirements +- A python environment +- Apache Spark + +##Set up of the project +1. Download this repository +2. In your IDE add the following paths to the project diff --git a/gmql/__init__.py b/gmql/__init__.py new file mode 100644 index 0000000..44e7ac8 --- /dev/null +++ b/gmql/__init__.py @@ -0,0 +1,13 @@ +""" +Setting up the pyspark environment +""" +spark_home = '/home/luca/spark-2.1.0-bin-hadoop2.7' + +import findspark +findspark.init(spark_home=spark_home) +import pyspark + +app_name = 'gmql_spark' + +# getting the Spark context +sc = pyspark.SparkContext(appName=app_name) diff --git a/gmql/dataset/GMQLDataset.py b/gmql/dataset/GMQLDataset.py new file mode 100644 index 0000000..66c4d72 --- /dev/null +++ b/gmql/dataset/GMQLDataset.py @@ -0,0 +1,7 @@ +from gmql import sc + +class GMQLDataset: + + def load_from_path(self, path, parser): + rdd = sc.textFile(path).map(parser.parse_line) + return rdd diff --git a/gmql/dataset/__init__.py b/gmql/dataset/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gmql/dataset/loaders/__init__.py b/gmql/dataset/loaders/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gmql/dataset/parsers/BedParser.py b/gmql/dataset/parsers/BedParser.py new file mode 100644 index 0000000..0b68401 --- /dev/null +++ b/gmql/dataset/parsers/BedParser.py @@ -0,0 +1,22 @@ +from .Parser import Parser +from pybedtools import BedTool + + +class BedParser(Parser): + + header = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd', + 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts'] + + def parse_line(self, line): + elems = line.split("\t") + n_fields = len(elems) + dict_row = {} + for i in range(n_fields): + dict_row[self.header[i]] = elems[i] + return dict_row + + +def parse_to_dataframe(self, file_path): + bed_file = BedTool(file_path) + df = bed_file.to_dataframe() + return df diff --git a/gmql/dataset/parsers/Parser.py b/gmql/dataset/parsers/Parser.py new file mode 100644 index 0000000..822eeed --- /dev/null +++ b/gmql/dataset/parsers/Parser.py @@ -0,0 +1,7 @@ + + +class Parser: + + def parse_line(self,): + pass + diff --git a/gmql/dataset/parsers/__init__.py b/gmql/dataset/parsers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gmql/operators/ProjectMD.py b/gmql/operators/ProjectMD.py new file mode 100644 index 0000000..e69de29 diff --git a/gmql/operators/__init__.py b/gmql/operators/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/resources/ENCSR373BIX_rep2_1_pe_bwa_biorep_filtered_hotspots.bed.REMOVED.git-id b/resources/ENCSR373BIX_rep2_1_pe_bwa_biorep_filtered_hotspots.bed.REMOVED.git-id new file mode 100644 index 0000000..8b6154c --- /dev/null +++ b/resources/ENCSR373BIX_rep2_1_pe_bwa_biorep_filtered_hotspots.bed.REMOVED.git-id @@ -0,0 +1 @@ +2999cfa90579accd1ca723b562464c86913bf34a \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..7c97cca --- /dev/null +++ b/setup.py @@ -0,0 +1,12 @@ +from setuptools import setup + +setup(name='gmql', + version='0.1', + description='Python library for GMQL computation', + url='https://github.com/lucananni93/GMQL-Python', + author='Luca Nanni', + author_email='luca.nanni@mail.polimi.it', + license='MIT', + packages=['gmql'], + requires=['findspark'], + zip_safe=False) \ No newline at end of file diff --git a/tests/callScalaGMQLFromPython.py b/tests/callScalaGMQLFromPython.py new file mode 100644 index 0000000..70ad1e7 --- /dev/null +++ b/tests/callScalaGMQLFromPython.py @@ -0,0 +1,47 @@ +""" + Main of example that demonstrate: + 1) the usage of the spylon package + 2) how to call GMQL scala functions from Python +""" + +import spylon.spark as sc + +c = sc.SparkConfiguration() +c._spark_home = "/home/luca/spark-2.1.0-bin-hadoop2.7" + +# I add the GMQL jar files for accessing them from pyspark +c.jars = ["/home/luca/Scrivania/GMQL/GMQL-Core/target/GMQL-Core-2.0.jar", + "/home/luca/Scrivania/GMQL/GMQL-Server/target/GMQL-Server-2.0.jar", + "/home/luca/Scrivania/GMQL/GMQL-Spark/target/GMQL-Spark-4.0.jar"] + + +path_file = "/home/luca/Scrivania/GMQL-Python/resources/ENCSR373BIX_rep2_1_pe_bwa_biorep_filtered_hotspots.bed" +path_output = "/home/luca/Scrivania/GMQL-Python/resources/" + +# I get the pyspark context from Spylon +sc = c.spark_context("prova_spylon") + +# Instantiation of a GMQLSparkExecutor +binsize = sc._jvm.it.polimi.genomics.core.BinSize(5000, 5000, 1000) +maxBinDistance = 100000 +REF_PARALLILISM = 20 +testingIOFormats = False +sparkContext = sc._jvm.org.apache.spark.SparkContext.getOrCreate() +outputFormat = sc._jvm.it.polimi.genomics.core.GMQLOutputFormat.TAB() + +gmql_spark_executor = sc._jvm.it.polimi.genomics.\ + spark.implementation.GMQLSparkExecutor(binsize, maxBinDistance, REF_PARALLILISM, + testingIOFormats, sparkContext, outputFormat) + +# Creation of a GMQLServer given the spark executor previously defined +server = sc._jvm.it.polimi.genomics.GMQLServer.GmqlServer(gmql_spark_executor, None) + +# Getting a BedParser object from GMQL +bedParser = sc._jvm.it.polimi.genomics.spark.implementation.loaders.BedParser + +# Executing the query +DS1 = server.READ(path_file).USING(bedParser) + + + +print("DONE") \ No newline at end of file diff --git a/tests/example_1.py b/tests/example_1.py new file mode 100644 index 0000000..5a3abf3 --- /dev/null +++ b/tests/example_1.py @@ -0,0 +1,12 @@ +import gmql as gl +from gmql.dataset.GMQLDataset import GMQLDataset +from gmql.dataset.parsers.BedParser import BedParser + +bed_path = "/home/luca/Scrivania/GMQL-Python/resources/ENCSR373BIX_rep2_1_pe_bwa_biorep_filtered_hotspots.bed" + +dataset = GMQLDataset() +bed_parser = BedParser() + +print('starting reading bed file') +bed_rdd = dataset.load_from_path(path=bed_path, parser=bed_parser) +bed_rdd.take(10).collect \ No newline at end of file