Skip to content

Commit

Permalink
starting Bed file parsing
Browse files Browse the repository at this point in the history
Former-commit-id: 48891c5
  • Loading branch information
lucananni93 committed Mar 12, 2017
1 parent a2f064b commit a3eecfa
Show file tree
Hide file tree
Showing 17 changed files with 332 additions and 2 deletions.
55 changes: 54 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,58 @@
# Created by https://www.gitignore.io/api/pycharm,python

### JetBrains ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff:
.idea/**/workspace.xml
.idea/**/tasks.xml

# Sensitive or high-churn files:
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.xml
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml

# Gradle:
.idea/**/gradle.xml
.idea/**/libraries

# Mongo Explorer plugin:
.idea/**/mongoSettings.xml

## File-based project format:
*.iws

## Plugin-specific files:

# IntelliJ
/out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties

### JetBrains Patch ###
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721

# *.iml
# modules.xml
# .idea/misc.xml
# *.ipr


### PyCharm ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
Expand Down Expand Up @@ -145,4 +198,4 @@ ENV/
# Rope project settings
.ropeproject

# End of https://www.gitignore.io/api/pycharm,python
# End of https://www.gitignore.io/api/pycharm,python
148 changes: 148 additions & 0 deletions .gitignore~
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# Created by https://www.gitignore.io/api/pycharm,python

### PyCharm ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff:
.idea/**/workspace.xml
.idea/**/tasks.xml

# Sensitive or high-churn files:
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.xml
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml

# Gradle:
.idea/**/gradle.xml
.idea/**/libraries

# Mongo Explorer plugin:
.idea/**/mongoSettings.xml

## File-based project format:
*.iws

## Plugin-specific files:

# IntelliJ
/out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties

### PyCharm Patch ###
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721

# *.iml
# modules.xml
# .idea/misc.xml
# *.ipr

### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# dotenv
.env

# virtualenv
.venv
venv/
ENV/

# Spyder project settings
.spyderproject

# Rope project settings
.ropeproject

# End of https://www.gitignore.io/api/pycharm,python
2 changes: 1 addition & 1 deletion .idea/GMQL-Python.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,10 @@
# GMQL-Python
Python-Spark implementation of the GMQL system

## Requirements
- A python environment
- Apache Spark

##Set up of the project
1. Download this repository
2. In your IDE add the following paths to the project
13 changes: 13 additions & 0 deletions gmql/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""
Setting up the pyspark environment
"""
spark_home = '/home/luca/spark-2.1.0-bin-hadoop2.7'

import findspark
findspark.init(spark_home=spark_home)
import pyspark

app_name = 'gmql_spark'

# getting the Spark context
sc = pyspark.SparkContext(appName=app_name)
7 changes: 7 additions & 0 deletions gmql/dataset/GMQLDataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from gmql import sc

class GMQLDataset:

def load_from_path(self, path, parser):
rdd = sc.textFile(path).map(parser.parse_line)
return rdd
Empty file added gmql/dataset/__init__.py
Empty file.
Empty file.
22 changes: 22 additions & 0 deletions gmql/dataset/parsers/BedParser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from .Parser import Parser
from pybedtools import BedTool


class BedParser(Parser):

header = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd',
'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']

def parse_line(self, line):
elems = line.split("\t")
n_fields = len(elems)
dict_row = {}
for i in range(n_fields):
dict_row[self.header[i]] = elems[i]
return dict_row


def parse_to_dataframe(self, file_path):
bed_file = BedTool(file_path)
df = bed_file.to_dataframe()
return df
7 changes: 7 additions & 0 deletions gmql/dataset/parsers/Parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@


class Parser:

def parse_line(self,):
pass

Empty file.
Empty file added gmql/operators/ProjectMD.py
Empty file.
Empty file added gmql/operators/__init__.py
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2999cfa90579accd1ca723b562464c86913bf34a
12 changes: 12 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from setuptools import setup

setup(name='gmql',
version='0.1',
description='Python library for GMQL computation',
url='https://github.com/lucananni93/GMQL-Python',
author='Luca Nanni',
author_email='luca.nanni@mail.polimi.it',
license='MIT',
packages=['gmql'],
requires=['findspark'],
zip_safe=False)
47 changes: 47 additions & 0 deletions tests/callScalaGMQLFromPython.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
Main of example that demonstrate:
1) the usage of the spylon package
2) how to call GMQL scala functions from Python
"""

import spylon.spark as sc

c = sc.SparkConfiguration()
c._spark_home = "/home/luca/spark-2.1.0-bin-hadoop2.7"

# I add the GMQL jar files for accessing them from pyspark
c.jars = ["/home/luca/Scrivania/GMQL/GMQL-Core/target/GMQL-Core-2.0.jar",
"/home/luca/Scrivania/GMQL/GMQL-Server/target/GMQL-Server-2.0.jar",
"/home/luca/Scrivania/GMQL/GMQL-Spark/target/GMQL-Spark-4.0.jar"]


path_file = "/home/luca/Scrivania/GMQL-Python/resources/ENCSR373BIX_rep2_1_pe_bwa_biorep_filtered_hotspots.bed"
path_output = "/home/luca/Scrivania/GMQL-Python/resources/"

# I get the pyspark context from Spylon
sc = c.spark_context("prova_spylon")

# Instantiation of a GMQLSparkExecutor
binsize = sc._jvm.it.polimi.genomics.core.BinSize(5000, 5000, 1000)
maxBinDistance = 100000
REF_PARALLILISM = 20
testingIOFormats = False
sparkContext = sc._jvm.org.apache.spark.SparkContext.getOrCreate()
outputFormat = sc._jvm.it.polimi.genomics.core.GMQLOutputFormat.TAB()

gmql_spark_executor = sc._jvm.it.polimi.genomics.\
spark.implementation.GMQLSparkExecutor(binsize, maxBinDistance, REF_PARALLILISM,
testingIOFormats, sparkContext, outputFormat)

# Creation of a GMQLServer given the spark executor previously defined
server = sc._jvm.it.polimi.genomics.GMQLServer.GmqlServer(gmql_spark_executor, None)

# Getting a BedParser object from GMQL
bedParser = sc._jvm.it.polimi.genomics.spark.implementation.loaders.BedParser

# Executing the query
DS1 = server.READ(path_file).USING(bedParser)



print("DONE")
12 changes: 12 additions & 0 deletions tests/example_1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import gmql as gl
from gmql.dataset.GMQLDataset import GMQLDataset
from gmql.dataset.parsers.BedParser import BedParser

bed_path = "/home/luca/Scrivania/GMQL-Python/resources/ENCSR373BIX_rep2_1_pe_bwa_biorep_filtered_hotspots.bed"

dataset = GMQLDataset()
bed_parser = BedParser()

print('starting reading bed file')
bed_rdd = dataset.load_from_path(path=bed_path, parser=bed_parser)
bed_rdd.take(10).collect

0 comments on commit a3eecfa

Please sign in to comment.