revise (#2)

* update input for program: now only requires receptor and poses file * add test cases * clean up README / installation notes Co-authored-by: karoka <xjr,karo@gmail.com>
atfrank · Apr 27, 2020 · 866fdd6 · 866fdd6
1 parent ee45bcc
commit 866fdd6
Show file tree

Hide file tree

Showing 1,367 changed files with 111,244 additions and 2,032,225 deletions.
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 # RNAPosers
 RNAPosers: Machine-Learning Pose Classifiers for RNA Containing Complexes.
 
-This repo contains source code for RNAPosers' pose fingerprint and prediction (classification) modules. Pose fingerprint module is written in C++, and will generate executable `bin/featurize` once compiled. Prediction (classification) is done using Python script `src/rna_poser.py` and pre-trained classifiers with various parameter setting in `classifier/` . All the classifiers were trained with `Python 3.5.3` and `sklearn v0.19.2`, but should be compatible with both Python2 and Python 3 and any later version sklearn. The combined fingerprinting and classification process can be done by running `rna_posers.sh`.
+This repo contains source code for RNAPosers' pose fingerprint and prediction (classification) modules. Pose fingerprint module is written in C++, and will generate executable `bin/featurize` once compiled. Prediction (classification) is done using Python script `src/rna_poser.py` and pre-trained classifiers with various parameter setting in `classifier/` . All the classifiers were trained with `Python 3.5.3` and `sklearn v0.19.2`, but should be compatible with both Python2 and Python 3 and any later version sklearn. The combined fingerprinting and classification process can be done by running `src/run.sh`.
 
 **RNAPosers-plugin**: We also offer a PyMOL plugin version of RNAPosers (`rnaposerplugin.zip`) with graphical user interface as a supplement to the source code version. See [Using RNAPosers PyMOL plugin](#Using-RNAPosers-PyMOL-plugin) for installation instructions.
 
@@ -23,73 +23,76 @@ source ~/.bashrc
 
 ### Setup environment
 ```
-conda create --name rnaposers
+conda env create -f env/rnaposers.yml
 conda activate rnaposers
-conda install -c anaconda scikit-learn
-conda install -c conda-forge openbabel
 ```
+<!---
+# conda create --name rnaposers
+# conda activate rnaposers
+# conda install -c schrodinger pymol
+# conda install -c schrodinger pymol-psico
+# conda install -c openbabel openbabel
+# conda install pandas
+# conda install -c anaconda scikit-learn
+-->
 
 ### Using RNAPosers
 
 ```
-./src/rna_poser.sh -h
-usage: ./rna_poser.sh [base-directory]
-                      [pdbid]
-                      [ref-pdb-name]
-                      [ref-mol2-name]
-                      [ref-dcd-name]
-                      [rmsd-threshold: 1., 1.5, 2., or 2.5]
-                      [eta: 2, 24, or 248]
-                      [feature-file-prefix]
-                      [class-scores-prefix]
-example: ./rna_poser.sh tests/input/2b57/ 2b57 complex.pdb lig_2b57.mol2 complexes.dcd 2.5 248 output/2b57_features output/2b57_class_scores
+./src/run.sh -h
 ```
 
 #### Arguments
-- **base-directory**: the path to folder that contain the receptor-ligand pdb file, ligand mol2 file, receptor-ligand poses dcd file
-- **pdbid**: identifier of current structure
-- **ref-pdb-name**: name of reference RNA-ligand pdb file]
-- **ref-mol2-name**: name of reference ligand mol2 file]
-- **ref-dcd-name**: name of receptor-ligand pose dcd file]
-- **rmsd-threshold**: definition of nativeness. Either: `1`, `1.5`, `2`, or `2.5`
-- **eta**: a Guassian width parameter for pose fingerprint. `eta=2` means {2 Å} (used in the manuscript), `eta=24` means {2Å and 4 Å}, and `eta=248` means {2, 4, and 8 Å}. The higher the eta values, the more complex the fingerprint and longer the time will take for the computation of pose fingerprint.
-- **feature-file prefix**: prefix to pose fingerprint file
-- **class-score file prefix**: prefix to which classification scores output file
+- **receptor mol2**: .mol2 file of receptor structure. Default: tests/input/1AM0/receptor.mol2
+- **ligand poses sd**: .sd file containing all ligand posees. Default: tests/input/1AM0/poses.sd
+- **output file**: where to save scores. Default: tests/output/1AM0.txt
+- **rmsd**: 1, 1.5, 2, 2.5. Default: 2
+- **eta**: 2, 24, or 248 (2A, 2A and 4A, 2A 4A and 8A). Default: 248
+- **stop frame**: only score the first several poses. Default: -1 (using all frames)
 
 ### Example
 ```
 cd $RNAPOSERS_PATH
-pdb=2b57
-mkdir tests/output/
-./src/rna_poser.sh tests/input/${pdb}/ ${pdb} complex.pdb lig_${pdb}.mol2 complexes.dcd 2.5 2 tests/output/${pdb}_features tests/output/${pdb}_class_scores
+# Test 1: score the first 10 poses of 1AM0
+./src/run.sh tests/input/1AM0/receptor.mol2 tests/input/1AM0/poses.sd tests/output/1AM0.txt 10 2.5 248
+# Test 2: score the full set of poses of 2B57
+./src/run.sh tests/input/2B57/receptor.mol2 tests/input/2B57/poses.sd tests/output/2B57.txt
 ```
 #### Output
 ```
-cat tests/output/class_scores_${pdb}.txt
+cat tests/output/1AM0.txt
 # columns: prediction probability(0) probability(1)
+1.000000 0.027000 0.973000
+1.000000 0.023000 0.977000
+1.000000 0.009000 0.991000
+1.000000 0.062000 0.938000
+1.000000 0.008000 0.992000
+1.000000 0.050000 0.950000
+1.000000 0.031000 0.969000
+1.000000 0.010000 0.990000
+1.000000 0.284000 0.716000
+0.000000 0.687000 0.313000
+
+cat tests/output/2B57.txt
+1.000000 0.025000 0.975000
+1.000000 0.015000 0.985000
+1.000000 0.025000 0.975000
+1.000000 0.005000 0.995000
 1.000000 0.004000 0.996000
-1.000000 0.001000 0.999000
 1.000000 0.004000 0.996000
-1.000000 0.002000 0.998000
-1.000000 0.001000 0.999000
-...      ...      ...     
-...      ...      ...     
-...      ...      ...     
-0.000000 0.963000 0.037000
-0.000000 0.967000 0.033000
-0.000000 0.975000 0.025000
-0.000000 0.983000 0.017000
-0.000000 0.984000 0.016000
+1.000000 0.004000 0.996000
+1.000000 0.005000 0.995000
+1.000000 0.005000 0.995000
+1.000000 0.004000 0.996000
+...
 ```
 
 ### PyMOL plugin
 The RNAPosers PyMOL plugin is compressed it as `rnaposerplugin.zip`. See PyMOL website: https://pymolwiki.org/index.php/Plugin_Manager for an instruction on installing PyMOL plugin from local file. Note that you still have to install this repo and setup environment to use the plugin.
 
 
 ### Additional Notes
-- RNAPosers expect that the ordering of ligand atoms in the reference receptor-ligand pdb matches the order in the reference ligand mol2 file
-- RNAPosers reads atom types from the reference ligand mol2 file
-- Atom types should to be one of the following SYBYL atom types:
+- Ligand Atom types should to be one of the following SYBYL atom types:
 
 Description | Type
 --- | ---
@@ -115,10 +118,6 @@ Sulphone sulphur | S.o2
 Phosphorus sp3 | P.3
 
 
-
-
-
-
 ## License
 ```
   Copyright University of Michigan.

diff --git a/env/rnaposers.yml b/env/rnaposers.yml
@@ -0,0 +1,90 @@
+name: rnaposers
+channels:
+  - openbabel
+  - anaconda
+  - schrodinger
+  - defaults
+dependencies:
+  - apbs=1.5=h1de35cc_3
+  - asn1crypto=1.3.0=py37_0
+  - blas=1.0=mkl
+  - bzip2=1.0.8=h1de35cc_0
+  - ca-certificates=2020.1.1=0
+  - cairo=1.14.12=hc4e6be7_4
+  - certifi=2020.4.5.1=py37_0
+  - cffi=1.14.0=py37hb5b8e2f_0
+  - chardet=3.0.4=py37_1003
+  - cryptography=2.8=py37ha12b0ac_0
+  - dbus=1.13.12=h90a0687_0
+  - expat=2.2.6=h0a44026_0
+  - fontconfig=2.13.0=h5d5b041_1
+  - freemol=1.158=py37_1
+  - freetype=2.9.1=hb4e5f40_0
+  - gettext=0.19.8.1=h15daf44_3
+  - glew=2.0.0=0
+  - glib=2.63.1=hd977a24_0
+  - h5py=2.10.0=py37h3134771_0
+  - hdf5=1.10.4=hfa1e0ec_0
+  - icu=58.2=h4b95b61_1
+  - idna=2.9=py_1
+  - intel-openmp=2019.4=233
+  - joblib=0.14.1=py_0
+  - jpeg=9b=he5867d9_2
+  - libcxx=4.0.1=hcfea43d_1
+  - libcxxabi=4.0.1=hcfea43d_1
+  - libedit=3.1.20181209=hb402a30_0
+  - libffi=3.2.1=h0a44026_6
+  - libgfortran=3.0.1=2000
+  - libiconv=1.15=hdd342a3_7
+  - libopenblas=0.3.6=hdc02c5d_2
+  - libpng=1.6.37=ha441bb4_0
+  - libtiff=4.1.0=hcb84e12_0
+  - libxml2=2.9.9=hf6e021a_1
+  - llvm-openmp=4.0.1=hcfea43d_1
+  - mengine=1=h1de35cc_1
+  - mkl=2019.4=233
+  - mkl-service=2.3.0=py37hfbe908c_0
+  - mkl_fft=1.0.15=py37h5e564d8_0
+  - mkl_random=1.1.0=py37ha771720_0
+  - mpeg_encode=1=h1de35cc_1
+  - mtz2ccp4_px=1.0=hdc02c5d_3
+  - ncurses=6.2=h0a44026_0
+  - numpy=1.18.1=py37h7241aed_0
+  - numpy-base=1.18.1=py37h6575580_1
+  - olefile=0.46=py37_0
+  - openbabel=2.4.1=py37_6
+  - openssl=1.1.1g=h1de35cc_0
+  - pandas=1.0.3=py37h6c726b0_0
+  - pcre=8.43=h0a44026_0
+  - pdb2pqr=2.1.1=py37_1
+  - pillow=7.0.0=py37h4655f20_0
+  - pip=20.0.2=py37_1
+  - pixman=0.38.0=h1de35cc_0
+  - pmw=2.0.1=py37_2
+  - pycparser=2.20=py_0
+  - pymol=2.3.5=py37h95b93ae_0
+  - pymol-psico=3.4.1=py_0
+  - pyopenssl=19.1.0=py37_0
+  - pyqt=5.9.2=py37h655552a_2
+  - pysocks=1.7.1=py37_0
+  - python=3.7.7=hc70fcce_0_cpython
+  - python-dateutil=2.8.1=py_0
+  - pytz=2019.3=py_0
+  - qt=5.9.7=h468cd18_1
+  - readline=8.0=h1de35cc_0
+  - requests=2.23.0=py37_0
+  - rigimol=1.3=2
+  - scikit-learn=0.22.1=py37h27c97d8_0
+  - scipy=1.4.1=py37h9fa6033_0
+  - setuptools=46.1.3=py37_0
+  - sip=4.19.8=py37h0a44026_0
+  - six=1.14.0=py37_0
+  - sqlite=3.31.1=h5c1f38d_1
+  - tk=8.6.9=x11tk0_2000
+  - urllib3=1.25.8=py37_0
+  - wheel=0.34.2=py37_0
+  - xz=5.2.5=h1de35cc_0
+  - zlib=1.2.11=h1de35cc_3
+  - zstd=1.3.7=h5bba6e5_0
+prefix: /Users/jr/anaconda3/envs/rnaposers
+
diff --git a/py/__init__.py b/py/__init__.py
diff --git a/py/generate_complex_files.py b/py/generate_complex_files.py
@@ -0,0 +1,88 @@
+import sys
+import inspect
+from glob import glob
+from pymol import cmd
+import psico.fullinit
+import tempfile
+import os
+
+def generate_complexes(receptor, poses, complexes, ref_complex, ref_ligand_mol2):
+    def fix_names(obj, is_ligand = False):
+        # utility functions to fix names
+        if is_ligand:
+            cmd.alter("%s"%(obj), "resn = 'UNK'")
+            cmd.alter("%s"%(obj), "chain = 'Z'")
+            cmd.alter("%s"%(obj), "resi = '1'")
+        else:
+            cmd.alter("%s"%(obj), "type = 'ATOM'")
+            cmd.alter("resn rC+C+RC and %s"%(obj), "resn = 'CYT'")
+            cmd.alter("resn rA+A+RA and %s"%(obj), "resn = 'ADE'")
+            cmd.alter("resn rU+U+RU and %s"%(obj), "resn = 'URA'")
+            cmd.alter("resn rG+G+RG and %s"%(obj), "resn = 'GUA'")
+
+    def split_decoys(poses, dir):
+        # utility function to split poses
+        for a in range(1, 1 + cmd.count_states("(%s)"%poses)):
+            cmd.frame(a)
+            cmd.save("%s/poses/poses_%i.pdb"%(dir, a))
+
+    # utility function to generate complexes
+    print("[RNAPosers Debugging] Generating complexes...")
+
+    # Create TemporaryDirectory to store intermediat files
+    with tempfile.TemporaryDirectory() as tmpDir:
+        os.system("mkdir -p %s/complex"%(tmpDir))
+        os.system("mkdir -p %s/poses"%(tmpDir))
+
+        # split posese
+        cmd.delete("*")
+        cmd.load(poses, "poses")
+        fix_names("poses", is_ligand = True)
+        split_decoys("poses", tmpDir)
+        poses=range(1, 1 + cmd.count_states("(poses)"))
+        cmd.delete("poses")
+
+        # combine poses + receptor
+        for a in poses:
+            cmd.delete("*")
+            cmd.load(receptor)
+            cmd.load("%s/poses/poses_%i.pdb"%(tmpDir, a))
+            cmd.create("test", "all")
+            fix_names("test", is_ligand = False)
+            cmd.save("%s/complex/complex_%i.pdb"%(tmpDir, a),"test")
+
+        # save dcd
+        cmd.delete("*")
+        for a in poses:
+            cmd.load("%s/complex/complex_%i.pdb" %(tmpDir, a), "complex")
+        psico.exporting.save_traj(filename = "%s" %complexes, selection = "complex")
+
+        # generate reference files
+        os.system("cp %s/complex/complex_1.pdb %s"%(tmpDir, ref_complex))
+
+        # XX
+        cmd.delete("*")
+        cmd.load("%s/complex/complex_1.pdb"%(tmpDir), "complex")
+        cmd.extract("lig", "resn UNK")
+        cmd.save("%s/complex/lig.pdb"%(tmpDir), "lig")
+
+        os.system("obabel -ipdb %s/complex/lig.pdb  -omol2 -O %s"%(tmpDir, ref_ligand_mol2))
+
+
+def main():
+    # Input: receptor, poses
+    # Output: complexes, ref_complex, ref_ligand_mol2
+    # Input
+    inDir = "test/receptor_and_poses/input/"
+    receptor = inDir + "receptor.mol2"
+    poses = inDir + "poses.sd"
+    # Output
+    outDir = "test/receptor_and_poses/output/" # where to save your pdb, dcd and mol2 files
+    os.system("mkdir -p %s" %outDir)
+    complexes = outDir + "complexes.dcd"
+    ref_complex = outDir + "complex.pdb"
+    ref_ligand_mol2 = outDir + "lig.mol2"
+    generate_complexes(receptor, poses, complexes, ref_complex, ref_ligand_mol2)
+
+if __name__ == '__main__':
+    main()
diff --git a/src/rna_poser.py → py/inference.py b/src/rna_poser.py → py/inference.py
@@ -8,27 +8,23 @@
 import os
 from sklearn.externals import joblib
 from sklearn.ensemble import RandomForestClassifier
-import argparse
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--classifier", help="classifier")
 parser.add_argument("--features", help="molecularize features: e.g: /home/itssahil/PROJECTS/Mol_Feturizer/B_LATEST_results/PREDICTORS_R/RF_6bfb.predictor.pkl")
 parser.add_argument("--output", help="output scores")
 
 a = parser.parse_args()
-
-def main():
-    MAXF, MINF = 3.4e38, 1.18e-38
-    Xtest = numpy.loadtxt(a.features, dtype='float64')
-    Xtest = Xtest[:, 1:]
-    # workaround overflow
-    Xtest = numpy.nan_to_num(Xtest)
-    Xtest[Xtest < MINF] = MINF
-    Xtest[Xtest > MAXF] = MAXF
-    # predict
-    pred_rf = joblib.load(a.classifier)
-    rf = pred_rf.predict(Xtest)
-    rf_p = pred_rf.predict_proba(Xtest)
-    merge_rf = numpy.column_stack((rf, rf_p))
-    numpy.savetxt(a.output, merge_rf, fmt='%f')
-main()
+MAXF, MINF = 3.4e38, 1.18e-38
+Xtest = numpy.loadtxt(a.features, dtype='float64')
+Xtest = Xtest[:, 1:]
+# workaround overflow
+Xtest = numpy.nan_to_num(Xtest)
+Xtest[Xtest < MINF] = MINF
+Xtest[Xtest > MAXF] = MAXF
+# predict
+pred_rf = joblib.load(a.classifier)
+rf = pred_rf.predict(Xtest)
+rf_p = pred_rf.predict_proba(Xtest)
+merge_rf = numpy.column_stack((rf, rf_p))
+numpy.savetxt(a.output, merge_rf, fmt='%f')