add missed packages for analysis and submit service.

THU-ATOM · Mar 4, 2024 · 97ad1cb · 97ad1cb
1 parent d51033d
commit 97ad1cb
Show file tree

Hide file tree

Showing 8 changed files with 227 additions and 27 deletions.
diff --git a/gui/stats.html b/gui/stats.html
diff --git a/lib/tool/colabfold/alphafold/common/protein.py b/lib/tool/colabfold/alphafold/common/protein.py
@@ -16,7 +16,7 @@
 import dataclasses
 import io
 from typing import Any, Mapping, Optional
-from pipeline.tool.colabfold.alphafold.common import residue_constants
+from lib.tool.colabfold.alphafold.common import residue_constants
 from Bio.PDB import PDBParser
 import numpy as np
 from string import ascii_uppercase,ascii_lowercase

diff --git a/lib/tool/colabfold/alphafold_runner.py b/lib/tool/colabfold/alphafold_runner.py
@@ -10,9 +10,9 @@
 import tqdm
 import pickle
 
-from pipeline.tool.colabfold import colabfold as cf
-from pipeline.tool.colabfold import colabfold_alphafold as cf_af
-from pipeline.constant import HHFILTER_PATH
+from lib.tool.colabfold import colabfold as cf
+from lib.tool.colabfold import colabfold_alphafold as cf_af
+from lib.constant import HHFILTER_PATH
 
 
 tf.config.set_visible_devices([], "GPU")
@@ -435,8 +435,8 @@ def alphafold_predict_run(args):
             # source activate ~/localcolabfold/colabfold/colabfold-conda # absolute path of the enviroment
             # conda install -c conda-forge openmm-setup
             # vim ~/localcolabfold/colabfold/alphafold/relax/cleanup.py # change simtk.openmm to openmm, or copy revised relax directory
-            from pipeline.tool.colabfold.alphafold.relax import relax
-            from pipeline.tool.colabfold.alphafold.relax import utils
+            from lib.tool.colabfold.alphafold.relax import relax
+            # from pipeline.tool.colabfold.alphafold.relax import utils
 
         with tqdm.tqdm(total=num_relax, bar_format=TQDM_BAR_FORMAT) as pbar:
             pbar.set_description(f"AMBER relaxation")

diff --git a/lib/tool/colabfold/alphafold_runner_ray.py b/lib/tool/colabfold/alphafold_runner_ray.py
@@ -46,25 +46,24 @@
 from itertools import chain
 from pathlib import Path
 from typing import Union, Dict, Any
-from pipeline.utils.systool import get_available_gpus
+from lib.utils.systool import get_available_gpus
 
 import ray
 from ray.util.queue import Queue
-from pipeline.constant import COLABFOLD_PYTHON_PATH, AF_PARAMS_ROOT
-from pipeline.base import BasePathTree
+from lib.constant import COLABFOLD_PYTHON_PATH, AF_PARAMS_ROOT
+from lib.base import BasePathTree
 
 from loguru import logger
 
-import pipeline.utils.datatool as dtool
-from pipeline.tool import metrics
-from pipeline.utils.ray_tools import ProgressBar
-from pipeline.utils.execute import (
+import lib.utils.datatool as dtool
+from lib.tool import metrics
+from lib.utils.execute import (
     cuda_visible_devices_wrapper,
     execute,
     rlaunch_wrapper,
     rlaunch_exists,
 )
-from pipeline.utils.timetool import time2str, with_time
+from lib.utils.timetool import time2str, with_time
 
 
 RUNNER_SCRIPT_PATH = Path(__file__).resolve().parent / "alphafold_runner.py"

diff --git a/lib/tool/colabfold/colabfold_alphafold.py b/lib/tool/colabfold/colabfold_alphafold.py
@@ -7,22 +7,22 @@
 import pickle
 import tensorflow as tf
 
-from pipeline.tool.colabfold.alphafold.data.tools import jackhmmer
-from pipeline.tool.colabfold.alphafold.data import parsers
-from pipeline.tool.colabfold.alphafold.data import pipeline
-from pipeline.tool.colabfold.alphafold.common import protein
-from pipeline.tool.colabfold.alphafold.model import config
-from pipeline.tool.colabfold.alphafold.model import model
-from pipeline.tool.colabfold.alphafold.model import data
-from pipeline.tool.colabfold.alphafold.model.tf import shape_placeholders
+from lib.tool.colabfold.alphafold.data.tools import jackhmmer
+from lib.tool.colabfold.alphafold.data import parsers
+from lib.tool.colabfold.alphafold.data import pipeline
+from lib.tool.colabfold.alphafold.common import protein
+from lib.tool.colabfold.alphafold.model import config
+from lib.tool.colabfold.alphafold.model import model
+from lib.tool.colabfold.alphafold.model import data
+from lib.tool.colabfold.alphafold.model.tf import shape_placeholders
 
 from string import ascii_uppercase
 
 import numpy as np
 import matplotlib.pyplot as plt
 
-import pipeline.tool.colabfold.colabfold as cf
-from pipeline.tool.colabfold import pairmsa
+import lib.tool.colabfold.colabfold as cf
+from lib.tool.colabfold import pairmsa
 
 
 IN_COLAB = False

diff --git a/lib/tool/pdb_clustering.py b/lib/tool/pdb_clustering.py
@@ -0,0 +1,184 @@
+import os
+import glob
+import shutil
+from typing import Tuple
+from loguru import logger
+from pathlib import Path
+import numpy as np
+import argparse
+
+from sklearn.cluster import SpectralClustering
+from collections import defaultdict
+
+from lib.tool.align import align_pdbs
+from lib.tool.colabfold.alphafold.common import protein
+from typing import List, Tuple
+
+
+def copy_to_dir(src_pattern: str, target_dir: str) -> None:
+    paths = glob.glob(src_pattern)
+    for p in paths:
+        f = "_".join(p.split("/")[-3:])
+        filename = Path(target_dir) / f
+        shutil.copy(src=p, dst=filename)
+
+
+def get_tm_score_matrix_plddt(
+    pdb_paths: List[str],
+    threshold: float = 0.1,
+    cut_head: int = 0,
+    cut_tail: int = 0,
+) -> Tuple[np.ndarray, list]:
+
+    plddts = []
+    pdbfiles = []
+    for pdb in pdb_paths:
+        with open(pdb) as fd:
+            prot = protein.from_pdb_string(fd.read())
+            plddt = np.mean(prot.b_factors[:, 0])
+            logger.info(f"{plddt:.2f} {pdb}")
+            if plddt > threshold:
+                plddts.append(plddt)
+                pdbfiles.append(pdb)
+
+    results = align_pdbs(*pdbfiles, cut_head=cut_head, cut_tail=cut_tail)
+    logger.info(
+        f"tm_score matrix [shape: {results['tm_score'].shape}] compute complete"
+    )
+
+    return results["tm_score"], plddts, pdbfiles
+
+
+def model_selection(
+    tm_score_matrix: np.ndarray,
+    names: List[str],
+    plddts: List[float],
+    num_cluster: int = 5,
+) -> Tuple[list, np.ndarray]:
+    groups = defaultdict(list)
+    sc = SpectralClustering(
+        num_cluster,
+        affinity="precomputed",
+        n_init=1000,
+        assign_labels="discretize",
+    )
+    labels = sc.fit_predict(tm_score_matrix)
+    for l, n, p in zip(labels, names, plddts):
+        groups[l].append((n, p))
+    group_info = "\n".join([str(groups[l]) for l in groups])
+    logger.info(f"cluster groups:\n {group_info}")
+    rets = []
+    for l, name2plddts in groups.items():
+        rets.append(max(name2plddts, key=lambda x: x[-1]))
+        items = sorted(name2plddts, key=lambda x: x[-1], reverse=True)
+        logger.info(f"{items[0][1]:.4f} {items[0][0]}" )
+        for n, p in items[1:]:
+            logger.info(f"  - {p:.4f} {n}")
+    return rets, labels
+
+
+def gen_submission(
+    submit_dir: str,
+    target: str,
+    author_code: str = "1673-5955-6191",
+) -> str:
+    paths = glob.glob(f"{submit_dir}/*")
+    sorted_paths = sorted(paths, reverse=True)
+    contents = (
+        f"PFRMAT TS\n"
+        f"TARGET {target}\n"
+        f"AUTHOR {author_code}\n"
+        f"METHOD Description of methods used\n"
+    )
+
+    for i, res in enumerate(sorted_paths):
+        coordinates = "".join(
+            filter(
+                lambda x: x.startswith("ATOM"),
+                open(res, "r").readlines(),
+            )
+        )
+        coordinates = f"MODEL  {i+1}\nPARENT N/A\n{coordinates}TER\nEND\n"
+        contents = f"{contents}{coordinates}"
+    return contents
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-s", "--src_pattern", type=str, required=True)
+    parser.add_argument("-t", "--tgt_dir", type=str, required=True)
+    parser.add_argument("-p", "--plddt_threshold", type=float, default=80.0)
+    parser.add_argument("-n", "--num_cluster", type=int, default=5)
+    parser.add_argument(
+        "-ch",
+        "--cut_head",
+        type=int,
+        default=0,
+        help="cut head residues",
+    )
+    parser.add_argument(
+        "-ct",
+        "--cut_tail",
+        type=int,
+        default=0,
+        help="cut tail residues",
+    )
+    parser.add_argument(
+        "-a", "--author", type=str, default="air", choices=["air", "helixon"]
+    )
+    args = parser.parse_args()
+
+    if args.author == "air":
+        author_code = "1673-5955-6191"
+    elif args.author == "helixon":
+        author_code = "1684-3203-7374"
+    else:
+        raise ValueError("no such author")
+    pdbs_dir = args.tgt_dir + "_pdbs"
+    tgt_dir = args.tgt_dir + "_submit"
+    if Path(pdbs_dir).exists():
+        logger.warning(f"{pdbs_dir} already exists, removing..")
+        shutil.rmtree(pdbs_dir)
+    if Path(tgt_dir).exists():
+        logger.warning(f"{tgt_dir} already exists, removing..")
+        shutil.rmtree(tgt_dir)
+    os.makedirs(pdbs_dir)
+    os.makedirs(tgt_dir)
+
+    copy_to_dir(args.src_pattern, pdbs_dir)
+    pdbfiles = glob.glob(str(Path(pdbs_dir) / "*.pdb"))
+    score_matrix, plddts, pdb_files = get_tm_score_matrix_plddt(
+        pdb_paths=pdbfiles,
+        threshold=args.plddt_threshold,
+        cut_head=args.cut_head,
+        cut_tail=args.cut_tail,
+    )
+    rets, labels = model_selection(
+        score_matrix, pdb_files, plddts, num_cluster=args.num_cluster
+    )
+
+    name = os.path.basename(args.tgt_dir)
+    table = []
+    for pdb_path, plddt in rets:
+        file_name = f"{name}_{plddt:.2f}_{os.path.basename(pdb_path)}"
+        shutil.copy(pdb_path, os.path.join(tgt_dir, file_name))
+        table.append((plddt, file_name))
+
+    sources = []
+    plddts = []
+    for plddt, file_name in sorted(table, reverse=True):
+        logger.info(f"{plddt:.2f} {file_name}")
+        plddts.append(f"{plddt:.2f}")
+        sources.append("H" if "ruihan" in file_name else "A")
+    logger.info("\t".join(sources))
+    logger.info("\t".join(plddts))
+
+    submit_results = gen_submission(
+        submit_dir=tgt_dir,
+        target=name,
+        author_code=author_code,
+    )
+
+    merged_file_path = Path(tgt_dir).parent / f"{name}_submit.pdb"
+    with open(merged_file_path, "w") as fd:
+        fd.write(submit_results)
diff --git a/services/analysis/requirements.txt b/services/analysis/requirements.txt
@@ -8,4 +8,8 @@ biopython
 py3Dmol
 psutil
 gpustat
-absl-py
+absl-py
+scipy
+tensorflow
+ray
+dm-tree
diff --git a/services/submit/requirements.txt b/services/submit/requirements.txt
@@ -1,3 +1,16 @@
 celery[redis]
 PyEmail==0.0.1
-emails==0.6
+emails==0.6
+loguru
+matplotlib
+requests
+jsonlines
+rich
+biopython
+py3Dmol
+psutil
+gpustat
+absl-py
+scikit-learn
+ray
+dm-tree
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,4 +8,8 @@ biopython @@
     py3Dmol
     psutil
     gpustat
-    absl-py
+    absl-py
+    scipy
+    tensorflow
+    ray
+    dm-tree