diff --git a/README.md b/README.md index e57cf1827..ac795c9da 100644 --- a/README.md +++ b/README.md @@ -217,6 +217,10 @@ change the following: happens inside the Multimer model. * The `preset` flag in `run_alphafold.py` and `run_docker.py` was split into `db_preset` and `model_preset`. +* The models to use are not specified using `model_names` but rather using the + `model_preset` flag. If you want to customize which models are used for each + preset, you will have to modify the the `MODEL_PRESETS` dictionary in + `alphafold/model/config.py`. * Setting the `data_dir` flag is now needed when using `run_docker.py`. @@ -320,18 +324,124 @@ All steps are the same as when running the monomer system, but you will have to whether all input sequences in the given fasta file are prokaryotic. If that is not the case or the origin is unknown, set to `false` for that fasta. -An example that folds two protein complexes `multimer1` and `multimer2` where -the first is prokaryotic and the second isn't: +An example that folds a protein complex `multimer.fasta` that is prokaryotic: ```bash python3 docker/run_docker.py \ - --fasta_paths=multimer1.fasta,multimer2.fasta \ - --is_prokaryote_list=true,false \ + --fasta_paths=multimer.fasta \ + --is_prokaryote_list=true \ --max_template_date=2020-05-14 \ --model_preset=multimer \ --data_dir=$DOWNLOAD_DIR ``` +### Examples + +Below are examples on how to use AlphaFold in different scenarios. + +#### Folding a monomer + +Say we have a monomer with the sequence ``. The input fasta should be: + +```fasta +>sequence_name + +``` + +Then run the following command: + +```bash +python3 docker/run_docker.py \ + --fasta_paths=monomer.fasta \ + --max_template_date=2021-11-01 \ + --model_preset=monomer \ + --data_dir=$DOWNLOAD_DIR +``` + +#### Folding a homomer + +Say we have a homomer from a prokaryote with 3 copies of the same sequence +``. The input fasta should be: + +```fasta +>sequence_1 + +>sequence_2 + +>sequence_3 + +``` + +Then run the following command: + +```bash +python3 docker/run_docker.py \ + --fasta_paths=homomer.fasta \ + --is_prokaryote_list=true \ + --max_template_date=2021-11-01 \ + --model_preset=multimer \ + --data_dir=$DOWNLOAD_DIR +``` + +#### Folding a heteromer + +Say we have a heteromer A2B3 of unknown origin, i.e. with 2 copies of +`` and 3 copies of ``. The input fasta should be: + +```fasta +>sequence_1 + +>sequence_2 + +>sequence_3 + +>sequence_4 + +>sequence_5 + +``` + +Then run the following command: + +```bash +python3 docker/run_docker.py \ + --fasta_paths=heteromer.fasta \ + --is_prokaryote_list=false \ + --max_template_date=2021-11-01 \ + --model_preset=multimer \ + --data_dir=$DOWNLOAD_DIR +``` + +#### Folding multiple monomers one after another + +Say we have a two monomers, `monomer1.fasta` and `monomer2.fasta`. + +We can fold both sequentially by using the following command: + +```bash +python3 docker/run_docker.py \ + --fasta_paths=monomer1.fasta,monomer2.fasta \ + --max_template_date=2021-11-01 \ + --model_preset=monomer \ + --data_dir=$DOWNLOAD_DIR +``` + +#### Folding multiple multimers one after another + +Say we have a two multimers, `multimer1.fasta` and `multimer2.fasta`. Both are +from a prokaryotic organism. + +We can fold both sequentially by using the following command: + +```bash +python3 docker/run_docker.py \ + --fasta_paths=multimer1.fasta,multimer2.fasta \ + --is_prokaryote_list=true,true \ + --max_template_date=2021-11-01 \ + --model_preset=multimer \ + --data_dir=$DOWNLOAD_DIR +``` + ### AlphaFold output The outputs will be saved in a subdirectory of the directory provided via the diff --git a/alphafold/data/pipeline_multimer.py b/alphafold/data/pipeline_multimer.py index 75bc1a52a..46fa7a9ad 100644 --- a/alphafold/data/pipeline_multimer.py +++ b/alphafold/data/pipeline_multimer.py @@ -202,7 +202,7 @@ def _process_single_chain( msa_output_dir: str, is_homomer_or_monomer: bool) -> pipeline.FeatureDict: """Runs the monomer pipeline on a single chain.""" - chain_fasta_str = f'>{description}\n{sequence}\n' + chain_fasta_str = f'>chain_{chain_id}\n{sequence}\n' chain_msa_output_dir = os.path.join(msa_output_dir, chain_id) if not os.path.exists(chain_msa_output_dir): os.makedirs(chain_msa_output_dir) diff --git a/docker/run_docker.py b/docker/run_docker.py index 4eec39c9e..5d0f9beb0 100644 --- a/docker/run_docker.py +++ b/docker/run_docker.py @@ -32,17 +32,17 @@ 'gpu_devices', 'all', 'Comma separated list of devices to pass to NVIDIA_VISIBLE_DEVICES.') flags.DEFINE_list( - 'fasta_paths', None, - 'Paths to FASTA files, each containing one sequence. Paths should be ' + 'fasta_paths', None, 'Paths to FASTA files, each containing a prediction ' + 'target that will be folded one after another. If a FASTA file contains ' + 'multiple sequences, then it will be folded as a multimer. Paths should be ' 'separated by commas. All FASTA paths must have a unique basename as the ' 'basename is used to name the output directories for each prediction.') -flags.DEFINE_list('is_prokaryote_list', None, 'Optional for multimer system, ' - 'not used by the single chain system. ' - 'This list should contain a boolean for each fasta ' - 'specifying true where the target complex is from a ' - 'prokaryote, and false where it is not, or where the ' - 'origin is unknown. These values determine the pairing ' - 'method for the MSA.') +flags.DEFINE_list( + 'is_prokaryote_list', None, 'Optional for multimer system, not used by the ' + 'single chain system. This list should contain a boolean for each fasta ' + 'specifying true where the target complex is from a prokaryote, and false ' + 'where it is not, or where the origin is unknown. These values determine ' + 'the pairing method for the MSA.') flags.DEFINE_string( 'output_dir', '/tmp/alphafold', 'Path to a directory that will store the results.') diff --git a/notebooks/AlphaFold.ipynb b/notebooks/AlphaFold.ipynb index 9e20dee00..d8c475d80 100644 --- a/notebooks/AlphaFold.ipynb +++ b/notebooks/AlphaFold.ipynb @@ -648,8 +648,9 @@ " total_num_res = best_unrelaxed_prot.residue_index.shape[-1]\n", " chain_ids = best_unrelaxed_prot.chain_index\n", " for chain_boundary in np.nonzero(chain_ids[:-1] - chain_ids[1:]):\n", - " plt.plot([0, total_num_res], [chain_boundary, chain_boundary], color='red')\n", - " plt.plot([chain_boundary, chain_boundary], [0, total_num_res], color='red')\n", + " if chain_boundary.size:\n", + " plt.plot([0, total_num_res], [chain_boundary, chain_boundary], color='red')\n", + " plt.plot([chain_boundary, chain_boundary], [0, total_num_res], color='red')\n", "\n", " plt.title('Predicted Aligned Error')\n", " plt.xlabel('Scored residue')\n", diff --git a/run_alphafold.py b/run_alphafold.py index 1d5403c1c..33fae99c8 100644 --- a/run_alphafold.py +++ b/run_alphafold.py @@ -43,18 +43,18 @@ logging.set_verbosity(logging.INFO) -flags.DEFINE_list('fasta_paths', None, 'Paths to FASTA files, each containing ' - 'a prediction target. Paths should be separated by commas. ' - 'All FASTA paths must have a unique basename as the ' - 'basename is used to name the output directories for ' - 'each prediction.') -flags.DEFINE_list('is_prokaryote_list', None, 'Optional for multimer system, ' - 'not used by the single chain system. ' - 'This list should contain a boolean for each fasta ' - 'specifying true where the target complex is from a ' - 'prokaryote, and false where it is not, or where the ' - 'origin is unknown. These values determine the pairing ' - 'method for the MSA.') +flags.DEFINE_list( + 'fasta_paths', None, 'Paths to FASTA files, each containing a prediction ' + 'target that will be folded one after another. If a FASTA file contains ' + 'multiple sequences, then it will be folded as a multimer. Paths should be ' + 'separated by commas. All FASTA paths must have a unique basename as the ' + 'basename is used to name the output directories for each prediction.') +flags.DEFINE_list( + 'is_prokaryote_list', None, 'Optional for multimer system, not used by the ' + 'single chain system. This list should contain a boolean for each fasta ' + 'specifying true where the target complex is from a prokaryote, and false ' + 'where it is not, or where the origin is unknown. These values determine ' + 'the pairing method for the MSA.') flags.DEFINE_string('data_dir', None, 'Path to directory of supporting data.') flags.DEFINE_string('output_dir', None, 'Path to a directory that will '