Supplementary 2a: Preprocess PDBs#

Set up the path#

# only run for the first time
import os
import sys
import warnings

warnings.filterwarnings("ignore")


def find_project_root(marker=".git"):
    current_path = os.getcwd()
    while current_path != os.path.dirname(current_path):  # Stop at the filesystem root
        if marker in os.listdir(current_path):
            return current_path
        current_path = os.path.dirname(current_path)
    return None  # Return None if the marker is not found


project_root = find_project_root()
print(project_root)

# Add project_root to the Python path
sys.path.append(project_root)
/home/yuyang/Project_local/FragBEST-Myo

Import requried pacakges#

import MDAnalysis as mda
import numpy as np
from biopandas.pdb import PandasPdb

from utils.thirdparty.masif.source.input_output.protonate import protonate

Download and preprocess the PDBs from RCSB#

  1. Download the PDBs from RCSB.

  2. Extract the biological unit.

  3. Add hydrogens to the structures.

1. Download the PDBs from RCSB#

Here, we use the biopandas package to download the PDB files.

# ------------- start of user-defined parameters -------------
# set the path to download the examples
examples_dir = f"{project_root}/dataset/examples"
investigation_list = ["8QYP", "8QYQ", "8QYR", "8QYU", "6YSY", "5N69"]
# ------------- end of user-defined parameters ---------------

# fetch the PDB files
for each in investigation_list:
    ppdb = PandasPdb().fetch_pdb(each)
    ppdb.to_pdb(path=f"{examples_dir}/{each}.pdb", records=["ATOM", "HETATM"])

We then load the raw PDBs using MDAnalysis. We can also visualize and check the PDBs with nglview.

# read pdb files
structures = {}
for each in investigation_list:
    structures[each] = mda.Universe(f"{examples_dir}/{each}.pdb")
# --------- start of user-defined parameters ---------
query = "8QYP"  # change here to visualize different structure
# (e.g. 8QYP, 8QYQ, 8QYR, 8QYU, 6YSY, 5N69)
# --------- end of user-defined parameters ---------

# the unique segids and residue names in the structure
print(np.unique(structures[query].residues.segids))
print(np.unique(structures[query].residues.resnames))
['A']
['ADP' 'ALA' 'ARG' 'ASN' 'ASP' 'CYS' 'GLN' 'GLU' 'GLY' 'HIS' 'HOH' 'ILE'
 'LEU' 'LYS' 'M3L' 'MET' 'MG' 'PHE' 'PRO' 'SER' 'THR' 'TRP' 'TYR' 'VAL'
 'VO4']
# visualize the structure (uncomment the next few lines)
# import nglview as nv
# view = nv.show_mdanalysis(structures[query])
# view

2. Extract the biological unit#

Next, we need to select the biological unit using MDAnalysis. We rename the new files with extension of .biounit.pdb.

# select the biological unit
biological_units = {
    "5N69": ["B", "G"],
    "8QYP": ["A"],
    "8QYQ": ["B", "D"],
    "8QYR": ["B"],
    "8QYU": ["B", "G"],
    "6YSY": ["A", "B"],
}

# read and write the biological unit
structures = {}
for each in investigation_list:
    # load the structure
    structures[each] = mda.Universe(f"{examples_dir}/{each}.pdb", topology_format="pdb")

    # select the biological unit
    query_str = " or ".join([f"chainID {i}" for i in biological_units[each]])

    # write the biological unit
    structures[each].select_atoms(query_str).write(f"{examples_dir}/{each}.biounit.pdb")

3. Add hydrogens to the protein#

Due to merely existing heavy atoms in RCSB PDBs, we need to add the hydrogens before generate the protein surface and features. We use the workflow from MaSIF with minor modification (github). The hydrogen-adding step is based on reduce. The output PDB files are with extension of .biounit_addHs.pdb.

# protonate the biological unit
for each in investigation_list:
    protonate(
        in_pdb_file=f"{examples_dir}/{each}.biounit.pdb",
        out_pdb_file=f"{examples_dir}/{each}.biounit_addHs.pdb",
    )