updates for system preparation, generalizing code for other uses

stefdoerr · stefdoerr · commit ce61a7797167 · 2025-05-09T11:00:16.000+03:00
diff --git a/moleculekit/tools/preparation.py b/moleculekit/tools/preparation.py
@@ -147,8 +147,13 @@ def _generate_nonstandard_residues_ff(
                 if residue_smiles is not None and res in residue_smiles:
                     smiles = residue_smiles[res]
 
-                tmol = _template_residue_from_smiles(molc, res, smiles=smiles)
-                cres = _process_custom_residue(tmol, res)
+                if smiles is not None and os.path.isfile(smiles):
+                    tmol = Molecule(smiles)
+                else:
+                    tmol = _template_residue_from_smiles(molc, res, smiles=smiles)
+                cres = _process_custom_residue(tmol)
+                # Rename to correct resname
+                cres.resname[:] = res
 
                 _mol_to_xml_def(cres, os.path.join(tmpdir, f"{res}.xml"))
                 _mol_to_dat_def(cres, os.path.join(tmpdir, f"{res}.dat"))
@@ -542,6 +547,7 @@ def systemPrepare(
     _molkit_ff=True,
     outdir=None,
     residue_smiles=None,
+    ignore_ns=False,
 ):
     """Prepare molecular systems through protonation and h-bond optimization.
 
@@ -732,15 +738,16 @@ def systemPrepare(
     _fix_protonation_resnames(mol_in)
 
     definition, forcefield = _get_custom_ff(molkit_ff=_molkit_ff)
-    definition, forcefield = _generate_nonstandard_residues_ff(
-        mol_in,
-        definition,
-        forcefield,
-        _molkit_ff,
-        outdir,
-        ignore_ns_errors=ignore_ns_errors,
-        residue_smiles=residue_smiles,
-    )
+    if not ignore_ns:
+        definition, forcefield = _generate_nonstandard_residues_ff(
+            mol_in,
+            definition,
+            forcefield,
+            _molkit_ff,
+            outdir,
+            ignore_ns_errors=ignore_ns_errors,
+            residue_smiles=residue_smiles,
+        )
 
     nonpept = []
     if hold_nonpeptidic_bonds:
diff --git a/moleculekit/tools/preparation_customres.py b/moleculekit/tools/preparation_customres.py
@@ -105,19 +105,50 @@ def _template_residue_from_smiles(inmol: Molecule, nsres: str, smiles=None):
     return mol
 
 
-def _get_idx(mol, name):
-    res = np.where(mol.name == name)
+def _reorder_residue_atoms(mol, resid):
+    # Reorder atoms. AMBER order is: N H CA HA [sidechain] C O
+    # the H atom will get added later
+    first_bbatoms = [_get_idx(mol, x, resid) for x in ["N", "CA", "HA"]]
+    first_bbatoms = [x for x in first_bbatoms if x is not None]
+    last_bbatoms = [_get_idx(mol, x, resid) for x in ["C", "O"]]
+    last_bbatoms = [x for x in last_bbatoms if x is not None]
+    other_idx = np.setdiff1d(
+        np.where(mol.resid == resid)[0], first_bbatoms + last_bbatoms
+    ).tolist()
+    prev_res = np.where(mol.resid == resid)[0][0]
+    if prev_res > 0:
+        prev_res = list(range(prev_res))
+    else:
+        prev_res = []
+    next_res = np.where(mol.resid == resid)[0][-1] + 1
+    if next_res < mol.numAtoms:
+        next_res = list(range(next_res, mol.numAtoms))
+    else:
+        next_res = []
+    mol.reorderAtoms(prev_res + first_bbatoms + other_idx + last_bbatoms + next_res)
+
+
+def _get_idx(mol, name, resid=None):
+    sel = mol.name == name
+    if resid is not None:
+        sel &= mol.resid == resid
+    res = np.where(sel)
     if len(res) == 0 or len(res[0]) == 0:
         return None
+    assert len(res[0]) == 1
     return res[0][0]
 
 
-def _process_custom_residue(mol: Molecule, resname: str):
+def _process_custom_residue(mol: Molecule, resid: int = None, align: bool = True):
     import networkx as nx
 
+    if resid is None:
+        resid = mol.resid[0]
+    resname = mol.resname[mol.resid == resid][0]
+
     gg = mol.toGraph()
-    n_idx = _get_idx(mol, "N")
-    c_idx = _get_idx(mol, "C")
+    n_idx = _get_idx(mol, "N", resid)
+    c_idx = _get_idx(mol, "C", resid)
     if n_idx is None or c_idx is None:
         raise RuntimeError(
             f"Residue {resname} does not contain N or C atoms. List of atoms: {mol.name}"
@@ -130,7 +161,7 @@ def _process_custom_residue(mol: Molecule, resname: str):
         )
 
     # Fix hydrogen names for CA / N
-    ca_idx = _get_idx(mol, "CA")
+    ca_idx = _get_idx(mol, "CA", resid)
     ca_hs = [nn for nn in gg.neighbors(ca_idx) if gg.nodes[nn]["element"] == "H"]
     if len(ca_hs) > 1:
         raise RuntimeError("Found more than 1 hydrogen on CA atom!")
@@ -139,7 +170,7 @@ def _process_custom_residue(mol: Molecule, resname: str):
 
     # Remove all N terminal hydrogens
     gg = mol.toGraph()
-    n_idx = _get_idx(mol, "N")
+    n_idx = _get_idx(mol, "N", resid)
     n_neighbours = list(gg.neighbors(n_idx))
     n_hs = [nn for nn in n_neighbours if gg.nodes[nn]["element"] == "H"]
     n_heavy = len(n_neighbours) - len(n_hs)
@@ -148,15 +179,15 @@ def _process_custom_residue(mol: Molecule, resname: str):
 
     # Remove all hydrogens attached to terminal C
     gg = mol.toGraph()
-    idx = _get_idx(mol, "C")
+    idx = _get_idx(mol, "C", resid)
     neighbours = list(gg.neighbors(idx))
     hs = [nn for nn in neighbours if gg.nodes[nn]["element"] == "H"]
     if len(hs):
         mol.remove(f"index {' '.join(map(str, hs))}", _logger=False)
 
     # Remove all hydrogens attached to C-terminal O
     gg = mol.toGraph()
-    idx = _get_idx(mol, "O")
+    idx = _get_idx(mol, "O", resid)
     neighbours = list(gg.neighbors(idx))
     hs = [nn for nn in neighbours if gg.nodes[nn]["element"] == "H"]
     if len(hs):
@@ -166,40 +197,43 @@ def _process_custom_residue(mol: Molecule, resname: str):
     hydr = mol.name == "X_H"
     mol.name[hydr] = [f"H{i}" for i in range(10, sum(hydr) + 10)]
 
-    # Reorder atoms. AMBER order is: N H CA HA [sidechain] C O
-    bbatoms = [x for x in ["N", "H", "CA", "HA", "C", "O"] if x in mol.name]
-    ordered_idx = [_get_idx(mol, nn) for nn in bbatoms]
-    other_idx = np.setdiff1d(range(mol.numAtoms), ordered_idx)
-    mol.reorderAtoms(ordered_idx[:4] + other_idx.tolist() + ordered_idx[4:])
+    _reorder_residue_atoms(mol, resid)
 
-    # Align to reference BB for pdb2pqr
-    mol.align("name N CA C", refmol=backbone)
+    if align:
+        # Align to reference BB for pdb2pqr
+        mol.align("name N CA C", refmol=backbone)
 
     if n_heavy == 1 and "N" in mol.name:
         # Add the H atom if N is only bonded to CA.
         # This is necessary to add it in the right position for pdb2pqr
         nmol = backbone.copy()
+        if not align and resid is not None:
+            nmol.align(
+                "name N CA C", refsel=f"name N CA C and resid {resid}", refmol=mol
+            )
         nmol.filter("name H", _logger=False)
-        mol.insert(nmol, 1)
-        mol.bonds = np.vstack((mol.bonds, [0, 1]))
-        mol.bondtype = np.hstack((mol.bondtype, "1"))
+        nmol.resname[:] = resname
+        nmol.resid[:] = resid
+        insert_idx = np.where(mol.resid == resid)[0][0] + 1
+        mol.insert(nmol, insert_idx)
+        mol.addBond(insert_idx - 1, insert_idx, "1")
 
-    # Rename to correct resname
-    mol.resname[:] = resname
     return mol
 
 
-def _prepare_for_parameterize(mol):
+def _prepare_for_parameterize(mol, resid=None):
     # Add OXT HXT HN2 atoms to convert it to RCSB-like structures and pass it to parameterize
     import networkx as nx
 
     mol = mol.copy()
-    resname = mol.resname[0]
+    if resid is None:
+        resid = mol.resid[0]
+    resname = mol.resname[mol.resid == resid][0]
 
     gg = mol.toGraph()
-    bb = nx.shortest_path(gg, _get_idx(mol, "N"), _get_idx(mol, "C"))
+    bb = nx.shortest_path(gg, _get_idx(mol, "N", resid), _get_idx(mol, "C", resid))
 
-    n_idx = _get_idx(mol, "N")
+    n_idx = _get_idx(mol, "N", resid)
     mol.formalcharge[n_idx] = 0
     n_neighbours = list(gg.neighbors(n_idx))
     if len(n_neighbours) == 2:
@@ -208,15 +242,22 @@ def _prepare_for_parameterize(mol):
         align_idx = [n_idx, bb[1], non_bb_idx[0]]
         nterm = alanine.copy()
         nterm.align(
-            [_get_idx(nterm, n) for n in ("N", "CA", "H")], refmol=mol, refsel=align_idx
+            [_get_idx(nterm, n) for n in ("N", "CA", "H")],
+            refmol=mol,
+            refsel=align_idx,
         )
         nterm.filter("name H2", _logger=False)
         nterm.name[0] = "HN2"
-        mol.append(nterm)
-        mol.bonds = np.vstack((mol.bonds, [n_idx, mol.numAtoms - 1]))
-        mol.bondtype = np.hstack((mol.bondtype, "1"))
+        nterm.resname[:] = resname
+        nterm.resid[:] = resid
+        insert_idx = np.where(mol.resid == resid)[0][0] + 1  # Second position
+        mol.insert(nterm, insert_idx)
+        mol.addBond(n_idx, insert_idx, "1")
 
-    c_idx = _get_idx(mol, "C")
+    gg = mol.toGraph()
+    bb = nx.shortest_path(gg, _get_idx(mol, "N", resid), _get_idx(mol, "C", resid))
+
+    c_idx = _get_idx(mol, "C", resid)
     mol.formalcharge[c_idx] = 0
     c_neighbours = list(gg.neighbors(c_idx))
     if len(c_neighbours) == 2:
@@ -225,21 +266,19 @@ def _prepare_for_parameterize(mol):
         align_idx = [bb[-2], c_idx, non_bb_idx[0]]
         cterm = alanine.copy()
         cterm.align(
-            [_get_idx(cterm, n) for n in ("CA", "C", "O")], refmol=mol, refsel=align_idx
+            [_get_idx(cterm, n) for n in ("CA", "C", "O")],
+            refmol=mol,
+            refsel=align_idx,
         )
         cterm.filter("name OXT HXT", _logger=False)
-        mol.append(cterm)
-        mol.bonds = np.vstack((mol.bonds, [c_idx, mol.numAtoms - 2]))
-        mol.bondtype = np.hstack((mol.bondtype, "1"))
-
-    # Rename to correct resname
-    mol.resname[:] = resname
+        cterm.resname[:] = resname
+        cterm.resid[:] = resid
+        insert_idx = np.where(mol.resid == resid)[0][-1] + 1  # End
+        mol.insert(cterm, insert_idx)
+        mol.addBond(c_idx, insert_idx, "1")
 
     # Reorder atoms. AMBER order is: N H CA HA [sidechain] C O
-    bbatoms = [x for x in ["N", "H", "CA", "HA", "C", "O"] if x in mol.name]
-    ordered_idx = [_get_idx(mol, nn) for nn in bbatoms]
-    other_idx = np.setdiff1d(range(mol.numAtoms), ordered_idx)
-    mol.reorderAtoms(ordered_idx[:4] + other_idx.tolist() + ordered_idx[4:])
+    _reorder_residue_atoms(mol, resid)
 
     return mol
 
@@ -280,7 +319,9 @@ def _convert_amber_prepi_to_pdb2pqr_residue(prepi, outdir, name=None):
             )
         mol.element[:] = sdf.element[:]
 
-        pmol = _process_custom_residue(mol, name)
+        pmol = _process_custom_residue(mol)
+        # Rename to correct resname
+        pmol.resname[:] = name
 
         _mol_to_xml_def(pmol, os.path.join(outdir, f"{name}.xml"))
         _mol_to_dat_def(pmol, os.path.join(outdir, f"{name}.dat"))
diff --git a/tests/test_systemprepare.py b/tests/test_systemprepare.py
@@ -71,7 +71,7 @@ def _compare_results(refpdb, refdf_f, pmol: Molecule, df):
     pmol.filter("not water", _logger=False)
     assert mol_equal(
         refmol, pmol, exceptFields=["serial"], fieldPrecision={"coords": 1e-3}
-    )
+    ), f"Failed comparison of {refpdb} vs {pmol.fileloc}"
 
 
 @pytest.mark.parametrize("pdb", ["3PTB", "1A25", "1U5U", "1UNC", "6A5J"])
@@ -179,8 +179,8 @@ def _test_auto_freezing_and_force():
         ("2QRV.pdb", "2QRV_prepared"),
     ),
 )
-def _test_nonstandard_residues(files):
-    from moleculekit.tools.preparation import autoSegment2
+def _test_nonstandard_residues(tmp_path, files):
+    from moleculekit.tools.autosegment import autoSegment2
 
     inf, outf = files
     test_home = os.path.join(
@@ -204,6 +204,8 @@ def _test_nonstandard_residues(files):
         hold_nonpeptidic_bonds=True,
         residue_smiles=res_smiles,
     )
+    pmol.fileloc.append(os.path.join(tmp_path, "prepared.pdb"))
+    pmol.write(pmol.fileloc[0])
 
     _compare_results(
         os.path.join(test_home, f"{outf}.pdb"),
@@ -213,6 +215,8 @@ def _test_nonstandard_residues(files):
     )
 
     pmol, df = systemPrepare(mol, return_details=True, hold_nonpeptidic_bonds=True)
+    pmol.fileloc.append(os.path.join(tmp_path, "prepared.pdb"))
+    pmol.write(pmol.fileloc[0])
 
     _compare_results(
         os.path.join(test_home, f"{outf}.pdb"),
@@ -307,7 +311,7 @@ def _test_nucleiclike_ligand():
     test_home = os.path.join(curr_dir, "test_systemprepare", "3U5S")
     mol = Molecule(os.path.join(test_home, "3U5S.pdb"))
 
-    pmol, df = systemPrepare(mol, return_details=True, ignore_ns_errors=True)
+    pmol, df = systemPrepare(mol, return_details=True, ignore_ns=True)
 
     _compare_results(
         os.path.join(test_home, "3U5S_prepared.pdb"),
diff --git a/tests/test_systemprepare/test-nonstandard-residues/2QRV_prepared.csv b/tests/test_systemprepare/test-nonstandard-residues/2QRV_prepared.csv
@@ -1819,7 +1819,7 @@ PHE,PHE,905,,X,P23,,
 ALA,ALA,906,,X,P23,,
 CYS,CYS,907,,X,P23,9.44095194923338,0.0
 VAL,VAL,908,,X,P23,,
-SAH,SAH,1,,Z,P25,,
-SAH,SAH,4,,b,P27,,
-SAH,SAH,5,,d,P29,,
-SAH,SAH,8,,f,P31,,
+SAH,SAH,1,,Y,P24,,
+SAH,SAH,4,,Z,P25,,
+SAH,SAH,5,,a,P26,,
+SAH,SAH,8,,b,P27,,
diff --git a/tests/test_systemprepare/test-nonstandard-residues/2QRV_prepared.pdb b/tests/test_systemprepare/test-nonstandard-residues/2QRV_prepared.pdb