Skip to content

Commit ee9c8ae

Browse files
committed
Merge branch 'master' into hetatm_parsing
2 parents 8c713d1 + 4d8dc64 commit ee9c8ae

File tree

9 files changed

+139
-36
lines changed

9 files changed

+139
-36
lines changed

.requirements/base.in

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,15 @@ pandas<2.0.0
22
biopandas>=0.5.1
33
biopython
44
bioservices>=1.10.0
5+
cpdb-protein==0.2.0
6+
cython
57
deepdiff
68
loguru
79
looseversion
810
matplotlib>=3.4.3
911
multipledispatch
1012
networkx
11-
numpy<1.24.0
13+
numpy
1214
pandas
1315
plotly
1416
pydantic

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
* Fixes progress bar for `download_pdb_multiprocessing`. [#394](https://github.com/a-r-j/graphein/pull/394)
55
* Add support for DSSP >4. Backwards compatibility is still supported. [#355](https://github.com/a-r-j/graphein/pull/355). Fixes [#353](https://github.com/a-r-j/graphein/issues/353).
66
* Fixes bug where RSA features are missing from nodes with insertion codes. [#355](https://github.com/a-r-j/graphein/pull/355). Fixes [#354](https://github.com/a-r-j/graphein/issues/353).
7+
* Fix bug where the `deprotonate` argument is not wired up to `graphein.protein.graphs.construct_graphs`. [#375](https://github.com/a-r-j/graphein/pull/375)
8+
* Add missing modified residue `AYA` to constants [#390](https://github.com/a-r-j/graphein/pull/390)
79
* Fix bug where the `deprotonate` argument is not wired up to `graphein.protein.graphs.construct_graphs` [#375](https://github.com/a-r-j/graphein/pull/375)
810
* Fix cluster file loading bug in `pdb_data.py` [#396](https://github.com/a-r-j/graphein/pull/396)
911

@@ -80,6 +82,7 @@ https://github.com/a-r-j/graphein/pull/334
8082

8183
#### Other Changes
8284

85+
- Uses [`cpdb`](https://github.com/a-r-j/CPDB) as default PDB file parser for improved performance. [#323](https://github.com/a-r-j/graphein/pull/323).
8386
- Adds transform composition to FoldComp Dataset [#312](https://github.com/a-r-j/graphein/pull/312)
8487
- Adds entry point for biopandas dataframes in `graphein.protein.tensor.io.protein_to_pyg`. [#310](https://github.com/a-r-j/graphein/pull/310)
8588
- Adds support for `.ent` files to `graphein.protein.graphs.read_pdb_to_dataframe`. [#310](https://github.com/a-r-j/graphein/pull/310)

graphein/protein/graphs.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from pathlib import Path
1616
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
1717

18+
import cpdb
1819
import networkx as nx
1920
import numpy as np
2021
import pandas as pd
@@ -109,32 +110,41 @@ def read_pdb_to_dataframe(
109110
or path.endswith(".pdb.gz")
110111
or path.endswith(".ent")
111112
):
112-
atomic_df = PandasPdb().read_pdb(path)
113+
atomic_df = cpdb.parse(path)
113114
elif path.endswith(".mmtf") or path.endswith(".mmtf.gz"):
114115
atomic_df = PandasMmtf().read_mmtf(path)
116+
atomic_df = atomic_df.get_model(model_index)
117+
atomic_df = pd.concat(
118+
[atomic_df.df["ATOM"], atomic_df.df["HETATM"]]
119+
)
115120
elif (
116121
path.endswith(".cif")
117122
or path.endswith(".cif.gz")
118123
or path.endswith(".mmcif")
119124
or path.endswith(".mmcif.gz")
120125
):
121126
atomic_df = PandasMmcif().read_mmcif(path)
127+
atomic_df = atomic_df.get_model(model_index)
128+
atomic_df = atomic_df.convert_to_pandas_pdb()
129+
atomic_df = pd.concat(
130+
[atomic_df.df["ATOM"], atomic_df.df["HETATM"]]
131+
)
122132
else:
123133
raise ValueError(
124134
f"File {path} must be either .pdb(.gz), .mmtf(.gz), .(mm)cif(.gz) or .ent, not {path.split('.')[-1]}"
125135
)
126136
elif uniprot_id is not None:
127-
atomic_df = PandasPdb().fetch_pdb(
128-
uniprot_id=uniprot_id, source="alphafold2-v3"
129-
)
137+
atomic_df = cpdb.parse(uniprot_id=uniprot_id)
130138
else:
131-
atomic_df = PandasPdb().fetch_pdb(pdb_code)
132-
atomic_df = atomic_df.get_model(model_index)
133-
if len(atomic_df.df["ATOM"]) == 0:
139+
atomic_df = cpdb.parse(pdb_code=pdb_code)
140+
141+
if "model_idx" in atomic_df.columns:
142+
atomic_df = atomic_df.loc[atomic_df["model_idx"] == model_index]
143+
144+
if len(atomic_df) == 0:
134145
raise ValueError(f"No model found for index: {model_index}")
135-
if isinstance(atomic_df, PandasMmcif):
136-
atomic_df = atomic_df.convert_to_pandas_pdb()
137-
return pd.concat([atomic_df.df["ATOM"], atomic_df.df["HETATM"]])
146+
147+
return atomic_df
138148

139149

140150
def label_node_id(
@@ -285,7 +295,7 @@ def remove_alt_locs(
285295
# Unsort
286296
if keep in ["max_occupancy", "min_occupancy"]:
287297
df = df.sort_index()
288-
298+
df = df.reset_index(drop=True)
289299
return df
290300

291301

graphein/protein/resi_atoms.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,7 @@
463463
"ABA",
464464
"ACE",
465465
"AIB",
466+
"AYA",
466467
"BMT",
467468
"BOC",
468469
"CBX",
@@ -535,6 +536,7 @@
535536
"ABA",
536537
"ACE",
537538
"AIB",
539+
"AYA",
538540
"ALA",
539541
"ARG",
540542
"ASN",
@@ -639,6 +641,7 @@
639641
"ASN": "N",
640642
"ASP": "D",
641643
"ASX": "B",
644+
"AYA": "A",
642645
"BMT": "T",
643646
"BOC": "X",
644647
"CBX": "X",
@@ -795,6 +798,7 @@
795798
"ABA": "ALA",
796799
"ACE": "-",
797800
"AIB": "ALA",
801+
"AYA": "ALA",
798802
"BMT": "THR",
799803
"BOC": "-",
800804
"CBX": "-",

graphein/protein/utils.py

Lines changed: 74 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from urllib.request import urlopen
1717

1818
import networkx as nx
19+
import numpy as np
1920
import pandas as pd
2021
import requests
2122
import wget
@@ -25,6 +26,30 @@
2526

2627
from .resi_atoms import BACKBONE_ATOMS, RESI_THREE_TO_1
2728

29+
pdb_df_columns = [
30+
"record_name",
31+
"atom_number",
32+
"blank_1",
33+
"atom_name",
34+
"alt_loc",
35+
"residue_name",
36+
"blank_2",
37+
"chain_id",
38+
"residue_number",
39+
"insertion",
40+
"blank_3",
41+
"x_coord",
42+
"y_coord",
43+
"z_coord",
44+
"occupancy",
45+
"b_factor",
46+
"blank_4",
47+
"segment_id",
48+
"element_symbol",
49+
"charge",
50+
"line_idx",
51+
]
52+
2853

2954
class ProteinGraphConfigurationError(Exception):
3055
"""
@@ -418,12 +443,27 @@ def save_graph_to_pdb(
418443
:type gz: bool
419444
"""
420445
ppd = PandasPdb()
421-
atom_df = filter_dataframe(
422-
g.graph["pdb_df"], "record_name", ["ATOM"], boolean=True
423-
)
424-
hetatm_df = filter_dataframe(
425-
g.graph["pdb_df"], "record_name", ["HETATM"], boolean=True
426-
)
446+
447+
df = g.graph["pdb_df"].copy()
448+
# format charge correctly
449+
df.charge = pd.to_numeric(df.charge, errors="coerce")
450+
451+
# Add blank columns
452+
blank_cols = [
453+
"blank_1",
454+
"blank_2",
455+
"blank_3",
456+
"blank_4",
457+
"segment_id",
458+
]
459+
for col in blank_cols:
460+
if col not in df.columns:
461+
df[col] = ""
462+
df["line_idx"] = list(range(1, len(df) + 1))
463+
df = df[pdb_df_columns]
464+
atom_df = filter_dataframe(df, "record_name", ["ATOM"], boolean=True)
465+
hetatm_df = filter_dataframe(df, "record_name", ["HETATM"], boolean=True)
466+
427467
if atoms:
428468
ppd.df["ATOM"] = atom_df
429469
if hetatms:
@@ -448,9 +488,22 @@ def save_pdb_df_to_pdb(
448488
:param gz: Whether to gzip the file. Defaults to ``False``.
449489
:type gz: bool
450490
"""
491+
df = df.copy()
492+
# format charge correctly
493+
df.charge = pd.to_numeric(df.charge, errors="coerce")
494+
df.alt_loc = df.alt_loc.fillna(" ")
495+
blank_cols = ["blank_1", "blank_2", "blank_3", "blank_4", "segment_id"]
496+
for col in blank_cols:
497+
if col not in df.columns:
498+
df[col] = ""
499+
df["line_idx"] = list(range(1, len(df) + 1))
500+
df = df[pdb_df_columns]
501+
451502
atom_df = filter_dataframe(df, "record_name", ["ATOM"], boolean=True)
452503
hetatm_df = filter_dataframe(df, "record_name", ["HETATM"], boolean=True)
504+
453505
ppd = PandasPdb()
506+
454507
if atoms:
455508
ppd.df["ATOM"] = atom_df
456509
if hetatms:
@@ -481,12 +534,21 @@ def save_rgroup_df_to_pdb(
481534
:type gz: bool
482535
"""
483536
ppd = PandasPdb()
484-
atom_df = filter_dataframe(
485-
g.graph["rgroup_df"], "record_name", ["ATOM"], boolean=True
486-
)
487-
hetatm_df = filter_dataframe(
488-
g.graph["rgroup_df"], "record_name", ["HETATM"], boolean=True
489-
)
537+
df = g.graph["rgroup_df"].copy()
538+
539+
# format charge correctly
540+
df.charge = pd.to_numeric(df.charge, errors="coerce")
541+
542+
blank_cols = ["blank_1", "blank_2", "blank_3", "blank_4", "segment_id"]
543+
for col in blank_cols:
544+
if col not in df.columns:
545+
df[col] = [""] * len(df)
546+
df["line_idx"] = list(range(1, len(df) + 1))
547+
df = df[pdb_df_columns]
548+
549+
atom_df = filter_dataframe(df, "record_name", ["ATOM"], boolean=True)
550+
hetatm_df = filter_dataframe(df, "record_name", ["HETATM"], boolean=True)
551+
490552
if atoms:
491553
ppd.df["ATOM"] = atom_df
492554
if hetatms:

tests/protein/tensor/test_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,5 @@
1616
def test_save_and_load_protein():
1717
a = Protein().from_pdb_code("4hhb")
1818
torch.save(a, "4hhb.pt")
19-
b = torch.load("4hhb.pt")
19+
b = torch.load("4hhb.pt", weights_only=False)
2020
assert a == b

tests/protein/tensor/test_reconstruction.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,3 @@ def test_dist_mat_to_coords():
3030
assert torch.allclose(d, torch.cdist(X, X), atol=1e-4)
3131
X_aligned = kabsch(X, coords)
3232
assert torch.allclose(coords, X_aligned, atol=1e-4)
33-
return coords, X, X_aligned

tests/protein/test_graphs.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -439,7 +439,10 @@ def test_alt_loc_exclusion():
439439
):
440440
config.alt_locs = opt
441441
g = construct_graph(config=config, pdb_code="2VVI")
442-
assert np.array_equal(g.nodes[node_id]["coords"], expected_coords)
442+
assert np.array_equal(
443+
g.nodes[node_id]["coords"],
444+
np.array(expected_coords, dtype=np.float32),
445+
)
443446

444447

445448
def test_alt_loc_inclusion():

tests/protein/test_utils.py

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,24 @@ def test_save_graph_to_pdb():
2727
# Check file exists
2828
assert os.path.isfile("/tmp/test_graph.pdb")
2929

30-
# Check for equivalence between saved and existing DFs.
31-
# We drop the line_idx columns as these will be renumbered
30+
graph_df = (
31+
g.graph["pdb_df"]
32+
.drop(
33+
[
34+
"node_id",
35+
"residue_id",
36+
],
37+
axis=1,
38+
)
39+
.reset_index(drop=True)
40+
)
41+
42+
a.reset_index(drop=True, inplace=True)
43+
a = a[graph_df.columns] # Reorder columns
44+
3245
assert_frame_equal(
33-
a.drop(["line_idx"], axis=1),
34-
g.graph["pdb_df"].drop(["line_idx", "node_id", "residue_id"], axis=1),
46+
a,
47+
graph_df,
3548
)
3649
h = construct_graph(path="/tmp/test_graph.pdb")
3750

@@ -48,10 +61,17 @@ def test_save_pdb_df_to_pdb():
4861
# Check file exists
4962
assert os.path.isfile("/tmp/test_graph.pdb")
5063

51-
# We drop the line_idx columns as these will be renumbered
5264
assert_frame_equal(
53-
a.drop(["line_idx"], axis=1),
54-
g.graph["pdb_df"].drop(["line_idx", "node_id", "residue_id"], axis=1),
65+
a,
66+
g.graph["pdb_df"]
67+
.drop(
68+
[
69+
"node_id",
70+
"residue_id",
71+
],
72+
axis=1,
73+
)
74+
.reset_index(drop=True),
5575
)
5676

5777
# Now check for raw, unprocessed DF
@@ -73,10 +93,10 @@ def test_save_rgroup_df_to_pdb():
7393

7494
# We drop the line_idx columns as these will be renumbered
7595
assert_frame_equal(
76-
a.drop(["line_idx"], axis=1),
96+
a,
7797
filter_dataframe(
7898
g.graph["rgroup_df"], "record_name", ["HETATM"], False
79-
).drop(["line_idx", "node_id", "residue_id"], axis=1),
99+
).drop(["node_id", "residue_id"], axis=1),
80100
)
81101

82102

0 commit comments

Comments
 (0)