Fix masked atom issue. Fixes #11

liugangcode · liugangcode · commit d9b901644b32 · 2025-06-26T15:47:45.000-04:00
diff --git a/tests/encoder/attrmask.py b/tests/encoder/attrmask.py
@@ -41,5 +41,26 @@ def test_attrmask_encoder():
         os.remove(save_path)
         print(f"Cleaned up {save_path}")
 
+def test_attrmask_encoder_polymers():
+    # Test molecules (simple examples)
+    polymers = [
+        "*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5ccc(N*)cc5)cc4)CCC(CCCCC)CC3)cc2)cc1",
+        "*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)c(-c3ccc(C)cc3)c2-c2ccc(C)cc2)cc1",
+        "*CC(*)(C)C(=O)OCCCCCCCCCOc1ccc2cc(C(=O)Oc3ccccc3)ccc2c1"
+    ]
+    model = AttrMaskMolecularEncoder(
+        num_layer=3,
+        hidden_size=300,
+        batch_size=5,
+        epochs=5,  # Small number for testing
+        verbose=True
+    )
+    model.fit(polymers)
+    vectors = model.encode(polymers)
+    print(f"Representation shape: {vectors.shape}")
+    print(f"Representation for new molecule: {vectors[0]}")
+
+
 if __name__ == "__main__":
-    test_attrmask_encoder()
+    test_attrmask_encoder_polymers()
+    test_attrmask_encoder()
diff --git a/tests/encoder/moama.py b/tests/encoder/moama.py
@@ -47,5 +47,25 @@ def test_moama_encoder():
         os.remove(save_path)
         print(f"Cleaned up {save_path}")
 
+def test_moama_encoder_polymers():
+    # Test molecules (simple examples)
+    polymers = [
+        "*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5ccc(N*)cc5)cc4)CCC(CCCCC)CC3)cc2)cc1",
+        "*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)c(-c3ccc(C)cc3)c2-c2ccc(C)cc2)cc1",
+        "*CC(*)(C)C(=O)OCCCCCCCCCOc1ccc2cc(C(=O)Oc3ccccc3)ccc2c1"
+    ]
+    model = MoamaMolecularEncoder(
+        num_layer=3,
+        hidden_size=300,
+        batch_size=5,
+        epochs=5,  # Small number for testing
+        verbose=True
+    )
+    model.fit(polymers)
+    vectors = model.encode(polymers)
+    print(f"Representation shape: {vectors.shape}")
+    print(f"Representation for new molecule: {vectors[0]}")
+
 if __name__ == "__main__":
-    test_moama_encoder()
+    test_moama_encoder_polymers()
+    test_moama_encoder()
diff --git a/torch_molecule/encoder/attrmask/model.py b/torch_molecule/encoder/attrmask/model.py
@@ -4,6 +4,7 @@
 
 from ...nn import GNN_node, GNN_node_Virtualnode, MLP
 from ...utils import init_weights
+from ...utils.graph.features import allowable_features
 
 import random
 
@@ -23,11 +24,12 @@ def __init__(
     ):
         super(GNN, self).__init__()
         gnn_name = encoder_type.split("-")[0]
-        self.num_atom_type = 119
+        decoding_size = len(allowable_features['possible_atomic_num_list'])
         self.hidden_size = hidden_size
         self.mask_num = mask_num
         self.mask_rate = mask_rate
 
+        self.mask_atom_id = 119
         encoder_params = {
             "num_layer": num_layer,
             "hidden_size": hidden_size,
@@ -50,7 +52,7 @@ def __init__(
         if self.pool is None:
             raise ValueError(f"Invalid graph pooling type {readout}.")
 
-        self.predictor = MLP(hidden_size, hidden_features=2 * hidden_size, out_features=self.num_atom_type)
+        self.predictor = MLP(hidden_size, hidden_features=2 * hidden_size, out_features=decoding_size)
     
     def initialize_parameters(self, seed=None):
         """
@@ -96,7 +98,7 @@ def compute_loss(self, batched_data):
 
         # mask nodes' features
         for node_idx in masked_node_indices:
-            batched_data.x[node_idx] = torch.tensor([self.num_atom_type - 1] + [0] * (batched_data.x.shape[1] - 1))
+            batched_data.x[node_idx] = torch.tensor([self.mask_atom_id - 1] + [0] * (batched_data.x.shape[1] - 1))
     
         # generate predictions
         h_node, _ = self.graph_encoder(batched_data)
diff --git a/torch_molecule/encoder/moama/model.py b/torch_molecule/encoder/moama/model.py
@@ -6,6 +6,7 @@
 from ...utils import init_weights
 
 from .utils import get_mask_indices, get_fingerprint_loss
+from ...utils.graph.features import allowable_features
 
 class_criterion = torch.nn.CrossEntropyLoss()
 
@@ -23,7 +24,9 @@ def __init__(
     ):
         super(GNN, self).__init__()
         gnn_name = encoder_type.split("-")[0]
-        self.num_atom_type = 119
+        decoding_size = len(allowable_features['possible_atomic_num_list'])
+        
+        self.mask_atom_id = 119
         self.hidden_size = hidden_size
         self.mask_rate = mask_rate
         self.lw_rec = lw_rec
@@ -50,7 +53,7 @@ def __init__(
         if self.pool is None:
             raise ValueError(f"Invalid graph pooling type {readout}.")
 
-        self.predictor = GNN_Decoder(hidden_size, self.num_atom_type)
+        self.predictor = GNN_Decoder(hidden_size, decoding_size)
     
     def initialize_parameters(self, seed=None):
         """
@@ -82,13 +85,16 @@ def compute_loss(self, batched_data):
 
         # mask nodes' features
         for node_idx in masked_node_indices:
-            batched_data.x[node_idx] = torch.tensor([self.num_atom_type - 1] + [0] * (batched_data.x.shape[1] - 1))
+            batched_data.x[node_idx] = torch.tensor([self.mask_atom_id - 1] + [0] * (batched_data.x.shape[1] - 1))
     
         # generate predictions
         h_node, _ = self.graph_encoder(batched_data)
         h_rep = self.pool(h_node, batched_data.batch)
         batched_data.x = h_node
         prediction_class = self.predictor(batched_data)[masked_node_indices]
+        print('prediction_class', prediction_class.max(), prediction_class.min())
+        print('batched_data.y', batched_data.y.max(), batched_data.y.min())
+
         
         # target_class = batched_data.y.to(torch.float32)
         loss_class = class_criterion(prediction_class.to(torch.float32), batched_data.y.long())