update dataset load output and add zinc250k

liugangcode · liugangcode · commit 2eadef7e98d3 · 2025-08-13T17:41:26.000-04:00
diff --git a/README.md b/README.md
@@ -70,7 +70,6 @@ See the [List of Supported Models](#list-of-supported-models) section for all av
 > More examples can be found in the `examples` and `tests` folders.
 
 `torch-molecule` supports applications in broad domains from chemistry, biology, to materials science. To get started, you can load prepared datasets from `torch_molecule.datasets` (updated after v0.1.3):
-
 | Dataset | Description | Function |
 |---------|-------------|----------|
 | qm9 | Quantum chemical properties (DFT level) | `load_qm9` |
@@ -79,23 +78,30 @@ See the [List of Supported Models](#list-of-supported-models) section for all av
 | toxcast | Toxicity of chemical compounds | `load_toxcast` |
 | admet | Chemical absorption, distribution, metabolism, excretion, and toxicity | `load_admet` |
 | gasperm | Six gas permeability properties for polymeric materials | `load_gasperm` |
-
+| zinc250k | A common subset of ZINC dataset, which does not have labels and could be used for unconditional generation or virtual screening | `load_zinc250k` |
 
 ```python
 from torch_molecule.datasets import load_qm9
 
 # local_dir is the local path where the dataset will be saved
-smiles_list, property_np_array = load_qm9(local_dir='torchmol_data')
+molecular_data = load_qm9(local_dir='torchmol_data')
+smiles_list, property_np_array = molecular_data.data, molecular_data.target
 
 # len(smiles_list): 133885
 # Property array shape: (133885, 1)
 
 # load_qm9 returns the target "gap" by default, but you can adjust it by passing new target_cols
 target_cols = ['homo', 'lumo', 'gap']
-smiles_list, property_np_array = load_qm9(local_dir='torchmol_data', target_cols=target_cols)
+molecular_data = load_qm9(local_dir='torchmol_data', target_cols=target_cols)
+smiles_list, property_np_array = molecular_data.data, molecular_data.target
+
+# the target could be None if loading an unlabeled dataset
+molecular_data = load_zinc250k(local_dir='torchmol_data', target_cols=target_cols)
+smiles_list = molecular_data.data
+assert molecular_data.target is None
 ```
 
-(We welcome your suggestions and contributions on your datasets!)
+(We are actively adding more datasets. We welcome your suggestions and contributions on your datasets!)
 
 ### Fit a Model
 
diff --git a/tests/datasets/gasperm.py b/tests/datasets/gasperm.py
@@ -15,7 +15,9 @@ def test_gasperm_download_and_cleanup():
         print("-" * 40)
         
         # Test with default target columns
-        smiles_list, property_numpy = load_gasperm()
+        molecular_dataset = load_gasperm()
+        smiles_list = molecular_dataset.data
+        property_numpy = molecular_dataset.target
         
         # Print results
         print(f"\nResults:")
@@ -52,7 +54,9 @@ def test_gasperm_download_and_cleanup():
         print("-" * 40)
         
         custom_targets = ["CH4", "CO2"]
-        smiles_list2, property_numpy2 = load_gasperm(target_cols=custom_targets)
+        molecular_dataset2 = load_gasperm(target_cols=custom_targets)
+        smiles_list2 = molecular_dataset2.data
+        property_numpy2 = molecular_dataset2.target
         
         print(f"Custom target results:")
         print(f"- Target columns: {custom_targets}")
@@ -65,7 +69,9 @@ def test_gasperm_download_and_cleanup():
         print("-" * 40)
         
         single_target = ["H2"]
-        smiles_list3, property_numpy3 = load_gasperm(target_cols=single_target)
+        molecular_dataset3 = load_gasperm(target_cols=single_target)
+        smiles_list3 = molecular_dataset3.data
+        property_numpy3 = molecular_dataset3.target
         
         print(f"Single target results:")
         print(f"- Target columns: {single_target}")
@@ -80,7 +86,9 @@ def test_gasperm_download_and_cleanup():
         
         try:
             invalid_targets = ["INVALID_GAS"]
-            smiles_list4, property_numpy4 = load_gasperm(target_cols=invalid_targets)
+            molecular_dataset4 = load_gasperm(target_cols=invalid_targets)
+            smiles_list4 = molecular_dataset4.data
+            property_numpy4 = molecular_dataset4.target
             print("ERROR: Should have raised ValueError for invalid target column")
         except ValueError as e:
             print(f"Successfully caught expected error: {e}")
diff --git a/tests/datasets/hf.py b/tests/datasets/hf.py
@@ -1,7 +1,7 @@
 import os
 import tempfile
 import shutil
-from torch_molecule.datasets import load_qm9, load_chembl2k, load_broad6k, load_toxcast, load_admet
+from torch_molecule.datasets import load_qm9, load_chembl2k, load_broad6k, load_toxcast, load_admet, load_zinc250k
 import numpy as np
 import csv
 import gzip
@@ -17,6 +17,8 @@ def load_dataset(dataset_name="qm9"):
         return load_toxcast
     elif dataset_name == "admet":
         return load_admet
+    elif dataset_name == "zinc250k":
+        return load_zinc250k
     else:
         raise ValueError(f"Dataset {dataset_name} not found")
 
@@ -42,57 +44,70 @@ def test_download_and_cleanup(dataset_name="qm9"):
         print("-" * 40)
         
         # Test with default target columns
-        smiles_list, property_numpy, local_data_path = load_func(
+        result = load_func(
             local_dir=test_csv_path,
             return_local_data_path=True,
         )
+        molecular_dataset, local_data_path = result
         
         # Print results
         print(f"\nResults:")
-        print(f"- Number of molecules: {len(smiles_list)}")
-        print(f"- Property array shape: {property_numpy.shape}")
+        print(f"- Number of molecules: {len(molecular_dataset.data)}")
+        print(f"- Property array shape: {molecular_dataset.target.shape if molecular_dataset.target is not None else 'None'}")
         print(f"- File exists: {os.path.exists(local_data_path)}")
         print(f"- File size: {os.path.getsize(local_data_path) if os.path.exists(local_data_path) else 0} bytes")
         
         print(f"\nFirst 5 SMILES:")
-        for i, smiles in enumerate(smiles_list[:5]):
+        for i, smiles in enumerate(molecular_dataset.data[:5]):
             print(f"  {i+1}. {smiles}")
         
         print(f"\nFirst 5 property values (gap):")
-        for i, prop in enumerate(property_numpy[:5]):
-            print(f"  {i+1}. {prop[0]:.6f}")
+        if molecular_dataset.target is not None:
+            for i, prop in enumerate(molecular_dataset.target[:5]):
+                print(f"  {i+1}. {prop[0]:.6f}")
+        else:
+            print("  No property values available (target is None)")
         
         print(f"\nProperty statistics:")
         # Calculate statistics excluding NaN values
-        non_null_mask = ~np.isnan(property_numpy)
-        non_null_values = property_numpy[non_null_mask]
-        
-        print(f"  Total values: {property_numpy.size}")
-        print(f"  Non-null values: {non_null_values.size}")
-        print(f"  Null values: {property_numpy.size - non_null_values.size}")
-        print(f"  Non-null percentage: {(non_null_values.size / property_numpy.size * 100):.2f}%")
-        
-        if non_null_values.size > 0:
-            print(f"  Min (non-null): {non_null_values.min():.6f}")
-            print(f"  Max (non-null): {non_null_values.max():.6f}")
-            print(f"  Mean (non-null): {non_null_values.mean():.6f}")
-            print(f"  Std (non-null): {non_null_values.std():.6f}")
+        if molecular_dataset.target is not None:
+            non_null_mask = ~np.isnan(molecular_dataset.target)
+            non_null_values = molecular_dataset.target[non_null_mask]
+            
+            print(f"  Total values: {molecular_dataset.target.size}")
+            print(f"  Non-null values: {non_null_values.size}")
+            print(f"  Null values: {molecular_dataset.target.size - non_null_values.size}")
+            print(f"  Non-null percentage: {(non_null_values.size / molecular_dataset.target.size * 100):.2f}%")
+            
+            if non_null_values.size > 0:
+                print(f"  Min (non-null): {non_null_values.min():.6f}")
+                print(f"  Max (non-null): {non_null_values.max():.6f}")
+                print(f"  Mean (non-null): {non_null_values.mean():.6f}")
+                print(f"  Std (non-null): {non_null_values.std():.6f}")
+            else:
+                print("  No non-null values found")
         else:
-            print("  No non-null values found")
+            print("  No property statistics available (target is None)")
         
         # Test loading from existing file (should not download again)
         print(f"\n2. Testing loading from existing file")
         print("-" * 40)
         
-        smiles_list2, property_numpy2, local_data_path = load_func(
+        result2 = load_func(
             local_dir=test_csv_path,
             return_local_data_path=True,
         )
+        molecular_dataset2, local_data_path2 = result2
         
         print(f"Second load results:")
-        print(f"- Same number of molecules: {len(smiles_list2) == len(smiles_list)}")
-        print(f"- Same property shape: {property_numpy2.shape == property_numpy.shape}")
-        print(f"- Local data path: {local_data_path}")
+        print(f"- Same number of molecules: {len(molecular_dataset2.data) == len(molecular_dataset.data)}")
+        if molecular_dataset.target is not None and molecular_dataset2.target is not None:
+            print(f"- Same property shape: {molecular_dataset2.target.shape == molecular_dataset.target.shape}")
+        elif molecular_dataset.target is None and molecular_dataset2.target is None:
+            print(f"- Same property shape: True (both are None)")
+        else:
+            print(f"- Same property shape: False (different None status)")
+        print(f"- Local data path: {local_data_path2}")
         
         # Test with multiple target columns (if available)
         print(f"\n3. Testing with multiple target columns")
@@ -132,8 +147,9 @@ def test_download_and_cleanup(dataset_name="qm9"):
 
 
 if __name__ == "__main__":
-    test_download_and_cleanup(dataset_name="qm9")
-    test_download_and_cleanup(dataset_name="chembl2k")
-    test_download_and_cleanup(dataset_name="broad6k")
-    test_download_and_cleanup(dataset_name="toxcast")
-    test_download_and_cleanup(dataset_name="admet")
+    # test_download_and_cleanup(dataset_name="qm9")
+    # test_download_and_cleanup(dataset_name="chembl2k")
+    # test_download_and_cleanup(dataset_name="broad6k")
+    # test_download_and_cleanup(dataset_name="toxcast")
+    # test_download_and_cleanup(dataset_name="admet")
+    test_download_and_cleanup(dataset_name="zinc250k")
diff --git a/torch_molecule/datasets/__init__.py b/torch_molecule/datasets/__init__.py
@@ -1,4 +1,4 @@
-from .load_hf_dataset import load_qm9, load_chembl2k, load_broad6k, load_toxcast, load_admet
+from .load_hf_dataset import load_qm9, load_chembl2k, load_broad6k, load_toxcast, load_admet, load_zinc250k
 from .load_local_csv import load_gasperm
 
 __all__ = [
@@ -8,4 +8,5 @@
     "load_toxcast",
     "load_admet",
     "load_gasperm",
+    "load_zinc250k",
 ]
diff --git a/torch_molecule/datasets/constant.py b/torch_molecule/datasets/constant.py
@@ -1,3 +1,20 @@
+from dataclasses import dataclass
+from typing import List
+import numpy as np
+
+@dataclass
+class SMILESDataset:
+    """
+    Data class for storing molecular SMILES dataset with input and target data.
+    
+    Attributes:
+        data (List[str]): Input data (e.g., list of SMILES strings)
+        target (np.ndarray | None): Target property values as 2D numpy array (rows=molecules, cols=targets) or None
+    """
+    data: List[str]
+    target: np.ndarray | None
+
+
 TOXCAST_TASKS = [
     'ACEA_T47D_80hr_Negative', 'ACEA_T47D_80hr_Positive',
     'APR_HepG2_CellCycleArrest_24h_dn', 'APR_HepG2_CellCycleArrest_24h_up',
diff --git a/torch_molecule/datasets/load_hf_dataset.py b/torch_molecule/datasets/load_hf_dataset.py
diff --git a/torch_molecule/datasets/load_local_csv.py b/torch_molecule/datasets/load_local_csv.py