Skip to content

Molecule3D

Molecule3D

Bases: BaseDataset

Molecule3D dataset consists of 3,899,647 molecules with equilibrium geometries and energies calculated at the B3LYP/6-31G* level of theory. The molecules are extracted from the PubChem database and cleaned by removing molecules with invalid molecule files, with SMILES conversion error, RDKIT warnings, sanitization problems, or with damaged log files.

Usage:

from openqdc.datasets import Molecule3D
dataset = Molecule3D()

References

https://arxiv.org/abs/2110.01717

https://github.com/divelab/MoleculeX

Source code in openqdc/datasets/potential/molecule3d.py
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
class Molecule3D(BaseDataset):
    """
    Molecule3D dataset consists of 3,899,647 molecules with equilibrium geometries and energies calculated at the
    B3LYP/6-31G* level of theory. The molecules are extracted from the PubChem database and cleaned by removing
    molecules with invalid molecule files, with SMILES conversion error, RDKIT warnings, sanitization problems,
    or with damaged log files.

    Usage:
    ```python
    from openqdc.datasets import Molecule3D
    dataset = Molecule3D()
    ```

    References:
        https://arxiv.org/abs/2110.01717\n
        https://github.com/divelab/MoleculeX
    """

    __name__ = "molecule3d"
    __energy_methods__ = [PotentialMethod.B3LYP_6_31G_D]  # "b3lyp/6-31g*",
    # UNITS MOST LIKELY WRONG, MUST CHECK THEM MANUALLY
    __energy_unit__ = "ev"  # CALCULATED
    __distance_unit__ = "ang"
    __forces_unit__ = "ev/ang"
    __links__ = {"molecule3d.zip": "https://drive.google.com/uc?id=1C_KRf8mX-gxny7kL9ACNCEV4ceu_fUGy"}

    energy_target_names = ["b3lyp/6-31g*.energy"]

    def read_raw_entries(self):
        raw = p_join(self.root, "data", "raw")
        sdf_paths = glob(p_join(raw, "*.sdf"))
        properties_path = p_join(raw, "properties.csv")

        fn = lambda x: _read_sdf(x, properties_path)
        res = dm.parallelized(fn, sdf_paths, n_jobs=1)  # don't use more than 1 job
        samples = sum(res, [])
        return samples

read_mol(mol, energy)

Read molecule (Chem.rdchem.Mol) and energy (float) and return dict with conformers and energies

Parameters

mol: Chem.rdchem.Mol RDKit molecule energy: float Energy of the molecule

Returns

res: dict Dictionary containing the following keys: - name: np.ndarray of shape (N,) containing the smiles of the molecule - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions - energies: np.ndarray of shape (1,) containing the energy of the conformer - n_atoms: np.ndarray of shape (1) containing the number of atoms in the conformer - subset: np.ndarray of shape (1) containing "molecule3d"

Source code in openqdc/datasets/potential/molecule3d.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def read_mol(mol: Chem.rdchem.Mol, energy: float) -> Dict[str, np.ndarray]:
    """Read molecule (Chem.rdchem.Mol) and energy (float) and return dict with conformers and energies

    Parameters
    ----------
    mol: Chem.rdchem.Mol
        RDKit molecule
    energy: float
        Energy of the molecule

    Returns
    -------
    res: dict
        Dictionary containing the following keys:
        - name: np.ndarray of shape (N,) containing the smiles of the molecule
        - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions
        - energies: np.ndarray of shape (1,) containing the energy of the conformer
        - n_atoms: np.ndarray of shape (1) containing the number of atoms in the conformer
        - subset: np.ndarray of shape (1) containing "molecule3d"
    """
    smiles = dm.to_smiles(mol, explicit_hs=False)
    # subset = dm.to_smiles(dm.to_scaffold_murcko(mol, make_generic=True), explicit_hs=False)
    x = get_atomic_number_and_charge(mol)
    positions = mol.GetConformer().GetPositions()

    res = dict(
        name=np.array([smiles]),
        subset=np.array(["molecule3d"]),
        energies=np.array([energy]).astype(np.float64)[:, None],
        atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32),
        n_atoms=np.array([x.shape[0]], dtype=np.int32),
    )

    return res