Skip to content

ANI

ANI1

Bases: BaseDataset

The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small organic molecules. The molecules contain 4 distinct atoms, C, N, O and H. Electronic structure calculations use the wB97x density functional and the 6-31G(d) basis set. For generating structures, smiles strings for molecules are used for generating 3D conformations using RDKit. These 3D structures are then pre-optimized to a stationary point using the MMFF94 force field. Finally, geometries are optimized until energy minima using the chosen DFT level.

Usage:

from openqdc.datasets import ANI1
dataset = ANI1()

References

https://www.nature.com/articles/sdata2017193

https://github.com/aiqm/ANI1x_datasets

Source code in openqdc/datasets/potential/ani.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
class ANI1(BaseDataset):
    """
    The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small organic
    molecules. The molecules contain 4 distinct atoms, C, N, O and H. Electronic structure calculations use the
    wB97x density functional and the 6-31G(d) basis set. For generating structures, smiles strings for molecules
    are used for generating 3D conformations using RDKit. These 3D structures are then pre-optimized to a stationary
    point using the MMFF94 force field. Finally, geometries are optimized until energy minima using the chosen DFT
    level.

    Usage:
    ```python
    from openqdc.datasets import ANI1
    dataset = ANI1()
    ```

    References:
        https://www.nature.com/articles/sdata2017193\n
        https://github.com/aiqm/ANI1x_datasets
    """

    __name__ = "ani1"

    __energy_methods__ = [
        PotentialMethod.WB97X_6_31G_D,
    ]

    energy_target_names = [
        "ωB97x:6-31G(d) Energy",
    ]

    __energy_unit__ = "hartree"
    __distance_unit__ = "bohr"
    __forces_unit__ = "hartree/bohr"
    __links__ = {"ani1.hdf5.gz": "https://zenodo.org/record/3585840/files/214.hdf5.gz"}

    @property
    def root(self):
        return p_join(get_local_cache(), "ani")

    @property
    def config(self):
        assert len(self.__links__) > 0, "No links provided for fetching"
        return dict(dataset_name="ani", links=self.__links__)

    def __smiles_converter__(self, x):
        return "-".join(x.decode("ascii").split("-")[:-1])

    @property
    def preprocess_path(self):
        path = p_join(self.root, "preprocessed", self.__name__)
        os.makedirs(path, exist_ok=True)
        return path

    def read_raw_entries(self):
        raw_path = p_join(self.root, f"{self.__name__}.h5.gz")
        samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, self.force_target_names)
        return samples

ANI1CCX

Bases: ANI1

ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset using active learning. The conformations are labelled using a high accuracy CCSD(T)*/CBS method.

Usage:

from openqdc.datasets import ANI1CCX
dataset = ANI1CCX()

References

https://doi.org/10.1038/s41467-019-10827-4

https://github.com/aiqm/ANI1x_datasets

Source code in openqdc/datasets/potential/ani.py
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
class ANI1CCX(ANI1):
    """
    ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset using active
    learning. The conformations are labelled using a high accuracy CCSD(T)*/CBS method.

    Usage:
    ```python
    from openqdc.datasets import ANI1CCX
    dataset = ANI1CCX()
    ```

    References:
        https://doi.org/10.1038/s41467-019-10827-4\n
        https://github.com/aiqm/ANI1x_datasets
    """

    __name__ = "ani1ccx"
    __energy_unit__ = "hartree"
    __distance_unit__ = "ang"
    __forces_unit__ = "hartree/ang"

    __energy_methods__ = [
        PotentialMethod.CCSD_T_CBS,  # "ccsd(t)/cbs",
        PotentialMethod.CCSD_T_CC_PVDZ,  # "ccsd(t)/cc-pvdz",
        PotentialMethod.CCSD_T_CC_PVTZ,  # "ccsd(t)/cc-pvtz",
        PotentialMethod.TCSSD_T_CC_PVDZ,  # "tccsd(t)/cc-pvdz",
    ]

    energy_target_names = [
        "CCSD(T)*:CBS Total Energy",
        "NPNO-CCSD(T):cc-pVDZ Correlation Energy",
        "NPNO-CCSD(T):cc-pVTZ Correlation Energy",
        "TPNO-CCSD(T):cc-pVDZ Correlation Energy",
    ]
    force_target_names = []
    __links__ = {"ani1x.hdf5.gz": "https://zenodo.org/record/4081694/files/292.hdf5.gz"}

    def __smiles_converter__(self, x):
        """util function to convert string to smiles: useful if the smiles is
        encoded in a different format than its display format
        """
        return x

__smiles_converter__(x)

util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format

Source code in openqdc/datasets/potential/ani.py
197
198
199
200
201
def __smiles_converter__(self, x):
    """util function to convert string to smiles: useful if the smiles is
    encoded in a different format than its display format
    """
    return x

ANI1CCX_V2

Bases: ANI1CCX

ANI1CCX_V2 is an extension of the ANI1CCX dataset with additional PM6 and GFN2_xTB labels for each conformation.

Usage:

from openqdc.datasets import ANI1CCX_V2
dataset = ANI1CCX_V2()

References

https://doi.org/10.1038/s41467-019-10827-4

https://github.com/aiqm/ANI1x_datasets

Source code in openqdc/datasets/potential/ani.py
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
class ANI1CCX_V2(ANI1CCX):
    """
    ANI1CCX_V2 is an extension of the ANI1CCX dataset with additional PM6 and GFN2_xTB labels
    for each conformation.

    Usage:
    ```python
    from openqdc.datasets import ANI1CCX_V2
    dataset = ANI1CCX_V2()
    ```

    References:
        https://doi.org/10.1038/s41467-019-10827-4\n
        https://github.com/aiqm/ANI1x_datasets
    """

    __name__ = "ani1ccx_v2"

    __energy_methods__ = ANI1CCX.__energy_methods__ + [PotentialMethod.PM6, PotentialMethod.GFN2_XTB]
    energy_target_names = ANI1CCX.energy_target_names + ["PM6", "GFN2"]
    __force_mask__ = ANI1CCX.__force_mask__ + [False, False]

ANI1X

Bases: ANI1

The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning, which leads to a total of 5,496,771 conformers with 63,865 unique molecules. Databases of molecules like GDB-11, ChEMBL, generated amino acids and 2-amino acid peptides are used for sampling new molecules. One of the techniques are used for sampling conformations, (1) molecular dynamics, (2) normal mode sampling, (3) dimer sampling and (4) torsion sampling.

Usage:

from openqdc.datasets import ANI1X
dataset = ANI1X()

References

https://doi.org/10.1063/1.5023802

https://github.com/aiqm/ANI1x_datasets

Source code in openqdc/datasets/potential/ani.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
class ANI1X(ANI1):
    """
    The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning, which leads to
    a total of 5,496,771 conformers with 63,865 unique molecules. Databases of molecules like GDB-11, ChEMBL,
    generated amino acids and 2-amino acid peptides are used for sampling new molecules. One of the techniques
    are used for sampling conformations, (1) molecular dynamics, (2) normal mode sampling, (3) dimer sampling and
    (4) torsion sampling.

    Usage:
    ```python
    from openqdc.datasets import ANI1X
    dataset = ANI1X()
    ```

    References:
        https://doi.org/10.1063/1.5023802\n
        https://github.com/aiqm/ANI1x_datasets
    """

    __name__ = "ani1x"
    __energy_unit__ = "hartree"
    __distance_unit__ = "ang"
    __forces_unit__ = "hartree/ang"

    __energy_methods__ = [
        PotentialMethod.HF_CC_PVDZ,
        PotentialMethod.HF_CC_PVQZ,
        PotentialMethod.HF_CC_PVTZ,
        PotentialMethod.MP2_CC_PVDZ,
        PotentialMethod.MP2_CC_PVQZ,
        PotentialMethod.MP2_CC_PVTZ,
        PotentialMethod.WB97X_6_31G_D,
        PotentialMethod.WB97X_CC_PVTZ,
    ]

    energy_target_names = [
        "HF:cc-pVDZ Total Energy",
        "HF:cc-pVQZ Total Energy",
        "HF:cc-pVTZ Total Energy",
        "MP2:cc-pVDZ Correlation Energy",
        "MP2:cc-pVQZ Correlation Energy",
        "MP2:cc-pVTZ Correlation Energy",
        "wB97x:6-31G(d) Total Energy",
        "wB97x:def2-TZVPP Total Energy",
    ]

    force_target_names = [
        "wB97x:6-31G(d) Atomic Forces",
        "wB97x:def2-TZVPP Atomic Forces",
    ]

    __force_mask__ = [False, False, False, False, False, False, True, True]
    __links__ = {"ani1ccx.hdf5.gz": "https://zenodo.org/record/4081692/files/293.hdf5.gz"}

    def convert_forces(self, x):
        return super().convert_forces(x) * 0.529177249  # correct the Dataset error

    def __smiles_converter__(self, x):
        return x

ANI2X

Bases: ANI1

The ANI-2X dataset was constructed using active learning from modified versions of GDB-11, CheMBL, and s66x8. It adds three new elements (F, Cl, S) resulting in 4.6 million conformers from 13k chemical isomers, optimized using the LBFGS algorithm and labeled with ωB97X/6-31G*. The same sampling techniques as done in ANI-1X are used for generating geometries.

Usage:

from openqdc.datasets import ANI2X
dataset = ANI2X()

References

https://doi.org/10.1021/acs.jctc.0c00121 https://github.com/aiqm/ANI1x_datasets

Source code in openqdc/datasets/potential/ani.py
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
class ANI2X(ANI1):
    """
    The ANI-2X dataset was constructed using active learning from modified versions of GDB-11, CheMBL, and s66x8.
    It adds three new elements (F, Cl, S) resulting in 4.6 million conformers from 13k chemical isomers, optimized
    using the LBFGS algorithm and labeled with ωB97X/6-31G*. The same sampling techniques as done in ANI-1X are
    used for generating geometries.

    Usage:
    ```python
    from openqdc.datasets import ANI2X
    dataset = ANI2X()
    ```

    References:
        https://doi.org/10.1021/acs.jctc.0c00121
        https://github.com/aiqm/ANI1x_datasets
    """

    __name__ = "ani2x"
    __energy_unit__ = "hartree"
    __distance_unit__ = "ang"
    __forces_unit__ = "hartree/ang"

    __energy_methods__ = [
        # PotentialMethod.NONE,  # "b973c/def2mtzvp",
        PotentialMethod.WB97X_6_31G_D,  # "wb97x/631gd", # PAPER DATASET
        # PotentialMethod.NONE,  # "wb97md3bj/def2tzvpp",
        # PotentialMethod.NONE,  # "wb97mv/def2tzvpp",
        # PotentialMethod.NONE,  # "wb97x/def2tzvpp",
    ]

    energy_target_names = [
        # "b973c/def2mtzvp",
        "wb97x/631gd",
        # "wb97md3bj/def2tzvpp",
        # "wb97mv/def2tzvpp",
        # "wb97x/def2tzvpp",
    ]

    force_target_names = ["wb97x/631gd"]  # "b973c/def2mtzvp",

    __force_mask__ = [True]
    __links__ = {  # "ANI-2x-B973c-def2mTZVP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-B973c-def2mTZVP.tar.gz?download=1",  # noqa
        # "ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz?download=1", # noqa
        # "ANI-2x-wB97MV-def2TZVPP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97MV-def2TZVPP.tar.gz?download=1", # noqa
        "ANI-2x-wB97X-631Gd.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97X-631Gd.tar.gz?download=1",  # noqa
        # "ANI-2x-wB97X-def2TZVPP.tar.gz": "https://zenodo.org/records/10108942/files/ANI-2x-wB97X-def2TZVPP.tar.gz?download=1", # noqa
    }

    def __smiles_converter__(self, x):
        return x

    def read_raw_entries(self):
        samples = []
        for lvl_theory in self.__links__.keys():
            raw_path = p_join(self.root, "final_h5", f"{lvl_theory.split('.')[0]}.h5")
            samples.extend(read_ani2_h5(raw_path))
        return samples