Skip to content

DES

DES370K

Bases: BaseInteractionDataset, IDES

DE Shaw 370K (DES370K) is a dataset of 3,691 distinct dimers with 370K unique geometries with interaction energies computed at CCSD(T)/CBS level of theory. It consists of 392 closed-shell chemical species (both neutral molecules and ions) including water and functional groups found in proteins. Dimer geometries are generated using QM-based optimization with DF-LMP2/aVDZ level of theory and MD-based from condensed phase MD simulations.

Usage:

from openqdc.datasets import DES370K
dataset = DES370K()

Reference

https://www.nature.com/articles/s41597-021-00833-x

Source code in openqdc/datasets/interaction/des.py
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
class DES370K(BaseInteractionDataset, IDES):
    """
    DE Shaw 370K (DES370K) is a dataset of 3,691 distinct dimers with 370K unique geometries with interaction energies
    computed at CCSD(T)/CBS level of theory. It consists of 392 closed-shell chemical species (both neutral molecules
    and ions) including water and functional groups found in proteins. Dimer geometries are generated using
    QM-based optimization with DF-LMP2/aVDZ level of theory and MD-based from condensed phase MD simulations.

    Usage:
    ```python
    from openqdc.datasets import DES370K
    dataset = DES370K()
    ```

    Reference:
        https://www.nature.com/articles/s41597-021-00833-x
    """

    __name__ = "des370k_interaction"
    __filename__ = "DES370K.csv"
    __energy_unit__ = "kcal/mol"
    __distance_unit__ = "ang"
    __forces_unit__ = "kcal/mol/ang"
    __energy_methods__ = [
        InteractionMethod.MP2_CC_PVDZ,
        InteractionMethod.MP2_CC_PVQZ,
        InteractionMethod.MP2_CC_PVTZ,
        InteractionMethod.MP2_CBS,
        InteractionMethod.CCSD_T_CC_PVDZ,
        InteractionMethod.CCSD_T_CBS,
        InteractionMethod.CCSD_T_NN,
        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
    ]

    __energy_type__ = [
        InterEnergyType.TOTAL,
        InterEnergyType.TOTAL,
        InterEnergyType.TOTAL,
        InterEnergyType.TOTAL,
        InterEnergyType.TOTAL,
        InterEnergyType.TOTAL,
        InterEnergyType.TOTAL,
        InterEnergyType.TOTAL,
        InterEnergyType.ES,
        InterEnergyType.EX,
        InterEnergyType.EX_S2,
        InterEnergyType.IND,
        InterEnergyType.EX_IND,
        InterEnergyType.DISP,
        InterEnergyType.EX_DISP_OS,
        InterEnergyType.EX_DISP_SS,
        InterEnergyType.DELTA_HF,
    ]

    energy_target_names = [
        "cc_MP2_all",
        "qz_MP2_all",
        "tz_MP2_all",
        "cbs_MP2_all",
        "cc_CCSD(T)_all",
        "cbs_CCSD(T)_all",
        "nn_CCSD(T)_all",
        "sapt_all",
        "sapt_es",
        "sapt_ex",
        "sapt_exs2",
        "sapt_ind",
        "sapt_exind",
        "sapt_disp",
        "sapt_exdisp_os",
        "sapt_exdisp_ss",
        "sapt_delta_HF",
    ]
    __links__ = {
        "DES370K.zip": "https://zenodo.org/record/5676266/files/DES370K.zip",
    }

    @property
    def csv_path(self):
        return os.path.join(self.root, self.__filename__)

    def _create_subsets(self, **kwargs):
        return create_subset(kwargs["smiles0"], kwargs["smiles1"])

    def read_raw_entries(self) -> List[Dict]:
        filepath = self.csv_path
        logger.info(f"Reading {self.__name__} interaction data from {filepath}")
        df = pd.read_csv(filepath)
        data = []
        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
            item = parse_des_df(row, self.energy_target_names)
            item["subset"] = self._create_subsets(row=row, **item)
            item = convert_to_record(item)
            data.append(item)
        return data

DES5M

Bases: DES370K

DE Shaw 5M (DES5M) is a dataset of 3,691 distinct dimers with 5,000,000 unique geometries with interaction energies computed using SNS-MP2, a machine learning approach. The unique geometries are generated similar to DES370K using QM based optimization and MD simulations.

Usage:

from openqdc.datasets import DES5M
dataset = DES5M()

Reference

https://www.nature.com/articles/s41597-021-00833-x

Source code in openqdc/datasets/interaction/des.py
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
class DES5M(DES370K):
    """
    DE Shaw 5M (DES5M) is a dataset of 3,691 distinct dimers with 5,000,000 unique geometries with interaction energies
    computed using SNS-MP2, a machine learning approach. The unique geometries are generated similar to DES370K using
    QM based optimization and MD simulations.

    Usage:
    ```python
    from openqdc.datasets import DES5M
    dataset = DES5M()
    ```

    Reference:
        https://www.nature.com/articles/s41597-021-00833-x
    """

    __name__ = "des5m_interaction"
    __filename__ = "DES5M.csv"

    __energy_methods__ = [
        InteractionMethod.MP2_CC_PVQZ,
        InteractionMethod.MP2_CC_PVTZ,
        InteractionMethod.MP2_CBS,
        InteractionMethod.CCSD_T_NN,
        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
    ]

    __energy_type__ = [
        InterEnergyType.TOTAL,
        InterEnergyType.TOTAL,
        InterEnergyType.TOTAL,
        InterEnergyType.TOTAL,
        InterEnergyType.TOTAL,
        InterEnergyType.ES,
        InterEnergyType.EX,
        InterEnergyType.EX_S2,
        InterEnergyType.IND,
        InterEnergyType.EX_IND,
        InterEnergyType.DISP,
        InterEnergyType.EX_DISP_OS,
        InterEnergyType.EX_DISP_SS,
        InterEnergyType.DELTA_HF,
    ]

    energy_target_names = [
        "qz_MP2_all",
        "tz_MP2_all",
        "cbs_MP2_all",
        "nn_CCSD(T)_all",
        "sapt_all",
        "sapt_es",
        "sapt_ex",
        "sapt_exs2",
        "sapt_ind",
        "sapt_exind",
        "sapt_disp",
        "sapt_exdisp_os",
        "sapt_exdisp_ss",
        "sapt_delta_HF",
    ]
    __links__ = {
        "DES5M.zip": "https://zenodo.org/records/5706002/files/DESS5M.zip?download=1",
    }

DESS66

Bases: DES370K

DESS66 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS dimer interaction energies with 1 equilibrium geometry giving 66 conformers in total. The protocol for estimating energies is based on the DES370K paper.

Usage:

from openqdc.datasets import DESS66
dataset = DESS66()

Reference

https://www.nature.com/articles/s41597-021-00833-x

S66: https://pubs.acs.org/doi/10.1021/ct2002946

Source code in openqdc/datasets/interaction/des.py
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
class DESS66(DES370K):
    """
    DESS66 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS
    dimer interaction energies with 1 equilibrium geometry giving 66 conformers in total.
    The protocol for estimating energies is based on the DES370K paper.

    Usage:
    ```python
    from openqdc.datasets import DESS66
    dataset = DESS66()
    ```

    Reference:
        https://www.nature.com/articles/s41597-021-00833-x\n
        S66: https://pubs.acs.org/doi/10.1021/ct2002946
    """

    __name__ = "des_s66"
    __filename__ = "DESS66.csv"
    __links__ = {"DESS66.zip": "https://zenodo.org/records/5676284/files/DESS66.zip?download=1"}

    def _create_subsets(self, **kwargs):
        return kwargs["row"]["system_name"]

DESS66x8

Bases: DESS66

DESS66x8 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS dimer interaction energies with 1 equilibrium geometry and 8 geometries along the dissociation curve giving 592 conformers in total. The protocol for estimating energies is based on the DES370K paper.

Usage:

from openqdc.datasets import DESS66x8
dataset = DESS66x8()

Reference

https://www.nature.com/articles/s41597-021-00833-x

Source code in openqdc/datasets/interaction/des.py
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
class DESS66x8(DESS66):
    """
    DESS66x8 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS
    dimer interaction energies with 1 equilibrium geometry and 8 geometries along the dissociation curve
    giving 592 conformers in total. The protocol for estimating energies is based on the DES370K paper.

    Usage:
    ```python
    from openqdc.datasets import DESS66x8
    dataset = DESS66x8()
    ```

    Reference:
        https://www.nature.com/articles/s41597-021-00833-x
    """

    __name__ = "des_s66x8"
    __filename__ = "DESS66x8.csv"
    __links__ = {"DESS66x8.zip": "https://zenodo.org/records/5676284/files/DESS66x8.zip?download=1"}