Skip to content

QM1B

QM1B

Bases: BaseDataset

QM1B is a dataset containing 1 billion conformations for 1.09M small molecules generated using a custom PySCF library that incorporates hardware acceleration via IPUs. The molecules contain 9-11 heavy atoms and are subsampled from the Generated Data Bank (GDB). For each molecule, 1000 geometries are generated using RDKit. Electronic properties for each conformation are then calculated using the density functional B3LYP and the basis set STO-3G.

Usage:

from openqdc.datasets import QM1B
dataset = QM1B()

References

https://arxiv.org/pdf/2311.01135

https://github.com/graphcore-research/qm1b-dataset/

Source code in openqdc/datasets/potential/qm1b.py
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
class QM1B(BaseDataset):
    """
    QM1B is a dataset containing 1 billion conformations for 1.09M small molecules generated using a custom
    PySCF library that incorporates hardware acceleration via IPUs. The molecules contain 9-11 heavy atoms and are
    subsampled from the Generated Data Bank (GDB). For each molecule, 1000 geometries are generated using RDKit.
    Electronic properties for each conformation are then calculated using the density functional B3LYP
    and the basis set STO-3G.

    Usage:
    ```python
    from openqdc.datasets import QM1B
    dataset = QM1B()
    ```

    References:
        https://arxiv.org/pdf/2311.01135\n
        https://github.com/graphcore-research/qm1b-dataset/
    """

    __name__ = "qm1b"

    __energy_methods__ = [PotentialMethod.B3LYP_STO3G]
    __force_methods__ = []

    energy_target_names = ["b3lyp/sto-3g"]
    force_target_names = []

    __energy_unit__ = "ev"
    __distance_unit__ = "bohr"
    __forces_unit__ = "ev/bohr"
    __links__ = {
        "qm1b_validation.parquet": "https://ndownloader.figshare.com/files/43005175",
        **{f"part_{i:03d}.parquet": f"https://ndownloader.figshare.com/files/{FILE_NUM[i]}" for i in range(0, 256)},
    }

    @property
    def root(self):
        return p_join(get_local_cache(), "qm1b")

    @property
    def preprocess_path(self):
        path = p_join(self.root, "preprocessed", self.__name__)
        os.makedirs(path, exist_ok=True)
        return path

    def read_raw_entries(self):
        filenames = list(map(lambda x: p_join(self.root, f"part_{x:03d}.parquet"), list(range(0, 256)))) + [
            p_join(self.root, "qm1b_validation.parquet")
        ]

        def read_entries_parallel(filename):
            df = pd.read_parquet(filename)

            def extract_parallel(df, i):
                return extract_from_row(df.iloc[i])

            fn = partial(extract_parallel, df)
            list_of_idxs = list(range(len(df)))
            results = dm.utils.parallelized(fn, list_of_idxs, scheduler="threads", progress=False)
            return results

        list_of_list = dm.utils.parallelized(read_entries_parallel, filenames, scheduler="processes", progress=True)

        return [x for xs in list_of_list for x in xs]

QM1B_SMALL

Bases: QM1B

QM1B_SMALL is a subset of the QM1B dataset containing a maximum of 15 random conformers per molecule.

Usage:

from openqdc.datasets import QM1B_SMALL
dataset = QM1B_SMALL()

Source code in openqdc/datasets/potential/qm1b.py
145
146
147
148
149
150
151
152
153
154
155
156
class QM1B_SMALL(QM1B):
    """
    QM1B_SMALL is a subset of the QM1B dataset containing a maximum of 15 random conformers per molecule.

    Usage:
    ```python
    from openqdc.datasets import QM1B_SMALL
    dataset = QM1B_SMALL()
    ```
    """

    __name__ = "qm1b_small"