Skip to content

PCQM

PCQM_B3LYP

Bases: PCQM_PM6

PubChemQC B3LYP/6-31G (PCQM_B3LYP) comprises of 85 million molecules ranging from essential compounds to biomolecules. The geometries for the molecule are optimized using PM6. Using the optimized geometry, the electronic structure and properties are calculated using B3LIP/6-31G method.

Usage:

from openqdc.datasets import PCQM_B3LYP
dataset = PCQM_B3LYP()

References

https://arxiv.org/abs/2305.18454

Source code in openqdc/datasets/potential/pcqm.py
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
class PCQM_B3LYP(PCQM_PM6):
    """
    PubChemQC B3LYP/6-31G* (PCQM_B3LYP) comprises of 85 million molecules ranging from essential compounds to
    biomolecules. The geometries for the molecule are optimized using PM6. Using the optimized geometry,
    the electronic structure and properties are calculated using B3LIP/6-31G* method.

    Usage:
    ```python
    from openqdc.datasets import PCQM_B3LYP
    dataset = PCQM_B3LYP()
    ```

    References:
        https://arxiv.org/abs/2305.18454
    """

    __name__ = "pubchemqc_b3lyp"
    __energy_methods__ = ["b3lyp/6-31g*"]
    energy_target_names = ["b3lyp"]

PCQM_PM6

Bases: BaseDataset

PubChemQC PM6 (PCQM_PM6) is an exhaustive dataset containing 221 million organic molecules with optimized molecular geometries and electronic properties. To generate the dataset, only molecules with weights less than 1000g/mol are considered from the PubChem ftp site. The initial structure is generated using OpenBabel and then is optimized using geometry optimization with the semi-empirical method PM6. The energies are also computed using the PM6 method.

Usage:

from openqdc.datasets import PCQM_PM6
dataset = PCQM_PM6()

References

https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00740

Source code in openqdc/datasets/potential/pcqm.py
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
class PCQM_PM6(BaseDataset):
    """
    PubChemQC PM6 (PCQM_PM6) is an exhaustive dataset containing 221 million organic molecules with optimized
    molecular geometries and electronic properties. To generate the dataset, only molecules with weights less
    than 1000g/mol are considered from the PubChem ftp site. The initial structure is generated using OpenBabel
    and then is optimized using geometry optimization with the semi-empirical method PM6. The energies are also
    computed using the PM6 method.

    Usage:
    ```python
    from openqdc.datasets import PCQM_PM6
    dataset = PCQM_PM6()
    ```

    References:
        https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00740
    """

    __name__ = "pubchemqc_pm6"
    __energy_methods__ = [PotentialMethod.PM6]

    energy_target_names = ["pm6"]

    __force_methods__ = []
    force_target_names = []

    @property
    def root(self):
        return p_join(get_local_cache(), "pubchemqc")

    @property
    def preprocess_path(self):
        path = p_join(self.root, "preprocessed", self.__name__)
        os.makedirs(path, exist_ok=True)
        return path

    def collate_list(self, list_entries):
        predicat = list_entries is not None and len(list_entries) > 0
        list_entries = [x for x in list_entries if x is not None]
        if predicat:
            res = super().collate_list(list_entries)
        else:
            res = None
        return res

    @property
    def data_types(self):
        return {
            "atomic_inputs": np.float32,
            "position_idx_range": np.int32,
            "energies": np.float32,
            "forces": np.float32,
        }

    def read_raw_entries(self):
        arxiv_paths = glob(p_join(self.root, f"{self.__energy_methods__[0]}", "*.pkl"))
        f = lambda x: self.collate_list(read_preprocessed_archive(x))
        samples = dm.parallelized(f, arxiv_paths, n_jobs=1, progress=True)
        samples = [x for x in samples if x is not None]
        return samples

    def preprocess(self, overwrite=False):
        if overwrite or not self.is_preprocessed():
            logger.info("Preprocessing data and saving it to cache.")
            logger.info(
                f"Dataset {self.__name__} data with the following units:\n"
                f"Energy: {self.energy_unit}, Distance: {self.distance_unit}, "
                f"Forces: {self.force_unit if self.__force_methods__ else 'None'}"
            )
            entries = self.read_raw_entries()
            self.collate_and_save_list(entries)

    def collate_and_save_list(self, list_entries):
        n_molecules, n_atoms = 0, 0
        for i in range(len(list_entries)):
            list_entries[i]["position_idx_range"] += n_atoms
            n_atoms += list_entries[i]["position_idx_range"].max()
            n_molecules += list_entries[i]["position_idx_range"].shape[0]

        for key in self.data_keys:
            first = list_entries[0][key]
            shape = (n_molecules, *first.shape[1:])
            local_path = p_join(self.preprocess_path, f"{key}.mmap")
            out = np.memmap(local_path, mode="w+", dtype=first.dtype, shape=shape)

            start = 0
            for i in range(len(list_entries)):
                x = list_entries[i].pop(key)
                n = x.shape[0]
                out[start : start + n] = x
                out.flush()
            push_remote(local_path, overwrite=True)

        # save smiles and subset
        tmp, n = dict(name=[]), len(list_entries)
        local_path = p_join(self.preprocess_path, "props.pkl")
        names = [list_entries[i].pop("name") for i in range(n)]
        f = lambda xs: [dm.to_inchikey(x) for x in xs]
        res = dm.parallelized(f, names, n_jobs=-1, progress=False)
        for x in res:
            tmp["name"] += x
        for key in ["subset", "n_atoms"]:
            tmp[key] = []
            for i in range(n):
                tmp[key] += list(list_entries[i].pop(key))
        with open(local_path, "wb") as f:
            pkl.dump(tmp, f)
        push_remote(local_path, overwrite=True)