68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175 | class PCQM_PM6(BaseDataset):
"""
PubChemQC PM6 (PCQM_PM6) is an exhaustive dataset containing 221 million organic molecules with optimized
molecular geometries and electronic properties. To generate the dataset, only molecules with weights less
than 1000g/mol are considered from the PubChem ftp site. The initial structure is generated using OpenBabel
and then is optimized using geometry optimization with the semi-empirical method PM6. The energies are also
computed using the PM6 method.
Usage:
```python
from openqdc.datasets import PCQM_PM6
dataset = PCQM_PM6()
```
References:
https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00740
"""
__name__ = "pubchemqc_pm6"
__energy_methods__ = [PotentialMethod.PM6]
energy_target_names = ["pm6"]
__force_methods__ = []
force_target_names = []
@property
def root(self):
return p_join(get_local_cache(), "pubchemqc")
@property
def preprocess_path(self):
path = p_join(self.root, "preprocessed", self.__name__)
os.makedirs(path, exist_ok=True)
return path
def collate_list(self, list_entries):
predicat = list_entries is not None and len(list_entries) > 0
list_entries = [x for x in list_entries if x is not None]
if predicat:
res = super().collate_list(list_entries)
else:
res = None
return res
@property
def data_types(self):
return {
"atomic_inputs": np.float32,
"position_idx_range": np.int32,
"energies": np.float32,
"forces": np.float32,
}
def read_raw_entries(self):
arxiv_paths = glob(p_join(self.root, f"{self.__energy_methods__[0]}", "*.pkl"))
f = lambda x: self.collate_list(read_preprocessed_archive(x))
samples = dm.parallelized(f, arxiv_paths, n_jobs=1, progress=True)
samples = [x for x in samples if x is not None]
return samples
def preprocess(self, overwrite=False):
if overwrite or not self.is_preprocessed():
logger.info("Preprocessing data and saving it to cache.")
logger.info(
f"Dataset {self.__name__} data with the following units:\n"
f"Energy: {self.energy_unit}, Distance: {self.distance_unit}, "
f"Forces: {self.force_unit if self.__force_methods__ else 'None'}"
)
entries = self.read_raw_entries()
self.collate_and_save_list(entries)
def collate_and_save_list(self, list_entries):
n_molecules, n_atoms = 0, 0
for i in range(len(list_entries)):
list_entries[i]["position_idx_range"] += n_atoms
n_atoms += list_entries[i]["position_idx_range"].max()
n_molecules += list_entries[i]["position_idx_range"].shape[0]
for key in self.data_keys:
first = list_entries[0][key]
shape = (n_molecules, *first.shape[1:])
local_path = p_join(self.preprocess_path, f"{key}.mmap")
out = np.memmap(local_path, mode="w+", dtype=first.dtype, shape=shape)
start = 0
for i in range(len(list_entries)):
x = list_entries[i].pop(key)
n = x.shape[0]
out[start : start + n] = x
out.flush()
push_remote(local_path, overwrite=True)
# save smiles and subset
tmp, n = dict(name=[]), len(list_entries)
local_path = p_join(self.preprocess_path, "props.pkl")
names = [list_entries[i].pop("name") for i in range(n)]
f = lambda xs: [dm.to_inchikey(x) for x in xs]
res = dm.parallelized(f, names, n_jobs=-1, progress=False)
for x in res:
tmp["name"] += x
for key in ["subset", "n_atoms"]:
tmp[key] = []
for i in range(n):
tmp[key] += list(list_entries[i].pop(key))
with open(local_path, "wb") as f:
pkl.dump(tmp, f)
push_remote(local_path, overwrite=True)
|