Skip to content

Utils

check_file(path)

Checks if file present on local

Source code in openqdc/utils/io.py
123
124
125
def check_file(path) -> bool:
    """Checks if file present on local"""
    return os.path.exists(path)

create_hdf5_file(hdf5_file_path)

Creates hdf5 file with fsspec

Source code in openqdc/utils/io.py
179
180
181
182
183
184
def create_hdf5_file(hdf5_file_path: str):
    """Creates hdf5 file with fsspec"""
    fp = fsspec.open(hdf5_file_path, "wb")
    if hasattr(fp, "open"):
        fp = fp.open()
    return h5py.File(fp, "a")

get_conversion(in_unit, out_unit)

Utility function to get the conversion function between two units.

Parameters:

Name Type Description Default
in_unit

The input unit

required
out_unit

The output unit

required

Returns:

Type Description
Callable[[float], float]

The conversion function

Source code in openqdc/utils/units.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
def get_conversion(in_unit: str, out_unit: str) -> Callable[[float], float]:
    """
    Utility function to get the conversion function between two units.

    Parameters:
        in_unit : The input unit
        out_unit : The output unit

    Returns:
        The conversion function
    """
    name = "convert_" + in_unit.lower().strip() + "_to_" + out_unit.lower().strip()
    if in_unit.lower().strip() == out_unit.lower().strip():
        return lambda x: x
    if name not in CONVERSION_REGISTRY:
        raise ConversionNotDefinedError(in_unit, out_unit)
    return CONVERSION_REGISTRY[name]

get_local_cache()

Returns the local cache directory. It creates it if it does not exist.

Returns:

Name Type Description
str str

path to the local cache directory

Source code in openqdc/utils/io.py
48
49
50
51
52
53
54
55
56
57
def get_local_cache() -> str:
    """
    Returns the local cache directory. It creates it if it does not exist.

    Returns:
        str: path to the local cache directory
    """
    cache_dir = os.path.expanduser(os.path.expandvars(_OPENQDC_CACHE_DIR))
    os.makedirs(cache_dir, exist_ok=True)
    return cache_dir

get_remote_cache(write_access=False)

Returns the entry point based on the write access.

Source code in openqdc/utils/io.py
60
61
62
63
64
65
66
67
68
69
70
def get_remote_cache(write_access=False) -> str:
    """
    Returns the entry point based on the write access.
    """
    if write_access:
        remote_cache = "openqdc/v1"  # "gs://qmdata-public/openqdc"
        # remote_cache = "gs://qmdata-public/openqdc"
    else:
        remote_cache = _OPENQDC_DOWNLOAD_API.get(os.environ.get("OPENQDC_DOWNLOAD_API", "s3"))
        # remote_cache = "https://storage.googleapis.com/qmdata-public/openqdc"
    return remote_cache

load_hdf5_file(hdf5_file_path)

Loads hdf5 file with fsspec

Source code in openqdc/utils/io.py
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
def load_hdf5_file(hdf5_file_path: str):
    """Loads hdf5 file with fsspec"""
    if not check_file(hdf5_file_path):
        raise FileNotFoundError(f"File {hdf5_file_path} does not exist on GCS and local.")

    fp = fsspec.open(hdf5_file_path, "rb")
    if hasattr(fp, "open"):
        fp = fp.open()
    file = h5py.File(fp)

    # inorder to enable multiprocessing:
    # https://github.com/fsspec/gcsfs/issues/379#issuecomment-839929801
    # fsspec.asyn.iothread[0] = None
    # fsspec.asyn.loop[0] = None

    return file

load_json(path)

Loads json file

Source code in openqdc/utils/io.py
187
188
189
190
def load_json(path):
    """Loads json file"""
    with fsspec.open(path, "r") as fp:  # Unpickling
        return json.load(fp)

load_pkl(path, check=True)

Load pkl file

Source code in openqdc/utils/io.py
151
152
153
154
155
156
157
158
def load_pkl(path, check=True):
    """Load pkl file"""
    if check:
        if not check_file(path):
            raise FileNotFoundError(f"File {path} does not exist on GCS and local.")

    with open(path, "rb") as fp:  # Unpickling
        return pkl.load(fp)

makedirs(path, exist_ok=True)

Creates directory

Source code in openqdc/utils/io.py
118
119
120
def makedirs(path, exist_ok=True):
    """Creates directory"""
    os.makedirs(path, exist_ok=exist_ok)

read_qc_archive_h5(raw_path, subset, energy_target_names, force_target_names=None)

Extracts data from the HDF5 archive file.

Source code in openqdc/utils/io.py
288
289
290
291
292
293
294
295
296
297
def read_qc_archive_h5(
    raw_path: str, subset: str, energy_target_names: List[str], force_target_names: Optional[List[str]] = None
) -> List[Dict[str, np.ndarray]]:
    """Extracts data from the HDF5 archive file."""
    data = load_hdf5_file(raw_path)
    data_t = {k2: data[k1][k2][:] for k1 in data.keys() for k2 in data[k1].keys()}

    n = len(data_t["molecule_id"])
    samples = [extract_entry(data_t, i, subset, energy_target_names, force_target_names) for i in tqdm(range(n))]
    return samples

save_pkl(file, path)

Saves pkl file

Source code in openqdc/utils/io.py
134
135
136
137
138
def save_pkl(file, path):
    """Saves pkl file"""
    logger.info(f"Saving file at {path}")
    with fsspec.open(path, "wb") as fp:  # Pickling
        pkl.dump(file, fp)

set_cache_dir(d)

Optionally set the _OPENQDC_CACHE_DIR directory.

Parameters:

Name Type Description Default
d str

path to a local folder.

required
Source code in openqdc/utils/io.py
35
36
37
38
39
40
41
42
43
44
45
def set_cache_dir(d):
    r"""
    Optionally set the _OPENQDC_CACHE_DIR directory.

    Args:
        d (str): path to a local folder.
    """
    if d is None:
        return
    global _OPENQDC_CACHE_DIR
    _OPENQDC_CACHE_DIR = os.path.normpath(os.path.expanduser(d))