WARNING: THIS SITE IS A MIRROR OF GITHUB.COM / IT CANNOT LOGIN OR REGISTER ACCOUNTS / THE CONTENTS ARE PROVIDED AS-IS / THIS SITE ASSUMES NO RESPONSIBILITY FOR ANY DISPLAYED CONTENT OR LINKS / IF YOU FOUND SOMETHING MAY NOT GOOD FOR EVERYONE, CONTACT ADMIN AT ilovescratch@foxmail.com
Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions data/thermoml_archive/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
git+https://github.com/sustainable-processes/thermopyl
tqdm
pyyaml
140 changes: 140 additions & 0 deletions data/thermoml_archive/transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import hashlib
import pathlib
import tarfile
import warnings

import pandas as pd
import requests
import tqdm
import yaml
from thermopyl import Parser

from chemnlp.data_val.model import Dataset


def get_and_transform_data():
"""Downloads the archived version of ThermoML, extracts it and
parses the provided XML files with thermopyl to construct a flat csv.

"""
# get raw data
fname = "ThermoML.v2020-09-30.tgz"
download_path = pathlib.Path(__file__).parent / fname
remote_data_path = f"https://data.nist.gov/od/ds/mds2-2422/{fname}"
sha256_checksum = "231161b5e443dc1ae0e5da8429d86a88474cb722016e5b790817bb31c58d7ec2"
final_csv_path = pathlib.Path(__file__).parent / "thermoml_archive.csv"
final_expected_csv_checksum = ""

if not download_path.exists():
data = requests.get(remote_data_path)
with open(download_path, "wb") as f:
for chunk in tqdm.tqdm(
data.iter_content(chunk_size=8192), desc="Downloading archive"
):
f.write(chunk)

# check if checksum is correct
sha256 = hashlib.sha256()
with open(download_path, "rb") as f:
for chunk in tqdm.tqdm(iter(lambda: f.read(8192), b""), desc="Checking hash"):
sha256.update(chunk)

if received_hash := sha256.hexdigest() != sha256_checksum:
raise RuntimeError(
"Downloaded file did not match expected checksum -- "
"either a new version has been released or something has gone wrong!\n"
f"Expected: {sha256_checksum}\n"
f"Received: {received_hash}"
)

# Extract tar.gz archive
with tarfile.open(download_path, "r:*") as tar:
tar.extractall(pathlib.Path(__file__).parent)

# Loop through journal DOI folders and scrape files

if final_csv_path.exists():
sha256 = hashlib.sha256()
with open(final_csv_path, "rb") as f:
for chunk in tqdm.tqdm(
iter(lambda: f.read(8192), b""), desc="Checking hash"
):
sha256.update(chunk)
if sha256.hexdigest() != final_expected_csv_checksum:
warnings.warn(
"Old CSV file did not match expected checksum, will try to recreate."
)
final_csv_path.rename(final_csv_path.with_suffix(".old.csv"))

root_dois = ("10.1007", "10.1016", "10.1021")

num_points = 0
num_failed = 0
for doi in root_dois:
for path in tqdm.tqdm(
(pathlib.Path(__file__).parent / doi).glob("*.xml"),
desc=f"Looping over files in {doi}",
):
with open(path, "r") as f:
try:
pd.DataFrame(Parser(path).parse()).to_csv(final_csv_path, mode="a")
num_points += 1
except Exception:
num_failed += 1

print(f"Ingested {num_points} with {num_failed} failures.")

sha256 = hashlib.sha256()
with open(final_csv_path, "rb") as f:
for chunk in tqdm.tqdm(iter(lambda: f.read(8192), b""), desc="Checking hash"):
sha256.update(chunk)

if csv_hash := sha256.hexdigest() != final_expected_csv_checksum:
warnings.warn(
"Final CSV file did not match expected checksum!\n"
f"Expected: {final_expected_csv_checksum}\n"
f"Received: {csv_hash}"
)

# create metadata
meta = Dataset(
**{
"name": "thermoml_archive",
"description": "ThermoML is an XML-based IUPAC standard for the storage and exchange of experimental thermophysical and thermochemical property data. The ThermoML archive is a subset of Thermodynamics Research Center (TRC) data holdings corresponding to cooperation between NIST TRC and five journals.", # noqa
"identifiers": [
{
"id": "",

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it ok to not provide an id value?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, this is unfinished (as mentioned in PR desc), this is the main bit that needs feedback/further discussion.

"type": "inchi",
},
{
"id": "",
"type": "inchikey",
},
],
"license": "https://www.nist.gov/open/license",
"links": [
{
"url": "https://doi.org/10.18434/mds2-2422",
"description": "data publication",
},
{
"url": "https://www.nist.gov/publications/towards-improved-fairness-thermoml-archive",
"description": "NIST publication description",
},
{
"url": "https://trc.nist.gov/ThermoML",
"description": "Live database hosted at NIST Thermodynamics Research Center",
},
],
"num_points": num_points,
"bibtex": [
"@article{Riccardi2022,title = {Towards improved {{FAIRness}} of the {{ThermoML Archive}}},author = {Riccardi, Demian and Trautt, Zachary and Bazyleva, Ala and Paulechka, Eugene and Diky, Vladimir and Magee, Joseph W. and Kazakov, Andrei F. and Townsend, Scott A. and Muzny, Chris D.},year = {2022},journal = {Journal of Computational Chemistry},volume = {43},number = {12},pages = {879--887},doi = {10.1002/jcc.26842},langid = {english}}", # noqa
],
}
)
with open("meta.yaml", "w") as f:
yaml.dump(meta.dict(), f, sort_keys=False)


if __name__ == "__main__":
get_and_transform_data()