diff --git a/.env.example b/.env.example index ab2c165c..cb94fd29 100644 --- a/.env.example +++ b/.env.example @@ -43,6 +43,16 @@ AWS_DEFAULT_REGION=us-east-1 AWS_BUCKET= AWS_USE_PATH_STYLE_ENDPOINT=false +# Monthly export → Zenodo (https://zenodo.org/records/13897048) +ZENODO_ENABLED=false +ZENODO_ACCESS_TOKEN= +ZENODO_LATEST_DEPOSITION_ID=13897048 +ZENODO_CONCEPT_DOI=10.5281/zenodo.13382750 +ZENODO_API_URL=https://zenodo.org/api +ZENODO_AUTO_PUBLISH=false +ZENODO_DRY_RUN=false +ZENODO_RELEASE_NOTES= + PUSHER_APP_ID= PUSHER_APP_KEY= PUSHER_APP_SECRET= diff --git a/config/services.php b/config/services.php index 748b8563..770bf6aa 100644 --- a/config/services.php +++ b/config/services.php @@ -68,4 +68,15 @@ 'cas_key' => env('CAS_KEY'), ], + 'zenodo' => [ + 'enabled' => (bool) env('ZENODO_ENABLED', false), + 'access_token' => env('ZENODO_ACCESS_TOKEN'), + 'latest_deposition_id' => env('ZENODO_LATEST_DEPOSITION_ID'), + 'concept_doi' => env('ZENODO_CONCEPT_DOI', '10.5281/zenodo.13382750'), + 'api_url' => env('ZENODO_API_URL', 'https://zenodo.org/api'), + 'auto_publish' => (bool) env('ZENODO_AUTO_PUBLISH', false), + 'dry_run' => (bool) env('ZENODO_DRY_RUN', false), + 'release_notes' => env('ZENODO_RELEASE_NOTES'), + ], + ]; diff --git a/resources/scripts/python/exports/generate_exports.py b/resources/scripts/python/exports/generate_exports.py index 8e164e21..2b289260 100644 --- a/resources/scripts/python/exports/generate_exports.py +++ b/resources/scripts/python/exports/generate_exports.py @@ -576,7 +576,9 @@ def main(): sdf_filenames["light_sdf"] # e.g. coconut_sdf_2d_lite-03-2025.sdf ) - # Now we can safely remove the CSV files (since we've read them into SDF) + # Zip CSV exports for S3/Zenodo downloads, then remove the uncompressed files + zip_file(csv_with_collection) + zip_file(csv_without_collection) cleanup_files(csv_with_collection, csv_without_collection) # Zip the 2D SDF files, remove originals @@ -652,5 +654,19 @@ def main(): print("All files have been successfully uploaded to S3.") + # 9. Publish to Zenodo (optional; configured via ZENODO_* env variables) + try: + from publish_zenodo import publish_monthly_release + + publish_monthly_release( + env_vars=env_vars, + backup_path=backup_path, + month_year=timestamp, + full_dump_path=full_dump_path if os.path.exists(full_dump_path) else None, + db_params=db_params, + ) + except ImportError as exc: + print(f"Zenodo publish skipped (publish_zenodo module unavailable): {exc}") + if __name__ == "__main__": main() diff --git a/resources/scripts/python/exports/publish_zenodo.py b/resources/scripts/python/exports/publish_zenodo.py new file mode 100644 index 00000000..068eb9f5 --- /dev/null +++ b/resources/scripts/python/exports/publish_zenodo.py @@ -0,0 +1,467 @@ +""" +Publish monthly COCONUT exports to Zenodo as a new dataset version. + +Configuration (via .env or environment): + ZENODO_ENABLED - set to true to run after monthly export + ZENODO_ACCESS_TOKEN - personal access token (deposit:write + deposit:actions) + ZENODO_LATEST_DEPOSITION_ID - record id of the latest published Zenodo version + ZENODO_API_URL - default https://zenodo.org/api + ZENODO_AUTO_PUBLISH - set to true to publish immediately (default: draft only) + ZENODO_DRY_RUN - set to true to validate without API calls + ZENODO_RELEASE_NOTES - optional HTML/text prepended to the record description +""" + +from __future__ import annotations + +import argparse +import os +import re +import sys +import zipfile +from datetime import datetime +from typing import Any + +import requests +from tqdm import tqdm + + +DEFAULT_API_URL = "https://zenodo.org/api" + + +def load_env(file_path: str) -> dict[str, str]: + env_vars: dict[str, str] = {} + if os.path.exists(file_path): + with open(file_path) as f: + for line in f: + line = line.strip() + if line and not line.startswith("#") and "=" in line: + key, value = line.split("=", 1) + env_vars[key] = value.strip().strip('"').strip("'") + return env_vars + + +def env_flag(env_vars: dict[str, str], key: str, default: bool = False) -> bool: + value = env_vars.get(key, str(default)).lower() + return value in ("1", "true", "yes", "on") + + +def get_zenodo_config(env_vars: dict[str, str]) -> dict[str, Any] | None: + if not env_flag(env_vars, "ZENODO_ENABLED"): + return None + + token = env_vars.get("ZENODO_ACCESS_TOKEN", "").strip() + deposition_id = env_vars.get("ZENODO_LATEST_DEPOSITION_ID", "").strip() + + if not token: + raise ValueError("ZENODO_ENABLED is true but ZENODO_ACCESS_TOKEN is not set") + if not deposition_id: + raise ValueError("ZENODO_ENABLED is true but ZENODO_LATEST_DEPOSITION_ID is not set") + + return { + "token": token, + "deposition_id": int(deposition_id), + "api_url": env_vars.get("ZENODO_API_URL", DEFAULT_API_URL).rstrip("/"), + "auto_publish": env_flag(env_vars, "ZENODO_AUTO_PUBLISH"), + "dry_run": env_flag(env_vars, "ZENODO_DRY_RUN"), + "release_notes": env_vars.get("ZENODO_RELEASE_NOTES", "").strip(), + "concept_doi": env_vars.get("ZENODO_CONCEPT_DOI", "").strip(), + } + + +def zenodo_file_mapping( + backup_path: str, + month_year: str, + full_dump_path: str | None, +) -> dict[str, str]: + """Map Zenodo upload names to local export files.""" + mapping = { + f"coconut-{month_year}.csv.zip": os.path.join( + backup_path, f"coconut_csv_lite-{month_year}.zip" + ), + f"coconut_complete-{month_year}.csv.zip": os.path.join( + backup_path, f"coconut_csv-{month_year}.zip" + ), + f"coconut-{month_year}.sdf.zip": os.path.join( + backup_path, f"coconut_sdf_2d_lite-{month_year}.zip" + ), + f"coconut_complete-{month_year}.sdf.zip": os.path.join( + backup_path, f"coconut_sdf_2d-{month_year}.zip" + ), + } + + sql_zip = os.path.join(backup_path, f"coconut-dump-{month_year}.sql.zip") + mapping[f"coconut-dump-{month_year}.sql.zip"] = sql_zip + + return mapping + + +def ensure_sql_zip(full_dump_path: str, sql_zip_path: str) -> str: + """Create the SQL zip for Zenodo if it does not exist yet.""" + if os.path.exists(sql_zip_path): + return sql_zip_path + + if not full_dump_path or not os.path.exists(full_dump_path): + raise FileNotFoundError( + f"SQL dump not found for Zenodo upload: {full_dump_path or '(not provided)'}" + ) + + print(f"Creating {sql_zip_path} from {full_dump_path} ...") + with zipfile.ZipFile(sql_zip_path, "w", zipfile.ZIP_DEFLATED) as zipf: + zipf.write(full_dump_path, os.path.basename(full_dump_path)) + + print(f"Created SQL zip ({os.path.getsize(sql_zip_path) / (1024 ** 3):.2f} GB)") + return sql_zip_path + + +def validate_local_files( + file_mapping: dict[str, str], + full_dump_path: str | None, +) -> list[str]: + missing: list[str] = [] + + for zenodo_name, local_path in file_mapping.items(): + if zenodo_name.endswith(".sql.zip"): + if not os.path.exists(local_path): + try: + ensure_sql_zip(full_dump_path or "", local_path) + except FileNotFoundError: + missing.append(zenodo_name) + continue + + if not os.path.exists(local_path): + missing.append(f"{zenodo_name} -> {local_path}") + + return missing + + +def fetch_database_stats(db_params: dict[str, str] | None) -> dict[str, int] | None: + if not db_params: + return None + + queries = { + "molecules": "SELECT COUNT(*) FROM molecules", + "collections": "SELECT COUNT(*) FROM collections", + "organisms": "SELECT COUNT(DISTINCT organism_id) FROM molecule_organism", + "citations": "SELECT COUNT(*) FROM citations", + } + + try: + import psycopg2 + except ImportError: + print("Warning: psycopg2 not installed; skipping database stats for Zenodo metadata.") + return None + + conn = None + try: + conn = psycopg2.connect( + dbname=db_params.get("dbname", "coconut"), + user=db_params.get("user"), + password=db_params.get("password"), + host=db_params.get("host", "127.0.0.1"), + port=db_params.get("port", "5432"), + ) + stats: dict[str, int] = {} + with conn.cursor() as cursor: + for key, query in queries.items(): + cursor.execute(query) + stats[key] = int(cursor.fetchone()[0]) + return stats + except Exception as exc: + print(f"Warning: could not fetch database stats for Zenodo metadata: {exc}") + return None + finally: + if conn: + conn.close() + + +def update_description_stats(description: str, stats: dict[str, int]) -> str: + """Update the summary statistics row in the inherited Zenodo HTML description.""" + values = [ + f"{stats['molecules']:,}", + f"{stats['collections']:,}", + f"{stats['organisms']:,}", + f"{stats['citations']:,}", + ] + + pattern = ( + r"(
)?)(\d[\d,]*)" + ) + replacement = rf"\g<1>{values[0]}\g<3>{values[1]}\g<5>{values[2]}\g<7>{values[3]}" + + updated, count = re.subn(pattern, replacement, description, count=1) + return updated if count else description + + +def build_metadata_update( + draft: dict[str, Any], + month_year: str, + release_notes: str, + stats: dict[str, int] | None, +) -> dict[str, Any]: + metadata = dict(draft.get("metadata", {})) + metadata["publication_date"] = datetime.now().strftime("%Y-%m-%d") + + description = metadata.get("description", "") + + if stats: + description = update_description_stats(description, stats) + + if release_notes: + notes_block = ( + f"