Timoniche · Timoniche · Oct 22, 2025 · Oct 23, 2025 · Oct 24, 2025 · Oct 24, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,16 @@
+.idea
+.DS_Store
+.DS_Store
+**/*.cpython-311..pyc
+vk_lsvd/metadata
+**/*.pth
+**/*.pkl
+**/.ipynb_checkpoints
+data/Beauty_legacy_datasphere/**
+tensorboard_logs_legacy/**
+tensorboard_logs/**
+tiger/modeling/dataset/__pycache__/**
+vk_lsvd/subsamples
+**/*.pyc
+yambda/embeddings.parquet
+yambda/flat/50m/likes.parquet
diff --git a/ai/vk_exps/YambdaDownload.ipynb b/ai/vk_exps/YambdaDownload.ipynb
@@ -0,0 +1,366 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "1161737c",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import json\n",
+        "from pathlib import Path\n",
+        "from typing import Literal\n",
+        "\n",
+        "import numpy as np\n",
+        "import polars as pl\n",
+        "from datasets import Dataset, DatasetDict, load_dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "f0b09001",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "DATA_DIR = \"../data\"\n",
+        "DATASET_NAME = \"yambda\"\n",
+        "\n",
+        "OUTPUT_DIR = f\"{DATA_DIR}/{DATASET_NAME}\"\n",
+        "DATASET_PATH = f\"{DATA_DIR}/{DATASET_NAME}/raw\"\n",
+        "\n",
+        "Path(DATASET_PATH).mkdir(parents=True, exist_ok=True)\n",
+        "\n",
+        "NUM_PARTS = 10\n",
+        "EMBEDDINGS_PATH = f\"{DATASET_PATH}/embeddings.parquet\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "ccd1b98e",
+      "metadata": {},
+      "source": [
+        "Download dataset with embeddings"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "a7c5e02f",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!hf download yandex/yambda embeddings.parquet --repo-type dataset --local-dir ../data/yambda/raw"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "f454d298",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "class YambdaDataset:\n",
+        "    INTERACTIONS = frozenset([\"likes\", \"listens\", \"multi_event\", \"dislikes\", \"unlikes\", \"undislikes\"])\n",
+        "\n",
+        "    def __init__(\n",
+        "        self, dataset_type: Literal[\"flat\", \"sequential\"] = \"flat\", dataset_size: Literal[\"50m\", \"500m\", \"5b\"] = \"50m\"\n",
+        "    ):\n",
+        "        assert dataset_type in {\"flat\", \"sequential\"}\n",
+        "        assert dataset_size in {\"50m\", \"500m\", \"5b\"}\n",
+        "        self.dataset_type = dataset_type\n",
+        "        self.dataset_size = dataset_size\n",
+        "\n",
+        "    def interaction(\n",
+        "        self, event_type: Literal[\"likes\", \"listens\", \"multi_event\", \"dislikes\", \"unlikes\", \"undislikes\"]\n",
+        "    ) -> Dataset:\n",
+        "        assert event_type in YambdaDataset.INTERACTIONS\n",
+        "        return self._download(f\"{self.dataset_type}/{self.dataset_size}\", event_type)\n",
+        "\n",
+        "    def audio_embeddings(self) -> Dataset:\n",
+        "        return self._download(\"\", \"embeddings\")\n",
+        "\n",
+        "    def album_item_mapping(self) -> Dataset:\n",
+        "        return self._download(\"\", \"album_item_mapping\")\n",
+        "\n",
+        "    def artist_item_mapping(self) -> Dataset:\n",
+        "        return self._download(\"\", \"artist_item_mapping\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _download(data_dir: str, file: str) -> Dataset:\n",
+        "        data = load_dataset(\"yandex/yambda\", data_dir=data_dir, data_files=f\"{file}.parquet\")\n",
+        "        assert isinstance(data, DatasetDict)\n",
+        "        return data[\"train\"]\n",
+        "\n",
+        "\n",
+        "dataset = YambdaDataset(\"flat\", \"50m\")\n",
+        "likes = dataset.interaction(\"likes\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "1e9bb366",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "item_ids = pl.read_parquet(EMBEDDINGS_PATH)[\"item_id\"].to_numpy()\n",
+        "all_data_interactions = (\n",
+        "    likes.to_polars()\n",
+        "    .filter(pl.col(\"item_id\").is_in(item_ids))\n",
+        "    .sort([\"timestamp\"])\n",
+        "    .with_row_index(\"original_order\")\n",
+        "    .rename({\"uid\": \"user_id\"})\n",
+        ")\n",
+        "all_data_interactions.head()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "5d4c0c61",
+      "metadata": {},
+      "source": [
+        "Remap User IDs"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "ccf2d358",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "user_mapping = all_data_interactions.select(pl.col(\"user_id\")).unique().sort(\"user_id\").with_row_index(\"new_user_id\")\n",
+        "\n",
+        "all_data_interactions = (\n",
+        "    all_data_interactions.join(user_mapping, on=\"user_id\", how=\"left\")\n",
+        "    .with_columns(pl.col(\"new_user_id\").cast(pl.Int64).alias(\"user_id\"))\n",
+        "    .drop(\"new_user_id\")\n",
+        ")\n",
+        "all_data_interactions.head()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "932e82db",
+      "metadata": {},
+      "source": [
+        "Remap Item IDs"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "12f096cf",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "all_data_items = all_data_interactions.select(\"item_id\").unique()\n",
+        "all_data_users = all_data_interactions.select(\"user_id\").unique()\n",
+        "\n",
+        "unique_items_sorted = all_data_items.sort(\"item_id\").with_row_index(\"new_item_id\")\n",
+        "global_item_mapping = dict(zip(unique_items_sorted[\"item_id\"], unique_items_sorted[\"new_item_id\"], strict=True))\n",
+        "\n",
+        "print(f\"Total users: {all_data_users.shape[0]}, Total items: {len(global_item_mapping)}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "35cf8f24",
+      "metadata": {},
+      "source": [
+        "Save Item IDs Mapping"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "3c3efd1a",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "mapping_output_path = f\"{OUTPUT_DIR}/global_item_mapping.json\"\n",
+        "\n",
+        "with open(mapping_output_path, \"w\") as f:\n",
+        "    json.dump({str(k): v for k, v in global_item_mapping.items()}, f, indent=2)\n",
+        "\n",
+        "print(f\"Mapping saved: {mapping_output_path}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "f5184112",
+      "metadata": {},
+      "source": [
+        "Filter out Items without Embeddings"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "a154c523",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "item_ids = pl.read_parquet(EMBEDDINGS_PATH)[\"item_id\"].to_numpy()\n",
+        "item_embeddings = pl.read_parquet(EMBEDDINGS_PATH)[\"normalized_embed\"].to_numpy()\n",
+        "\n",
+        "mask = np.isin(item_ids, all_data_items.to_numpy())\n",
+        "\n",
+        "item_ids = item_ids[mask]\n",
+        "item_embeddings = np.array([x.tolist() for x in item_embeddings[mask]])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "66b1d814",
+      "metadata": {},
+      "source": [
+        "Save Item Embeddings"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "73ed2614",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "items_metadata = pl.DataFrame({\"item_id\": item_ids, \"embedding\": item_embeddings})\n",
+        "\n",
+        "# items_metadata.write_parquet(EMBEDDINGS_PATH) not needed since it's already saved\n",
+        "items_metadata.head()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "da336dca",
+      "metadata": {},
+      "source": [
+        "Remap Interations Item IDs"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "8c7e1e12",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def remap_interactions(df, mapping):\n",
+        "    return df.with_columns(pl.col(\"item_id\").replace_strict(mapping, return_dtype=pl.UInt32))\n",
+        "\n",
+        "\n",
+        "all_data_interactions_remapped = remap_interactions(all_data_interactions, global_item_mapping)\n",
+        "items_metadata_remapped = remap_interactions(items_metadata, global_item_mapping)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "05510701",
+      "metadata": {},
+      "source": [
+        "Assign Parts"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "55773fcd",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "all_data_interactions_remapped_sorted = all_data_interactions_remapped.sort(\"original_order\")\n",
+        "all_data_interactions_remapped_sorted = all_data_interactions_remapped_sorted.with_row_index(\"row_nr\")\n",
+        "\n",
+        "base_size = all_data_interactions_remapped_sorted.height // NUM_PARTS\n",
+        "\n",
+        "all_data_interactions_with_groups = all_data_interactions_remapped_sorted.with_columns(\n",
+        "    (pl.col(\"row_nr\") // (base_size + 1)).alias(\"part\")\n",
+        ").drop(\"row_nr\")\n",
+        "\n",
+        "all_data_interactions_with_groups.head()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "f9ee5178",
+      "metadata": {},
+      "source": [
+        "Check Stats"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "3def0005",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "parts_distribution = (\n",
+        "    all_data_interactions_with_groups.group_by(\"part\")\n",
+        "    .agg(\n",
+        "        pl.count().alias(\"count\"),\n",
+        "        pl.col(\"original_order\").min().alias(\"min_order\"),\n",
+        "        pl.col(\"original_order\").max().alias(\"max_order\"),\n",
+        "    )\n",
+        "    .sort(\"part\")\n",
+        ")\n",
+        "\n",
+        "print(\"Distribution by part:\")\n",
+        "print(parts_distribution)\n",
+        "\n",
+        "print(f\"Minimum part: {parts_distribution['part'].min()}\")\n",
+        "print(f\"Maximum part: {parts_distribution['part'].max()}\")\n",
+        "print(f\"Total number of events from all parts: {parts_distribution['count'].sum()}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "8eb66774",
+      "metadata": {},
+      "source": [
+        "Save Dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "f27dd7f7",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def write_parquet(output_dir, data, file_name):\n",
+        "    print(f\"Shape: {data.shape}\")\n",
+        "    output_parquet_path = f\"{output_dir}/{file_name}.parquet\"\n",
+        "    data.write_parquet(output_parquet_path)\n",
+        "    print(f\"File saved: {file_name}\")\n",
+        "\n",
+        "\n",
+        "write_parquet(OUTPUT_DIR, items_metadata_remapped, \"items_metadata_remapped\")\n",
+        "write_parquet(OUTPUT_DIR, all_data_interactions_with_groups, \"all_data_interactions_with_groups\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "deeprec",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.12.13"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}