Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
3ecc1cc
sasrec baseline
Timoniche Oct 22, 2025
2142ad3
tiger_baseline added
Timoniche Oct 23, 2025
8df00e9
baselines tensorboards
Timoniche Oct 24, 2025
2e15136
Respect max epoch count in trainer
Timoniche Oct 24, 2025
68ebc84
Respect max epoch count in trainer
Timoniche Oct 24, 2025
a67912d
kmeans tiger baseline (1 epoch wip check)
Timoniche Oct 25, 2025
baee340
ddulaev
Timoniche Oct 25, 2025
3680abd
minor: README.md typo + .idea gitignore
Timoniche Oct 25, 2025
7a484f0
built simple positive_pairs dataset
Timoniche Oct 26, 2025
da513ad
wip merge (minor)
Timoniche Oct 26, 2025
a625071
positive pairs json to txt
Timoniche Oct 26, 2025
a785403
cf dataset builder json to txt
Timoniche Oct 26, 2025
4df9d58
finetuning notebook impl
Timoniche Oct 26, 2025
07f12a0
[success] cf kmeans run
Timoniche Oct 27, 2025
6a7d95e
sasrec cold-warm-hot ndcg + recall
Timoniche Nov 4, 2025
e6a8f75
bugfix, semantic_* .ids is stored flattened in the batch
Timoniche Nov 4, 2025
10a56fa
Beauty -> Beauty_legacy
Timoniche Nov 8, 2025
2df21b5
correct inter.json & index_rqkmeans.json
Timoniche Nov 8, 2025
a1b26d3
[success] kmeans tuned/not tuned/sasrec runs on correct embeddings
Timoniche Nov 9, 2025
b5513e5
Merge branch 'main' into ddulaev/diploma
Timoniche Nov 9, 2025
eb77b32
almanah md + lsvd iskander template
Timoniche Apr 26, 2026
7e7db1e
task vk lsvd
Timoniche Apr 26, 2026
5636b87
vk lsvd run
Timoniche Apr 26, 2026
d81936c
VkDatasetProcessing fix
Timoniche Apr 26, 2026
713a06c
[successful] state pinned
Timoniche Apr 27, 2026
4746635
[vk] positive pairs builded
Timoniche Apr 27, 2026
5592745
[vk] finetune success run
Timoniche Apr 27, 2026
7311e42
vk plan added
Timoniche Apr 28, 2026
a1c7e0b
cold-warm-hot metrics plan
Timoniche Apr 28, 2026
c76cfc6
freqs_partitioning
Timoniche Apr 28, 2026
81c474c
minor comment
Timoniche Apr 28, 2026
b5ae842
logq task and plan
Timoniche Apr 28, 2026
5e50a66
[vk] item_frquencies count + finetune with logQ
Timoniche Apr 29, 2026
7b16d72
minor freqs_test
Timoniche Apr 29, 2026
e1dce2b
merge logQ, positive pairs and plans
Timoniche Apr 29, 2026
c29391a
merge freqs_test
Timoniche Apr 29, 2026
1f2c8ef
task4 smaller vk dataset plan
Timoniche Apr 29, 2026
b5b4524
[vk] dataset is shinked
Timoniche Apr 30, 2026
932034a
[vk small] new inter and pkl
Timoniche Apr 30, 2026
e1319c0
deprecate old kmeans quantizer
Timoniche Apr 30, 2026
1614fa8
[yambda] plan and first impl
Timoniche Apr 30, 2026
f32dfef
[vk small] log_q pkl is builded
Timoniche Apr 30, 2026
3eecbad
[yambda] embs + interactions
Timoniche May 1, 2026
c65ea22
freqs_test
Timoniche May 1, 2026
5522f92
[yambda] data is built
Timoniche May 1, 2026
5cf7be5
wip analytics
Timoniche May 4, 2026
d908f16
[metrics] success
Timoniche May 4, 2026
a1aef09
tiger ndcg & recall doc
Timoniche May 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
.idea
.DS_Store
.DS_Store
**/*.cpython-311..pyc
vk_lsvd/metadata
**/*.pth
**/*.pkl
**/.ipynb_checkpoints
data/Beauty_legacy_datasphere/**
tensorboard_logs_legacy/**
tensorboard_logs/**
tiger/modeling/dataset/__pycache__/**
vk_lsvd/subsamples
**/*.pyc
yambda/embeddings.parquet
yambda/flat/50m/likes.parquet
366 changes: 366 additions & 0 deletions ai/vk_exps/YambdaDownload.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,366 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "1161737c",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"from pathlib import Path\n",
"from typing import Literal\n",
"\n",
"import numpy as np\n",
"import polars as pl\n",
"from datasets import Dataset, DatasetDict, load_dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f0b09001",
"metadata": {},
"outputs": [],
"source": [
"DATA_DIR = \"../data\"\n",
"DATASET_NAME = \"yambda\"\n",
"\n",
"OUTPUT_DIR = f\"{DATA_DIR}/{DATASET_NAME}\"\n",
"DATASET_PATH = f\"{DATA_DIR}/{DATASET_NAME}/raw\"\n",
"\n",
"Path(DATASET_PATH).mkdir(parents=True, exist_ok=True)\n",
"\n",
"NUM_PARTS = 10\n",
"EMBEDDINGS_PATH = f\"{DATASET_PATH}/embeddings.parquet\""
]
},
{
"cell_type": "markdown",
"id": "ccd1b98e",
"metadata": {},
"source": [
"Download dataset with embeddings"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a7c5e02f",
"metadata": {},
"outputs": [],
"source": [
"!hf download yandex/yambda embeddings.parquet --repo-type dataset --local-dir ../data/yambda/raw"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f454d298",
"metadata": {},
"outputs": [],
"source": [
"class YambdaDataset:\n",
" INTERACTIONS = frozenset([\"likes\", \"listens\", \"multi_event\", \"dislikes\", \"unlikes\", \"undislikes\"])\n",
"\n",
" def __init__(\n",
" self, dataset_type: Literal[\"flat\", \"sequential\"] = \"flat\", dataset_size: Literal[\"50m\", \"500m\", \"5b\"] = \"50m\"\n",
" ):\n",
" assert dataset_type in {\"flat\", \"sequential\"}\n",
" assert dataset_size in {\"50m\", \"500m\", \"5b\"}\n",
" self.dataset_type = dataset_type\n",
" self.dataset_size = dataset_size\n",
"\n",
" def interaction(\n",
" self, event_type: Literal[\"likes\", \"listens\", \"multi_event\", \"dislikes\", \"unlikes\", \"undislikes\"]\n",
" ) -> Dataset:\n",
" assert event_type in YambdaDataset.INTERACTIONS\n",
" return self._download(f\"{self.dataset_type}/{self.dataset_size}\", event_type)\n",
"\n",
" def audio_embeddings(self) -> Dataset:\n",
" return self._download(\"\", \"embeddings\")\n",
"\n",
" def album_item_mapping(self) -> Dataset:\n",
" return self._download(\"\", \"album_item_mapping\")\n",
"\n",
" def artist_item_mapping(self) -> Dataset:\n",
" return self._download(\"\", \"artist_item_mapping\")\n",
"\n",
" @staticmethod\n",
" def _download(data_dir: str, file: str) -> Dataset:\n",
" data = load_dataset(\"yandex/yambda\", data_dir=data_dir, data_files=f\"{file}.parquet\")\n",
" assert isinstance(data, DatasetDict)\n",
" return data[\"train\"]\n",
"\n",
"\n",
"dataset = YambdaDataset(\"flat\", \"50m\")\n",
"likes = dataset.interaction(\"likes\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1e9bb366",
"metadata": {},
"outputs": [],
"source": [
"item_ids = pl.read_parquet(EMBEDDINGS_PATH)[\"item_id\"].to_numpy()\n",
"all_data_interactions = (\n",
" likes.to_polars()\n",
" .filter(pl.col(\"item_id\").is_in(item_ids))\n",
" .sort([\"timestamp\"])\n",
" .with_row_index(\"original_order\")\n",
" .rename({\"uid\": \"user_id\"})\n",
")\n",
"all_data_interactions.head()"
]
},
{
"cell_type": "markdown",
"id": "5d4c0c61",
"metadata": {},
"source": [
"Remap User IDs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ccf2d358",
"metadata": {},
"outputs": [],
"source": [
"user_mapping = all_data_interactions.select(pl.col(\"user_id\")).unique().sort(\"user_id\").with_row_index(\"new_user_id\")\n",
"\n",
"all_data_interactions = (\n",
" all_data_interactions.join(user_mapping, on=\"user_id\", how=\"left\")\n",
" .with_columns(pl.col(\"new_user_id\").cast(pl.Int64).alias(\"user_id\"))\n",
" .drop(\"new_user_id\")\n",
")\n",
"all_data_interactions.head()"
]
},
{
"cell_type": "markdown",
"id": "932e82db",
"metadata": {},
"source": [
"Remap Item IDs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "12f096cf",
"metadata": {},
"outputs": [],
"source": [
"all_data_items = all_data_interactions.select(\"item_id\").unique()\n",
"all_data_users = all_data_interactions.select(\"user_id\").unique()\n",
"\n",
"unique_items_sorted = all_data_items.sort(\"item_id\").with_row_index(\"new_item_id\")\n",
"global_item_mapping = dict(zip(unique_items_sorted[\"item_id\"], unique_items_sorted[\"new_item_id\"], strict=True))\n",
"\n",
"print(f\"Total users: {all_data_users.shape[0]}, Total items: {len(global_item_mapping)}\")"
]
},
{
"cell_type": "markdown",
"id": "35cf8f24",
"metadata": {},
"source": [
"Save Item IDs Mapping"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3c3efd1a",
"metadata": {},
"outputs": [],
"source": [
"mapping_output_path = f\"{OUTPUT_DIR}/global_item_mapping.json\"\n",
"\n",
"with open(mapping_output_path, \"w\") as f:\n",
" json.dump({str(k): v for k, v in global_item_mapping.items()}, f, indent=2)\n",
"\n",
"print(f\"Mapping saved: {mapping_output_path}\")"
]
},
{
"cell_type": "markdown",
"id": "f5184112",
"metadata": {},
"source": [
"Filter out Items without Embeddings"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a154c523",
"metadata": {},
"outputs": [],
"source": [
"item_ids = pl.read_parquet(EMBEDDINGS_PATH)[\"item_id\"].to_numpy()\n",
"item_embeddings = pl.read_parquet(EMBEDDINGS_PATH)[\"normalized_embed\"].to_numpy()\n",
"\n",
"mask = np.isin(item_ids, all_data_items.to_numpy())\n",
"\n",
"item_ids = item_ids[mask]\n",
"item_embeddings = np.array([x.tolist() for x in item_embeddings[mask]])"
]
},
{
"cell_type": "markdown",
"id": "66b1d814",
"metadata": {},
"source": [
"Save Item Embeddings"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "73ed2614",
"metadata": {},
"outputs": [],
"source": [
"items_metadata = pl.DataFrame({\"item_id\": item_ids, \"embedding\": item_embeddings})\n",
"\n",
"# items_metadata.write_parquet(EMBEDDINGS_PATH) not needed since it's already saved\n",
"items_metadata.head()"
]
},
{
"cell_type": "markdown",
"id": "da336dca",
"metadata": {},
"source": [
"Remap Interations Item IDs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8c7e1e12",
"metadata": {},
"outputs": [],
"source": [
"def remap_interactions(df, mapping):\n",
" return df.with_columns(pl.col(\"item_id\").replace_strict(mapping, return_dtype=pl.UInt32))\n",
"\n",
"\n",
"all_data_interactions_remapped = remap_interactions(all_data_interactions, global_item_mapping)\n",
"items_metadata_remapped = remap_interactions(items_metadata, global_item_mapping)"
]
},
{
"cell_type": "markdown",
"id": "05510701",
"metadata": {},
"source": [
"Assign Parts"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "55773fcd",
"metadata": {},
"outputs": [],
"source": [
"all_data_interactions_remapped_sorted = all_data_interactions_remapped.sort(\"original_order\")\n",
"all_data_interactions_remapped_sorted = all_data_interactions_remapped_sorted.with_row_index(\"row_nr\")\n",
"\n",
"base_size = all_data_interactions_remapped_sorted.height // NUM_PARTS\n",
"\n",
"all_data_interactions_with_groups = all_data_interactions_remapped_sorted.with_columns(\n",
" (pl.col(\"row_nr\") // (base_size + 1)).alias(\"part\")\n",
").drop(\"row_nr\")\n",
"\n",
"all_data_interactions_with_groups.head()"
]
},
{
"cell_type": "markdown",
"id": "f9ee5178",
"metadata": {},
"source": [
"Check Stats"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3def0005",
"metadata": {},
"outputs": [],
"source": [
"parts_distribution = (\n",
" all_data_interactions_with_groups.group_by(\"part\")\n",
" .agg(\n",
" pl.count().alias(\"count\"),\n",
" pl.col(\"original_order\").min().alias(\"min_order\"),\n",
" pl.col(\"original_order\").max().alias(\"max_order\"),\n",
" )\n",
" .sort(\"part\")\n",
")\n",
"\n",
"print(\"Distribution by part:\")\n",
"print(parts_distribution)\n",
"\n",
"print(f\"Minimum part: {parts_distribution['part'].min()}\")\n",
"print(f\"Maximum part: {parts_distribution['part'].max()}\")\n",
"print(f\"Total number of events from all parts: {parts_distribution['count'].sum()}\")"
]
},
{
"cell_type": "markdown",
"id": "8eb66774",
"metadata": {},
"source": [
"Save Dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f27dd7f7",
"metadata": {},
"outputs": [],
"source": [
"def write_parquet(output_dir, data, file_name):\n",
" print(f\"Shape: {data.shape}\")\n",
" output_parquet_path = f\"{output_dir}/{file_name}.parquet\"\n",
" data.write_parquet(output_parquet_path)\n",
" print(f\"File saved: {file_name}\")\n",
"\n",
"\n",
"write_parquet(OUTPUT_DIR, items_metadata_remapped, \"items_metadata_remapped\")\n",
"write_parquet(OUTPUT_DIR, all_data_interactions_with_groups, \"all_data_interactions_with_groups\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deeprec",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading