codepawl · nxank4 · Feb 27, 2026 · Feb 27, 2026 · Feb 27, 2026 · Feb 27, 2026
@@ -0,0 +1,129 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 🔗 Entity Resolution\n",
+    "\n",
+    "Canonicalize messy string variations (typos, abbreviations, casing) under a single label using `loclean.resolve_entities`.\n",
+    "\n",
+    "**Use case:** You have a column of company names entered by different people — some wrote \"Google\", others \"google\", \"GOOGLE Inc.\", or \"Alphabet / Google\". Entity resolution merges them into one canonical form."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import polars as pl\n",
+    "\n",
+    "import loclean"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create messy data\n",
+    "\n",
+    "16 company name variations across 5 real companies:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pl.DataFrame(\n",
+    "    {\n",
+    "        \"company\": [\n",
+    "            \"Google LLC\",\n",
+    "            \"google\",\n",
+    "            \"GOOGLE Inc.\",\n",
+    "            \"Alphabet / Google\",\n",
+    "            \"Microsoft Corp\",\n",
+    "            \"microsoft\",\n",
+    "            \"MSFT\",\n",
+    "            \"Apple Inc.\",\n",
+    "            \"apple\",\n",
+    "            \"AAPL\",\n",
+    "            \"Amazon.com Inc\",\n",
+    "            \"amazon\",\n",
+    "            \"AMZN\",\n",
+    "            \"Meta Platforms\",\n",
+    "            \"meta\",\n",
+    "            \"Facebook (Meta)\",\n",
+    "        ]\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "print(f\"Unique values before: {df['company'].n_unique()}\")\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Resolve entities\n",
+    "\n",
+    "The `threshold` parameter controls how aggressively values are merged (0 = nothing, 1 = everything)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = loclean.resolve_entities(df, \"company\", threshold=0.8)\n",
+    "result"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compare before vs. after"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "unique_before = df[\"company\"].n_unique()\n",
+    "unique_after = result[\"company_canonical\"].n_unique()\n",
+    "merged = unique_before - unique_after\n",
+    "\n",
+    "print(f\"Unique values: {unique_before} → {unique_after} ({merged} merged)\")\n",
+    "result.select([\"company\", \"company_canonical\"])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
@@ -0,0 +1,153 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ⚖️ Semantic Oversampling\n",
+    "\n",
+    "Generate synthetic minority-class records using `loclean.oversample`.\n",
+    "\n",
+    "**Use case:** Your dataset has 8 \"healthy\" patients and only 2 \"hypertension\" — the LLM generates semantically plausible synthetic hypertension records to balance the classes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import polars as pl\n",
+    "from pydantic import BaseModel, Field\n",
+    "\n",
+    "import loclean"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define schema and create imbalanced data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class PatientRecord(BaseModel):\n",
+    "    \"\"\"Schema for synthetic patient records.\"\"\"\n",
+    "\n",
+    "    age: int = Field(..., ge=0, le=120, description=\"Patient age\")\n",
+    "    blood_pressure: str = Field(\n",
+    "        ..., description=\"Blood pressure reading, e.g. '120/80'\"\n",
+    "    )\n",
+    "    cholesterol: str = Field(..., description=\"Cholesterol level: Low, Normal, or High\")\n",
+    "    diagnosis: str = Field(..., description=\"Medical diagnosis label\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pl.DataFrame(\n",
+    "    {\n",
+    "        \"age\": [45, 52, 38, 61, 55, 42, 35, 67, 48, 50],\n",
+    "        \"blood_pressure\": [\n",
+    "            \"120/80\",\n",
+    "            \"140/90\",\n",
+    "            \"130/85\",\n",
+    "            \"150/95\",\n",
+    "            \"128/82\",\n",
+    "            \"135/88\",\n",
+    "            \"118/76\",\n",
+    "            \"155/100\",\n",
+    "            \"125/80\",\n",
+    "            \"138/92\",\n",
+    "        ],\n",
+    "        \"cholesterol\": [\n",
+    "            \"Normal\",\n",
+    "            \"High\",\n",
+    "            \"Normal\",\n",
+    "            \"High\",\n",
+    "            \"Normal\",\n",
+    "            \"Normal\",\n",
+    "            \"Low\",\n",
+    "            \"High\",\n",
+    "            \"Normal\",\n",
+    "            \"Normal\",\n",
+    "        ],\n",
+    "        \"diagnosis\": [\n",
+    "            \"healthy\",\n",
+    "            \"healthy\",\n",
+    "            \"healthy\",\n",
+    "            \"hypertension\",\n",
+    "            \"healthy\",\n",
+    "            \"healthy\",\n",
+    "            \"healthy\",\n",
+    "            \"hypertension\",\n",
+    "            \"healthy\",\n",
+    "            \"healthy\",\n",
+    "        ],\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "print(\"Class distribution (before):\")\n",
+    "print(df[\"diagnosis\"].value_counts())\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generate synthetic minority records"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = loclean.oversample(\n",
+    "    df,\n",
+    "    target_col=\"diagnosis\",\n",
+    "    target_value=\"hypertension\",\n",
+    "    n=6,\n",
+    "    schema=PatientRecord,\n",
+    "    batch_size=3,\n",
+    ")\n",
+    "\n",
+    "print(f\"Rows: {len(df)} → {len(result)} (+{len(result) - len(df)} synthetic)\")\n",
+    "print(\"\\nClass distribution (after):\")\n",
+    "print(result[\"diagnosis\"].value_counts())\n",
+    "result"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}