diff --git a/src/homeworks/homework6/ensembles.ipynb b/src/homeworks/homework6/ensembles.ipynb new file mode 100644 index 0000000..5c706c5 --- /dev/null +++ b/src/homeworks/homework6/ensembles.ipynb @@ -0,0 +1,1011 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "1PjQglGj4q54" + }, + "source": [ + "# Случайные леса\n", + "\n", + "В этом задании вам предстоит реализовать ансамбль деревьев решений, известный как случайный лес, применить его к публичным данным пользователей социальной сети Вконтакте, и сравнить его эффективность с бустингом, предоставляемым библиотекой `CatBoost`.\n", + "\n", + "В результате мы сможем определить, какие подписки пользователей больше всего влияют на определение возраста и пола человека." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "LH5PiGz04q5-" + }, + "outputs": [], + "source": [ + "import inspect\n", + "import random\n", + "from collections import Counter\n", + "from dataclasses import dataclass\n", + "from itertools import product\n", + "from typing import Callable, List, Tuple, Union\n", + "\n", + "import numpy as np\n", + "import numpy.typing as npt\n", + "import pandas\n", + "from catboost import CatBoostClassifier\n", + "from sklearn.model_selection import train_test_split\n", + "from tqdm.notebook import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "uB2AMtmBsjr0", + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "def set_seed(seed=42):\n", + " np.random.seed(seed)\n", + " random.seed(seed)\n", + "\n", + "\n", + "# Этой функцией будут помечены все места, которые необходимо дозаполнить\n", + "# Это могут быть как целые функции, так и отдельные части внутри них\n", + "# Всегда можно воспользоваться интроспекцией и найти места использования этой функции :)\n", + "def todo():\n", + " stack = inspect.stack()\n", + " caller_frame = stack[1]\n", + " function_name = caller_frame.function\n", + " line_number = caller_frame.lineno\n", + " raise NotImplementedError(f\"TODO at {function_name}, line {line_number}\")\n", + "\n", + "\n", + "SEED = 0xC0FFEE\n", + "set_seed(SEED)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "fANg2rPVsjr0" + }, + "outputs": [], + "source": [ + "def mode(data):\n", + " counts = Counter(data)\n", + " return counts.most_common(n=1)[0][0]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tfxycK0Q4q5_" + }, + "source": [ + "### Задание 1 (2 балла)\n", + "Random Forest состоит из деревьев решений. Каждое такое дерево строится на одной из выборок, полученных при помощи bootstrap. Элементы, которые не вошли в новую обучающую выборку, образуют **out-of-bag** выборку. Кроме того, в каждом узле дерева мы случайным образом выбираем набор из `max_features` и ищем признак для предиката разбиения только в этом наборе.\n", + "\n", + "Сегодня мы будем работать только с бинарными признаками, поэтому нет необходимости выбирать значение признака для разбиения.\n", + "\n", + "#### Методы\n", + "`predict(X)` - возвращает предсказанные метки для элементов выборки `X`\n", + "\n", + "#### Параметры конструктора\n", + "`X, y` - обучающая выборка и соответствующие ей метки классов. Из нее нужно получить выборку для построения дерева при помощи bootstrap. Out-of-bag выборку нужно запомнить, она понадобится потом.\n", + "\n", + "`criterion=\"gini\"` - задает критерий, который будет использоваться при построении дерева. Возможные значения: `\"gini\"`, `\"entropy\"`.\n", + "\n", + "`max_depth=None` - ограничение глубины дерева. Если `None` - глубина не ограничена\n", + "\n", + "`min_samples_leaf=1` - минимальное количество элементов в каждом листе дерева.\n", + "\n", + "`max_features=\"auto\"` - количество признаков, которые могут использоваться в узле. Если `\"auto\"` - равно `sqrt(X.shape[1])`" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "bQUJHTjS4q5-" + }, + "outputs": [], + "source": [ + "# Для начала реализуем сами критерии\n", + "\n", + "\n", + "def gini(x: npt.ArrayLike) -> float:\n", + " \"\"\"\n", + " Calculate the Gini impurity of a list or array of class labels.\n", + "\n", + " Args:\n", + " x (ArrayLike): Array-like object containing class labels.\n", + "\n", + " Returns:\n", + " float: Gini impurity value.\n", + " \"\"\"\n", + " if len(x) == 0:\n", + " return 0.0\n", + " \n", + " x = np.asarray(x)\n", + "\n", + " _, count = np.unique(x, return_counts=True)\n", + " probabilities = count / len(x)\n", + "\n", + " return np.sum(probabilities * (1 - probabilities))\n", + "\n", + "\n", + "def entropy(x: npt.ArrayLike) -> float:\n", + " \"\"\"\n", + " Calculate the entropy of a list or array of class labels.\n", + "\n", + " Args:\n", + " x (ArrayLike): Array-like object containing class labels.\n", + "\n", + " Returns:\n", + " float: Entropy value.\n", + " \"\"\"\n", + " if len(x) == 0:\n", + " return 0.0\n", + " \n", + " x = np.asarray(x)\n", + "\n", + " _, count = np.unique(x, return_counts=True)\n", + " probabilities = count / len(x)\n", + "\n", + " return -np.sum(probabilities * np.log2(probabilities, where=(probabilities > 0)))\n", + "\n", + "\n", + "def gain(left_y: npt.ArrayLike, right_y: npt.ArrayLike, criterion: Callable[[npt.ArrayLike], float]) -> float:\n", + " \"\"\"\n", + " Calculate the information gain of a split using a specified criterion.\n", + "\n", + " Args:\n", + " left_y (ArrayLike): Class labels for the left split.\n", + " right_y (ArrayLike): Class labels for the right split.\n", + " criterion (Callable): Function to calculate impurity (e.g., gini or entropy).\n", + "\n", + " Returns:\n", + " float: Information gain from the split.\n", + " \"\"\"\n", + " left_y, right_y = np.asarray(left_y), np.asarray(right_y)\n", + "\n", + " y = np.concatenate([left_y, right_y])\n", + " R = len(y)\n", + " R_l, R_r = len(left_y), len(right_y)\n", + "\n", + " return criterion(y) - (R_l / R) * criterion(left_y) - (R_r / R) * criterion(right_y)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "Nctm_I99sjr2", + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "@dataclass\n", + "class DecisionTreeLeaf:\n", + " classes: np.ndarray\n", + "\n", + " def __post_init__(self):\n", + " self.max_class = mode(self.classes)\n", + "\n", + "\n", + "@dataclass\n", + "class DecisionTreeInternalNode:\n", + " split_dim: int\n", + " left: Union[\"DecisionTreeInternalNode\", DecisionTreeLeaf]\n", + " right: Union[\"DecisionTreeInternalNode\", DecisionTreeLeaf]\n", + "\n", + "\n", + "DecisionTreeNode = Union[DecisionTreeInternalNode, DecisionTreeLeaf]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "8smLW2V_4q5_" + }, + "outputs": [], + "source": [ + "class DecisionTree:\n", + " def __init__(self, X, y, criterion=\"gini\", max_depth=None, min_samples_leaf=1, max_features=\"auto\"):\n", + " self.criterion: Callable = gini if criterion == \"gini\" else entropy\n", + " self.max_depth: int | None = max_depth\n", + " self.min_samples: int = min_samples_leaf\n", + " self.max_features: int = int(np.sqrt(X.shape[1])) if max_features == \"auto\" else int(max_features)\n", + "\n", + " n_samples = X.shape[0]\n", + " bootstrap_indices = np.random.choice(n_samples, size=n_samples, replace=True)\n", + " oob_mask = np.ones(n_samples, dtype=bool)\n", + " oob_mask[bootstrap_indices] = False\n", + " self._out_of_bag_X = X[oob_mask]\n", + " self._out_of_bag_y = y[oob_mask]\n", + " X = X[bootstrap_indices]\n", + " y = y[bootstrap_indices]\n", + "\n", + " self.root = self._build_node(X, y, depth=0)\n", + " \n", + " @property\n", + " def out_of_bag(self) -> Tuple[np.ndarray, np.ndarray]:\n", + " return self._out_of_bag_X, self._out_of_bag_y\n", + "\n", + " def _build_node(self, points: np.ndarray, classes: np.ndarray, depth: int) -> DecisionTreeNode:\n", + " if len(points) <= self.min_samples or len(np.unique(classes)) == 1 or (self.max_depth is not None and depth >= self.max_depth):\n", + " return DecisionTreeLeaf(classes)\n", + " \n", + " ind, mask = self._find_best_split(points, classes)\n", + " if mask is None:\n", + " return DecisionTreeLeaf(classes)\n", + " \n", + " X_left, X_right = points[mask], points[~mask]\n", + " y_left, y_right = classes[mask], classes[~mask]\n", + "\n", + " if len(y_right) <= self.min_samples or len(y_left) <= self.min_samples or ind is None:\n", + " return DecisionTreeLeaf(classes)\n", + "\n", + " return DecisionTreeInternalNode(ind, self._build_node(X_left, y_left, depth + 1), self._build_node(X_right, y_right, depth + 1))\n", + " \n", + " def _find_best_split(self, X: np.ndarray, y: np.ndarray):\n", + " max_gain = -1\n", + " best_mask = None\n", + " best_ind = None \n", + "\n", + " feature_inds = np.random.choice(np.arange(0, X.shape[1]), size=self.max_features, replace=False)\n", + " for ind in feature_inds:\n", + " mask = X[:, ind] < 0.5\n", + " y_left = y[mask]\n", + " y_right = y[~mask]\n", + "\n", + " if len(y_left) >= self.min_samples and len(y_right) >= self.min_samples:\n", + " gain_ind = gain(y_left, y_right, self.criterion)\n", + " if gain_ind > max_gain:\n", + " max_gain = gain_ind\n", + " best_mask = mask\n", + " best_ind = ind\n", + "\n", + " return best_ind, best_mask\n", + "\n", + " def _predict(self, point: np.ndarray, node: DecisionTreeNode) -> int:\n", + " if isinstance(node, DecisionTreeLeaf):\n", + " return node.max_class\n", + " \n", + " if point[node.split_dim] < 0.5:\n", + " return self._predict(point, node.left)\n", + " \n", + " return self._predict(point, node.right)\n", + "\n", + " def predict(self, points: np.ndarray) -> np.ndarray:\n", + " classes_pred = []\n", + " for point in points:\n", + " classes_pred.append(self._predict(point, self.root))\n", + "\n", + " return np.array(classes_pred)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9oijgwLt4q6A" + }, + "source": [ + "### Задание 2 (2 балла)\n", + "Теперь реализуем сам Random Forest. Идея очень простая: строим `n` деревьев, а затем берем модальное предсказание.\n", + "\n", + "#### Параметры конструктора\n", + "`n_estimators` - количество используемых для предсказания деревьев.\n", + "\n", + "Остальное - параметры деревьев.\n", + "\n", + "#### Методы\n", + "`fit(X, y)` - строит `n_estimators` деревьев по выборке `X`.\n", + "\n", + "`predict(X)` - для каждого элемента выборки `X` возвращает самый частый класс, который предсказывают для него деревья." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "APIy88YW4q6A" + }, + "outputs": [], + "source": [ + "class RandomForestClassifier:\n", + "\n", + " _n_features: int = None\n", + "\n", + " def __init__(self, criterion=\"gini\", max_depth=None, min_samples_leaf=1, max_features=\"auto\", n_estimators=10):\n", + " self._criterion = criterion\n", + " self._max_depth = max_depth\n", + " self._min_samples_leaf = min_samples_leaf\n", + " self._max_features = max_features\n", + " self._n_estimators = n_estimators\n", + " self._estimators: list[DecisionTree | None] = []\n", + "\n", + " @property\n", + " def estimators(self) -> List[DecisionTree]:\n", + " return self._estimators\n", + "\n", + " @property\n", + " def n_features(self) -> int:\n", + " if self._n_features is None:\n", + " raise RuntimeError(\"Fit random forest before accessing to number of features properties\")\n", + " return self._n_features\n", + "\n", + " def fit(self, X, y):\n", + " self._n_features = X.shape[1]\n", + "\n", + " for _ in range(self._n_estimators):\n", + " tree = DecisionTree(X, y, self._criterion, self._max_depth, self._min_samples_leaf, self._max_features)\n", + " self.estimators.append(tree)\n", + "\n", + " def predict(self, X) -> np.ndarray:\n", + " all_preds = []\n", + " for estimator in self.estimators:\n", + " all_preds.append(estimator.predict(X))\n", + " \n", + " y_pred = np.stack(all_preds, axis=1)\n", + " preds = []\n", + " for pred in y_pred:\n", + " preds.append(mode(pred))\n", + " \n", + " return np.array(preds)\n", + " \n", + " def get_params(self, deep=True) -> dict:\n", + " return {\n", + " \"criterion\": self._criterion,\n", + " \"max_depth\": self._max_depth,\n", + " \"min_samples_leaf\": self._min_samples_leaf,\n", + " \"max_features\": self._max_features,\n", + " \"n_estimators\": self._n_estimators\n", + " }\n", + "\n", + " def set_params(self, **params) -> \"RandomForestClassifier\":\n", + " for key, value in params.items():\n", + " setattr(self, key, value)\n", + " return self" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i80pffMn4q6A" + }, + "source": [ + "### Задание 3 (2 балла)\n", + "Часто хочется понимать, насколько большую роль играет тот или иной признак для предсказания класса объекта. Есть различные способы посчитать его важность. Один из простых способов сделать это для Random Forest - посчитать out-of-bag ошибку предсказания `err_oob`, а затем перемешать значения признака `j` и посчитать ее (`err_oob_j`) еще раз. Оценкой важности признака `j` для одного дерева будет разность `err_oob_j - err_oob`, важность для всего леса считается как среднее значение важности по деревьям.\n", + "\n", + "Реализуйте функцию `feature_importance`, которая принимает на вход Random Forest и возвращает массив, в котором содержится важность для каждого признака." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "rEmVG1Fl4q6B" + }, + "outputs": [], + "source": [ + "def accuracy_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:\n", + " y_true = y_true.reshape(-1)\n", + " y_pred = y_pred.reshape(-1)\n", + " return np.mean(y_true == y_pred)\n", + "\n", + "\n", + "def feature_importance(rfc: RandomForestClassifier):\n", + " matrix_importance = np.zeros(rfc.n_features)\n", + "\n", + " for estimator in rfc.estimators:\n", + " X, y = estimator.out_of_bag\n", + " if len(X) == 0:\n", + " continue\n", + "\n", + " y_pred = estimator.predict(X)\n", + " err_obb = accuracy_score(y, y_pred)\n", + "\n", + " for j in range(rfc.n_features):\n", + " X_shuffle = X.copy()\n", + " np.random.shuffle(X_shuffle[:, j])\n", + " y_pred_shuffled = estimator.predict(X_shuffle)\n", + " err_obb_j = accuracy_score(y, y_pred_shuffled)\n", + " matrix_importance[j] += err_obb - err_obb_j\n", + " \n", + " valid_estimators = sum(1 for est in rfc.estimators if len(est.out_of_bag[0]) > 0)\n", + " if valid_estimators == 0:\n", + " return matrix_importance\n", + " return matrix_importance / valid_estimators\n", + "\n", + "\n", + "def most_important_features(importance, names, k=20):\n", + " # Выводит названия k самых важных признаков\n", + " indices = np.argsort(importance)[::-1][:k]\n", + " return np.array(names)[indices]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JooN_YKm4q6B" + }, + "source": [ + "Наконец, пришло время протестировать наше дерево на простом синтетическом наборе данных. В результате точность должна быть примерно равна `1.0`, наибольшее значение важности должно быть у признака с индексом `4`, признаки с индексами `2` и `3` должны быть одинаково важны, а остальные признаки - не важны совсем." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "8gqYMp994q6B" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 1.0\n", + "Importance: [-1.24005774e-03 -1.08250356e-03 1.70921794e-01 1.65304361e-01\n", + " 3.35875415e-01 1.68137126e-04]\n" + ] + } + ], + "source": [ + "def synthetic_dataset(size):\n", + " X = [\n", + " (np.random.randint(0, 2), np.random.randint(0, 2), i % 6 == 3, i % 6 == 0, i % 3 == 2, np.random.randint(0, 2))\n", + " for i in range(size)\n", + " ]\n", + " y = [i % 3 for i in range(size)]\n", + " return np.array(X), np.array(y)\n", + "\n", + "\n", + "X, y = synthetic_dataset(1000)\n", + "rfc = RandomForestClassifier(n_estimators=100)\n", + "rfc.fit(X, y)\n", + "print(\"Accuracy:\", np.mean(rfc.predict(X) == y))\n", + "print(\"Importance:\", feature_importance(rfc))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vRtGOs164q6C" + }, + "source": [ + "### Задание 4 (1 балл)\n", + "Теперь поработаем с реальными данными.\n", + "\n", + "Выборка состоит из публичных анонимизированных данных пользователей социальной сети Вконтакте. Первые два столбца отражают возрастную группу (`zoomer`, `doomer` и `boomer`) и пол (`female`, `male`). Все остальные столбцы являются бинарными признаками, каждый из них определяет, подписан ли пользователь на определенную группу/публичную страницу или нет.\\\n", + "\\\n", + "Необходимо обучить два классификатора, один из которых определяет возрастную группу, а второй - пол.\\\n", + "\\\n", + "Эксперименты с множеством используемых признаков и подбор гиперпараметров приветствуются. Лес должен строиться за какое-то разумное время." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "HruobK-q4q6C" + }, + "outputs": [], + "source": [ + "def read_dataset(path):\n", + " dataframe = pandas.read_csv(path, header=0)\n", + " dataset = dataframe.values.tolist()\n", + " random.shuffle(dataset)\n", + " y_age = [row[0] for row in dataset]\n", + " y_sex = [row[1] for row in dataset]\n", + " X = [row[2:] for row in dataset]\n", + "\n", + " return np.array(X), np.array(y_age), np.array(y_sex), list(dataframe.columns)[2:]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "K0QXWr3b4q6C" + }, + "outputs": [], + "source": [ + "X, y_age, y_sex, features = read_dataset(\"src/homeworks/homework6/vk.csv\")\n", + "X_train, X_test, y_age_train, y_age_test, y_sex_train, y_sex_test = train_test_split(X, y_age, y_sex, train_size=0.9)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import GridSearchCV" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "I0y8J97m4q6C" + }, + "source": [ + "#### Возраст" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Лучшие параметры для rfc: {'_criterion': 'entropy', '_max_depth': None, '_max_features': 'auto', '_min_samples_leaf': 1, '_n_estimators': 40}\n" + ] + } + ], + "source": [ + "rfc = RandomForestClassifier(n_estimators=10)\n", + "\n", + "params = {\n", + " \"_criterion\": [\"gini\", \"entropy\"], \n", + " \"_max_depth\" : [None, 10, 20], \n", + " \"_min_samples_leaf\": [1, 3, 15],\n", + " \"_max_features\": [\"auto\", np.log2(149)],\n", + " \"_n_estimators\": [10, 40]}\n", + "\n", + "grid = GridSearchCV(\n", + " estimator=rfc,\n", + " param_grid=params,\n", + " scoring='accuracy',\n", + " cv=5,\n", + " n_jobs=-1\n", + ")\n", + "\n", + "grid.fit(X_train, y_age_train)\n", + "print(\"Лучшие параметры для rfc:\", grid.best_params_)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "MLJykJZH4q6C" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.7238335435056746\n", + "Most important features:\n", + "1. ovsyanochan\n", + "2. 4ch\n", + "3. styd.pozor\n", + "4. mudakoff\n", + "5. rhymes\n", + "6. dayvinchik\n", + "7. rapnewrap\n", + "8. pravdashowtop\n", + "9. pixel_stickers\n", + "10. tumblr_vacuum\n", + "11. reflexia_our_feelings\n", + "12. bot_maxim\n", + "13. iwantyou\n", + "14. leprum\n", + "15. bestad\n", + "16. i_d_t\n", + "17. xfilm\n", + "18. ohhluul\n", + "19. ne1party\n", + "20. bog_memes\n" + ] + } + ], + "source": [ + "rfc = RandomForestClassifier(\"entropy\", None, 1, \"auto\", 40)\n", + "\n", + "rfc.fit(X_train, y_age_train)\n", + "print(\"Accuracy:\", np.mean(rfc.predict(X_test) == y_age_test))\n", + "print(\"Most important features:\")\n", + "for i, name in enumerate(most_important_features(feature_importance(rfc), features, 20)):\n", + " print(str(i + 1) + \".\", name)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cgNpaAKH4q6D" + }, + "source": [ + "#### Пол" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rfc = RandomForestClassifier(n_estimators=10)\n", + "\n", + "params = {\n", + " \"_criterion\": [\"gini\", \"entropy\"], \n", + " \"_max_depth\" : [None, 10, 20], \n", + " \"_min_samples_leaf\": [1, 3, 15],\n", + " \"_max_features\": [\"auto\", np.log2(149)],\n", + " \"_n_estimators\": [10, 40]}\n", + "\n", + "grid = GridSearchCV(\n", + " estimator=rfc,\n", + " param_grid=params,\n", + " scoring='accuracy',\n", + " cv=5,\n", + " n_jobs=-1\n", + ")\n", + "\n", + "grid.fit(X_train, y_sex_train)\n", + "print(\"Лучшие параметры для rfc:\", grid.best_params_)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "X-zne5-R4q6D" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.849936948297604\n", + "Most important features:\n", + "1. 40kg\n", + "2. girlmeme\n", + "3. modnailru\n", + "4. zerofat\n", + "5. 9o_6o_9o\n", + "6. mudakoff\n", + "7. be.beauty\n", + "8. i_d_t\n", + "9. woman.blog\n", + "10. 4ch\n", + "11. reflexia_our_feelings\n", + "12. igm\n", + "13. cook_good\n", + "14. beauty\n", + "15. femalemem\n", + "16. recipes40kg\n", + "17. thesmolny\n", + "18. sh.cook\n", + "19. be.women\n", + "20. rapnewrap\n" + ] + } + ], + "source": [ + "rfc = RandomForestClassifier(\"entropy\", None, 1, \"auto\", 40)\n", + "\n", + "rfc.fit(X_train, y_sex_train)\n", + "print(\"Accuracy:\", np.mean(rfc.predict(X_test) == y_sex_test))\n", + "print(\"Most important features:\")\n", + "for i, name in enumerate(most_important_features(feature_importance(rfc), features, 20)):\n", + " print(str(i + 1) + \".\", name)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pxeTQylQ4q6D" + }, + "source": [ + "### CatBoost\n", + "В качестве аьтернативы попробуем CatBoost.\n", + "\n", + "Устаниовить его можно просто с помощью `pip install catboost`. Туториалы можно найти, например, [здесь](https://catboost.ai/docs/concepts/python-usages-examples.html#multiclassification) и [здесь](https://github.com/catboost/tutorials/blob/master/python_tutorial.ipynb). Главное - не забудьте использовать `loss_function='MultiClass'`.\\\n", + "\\\n", + "Сначала протестируйте CatBoost на синтетических данных. Выведите точность и важность признаков." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "DOqVkEnd4q6D" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 1.0\n", + "Importance: [6.91453034e-04 4.02295744e-04 2.80921406e+01 2.82148745e+01\n", + " 4.36910485e+01 8.42592869e-04]\n" + ] + } + ], + "source": [ + "X, y = synthetic_dataset(1000)\n", + "\n", + "cb_model = CatBoostClassifier(iterations=10, learning_rate=0.01, depth=10, loss_function='MultiClass', verbose=False)\n", + "cb_model.fit(X, y)\n", + "y_pred = cb_model.predict(X)\n", + "\n", + "print(\"Accuracy:\", accuracy_score(y_pred, y))\n", + "print(\"Importance:\", cb_model.feature_importances_)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tcLRsSNG4q6E" + }, + "source": [ + "### Задание 5 (3 балла)\n", + "Попробуем применить один из используемых на практике алгоритмов. В этом нам поможет CatBoost. Также, как и реализованный ними RandomForest, применим его для определения пола и возраста пользователей сети Вконтакте, выведите названия наиболее важных признаков так же, как в задании 3.\\\n", + "\\\n", + "Эксперименты с множеством используемых признаков и подбор гиперпараметров приветствуются." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "id": "hJGrQcO-4q6E" + }, + "outputs": [], + "source": [ + "X, y_age, y_sex, features = read_dataset(\"src/homeworks/homework6/vk.csv\")\n", + "X_train, X_test, y_age_train, y_age_test, y_sex_train, y_sex_test = train_test_split(X, y_age, y_sex, train_size=0.9)\n", + "X_train, X_eval, y_age_train, y_age_eval, y_sex_train, y_sex_eval = train_test_split(\n", + " X_train, y_age_train, y_sex_train, train_size=0.8\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "6Rlz_kDlsjr6", + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "max_depth = range(1, 10, 3)\n", + "min_samples_leaf = range(1, 10, 3)\n", + "learning_rate = np.linspace(0.001, 1.0, 5)\n", + "\n", + "\n", + "def get_best_params(y_train, y_eval):\n", + " best_score, best_params = None, None\n", + " for lr, md, msl in tqdm(list(product(learning_rate, max_depth, min_samples_leaf))):\n", + " cb_model = CatBoostClassifier(iterations=100, learning_rate=lr, depth=md, min_data_in_leaf=msl, loss_function='MultiClass', verbose=False, random_seed=SEED)\n", + " cb_model.fit(X_train, y_train)\n", + " y_pred = cb_model.predict(X_eval)\n", + " score = accuracy_score(y_eval, y_pred)\n", + "\n", + " if best_score is None or score > best_score:\n", + " best_score = score\n", + " best_params = {\n", + " 'learning_rate': lr,\n", + " 'depth': md,\n", + " 'min_data_in_leaf': msl\n", + " }\n", + " return best_params, best_score" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XA5f_8eC4q6E" + }, + "source": [ + "#### Возраст" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "id": "rOXsfxY7sjr6", + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2651344d76924155a8acf61144b07691", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/45 [00:00