diff --git a/Solar-Fullstack-LLM-101/05_1_ChromaDB.ipynb b/Solar-Fullstack-LLM-101/05_1_ChromaDB.ipynb index 31c3408..0c369e2 100644 --- a/Solar-Fullstack-LLM-101/05_1_ChromaDB.ipynb +++ b/Solar-Fullstack-LLM-101/05_1_ChromaDB.ipynb @@ -1,338 +1,439 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\"Open\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 05-1.ChromaDB\n", - "\n", - "## Overview \n", - "In this exercise, we will explore how to utilize ChromaDB to embed documents and construct a vectorspace. Additionally, we will gain insight into the creation of a Retriever object to facilitate efficient query searches within documents. This tutorial will guide you through the process of embedding documents and using a vectorspace for effective information retrieval.\n", - " \n", - "## Purpose of the Exercise\n", - "The purpose of this exercise is to demonstrate the use of the Solar Embedding API to generate embeddings and create a vectorspace. By the end of this tutorial, users will be able to create a Retriever object and conduct efficient searches within the vectorspace, thereby enhancing the ability to retrieve relevant information from embedded documents.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Keyword VS Semantic Search \n", - "![Vector](https://blog.dataiku.com/hs-fs/hubfs/dftt%202.webp?width=1346&height=632&name=dftt%202.webp)\n", - "\n", - "from https://blog.dataiku.com/semantic-search-an-overlooked-nlp-superpower" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Emb_search](figures/emb_search.png)\n", - "\n", - "from https://sreent.medium.com/llms-embeddings-and-vector-search-d4bd9362df56" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "! pip3 install -qU markdownify langchain-upstage rank_bm25 python-dotenv" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# @title set API key\n", - "import os\n", - "import getpass\n", - "from pprint import pprint\n", - "import warnings\n", - "\n", - "warnings.filterwarnings(\"ignore\")\n", - "\n", - "from IPython import get_ipython\n", - "\n", - "if \"google.colab\" in str(get_ipython()):\n", - " # Running in Google Colab. Please set the UPSTAGE_API_KEY in the Colab Secrets\n", - " from google.colab import userdata\n", - " os.environ[\"UPSTAGE_API_KEY\"] = userdata.get(\"UPSTAGE_API_KEY\")\n", - "else:\n", - " # Running locally. Please set the UPSTAGE_API_KEY in the .env file\n", - " from dotenv import load_dotenv\n", - "\n", - " load_dotenv()\n", - "\n", - "if \"UPSTAGE_API_KEY\" not in os.environ:\n", - " os.environ[\"UPSTAGE_API_KEY\"] = getpass.getpass(\"Enter your Upstage API key: \")\n" -] - - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ + "cells": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Document(page_content='Korea is a beautiful country to visit in the spring.'), Document(page_content='The best time to visit Korea is in the fall.'), Document(page_content='Best way to find bug is using unit test.'), Document(page_content='Python is a great programming language for beginners.'), Document(page_content='Sung Kim is a great teacher.')]\n" - ] - } - ], - "source": [ - "from langchain_chroma import Chroma\n", - "from langchain_upstage import UpstageEmbeddings\n", - "from langchain.docstore.document import Document\n", - "\n", - "from langchain_text_splitters import (\n", - " Language,\n", - " RecursiveCharacterTextSplitter,\n", - ")\n", - "\n", - "sample_text = [\n", - " \"Korea is a beautiful country to visit in the spring.\",\n", - " \"The best time to visit Korea is in the fall.\",\n", - " \"Best way to find bug is using unit test.\",\n", - " \"Python is a great programming language for beginners.\",\n", - " \"Sung Kim is a great teacher.\",\n", - "]\n", - "\n", - "splits = RecursiveCharacterTextSplitter().create_documents(sample_text)\n", - "\n", - "print(splits)\n", - "\n", - "vectorstore = Chroma.from_documents(\n", - " documents=splits,\n", - " ids=[doc.page_content for doc in splits],\n", - " embedding=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "# check if text is in the vector store\n", - "def is_in_vectorstore(vectorstore, text):\n", - " search_results = vectorstore.get(ids=[text])\n", - " if search_results and search_results[\"ids\"]:\n", - " return True\n", - " else:\n", - " return False" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "metadata": { + "id": "bHQUCbh2aim1" + }, + "source": [ + "\n", + "\"Open\n", + "" + ] + }, { - "data": { - "text/plain": [ - "False" + "cell_type": "markdown", + "metadata": { + "id": "yRnbgQpzaim2" + }, + "source": [ + "# 05-1.ChromaDB\n", + "\n", + "## Overview \n", + "In this exercise, we will explore how to utilize ChromaDB to embed documents and construct a vectorspace. Additionally, we will gain insight into the creation of a Retriever object to facilitate efficient query searches within documents. This tutorial will guide you through the process of embedding documents and using a vectorspace for effective information retrieval.\n", + "\n", + "## Purpose of the Exercise\n", + "The purpose of this exercise is to demonstrate the use of the Solar Embedding API to generate embeddings and create a vectorspace. By the end of this tutorial, users will be able to create a Retriever object and conduct efficient searches within the vectorspace, thereby enhancing the ability to retrieve relevant information from embedded documents.\n", + "\n" ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "is_in_vectorstore(vectorstore, \"Hello, new sentence\")" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ + }, { - "data": { - "text/plain": [ - "True" + "cell_type": "markdown", + "metadata": { + "id": "lDjVZoaxaim3" + }, + "source": [ + "## Keyword VS Semantic Search\n", + "![Vector](https://blog.dataiku.com/hs-fs/hubfs/dftt%202.webp?width=1346&height=632&name=dftt%202.webp)\n", + "\n", + "from https://blog.dataiku.com/semantic-search-an-overlooked-nlp-superpower" ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "is_in_vectorstore(vectorstore, splits[0].page_content)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_upstage import UpstageLayoutAnalysisLoader\n", - "\n", - "\n", - "layzer = UpstageLayoutAnalysisLoader(\"pdfs/kim-tse-2008.pdf\", output_type=\"html\")\n", - "# For improved memory efficiency, consider using the lazy_load method to load documents page by page.\n", - "docs = layzer.load() # or layzer.lazy_load()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Splits: 125\n" - ] - } - ], - "source": [ - "from langchain_text_splitters import (\n", - " Language,\n", - " RecursiveCharacterTextSplitter,\n", - ")\n", - "\n", - "# 2. Split\n", - "text_splitter = RecursiveCharacterTextSplitter.from_language(\n", - " chunk_size=1000, chunk_overlap=100, language=Language.HTML\n", - ")\n", - "splits = text_splitter.split_documents(docs)\n", - "print(\"Splits:\", len(splits))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "metadata": { + "id": "FAFLy4Etaim3" + }, + "source": [ + "![Emb_search](https://github.com/UpstageAI/cookbook/blob/main/Solar-Fullstack-LLM-101/figures/emb_search.png?raw=1)\n", + "\n", + "from https://sreent.medium.com/llms-embeddings-and-vector-search-d4bd9362df56" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "125\n" - ] - } - ], - "source": [ - "from langchain_chroma import Chroma\n", - "\n", - "vectorstore = Chroma(\n", - " persist_directory=\"./chroma_db\",\n", - " embedding_function=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n", - ")\n", - "retriever = vectorstore.as_retriever()\n", - "\n", - "\n", - "unique_splits = [\n", - " split for split in splits if not is_in_vectorstore(vectorstore, split.page_content)\n", - "]\n", - "print(len(unique_splits))\n", - "\n", - "# 3. Embed & indexing\n", - "if len(unique_splits) > 0:\n", - " vectorstore = Chroma.from_documents(\n", - " ids=[split.page_content for split in unique_splits],\n", - " persist_directory=\"./chroma_db\",\n", - " documents=unique_splits,\n", - " embedding=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uPHGD_OBaim4" + }, + "outputs": [], + "source": [ + "! pip3 install -qU langchain-chroma markdownify langchain-upstage rank_bm25 python-dotenv langchain" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "0\n" + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "KyP3R-L1aim4" + }, + "outputs": [], + "source": [ + "# @title set API key\n", + "from pprint import pprint\n", + "import os\n", + "\n", + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "if \"google.colab\" in str(get_ipython()):\n", + " # Running in Google Colab. Please set the UPSTAGE_API_KEY in the Colab Secrets\n", + " from google.colab import userdata\n", + "\n", + " os.environ[\"UPSTAGE_API_KEY\"] = userdata.get(\"UPSTAGE_API_KEY\")\n", + "else:\n", + " # Running locally. Please set the UPSTAGE_API_KEY in the .env file\n", + " from dotenv import load_dotenv\n", + "\n", + " load_dotenv()\n", + "\n", + "assert (\n", + " \"UPSTAGE_API_KEY\" in os.environ\n", + "), \"Please set the UPSTAGE_API_KEY environment variable\"" ] - } - ], - "source": [ - "from langchain_chroma import Chroma\n", - "\n", - "vectorstore = Chroma(\n", - " persist_directory=\"./chroma_db\",\n", - " embedding_function=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n", - ")\n", - "retriever = vectorstore.as_retriever()\n", - "\n", - "unique_splits = [\n", - " split for split in splits if not is_in_vectorstore(vectorstore, split.page_content)\n", - "]\n", - "print(len(unique_splits))\n", - "\n", - "# 3. Embed & indexing\n", - "if len(unique_splits) > 0:\n", - " vectorstore = Chroma.from_documents(\n", - " ids=[split.page_content for split in unique_splits],\n", - " persist_directory=\"./chroma_db\",\n", - " documents=unique_splits,\n", - " embedding=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "

introduced bugs immediately. Several bug-finding techni-
ques c\n" - ] + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "0DW5Q0Wkaim4", + "outputId": "df098096-582d-4f7e-a90c-affd61a68a77", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[Document(metadata={}, page_content='Korea is a beautiful country to visit in the spring.'), Document(metadata={}, page_content='The best time to visit Korea is in the fall.'), Document(metadata={}, page_content='Best way to find bug is using unit test.'), Document(metadata={}, page_content='Python is a great programming language for beginners.'), Document(metadata={}, page_content='Sung Kim is a great teacher.')]\n" + ] + } + ], + "source": [ + "from langchain_chroma import Chroma\n", + "from langchain_upstage import UpstageEmbeddings\n", + "from langchain.docstore.document import Document\n", + "\n", + "from langchain_text_splitters import (\n", + " Language,\n", + " RecursiveCharacterTextSplitter,\n", + ")\n", + "\n", + "sample_text = [\n", + " \"Korea is a beautiful country to visit in the spring.\",\n", + " \"The best time to visit Korea is in the fall.\",\n", + " \"Best way to find bug is using unit test.\",\n", + " \"Python is a great programming language for beginners.\",\n", + " \"Sung Kim is a great teacher.\",\n", + "]\n", + "\n", + "splits = RecursiveCharacterTextSplitter().create_documents(sample_text)\n", + "\n", + "print(splits)\n", + "\n", + "vectorstore = Chroma.from_documents(\n", + " documents=splits,\n", + " ids=[doc.page_content for doc in splits],\n", + " embedding=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "N_k9hEaUaim5" + }, + "outputs": [], + "source": [ + "# check if text is in the vector store\n", + "def is_in_vectorstore(vectorstore, text):\n", + " search_results = vectorstore.get(ids=[text])\n", + " if search_results and search_results[\"ids\"]:\n", + " return True\n", + " else:\n", + " return False" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "9FlsMkYkaim5", + "outputId": "8290c2c9-177c-4166-d7f3-368807ea2928", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "False" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ], + "source": [ + "is_in_vectorstore(vectorstore, \"Hello, new sentence\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "wxndeR9Naim6", + "outputId": "92e464f3-bee5-4048-96be-b6247c99cc79", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ], + "source": [ + "is_in_vectorstore(vectorstore, splits[0].page_content)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "bmaZFuDWaim6" + }, + "outputs": [], + "source": [ + "from langchain_upstage import UpstageDocumentParseLoader\n", + "\n", + "layzer = UpstageDocumentParseLoader(\"pdfs/kim-tse-2008.pdf\", output_format=\"html\")\n", + "# For improved memory efficiency, consider using the lazy_load method to load documents page by page.\n", + "docs = layzer.load() # or layzer.lazy_load()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "A7KuZZqHaim6", + "outputId": "5c1bbf0a-1844-4ede-e682-eed0f4a3f2b2", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Splits: 132\n" + ] + } + ], + "source": [ + "from langchain_text_splitters import (\n", + " Language,\n", + " RecursiveCharacterTextSplitter,\n", + ")\n", + "\n", + "# 2. Split\n", + "text_splitter = RecursiveCharacterTextSplitter.from_language(\n", + " chunk_size=1000, chunk_overlap=100, language=Language.HTML\n", + ")\n", + "splits = text_splitter.split_documents(docs)\n", + "print(\"Splits:\", len(splits))" + ] + }, + { + "cell_type": "code", + "source": [ + "from langchain_chroma import Chroma\n", + "from langchain_upstage import UpstageEmbeddings\n", + "\n", + "vectorstore = Chroma(\n", + " persist_directory=\"./chroma_db\",\n", + " embedding_function=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n", + ")\n", + "retriever = vectorstore.as_retriever()\n", + "\n", + "unique_splits = [\n", + " split for split in splits if not is_in_vectorstore(vectorstore, split.page_content)\n", + "]\n", + "print(len(unique_splits))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lEqnFToCkZfH", + "outputId": "a22c8c5e-7d16-4c5e-ca9f-cb739dd42847" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "132\n" + ] + } + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "X1SWtlmkaim6", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "06f4520e-ba67-4a13-a6d6-6222f67535a8" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "132\n" + ] + } + ], + "source": [ + "from langchain_chroma import Chroma\n", + "from langchain_upstage import UpstageEmbeddings\n", + "from langchain.docstore.document import Document\n", + "\n", + "# Simplify metadata by converting complex data to simple types (str, int, float, bool)\n", + "def simplify_metadata(metadata):\n", + " simplified_metadata = {}\n", + " for key, value in metadata.items():\n", + " if isinstance(value, (str, int, float, bool)):\n", + " simplified_metadata[key] = value\n", + " else:\n", + " simplified_metadata[key] = str(value)\n", + " return simplified_metadata\n", + "\n", + "unique_splits = [\n", + " Document(page_content=split.page_content, metadata=simplify_metadata(split.metadata))\n", + " for split in unique_splits\n", + "]\n", + "print(len(unique_splits))\n", + "\n", + "# 3. Embed & indexing\n", + "if len(unique_splits) > 0:\n", + " vectorstore = Chroma.from_documents(\n", + " ids=[split.page_content for split in unique_splits],\n", + " persist_directory=\"./chroma_db\",\n", + " documents=unique_splits,\n", + " embedding=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "3kxYmiUcaim7", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "a38c52fb-555c-4211-ffee-bee3e2c9d663" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "0\n" + ] + } + ], + "source": [ + "from langchain_chroma import Chroma\n", + "\n", + "vectorstore = Chroma(\n", + " persist_directory=\"./chroma_db\",\n", + " embedding_function=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n", + ")\n", + "retriever = vectorstore.as_retriever()\n", + "\n", + "unique_splits = [\n", + " split for split in splits if not is_in_vectorstore(vectorstore, split.page_content)\n", + "]\n", + "print(len(unique_splits))\n", + "\n", + "# 3. Embed & indexing\n", + "if len(unique_splits) > 0:\n", + " vectorstore = Chroma.from_documents(\n", + " ids=[split.page_content for split in unique_splits],\n", + " persist_directory=\"./chroma_db\",\n", + " documents=unique_splits,\n", + " embedding=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "z5iN091Eaim7", + "outputId": "341a58d6-fdd4-4b72-8924-19fa6e74e5ed", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "

introduced bugs immediately. Several bug\n" + ] + } + ], + "source": [ + "search_result = retriever.invoke(\"How to find problems in code?\")\n", + "print(search_result[0].page_content[:100])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + }, + "colab": { + "provenance": [], + "include_colab_link": true } - ], - "source": [ - "search_result = retriever.invoke(\"How to find problems in code?\")\n", - "print(search_result[0].page_content[:100])" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.10" - } - }, - "nbformat": 4, - "nbformat_minor": 4 + "nbformat": 4, + "nbformat_minor": 0 }