diff --git a/Solar-Fullstack-LLM-101/05_1_ChromaDB.ipynb b/Solar-Fullstack-LLM-101/05_1_ChromaDB.ipynb
index 31c3408..0c369e2 100644
--- a/Solar-Fullstack-LLM-101/05_1_ChromaDB.ipynb
+++ b/Solar-Fullstack-LLM-101/05_1_ChromaDB.ipynb
@@ -1,338 +1,439 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- ""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# 05-1.ChromaDB\n",
- "\n",
- "## Overview \n",
- "In this exercise, we will explore how to utilize ChromaDB to embed documents and construct a vectorspace. Additionally, we will gain insight into the creation of a Retriever object to facilitate efficient query searches within documents. This tutorial will guide you through the process of embedding documents and using a vectorspace for effective information retrieval.\n",
- " \n",
- "## Purpose of the Exercise\n",
- "The purpose of this exercise is to demonstrate the use of the Solar Embedding API to generate embeddings and create a vectorspace. By the end of this tutorial, users will be able to create a Retriever object and conduct efficient searches within the vectorspace, thereby enhancing the ability to retrieve relevant information from embedded documents.\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Keyword VS Semantic Search \n",
- "\n",
- "\n",
- "from https://blog.dataiku.com/semantic-search-an-overlooked-nlp-superpower"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "from https://sreent.medium.com/llms-embeddings-and-vector-search-d4bd9362df56"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "! pip3 install -qU markdownify langchain-upstage rank_bm25 python-dotenv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "# @title set API key\n",
- "import os\n",
- "import getpass\n",
- "from pprint import pprint\n",
- "import warnings\n",
- "\n",
- "warnings.filterwarnings(\"ignore\")\n",
- "\n",
- "from IPython import get_ipython\n",
- "\n",
- "if \"google.colab\" in str(get_ipython()):\n",
- " # Running in Google Colab. Please set the UPSTAGE_API_KEY in the Colab Secrets\n",
- " from google.colab import userdata\n",
- " os.environ[\"UPSTAGE_API_KEY\"] = userdata.get(\"UPSTAGE_API_KEY\")\n",
- "else:\n",
- " # Running locally. Please set the UPSTAGE_API_KEY in the .env file\n",
- " from dotenv import load_dotenv\n",
- "\n",
- " load_dotenv()\n",
- "\n",
- "if \"UPSTAGE_API_KEY\" not in os.environ:\n",
- " os.environ[\"UPSTAGE_API_KEY\"] = getpass.getpass(\"Enter your Upstage API key: \")\n"
-]
-
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {},
- "outputs": [
+ "cells": [
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[Document(page_content='Korea is a beautiful country to visit in the spring.'), Document(page_content='The best time to visit Korea is in the fall.'), Document(page_content='Best way to find bug is using unit test.'), Document(page_content='Python is a great programming language for beginners.'), Document(page_content='Sung Kim is a great teacher.')]\n"
- ]
- }
- ],
- "source": [
- "from langchain_chroma import Chroma\n",
- "from langchain_upstage import UpstageEmbeddings\n",
- "from langchain.docstore.document import Document\n",
- "\n",
- "from langchain_text_splitters import (\n",
- " Language,\n",
- " RecursiveCharacterTextSplitter,\n",
- ")\n",
- "\n",
- "sample_text = [\n",
- " \"Korea is a beautiful country to visit in the spring.\",\n",
- " \"The best time to visit Korea is in the fall.\",\n",
- " \"Best way to find bug is using unit test.\",\n",
- " \"Python is a great programming language for beginners.\",\n",
- " \"Sung Kim is a great teacher.\",\n",
- "]\n",
- "\n",
- "splits = RecursiveCharacterTextSplitter().create_documents(sample_text)\n",
- "\n",
- "print(splits)\n",
- "\n",
- "vectorstore = Chroma.from_documents(\n",
- " documents=splits,\n",
- " ids=[doc.page_content for doc in splits],\n",
- " embedding=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {},
- "outputs": [],
- "source": [
- "# check if text is in the vector store\n",
- "def is_in_vectorstore(vectorstore, text):\n",
- " search_results = vectorstore.get(ids=[text])\n",
- " if search_results and search_results[\"ids\"]:\n",
- " return True\n",
- " else:\n",
- " return False"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {},
- "outputs": [
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "bHQUCbh2aim1"
+ },
+ "source": [
+ "\n",
+ "
\n",
+ ""
+ ]
+ },
{
- "data": {
- "text/plain": [
- "False"
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "yRnbgQpzaim2"
+ },
+ "source": [
+ "# 05-1.ChromaDB\n",
+ "\n",
+ "## Overview \n",
+ "In this exercise, we will explore how to utilize ChromaDB to embed documents and construct a vectorspace. Additionally, we will gain insight into the creation of a Retriever object to facilitate efficient query searches within documents. This tutorial will guide you through the process of embedding documents and using a vectorspace for effective information retrieval.\n",
+ "\n",
+ "## Purpose of the Exercise\n",
+ "The purpose of this exercise is to demonstrate the use of the Solar Embedding API to generate embeddings and create a vectorspace. By the end of this tutorial, users will be able to create a Retriever object and conduct efficient searches within the vectorspace, thereby enhancing the ability to retrieve relevant information from embedded documents.\n",
+ "\n"
]
- },
- "execution_count": 24,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "is_in_vectorstore(vectorstore, \"Hello, new sentence\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {},
- "outputs": [
+ },
{
- "data": {
- "text/plain": [
- "True"
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "lDjVZoaxaim3"
+ },
+ "source": [
+ "## Keyword VS Semantic Search\n",
+ "\n",
+ "\n",
+ "from https://blog.dataiku.com/semantic-search-an-overlooked-nlp-superpower"
]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "is_in_vectorstore(vectorstore, splits[0].page_content)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "from langchain_upstage import UpstageLayoutAnalysisLoader\n",
- "\n",
- "\n",
- "layzer = UpstageLayoutAnalysisLoader(\"pdfs/kim-tse-2008.pdf\", output_type=\"html\")\n",
- "# For improved memory efficiency, consider using the lazy_load method to load documents page by page.\n",
- "docs = layzer.load() # or layzer.lazy_load()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
+ },
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Splits: 125\n"
- ]
- }
- ],
- "source": [
- "from langchain_text_splitters import (\n",
- " Language,\n",
- " RecursiveCharacterTextSplitter,\n",
- ")\n",
- "\n",
- "# 2. Split\n",
- "text_splitter = RecursiveCharacterTextSplitter.from_language(\n",
- " chunk_size=1000, chunk_overlap=100, language=Language.HTML\n",
- ")\n",
- "splits = text_splitter.split_documents(docs)\n",
- "print(\"Splits:\", len(splits))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "FAFLy4Etaim3"
+ },
+ "source": [
+ "\n",
+ "\n",
+ "from https://sreent.medium.com/llms-embeddings-and-vector-search-d4bd9362df56"
+ ]
+ },
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "125\n"
- ]
- }
- ],
- "source": [
- "from langchain_chroma import Chroma\n",
- "\n",
- "vectorstore = Chroma(\n",
- " persist_directory=\"./chroma_db\",\n",
- " embedding_function=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n",
- ")\n",
- "retriever = vectorstore.as_retriever()\n",
- "\n",
- "\n",
- "unique_splits = [\n",
- " split for split in splits if not is_in_vectorstore(vectorstore, split.page_content)\n",
- "]\n",
- "print(len(unique_splits))\n",
- "\n",
- "# 3. Embed & indexing\n",
- "if len(unique_splits) > 0:\n",
- " vectorstore = Chroma.from_documents(\n",
- " ids=[split.page_content for split in unique_splits],\n",
- " persist_directory=\"./chroma_db\",\n",
- " documents=unique_splits,\n",
- " embedding=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n",
- " )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "uPHGD_OBaim4"
+ },
+ "outputs": [],
+ "source": [
+ "! pip3 install -qU langchain-chroma markdownify langchain-upstage rank_bm25 python-dotenv langchain"
+ ]
+ },
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "0\n"
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "id": "KyP3R-L1aim4"
+ },
+ "outputs": [],
+ "source": [
+ "# @title set API key\n",
+ "from pprint import pprint\n",
+ "import os\n",
+ "\n",
+ "import warnings\n",
+ "\n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "\n",
+ "if \"google.colab\" in str(get_ipython()):\n",
+ " # Running in Google Colab. Please set the UPSTAGE_API_KEY in the Colab Secrets\n",
+ " from google.colab import userdata\n",
+ "\n",
+ " os.environ[\"UPSTAGE_API_KEY\"] = userdata.get(\"UPSTAGE_API_KEY\")\n",
+ "else:\n",
+ " # Running locally. Please set the UPSTAGE_API_KEY in the .env file\n",
+ " from dotenv import load_dotenv\n",
+ "\n",
+ " load_dotenv()\n",
+ "\n",
+ "assert (\n",
+ " \"UPSTAGE_API_KEY\" in os.environ\n",
+ "), \"Please set the UPSTAGE_API_KEY environment variable\""
]
- }
- ],
- "source": [
- "from langchain_chroma import Chroma\n",
- "\n",
- "vectorstore = Chroma(\n",
- " persist_directory=\"./chroma_db\",\n",
- " embedding_function=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n",
- ")\n",
- "retriever = vectorstore.as_retriever()\n",
- "\n",
- "unique_splits = [\n",
- " split for split in splits if not is_in_vectorstore(vectorstore, split.page_content)\n",
- "]\n",
- "print(len(unique_splits))\n",
- "\n",
- "# 3. Embed & indexing\n",
- "if len(unique_splits) > 0:\n",
- " vectorstore = Chroma.from_documents(\n",
- " ids=[split.page_content for split in unique_splits],\n",
- " persist_directory=\"./chroma_db\",\n",
- " documents=unique_splits,\n",
- " embedding=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n",
- " )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
+ },
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "
introduced bugs immediately. Several bug-finding techni-
ques c\n"
- ]
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "id": "0DW5Q0Wkaim4",
+ "outputId": "df098096-582d-4f7e-a90c-affd61a68a77",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "[Document(metadata={}, page_content='Korea is a beautiful country to visit in the spring.'), Document(metadata={}, page_content='The best time to visit Korea is in the fall.'), Document(metadata={}, page_content='Best way to find bug is using unit test.'), Document(metadata={}, page_content='Python is a great programming language for beginners.'), Document(metadata={}, page_content='Sung Kim is a great teacher.')]\n"
+ ]
+ }
+ ],
+ "source": [
+ "from langchain_chroma import Chroma\n",
+ "from langchain_upstage import UpstageEmbeddings\n",
+ "from langchain.docstore.document import Document\n",
+ "\n",
+ "from langchain_text_splitters import (\n",
+ " Language,\n",
+ " RecursiveCharacterTextSplitter,\n",
+ ")\n",
+ "\n",
+ "sample_text = [\n",
+ " \"Korea is a beautiful country to visit in the spring.\",\n",
+ " \"The best time to visit Korea is in the fall.\",\n",
+ " \"Best way to find bug is using unit test.\",\n",
+ " \"Python is a great programming language for beginners.\",\n",
+ " \"Sung Kim is a great teacher.\",\n",
+ "]\n",
+ "\n",
+ "splits = RecursiveCharacterTextSplitter().create_documents(sample_text)\n",
+ "\n",
+ "print(splits)\n",
+ "\n",
+ "vectorstore = Chroma.from_documents(\n",
+ " documents=splits,\n",
+ " ids=[doc.page_content for doc in splits],\n",
+ " embedding=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "id": "N_k9hEaUaim5"
+ },
+ "outputs": [],
+ "source": [
+ "# check if text is in the vector store\n",
+ "def is_in_vectorstore(vectorstore, text):\n",
+ " search_results = vectorstore.get(ids=[text])\n",
+ " if search_results and search_results[\"ids\"]:\n",
+ " return True\n",
+ " else:\n",
+ " return False"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "id": "9FlsMkYkaim5",
+ "outputId": "8290c2c9-177c-4166-d7f3-368807ea2928",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "False"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 5
+ }
+ ],
+ "source": [
+ "is_in_vectorstore(vectorstore, \"Hello, new sentence\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "id": "wxndeR9Naim6",
+ "outputId": "92e464f3-bee5-4048-96be-b6247c99cc79",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 6
+ }
+ ],
+ "source": [
+ "is_in_vectorstore(vectorstore, splits[0].page_content)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "id": "bmaZFuDWaim6"
+ },
+ "outputs": [],
+ "source": [
+ "from langchain_upstage import UpstageDocumentParseLoader\n",
+ "\n",
+ "layzer = UpstageDocumentParseLoader(\"pdfs/kim-tse-2008.pdf\", output_format=\"html\")\n",
+ "# For improved memory efficiency, consider using the lazy_load method to load documents page by page.\n",
+ "docs = layzer.load() # or layzer.lazy_load()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "id": "A7KuZZqHaim6",
+ "outputId": "5c1bbf0a-1844-4ede-e682-eed0f4a3f2b2",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Splits: 132\n"
+ ]
+ }
+ ],
+ "source": [
+ "from langchain_text_splitters import (\n",
+ " Language,\n",
+ " RecursiveCharacterTextSplitter,\n",
+ ")\n",
+ "\n",
+ "# 2. Split\n",
+ "text_splitter = RecursiveCharacterTextSplitter.from_language(\n",
+ " chunk_size=1000, chunk_overlap=100, language=Language.HTML\n",
+ ")\n",
+ "splits = text_splitter.split_documents(docs)\n",
+ "print(\"Splits:\", len(splits))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from langchain_chroma import Chroma\n",
+ "from langchain_upstage import UpstageEmbeddings\n",
+ "\n",
+ "vectorstore = Chroma(\n",
+ " persist_directory=\"./chroma_db\",\n",
+ " embedding_function=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n",
+ ")\n",
+ "retriever = vectorstore.as_retriever()\n",
+ "\n",
+ "unique_splits = [\n",
+ " split for split in splits if not is_in_vectorstore(vectorstore, split.page_content)\n",
+ "]\n",
+ "print(len(unique_splits))"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "lEqnFToCkZfH",
+ "outputId": "a22c8c5e-7d16-4c5e-ca9f-cb739dd42847"
+ },
+ "execution_count": 9,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "132\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "id": "X1SWtlmkaim6",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "06f4520e-ba67-4a13-a6d6-6222f67535a8"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "132\n"
+ ]
+ }
+ ],
+ "source": [
+ "from langchain_chroma import Chroma\n",
+ "from langchain_upstage import UpstageEmbeddings\n",
+ "from langchain.docstore.document import Document\n",
+ "\n",
+ "# Simplify metadata by converting complex data to simple types (str, int, float, bool)\n",
+ "def simplify_metadata(metadata):\n",
+ " simplified_metadata = {}\n",
+ " for key, value in metadata.items():\n",
+ " if isinstance(value, (str, int, float, bool)):\n",
+ " simplified_metadata[key] = value\n",
+ " else:\n",
+ " simplified_metadata[key] = str(value)\n",
+ " return simplified_metadata\n",
+ "\n",
+ "unique_splits = [\n",
+ " Document(page_content=split.page_content, metadata=simplify_metadata(split.metadata))\n",
+ " for split in unique_splits\n",
+ "]\n",
+ "print(len(unique_splits))\n",
+ "\n",
+ "# 3. Embed & indexing\n",
+ "if len(unique_splits) > 0:\n",
+ " vectorstore = Chroma.from_documents(\n",
+ " ids=[split.page_content for split in unique_splits],\n",
+ " persist_directory=\"./chroma_db\",\n",
+ " documents=unique_splits,\n",
+ " embedding=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n",
+ " )\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "id": "3kxYmiUcaim7",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "a38c52fb-555c-4211-ffee-bee3e2c9d663"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "0\n"
+ ]
+ }
+ ],
+ "source": [
+ "from langchain_chroma import Chroma\n",
+ "\n",
+ "vectorstore = Chroma(\n",
+ " persist_directory=\"./chroma_db\",\n",
+ " embedding_function=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n",
+ ")\n",
+ "retriever = vectorstore.as_retriever()\n",
+ "\n",
+ "unique_splits = [\n",
+ " split for split in splits if not is_in_vectorstore(vectorstore, split.page_content)\n",
+ "]\n",
+ "print(len(unique_splits))\n",
+ "\n",
+ "# 3. Embed & indexing\n",
+ "if len(unique_splits) > 0:\n",
+ " vectorstore = Chroma.from_documents(\n",
+ " ids=[split.page_content for split in unique_splits],\n",
+ " persist_directory=\"./chroma_db\",\n",
+ " documents=unique_splits,\n",
+ " embedding=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "id": "z5iN091Eaim7",
+ "outputId": "341a58d6-fdd4-4b72-8924-19fa6e74e5ed",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "
introduced bugs immediately. Several bug\n" + ] + } + ], + "source": [ + "search_result = retriever.invoke(\"How to find problems in code?\")\n", + "print(search_result[0].page_content[:100])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + }, + "colab": { + "provenance": [], + "include_colab_link": true } - ], - "source": [ - "search_result = retriever.invoke(\"How to find problems in code?\")\n", - "print(search_result[0].page_content[:100])" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.10" - } - }, - "nbformat": 4, - "nbformat_minor": 4 + "nbformat": 4, + "nbformat_minor": 0 }