diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index 3ab9a4d7d..000000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,25 +0,0 @@ -name: ci -on: - push: - branches: - - master - - main -permissions: - contents: write -jobs: - deploy: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: 3.x - - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV - - uses: actions/cache@v3 - with: - key: mkdocs-material-${{ env.cache_id }} - path: .cache - restore-keys: | - mkdocs-material- - - run: pip install mkdocs-material mkdocstrings==0.27.0 pillow cairosvg mknotebooks - - run: mkdocs gh-deploy --force diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 000000000..f1e99bd82 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,31 @@ +name: Documentation +on: + push: + branches: + - master + - main + +permissions: + contents: read + pages: write + id-token: write + +jobs: + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + steps: + - uses: actions/configure-pages@v5 + - uses: actions/checkout@v5 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - run: pip install zensical + - run: zensical build --clean + - uses: actions/upload-pages-artifact@v4 + with: + path: site + - uses: actions/deploy-pages@v4 + id: deployment \ No newline at end of file diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index ca1872fce..c45518108 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -1,39 +1,48 @@ -# This workflow will upload a Python Package using Twine when a release is created -# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries - -# This workflow uses actions that are not certified by GitHub. -# They are provided by a third-party and are governed by -# separate terms of service, privacy policy, and support -# documentation. - -name: Upload Python Package +name: Publish to PyPI on: + release: + types: [published] workflow_dispatch: - push: - # Pattern matched against refs/tags - tags: - - 'v*' # Push events to every version tag jobs: - deploy: - + build: runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install poetry + poetry install + + - name: Build package + run: poetry build + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: dist + path: dist/ + + publish: + needs: build + runs-on: ubuntu-latest + environment: pypi + permissions: + id-token: write # Required for trusted publishing steps: - - uses: actions/checkout@v2 - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: '3.10.x' - - name: Install dependencies - run: | - python -m pip install poetry - poetry install - - name: Build package - run: poetry build - - name: Publish package - uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 - with: - user: __token__ - password: ${{ secrets.PYPI_API_TOKEN }} + - name: Download artifact + uses: actions/download-artifact@v4 + with: + name: dist + path: dist/ + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 \ No newline at end of file diff --git a/.github/workflows/type-checkers.yml b/.github/workflows/type-checkers.yml index 3ca51ccf2..35e0b4197 100644 --- a/.github/workflows/type-checkers.yml +++ b/.github/workflows/type-checkers.yml @@ -28,7 +28,7 @@ jobs: - name: mypy run: | - poetry run mypy fastembed \ + poetry run mypy fastembed_bio \ --disallow-incomplete-defs \ --disallow-untyped-defs \ --disable-error-code=import-untyped diff --git a/.gitignore b/.gitignore index 25085a9fe..162fdd95f 100644 --- a/.gitignore +++ b/.gitignore @@ -140,3 +140,4 @@ docs/experimental/*.parquet docs/experimental/*.bin qdrant_storage/* experiments/models/* +converted_models \ No newline at end of file diff --git a/README.md b/README.md index d4c882615..f5f3815b1 100644 --- a/README.md +++ b/README.md @@ -1,281 +1,91 @@ -# โšก๏ธ What is FastEmbed? +# fastembed-bio -FastEmbed is a lightweight, fast, Python library built for embedding generation. We [support popular text models](https://qdrant.github.io/fastembed/examples/Supported_Models/). Please [open a GitHub issue](https://github.com/qdrant/fastembed/issues/new) if you want us to add a new model. +Fast, lightweight biological sequence embeddings using ONNX. Built on [FastEmbed](https://github.com/qdrant/fastembed). -The default text embedding (`TextEmbedding`) model is Flag Embedding, presented in the [MTEB](https://huggingface.co/spaces/mteb/leaderboard) leaderboard. It supports "query" and "passage" prefixes for the input text. Here is an example for [Retrieval Embedding Generation](https://qdrant.github.io/fastembed/qdrant/Retrieval_with_FastEmbed/) and how to use [FastEmbed with Qdrant](https://qdrant.github.io/fastembed/qdrant/Usage_With_Qdrant/). +## Why fastembed-bio? -## ๐Ÿ“ˆ Why FastEmbed? +1. **Light**: No GPU required. No PyTorch. Just ONNX Runtime. Perfect for serverless and resource-constrained environments. -1. Light: FastEmbed is a lightweight library with few external dependencies. We don't require a GPU and don't download GBs of PyTorch dependencies, and instead use the ONNX Runtime. This makes it a great candidate for serverless runtimes like AWS Lambda. +2. **Fast**: ONNX Runtime is faster than PyTorch inference. Batch processing and parallelism built-in. -2. Fast: FastEmbed is designed for speed. We use the ONNX Runtime, which is faster than PyTorch. We also use data parallelism for encoding large datasets. +3. **Simple**: Same interface patterns as FastEmbed. If you've used FastEmbed for text, you already know how to use this. -3. Accurate: FastEmbed is better than OpenAI Ada-002. We also [support](https://qdrant.github.io/fastembed/examples/Supported_Models/) an ever-expanding set of models, including a few multilingual models. - -## ๐Ÿš€ Installation - -To install the FastEmbed library, pip works best. You can install it with or without GPU support: +## Installation ```bash -pip install fastembed - -# or with GPU support - -pip install fastembed-gpu +pip install fastembed-bio ``` -## ๐Ÿ“– Quickstart +## Quickstart ```python -from fastembed import TextEmbedding - +from fastembed.bio import ProteinEmbedding -# Example list of documents -documents: list[str] = [ - "This is built to be faster and lighter than other embedding libraries e.g. Transformers, Sentence-Transformers, etc.", - "fastembed is supported by and maintained by Qdrant.", +sequences = [ + "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG", + "GKGDPKKPRGKMSSYAFFVQTSREEHKKKHPDASVNFSEFSKKCSERWKTMSAKEKGKFEDMAK", ] -# This will trigger the model download and initialization -embedding_model = TextEmbedding() -print("The model BAAI/bge-small-en-v1.5 is ready to use.") - -embeddings_generator = embedding_model.embed(documents) # reminder this is a generator -embeddings_list = list(embedding_model.embed(documents)) - # you can also convert the generator to a list, and that to a numpy array -len(embeddings_list[0]) # Vector of 384 dimensions -``` - -Fastembed supports a variety of models for different tasks and modalities. -The list of all the available models can be found [here](https://qdrant.github.io/fastembed/examples/Supported_Models/) -### ๐ŸŽ’ Dense text embeddings - -```python -from fastembed import TextEmbedding - -model = TextEmbedding(model_name="BAAI/bge-small-en-v1.5") -embeddings = list(model.embed(documents)) +model = ProteinEmbedding("facebook/esm2_t12_35M_UR50D") +embeddings = list(model.embed(sequences)) # [ -# array([-0.1115, 0.0097, 0.0052, 0.0195, ...], dtype=float32), -# array([-0.1019, 0.0635, -0.0332, 0.0522, ...], dtype=float32) +# array([-0.0055, -0.0144, 0.0355, -0.0049, ...], dtype=float32), +# array([ 0.0114, 0.0020, -0.0247, 0.0060, ...], dtype=float32) # ] - -``` - -Dense text embedding can also be extended with models which are not in the list of supported models. - -```python -from fastembed import TextEmbedding -from fastembed.common.model_description import PoolingType, ModelSource - -TextEmbedding.add_custom_model( - model="intfloat/multilingual-e5-small", - pooling=PoolingType.MEAN, - normalization=True, - sources=ModelSource(hf="intfloat/multilingual-e5-small"), # can be used with an `url` to load files from a private storage - dim=384, - model_file="onnx/model.onnx", # can be used to load an already supported model with another optimization or quantization, e.g. onnx/model_O4.onnx -) -model = TextEmbedding(model_name="intfloat/multilingual-e5-small") -embeddings = list(model.embed(documents)) ``` +## Supported Models -### ๐Ÿ”ฑ Sparse text embeddings +### Protein Embeddings -* SPLADE++ +| Model | Parameters | Dimensions | Description | +|-------|------------|------------|-------------| +| `facebook/esm2_t12_35M_UR50D` | 35M | 480 | ESM-2 protein language model | ```python -from fastembed import SparseTextEmbedding - -model = SparseTextEmbedding(model_name="prithivida/Splade_PP_en_v1") -embeddings = list(model.embed(documents)) +from fastembed.bio import ProteinEmbedding -# [ -# SparseEmbedding(indices=[ 17, 123, 919, ... ], values=[0.71, 0.22, 0.39, ...]), -# SparseEmbedding(indices=[ 38, 12, 91, ... ], values=[0.11, 0.22, 0.39, ...]) -# ] +model = ProteinEmbedding("facebook/esm2_t12_35M_UR50D") +embeddings = list(model.embed(["MKTVRQERLKS", "GKGDPKKPRGK"])) ``` - - -### ๐Ÿฆฅ Late interaction models (aka ColBERT) - - -```python -from fastembed import LateInteractionTextEmbedding - -model = LateInteractionTextEmbedding(model_name="colbert-ir/colbertv2.0") -embeddings = list(model.embed(documents)) - -# [ -# array([ -# [-0.1115, 0.0097, 0.0052, 0.0195, ...], -# [-0.1019, 0.0635, -0.0332, 0.0522, ...], -# ]), -# array([ -# [-0.9019, 0.0335, -0.0032, 0.0991, ...], -# [-0.2115, 0.8097, 0.1052, 0.0195, ...], -# ]), -# ] -``` +RNA foundation models for RNA sequence embeddings. -### ๐Ÿ–ผ๏ธ Image embeddings +## GPU Support ```python -from fastembed import ImageEmbedding - -images = [ - "./path/to/image1.jpg", - "./path/to/image2.jpg", -] - -model = ImageEmbedding(model_name="Qdrant/clip-ViT-B-32-vision") -embeddings = list(model.embed(images)) - -# [ -# array([-0.1115, 0.0097, 0.0052, 0.0195, ...], dtype=float32), -# array([-0.1019, 0.0635, -0.0332, 0.0522, ...], dtype=float32) -# ] -``` - -### Late interaction multimodal models (ColPali) +from fastembed.bio import ProteinEmbedding -```python -from fastembed import LateInteractionMultimodalEmbedding - -doc_images = [ - "./path/to/qdrant_pdf_doc_1_screenshot.jpg", - "./path/to/colpali_pdf_doc_2_screenshot.jpg", -] - -query = "What is Qdrant?" - -model = LateInteractionMultimodalEmbedding(model_name="Qdrant/colpali-v1.3-fp16") -doc_images_embeddings = list(model.embed_image(doc_images)) -# shape (2, 1030, 128) -# [array([[-0.03353882, -0.02090454, ..., -0.15576172, -0.07678223]], dtype=float32)] -query_embedding = model.embed_text(query) -# shape (1, 20, 128) -# [array([[-0.00218201, 0.14758301, ..., -0.02207947, 0.16833496]], dtype=float32)] -``` - -### ๐Ÿ”„ Rerankers -```python -from fastembed.rerank.cross_encoder import TextCrossEncoder - -query = "Who is maintaining Qdrant?" -documents: list[str] = [ - "This is built to be faster and lighter than other embedding libraries e.g. Transformers, Sentence-Transformers, etc.", - "fastembed is supported by and maintained by Qdrant.", -] -encoder = TextCrossEncoder(model_name="Xenova/ms-marco-MiniLM-L-6-v2") -scores = list(encoder.rerank(query, documents)) - -# [-11.48061752319336, 5.472434997558594] -``` - -Text cross encoders can also be extended with models which are not in the list of supported models. - -```python -from fastembed.rerank.cross_encoder import TextCrossEncoder -from fastembed.common.model_description import ModelSource - -TextCrossEncoder.add_custom_model( - model="Xenova/ms-marco-MiniLM-L-4-v2", - model_file="onnx/model.onnx", - sources=ModelSource(hf="Xenova/ms-marco-MiniLM-L-4-v2"), -) -model = TextCrossEncoder(model_name="Xenova/ms-marco-MiniLM-L-4-v2") -scores = list(model.rerank_pairs( - [("What is AI?", "Artificial intelligence is ..."), ("What is ML?", "Machine learning is ..."),] -)) -``` - -## โšก๏ธ FastEmbed on a GPU - -FastEmbed supports running on GPU devices. -It requires installation of the `fastembed-gpu` package. - -```bash -pip install fastembed-gpu -``` - -Check our [example](https://qdrant.github.io/fastembed/examples/FastEmbed_GPU/) for detailed instructions, CUDA 12.x support and troubleshooting of the common issues. - -```python -from fastembed import TextEmbedding - -embedding_model = TextEmbedding( - model_name="BAAI/bge-small-en-v1.5", +model = ProteinEmbedding( + "facebook/esm2_t12_35M_UR50D", providers=["CUDAExecutionProvider"] ) -print("The model BAAI/bge-small-en-v1.5 is ready to use on a GPU.") - -``` - -## Usage with Qdrant - -Installation with Qdrant Client in Python: - -```bash -pip install qdrant-client[fastembed] ``` -or +Requires `onnxruntime-gpu` instead of `onnxruntime`. -```bash -pip install qdrant-client[fastembed-gpu] -``` +## Relationship to FastEmbed -You might have to use quotes ```pip install 'qdrant-client[fastembed]'``` on zsh. +This project is a community-driven fork of [FastEmbed](https://github.com/qdrant/fastembed) focused on biological sequence embeddings. It uses the same core infrastructure (ONNX models, model management, etc.) but is specialized for proteins, DNA, and RNA. -```python -from qdrant_client import QdrantClient, models +The goal is to make biological embeddings as accessible and efficient as text embeddings. -# Initialize the client -client = QdrantClient("localhost", port=6333) # For production -# client = QdrantClient(":memory:") # For experimentation +## Contributing -model_name = "sentence-transformers/all-MiniLM-L6-v2" -payload = [ - {"document": "Qdrant has Langchain integrations", "source": "Langchain-docs", }, - {"document": "Qdrant also has Llama Index integrations", "source": "LlamaIndex-docs"}, -] -docs = [models.Document(text=data["document"], model=model_name) for data in payload] -ids = [42, 2] +Contributions welcome! Areas of interest: -client.create_collection( - "demo_collection", - vectors_config=models.VectorParams( - size=client.get_embedding_size(model_name), distance=models.Distance.COSINE) -) +- Additional ESM-2 model sizes +- DNABert and other DNA models +- RNA foundation models +- Performance optimizations -client.upload_collection( - collection_name="demo_collection", - vectors=docs, - ids=ids, - payload=payload, -) +## License -search_result = client.query_points( - collection_name="demo_collection", - query=models.Document(text="This is a query document", model=model_name) -).points -print(search_result) -``` +Apache 2.0 \ No newline at end of file diff --git a/docs/Getting Started.ipynb b/docs/Getting Started.ipynb deleted file mode 100644 index 054d5cdca..000000000 --- a/docs/Getting Started.ipynb +++ /dev/null @@ -1,242 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "3f159fb4", - "metadata": {}, - "source": [ - "# ๐Ÿšถ๐Ÿปโ€โ™‚๏ธ Getting Started\n", - "\n", - "Here you will learn how to use the fastembed package to embed your data into a vector space. The package is designed to be easy to use and fast. It is built on top of the [ONNX](https://onnx.ai/) standard, which allows for fast inference on a variety of hardware (called Runtimes in ONNX). \n", - "\n", - "## Quick Start\n", - "\n", - "The fastembed package is designed to be easy to use. We'll be using `TextEmbedding` class. It takes a list of strings as input and returns a generator of vectors.\n", - "\n", - "> ๐Ÿ’ก You can learn more about generators from [Python Wiki](https://wiki.python.org/moin/Generators)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "ada95c6a", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install -Uqq fastembed" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "b61c6552", - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "890cc3b969354eec8d149d143e301a7a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Fetching 9 files: 0%| | 0/9 [00:00 ๐Ÿ’ก **Why do we use generators?**\n", - "> \n", - "> We use them to save memory mostly. Instead of loading all the vectors into memory, we can load them one by one. This is useful when you have a large dataset and you don't want to load all the vectors at once." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "8a225cb8", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Document: This is built to be faster and lighter than other embedding libraries e.g. Transformers, Sentence-Transformers, etc.\n", - "Vector of type: with shape: (384,)\n", - "Document: fastembed is supported by and maintained by Qdrant.\n", - "Vector of type: with shape: (384,)\n" - ] - } - ], - "source": [ - "embeddings_generator = embedding_model.embed(documents)\n", - "\n", - "for doc, vector in zip(documents, embeddings_generator):\n", - " print(\"Document:\", doc)\n", - " print(f\"Vector of type: {type(vector)} with shape: {vector.shape}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "769a1be9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(2, 384)" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "embeddings_list = np.array(list(embedding_model.embed(documents)))\n", - "embeddings_list.shape" - ] - }, - { - "cell_type": "markdown", - "id": "8c49ae50", - "metadata": {}, - "source": [ - "We're using [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) a state of the art Flag Embedding model. The model does better than OpenAI text-embedding-ada-002. We've made it even faster by converting it to ONNX format and quantizing the model for you.\n", - "\n", - "#### Format of the Document List\n", - "\n", - "1. List of Strings: Your documents must be in a list, and each document must be a string\n", - "2. For Retrieval Tasks with our default: If you're working with queries and passages, you can add special labels to them:\n", - "- **Queries**: Add \"query:\" at the beginning of each query string\n", - "- **Passages**: Add \"passage:\" at the beginning of each passage string\n", - "\n", - "## Beyond the default model\n", - "\n", - "The default model is built for speed and efficiency. If you need a more accurate model, you can use the `TextEmbedding` class to load any model from our list of available models. You can find the list of available models using `TextEmbedding.list_supported_models()`." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "2e9c8766", - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9470ec542f3c4400a42452c2489a1abc", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Fetching 8 files: 0%| | 0/8 [00:00 list[int]:\n", - " \"\"\"\n", - " Compute relevance scores for top-k documents given a query.\n", - "\n", - " :param query_embedding: Numpy array representing the query embedding, shape: [num_query_terms, embedding_dim]\n", - " :param document_embeddings: Numpy array representing embeddings for documents, shape: [num_documents, max_doc_length, embedding_dim]\n", - " :param k: Number of top documents to return\n", - " :return: Indices of the top-k documents based on their relevance scores\n", - " \"\"\"\n", - " # Compute batch dot-product of query_embedding and document_embeddings\n", - " # Resulting shape: [num_documents, num_query_terms, max_doc_length]\n", - " scores = np.matmul(query_embedding, document_embeddings.transpose(0, 2, 1))\n", - "\n", - " # Apply max-pooling across document terms (axis=2) to find the max similarity per query term\n", - " # Shape after max-pool: [num_documents, num_query_terms]\n", - " max_scores_per_query_term = np.max(scores, axis=2)\n", - "\n", - " # Sum the scores across query terms to get the total score for each document\n", - " # Shape after sum: [num_documents]\n", - " total_scores = np.sum(max_scores_per_query_term, axis=1)\n", - "\n", - " # Sort the documents based on their total scores and get the indices of the top-k documents\n", - " sorted_indices = np.argsort(total_scores)[::-1][:k]\n", - "\n", - " return sorted_indices" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "c61d07bed7b60e35", - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-03T17:39:37.053383Z", - "start_time": "2024-06-03T17:39:37.050926Z" - }, - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sorted document indices: [0 1]\n" - ] - } - ], - "source": [ - "sorted_indices = compute_relevance_scores(\n", - " np.array(query_embeddings[0]), np.array(document_embeddings), k=3\n", - ")\n", - "print(\"Sorted document indices:\", sorted_indices)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "b24df2569970d9e8", - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-03T17:40:52.276846Z", - "start_time": "2024-06-03T17:40:52.273789Z" - }, - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Query: Are there any other late interaction text embedding models except ColBERT?\n", - "Document: ColBERT is a late interaction text embedding model, however, there are also other models such as TwinBERT.\n", - "Document: On the contrary to the late interaction models, the early interaction models contains interaction steps at embedding generation process\n" - ] - } - ], - "source": [ - "print(f\"Query: {queries[0]}\")\n", - "for index in sorted_indices:\n", - " print(f\"Document: {documents[index]}\")" - ] - }, - { - "cell_type": "markdown", - "id": "6de537c37aff3927", - "metadata": { - "collapsed": false - }, - "source": [ - "## Use-case recommendation" - ] - }, - { - "cell_type": "markdown", - "id": "37e3525d3259cd2b", - "metadata": { - "collapsed": false - }, - "source": [ - "Despite ColBERT allows to compute embeddings independently and spare some workload offline, it still computes more resources than no interaction models. Due to this, it might be more reasonable to use ColBERT not as a first-stage retriever, but as a re-ranker.\n", - "\n", - "The first-stage retriever would then be a no-interaction model, which e.g. retrieves first 100 or 500 examples, and leave the final ranking to the ColBERT model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cfa922793454b4ad", - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/examples/FastEmbed_GPU.ipynb b/docs/examples/FastEmbed_GPU.ipynb deleted file mode 100644 index 80e33d110..000000000 --- a/docs/examples/FastEmbed_GPU.ipynb +++ /dev/null @@ -1,540 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "ntGNDuSCeAR2" - }, - "source": [ - "# FastEmbed on GPU\n", - "\n", - "As of version 0.2.7 FastEmbed supports GPU acceleration.\n", - "\n", - "This notebook covers the installation process and usage of fastembed on GPU.\n", - "\n", - "## Installation\n", - "\n", - "Fastembed depends on `onnxruntime` and inherits its scheme of GPU support.\n", - "\n", - "In order to use GPU with onnx models, you would need to have `onnxruntime-gpu` package, which substitutes all the `onnxruntime` functionality.\n", - "Fastembed mimics this behavior and requires `fastembed-gpu` package to be installed." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GK2XADwUeEK7" - }, - "outputs": [], - "source": [ - "!pip install fastembed-gpu" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3aiGPqjCeGzo" - }, - "source": [ - "**NOTE**: `onnxruntime-gpu` and `onnxruntime` can't be installed in the same environment. If you have `onnxruntime` installed, you would need to uninstall it before installing `onnxruntime-gpu`. Same is true for `fastembed` and `fastembed-gpu`." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3xx3r-9jgAMi" - }, - "source": [ - "### CUDA 12.x support\n", - "You can check your CUDA version using such commands as `nvidia-smi` or `nvcc --version`\n", - "\n", - "Starting from version 1.19.0, onnxruntime-gpu ships with support for CUDA 12.x by default.\n", - "\n", - "Google Colab notebooks have by default CUDA 12.x and CuDNN 8.x.\n", - "\n", - "Latest version of `onnxruntime-gpu` requires CuDNN 9.x, in order to install it you can run the following command: " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!sudo apt install cudnn9\n", - "!pip install fastembed-gpu -qqq" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If it necessary to work with CuDNN 8, you can consider locking `onnxruntime-gpu` to 1.18.0 with CUDA 12.x by this command:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install onnxruntime-gpu==1.18.0 -i https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ -qq\n", - "!pip install fastembed-gpu -qqq" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### CUDA 11.x support\n", - "To use latest version of `onnxruntime-gpu` with CUDA 11.x, you can run the following command:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install onnxruntime-gpu -i https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-11/pypi/simple/ -qq" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**NOTE**: Ensure that CuDNN 9.x is installed when working with the latest `onnxruntime-gpu`, whether using CUDA 11.x or 12.x." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Igv5RXhSeO68" - }, - "source": [ - "### CUDA drivers\n", - "\n", - "FastEmbed does not include CUDA drivers and CuDNN libraries.\n", - "You would need to take care of the environment setup on your own.\n", - "The dependencies required for the chosen onnxruntime version are listed in the [CUDA Execution Provider requirements](https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Setting up fastembed-gpu on GCP\n", - "\n", - "#### CUDA drivers\n", - "[CUDA 11.8 toolkit](https://developer.nvidia.com/cuda-11-8-0-download-archive) or [CUDA 12.x toolkit](https://developer.nvidia.com/cuda-downloads) has to be installed if they haven't yet been set up.\n", - "\n", - "#### Example of setting up CUDA 12.x on Ubuntu 22.04\n", - "Make sure to download an archive which has been created for your particular platform, CPU architecture and OS distribution.\n", - "\n", - "For Ubuntu 22.04 with x86_64 CPU architecture the following [archive](https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_network) has to be downloaded.\n", - "\n", - "```bash\n", - "wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb\n", - "sudo dpkg -i cuda-keyring_1.1-1_all.deb\n", - "sudo apt-get update\n", - "sudo apt-get -y install cuda\n", - "```\n", - "**NOTE**: Specific CUDA libraries can be found in the [meta packages section](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#meta-packages) in the CUDA installation guide.\n", - "\n", - "**NOTE**: When installing CUDA, the environment variable might not be set by default. Make sure to add the following line to your environment variables:\n", - "```bash\n", - "LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH\n", - "```\n", - "This will ensure that the CUDA libraries are properly linked.\n", - "\n", - "#### CuDNN 9.x\n", - "CuDNN 9.x library can be installed via the following [archive](https://developer.nvidia.com/rdp/cudnn-archive).\n", - "\n", - "#### Example of setting up CuDNN 9.x on Ubuntu 22.04\n", - "CuDNN 9.x for Ubuntu 22.04 x86_64 [archive](https://developer.nvidia.com/cudnn-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_network) can be downloaded and installed in the following way:\n", - "```bash\n", - "wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb\n", - "sudo dpkg -i cuda-keyring_1.1-1_all.deb\n", - "sudo apt-get update\n", - "sudo apt-get -y install cudnn\n", - "```\n", - "**NOTE**: When installing CuDNN, you can choose specific version, cudnn-cuda-11 or cudnn-cuda-12" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Common issues\n", - "\n", - "The following are some common issues that may arise while using `fastembed-gpu` if not installed properly:\n", - "\n", - "CUDA library is not installed:\n", - "```bash\n", - "FAIL : Failed to load library libonnxruntime_providers_cuda.so with error: libcublasLt.so.x: cannot open shared object file: No such file or directory\n", - "```\n", - "\n", - "\n", - "CuDNN library is not installed:\n", - "```bash\n", - "FAIL : Failed to load library libonnxruntime_providers_cuda.so with error: libcudnn.so.x: cannot open shared object file: No such file or directory\n", - "```\n", - "\n", - "\n", - "CUDA library path is not set:\n", - "```bash\n", - "FAIL : Failed to load library libonnxruntime_providers_cuda.so with error: libcufft.so.x: failed to map segment from shared object\n", - "```\n", - "\n", - "Make sure to add the following line to your environment variables:\n", - "```bash\n", - "LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Usage" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 334, - "referenced_widgets": [ - "aacf08a7aa444b64a2efad1967d28a53", - "5606aa785de74d65a9928b31c0be8a53", - "d4ec9d3b74ec4412894da2161ed2bddf", - "8edd544c3e074ec1813e5b9d1aef43d9", - "9898890f8a75468ea20e3ce319d0b6e2", - "da3b18abb16241a0a7191ee9afcb0510", - "258a619168824253a6a329efdc51ebe6", - "53c7cdc967d24faba0b5c659c94c50b8", - "e9348d8be28d408e8e760c71b21ab294", - "0ba06e0816714f2fbdec8260f160abc0", - "0b96563334964d449dd34f35b6b3e715", - "11c2eec490e8479b944eec7f30cb1ca2", - "91463da0d1c5466795e06ab586002259", - "30f4f7833406474f89ef0700b00a33aa", - "4302c304ec6a4b5985797e300bd7e353", - "2605640c7b824ed7aa137d404e14b774", - "b02efe3a33d04f06aa8938719ab35671", - "50408e5d052343b1a1b44a0fae0f801d", - "1be01c95d9e84f8ea88367c987a72fdc", - "a109c13bc93a449186424542dc330be8", - "4adce304ce1947b5a01dde10bbb3bb8c", - "a761366a37e44837a25e0f25b18efed2", - "94512b9055e546389471197b76ad5449", - "072dca00bd7b4918a178f90ccabf698a", - "48a856c59ef74cc3834521b1bf616541", - "c020c503aeaa464cad643ade5ee3ae24", - "a4e7e40c0bbd4f878c20a9f65fe3a048", - "e8c0a1c339fd47668d944a9defad79d4", - "cd782d35c6bd40c0a60d57b1828a7251", - "04f638ab08da4d20928644c4ba03f8ef", - "17f20477fc79475f97adf1c1f64a4192", - "96f7b5a2e224462e9fcffd03f906a593", - "755cd32d9fc9407c80a160f45c802d1e", - "a886258e7cd14c048b58391d7b772901", - "bc3e48f826a74840867a6209e622b75e", - "125b2ac0f78043bba7eca53474ca44c4", - "82f186d1ffb4435d94a6c7e9025242ef", - "77000333e5ca4094be291ad82d4a627a", - "7fe64fb53055431488d002c76c8e331e", - "7de59ae9919f4a5bb2b6e601a3c02412", - "97a69423a6644eab87fc636e182f23a4", - "4df936d1065b41f4bf02ed394fdf7b7e", - "3918bd1affa3454e8e9044a418a056ea", - "163b27ae0bce41e5b48efcb4b3fd780d", - "94631fd6e0744085bc79c3121de4a9f7", - "31cd98d66bc54418b35e70fbbc0fa3c0", - "6d21627a638b4ddca6fe7bfb80a621b5", - "b37bed9dc4fe45c08b8397288fe5b1a9", - "164fef95d1414177a40d563f5682f6a3", - "1a9a0ea53448413a8e4b360b7bb69e26", - "dd1a4483b4b045c6929e3d2cf1338f63", - "496ddd8e05f949cd8cbba8e677f476ac", - "2813be951d7f48b2aad1dd4a444ce3eb", - "8e9a2c2dd21942edbdfecb3b7dffc70b", - "08a10fe247f1425db044cfc13f2fb384", - "b8786aded92d421592bc7623c5c7899e", - "c91a20a9433d4016ba2db69fa50e0b4d", - "e997820738594c6dadb061908d7afdc1", - "a5fc751f81ae498f9aa55ece0e6853b2", - "2aee4fc8cda64c5eb8722be81e48e0ca", - "3a53e8624dff48b3959875ef58ee99ce", - "50a70044f77542108fe188598e70797e", - "13cf998b35ae4507a63e797f6fa3eada", - "6209eb6a68cf4a378767ef34d0d9216d", - "7395db766b944af9b41d6b56c9ada0b1", - "42122c317ec648688f0164a1adb5df28" - ] - }, - "id": "Ttf4YggPeQQK", - "outputId": "aa75129d-9e2d-4c88-cf03-251dd43a11b1" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", - "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", - "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", - "You will be able to reuse this secret in all of your notebooks.\n", - "Please note that authentication is recommended but still optional to access public models or datasets.\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "aacf08a7aa444b64a2efad1967d28a53", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Fetching 5 files: 0%| | 0/5 [00:00 None:\n", - " self.model = AutoModel.from_pretrained(model_id)\n", - " self.tokenizer = AutoTokenizer.from_pretrained(model_id)\n", - "\n", - " def embed(self, texts: list[str]):\n", - " encoded_input = self.tokenizer(\n", - " texts, max_length=512, padding=True, truncation=True, return_tensors=\"pt\"\n", - " )\n", - " model_output = self.model(**encoded_input)\n", - " sentence_embeddings = model_output[0][:, 0]\n", - " sentence_embeddings = F.normalize(sentence_embeddings)\n", - " return sentence_embeddings\n", - "\n", - "\n", - "model_id = \"BAAI/bge-small-en-v1.5\"\n", - "hf = HF(model_id=model_id)\n", - "hf.embed(documents).shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setting up โšก๏ธFastEmbed\n", - "\n", - "Sorry, don't have a lot to set up here. We'll be using the default model, which is Flag Embedding, same as the Huggingface model." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:43:35.486719Z", - "start_time": "2024-03-30T00:43:35.416166Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "08da8fd851604028af05b6e83681e904", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Fetching 9 files: 0%| | 0/9 [00:00 tuple[float, float, float]:\n", - " times = []\n", - " for _ in range(k):\n", - " # Timing the embed_func call\n", - " start_time = time.time()\n", - " embeddings = embed_func(documents)\n", - " # Force computation if embed_func returns a generator\n", - " if isinstance(embeddings, types.GeneratorType):\n", - " list(embeddings)\n", - "\n", - " end_time = time.time()\n", - " times.append(end_time - start_time)\n", - "\n", - " # Returning mean, max, and min time for the call\n", - " return (sum(times) / k, max(times), min(times))\n", - "\n", - "\n", - "hf_stats = calculate_time_stats(hf.embed, documents, k=100)\n", - "print(f\"Huggingface Transformers (Average, Max, Min): {hf_stats}\")\n", - "fst_stats = calculate_time_stats(embedding_model.embed, documents, k=100)\n", - "print(f\"FastEmbed (Average, Max, Min): {fst_stats}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ๐Ÿ“ˆ Results\n", - "\n", - "Let's run the comparison and see the results." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:43:35.746781Z", - "start_time": "2024-03-30T00:43:35.698423Z" - } - }, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAGzCAYAAAAyiiOsAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/H5lhTAAAACXBIWXMAAA9hAAAPYQGoP6dpAABq/UlEQVR4nO3dd1QU19sH8C+g9CbSBQFRERR7w14QRDSxJMYSxR4NWBM1JtYYgw2VGGuKmqixxZjYBeyKDcEOViQWwAYroNT7/uHL/BwXdFcXWc33c86ew9z7zJ1nlt3lYebOrI4QQoCIiIiIXkq3tBMgIiIiehewaCIiIiJSAYsmIiIiIhWwaCIiIiJSAYsmIiIiIhWwaCIiIiJSAYsmIiIiIhWwaCIiIiJSAYsmIiIiIhWwaKISpaOjg5CQkNJOgwjAs9fj1KlTSzsNAuDq6op+/fqVdhrvhH79+sHV1bW00yCwaKLXdO3aNXz22WeoVKkSDA0NYW5ujqZNmyI8PBxPnjwp7fTe2J07dzB16lTExcWVdioyU6dOhY6OjvQwNjaGl5cXJk6cCIVCUdrpkQYlJiaif//+cHd3h6GhIezt7dGiRQtMmTKltFN7654+fYr58+ejUaNGsLCwgKGhIapWrYqQkBBcvny5tNOj/5AypZ0AvXu2b9+Ojz/+GAYGBujbty9q1KiBnJwcHD58GGPHjsWFCxewfPny0k7zjdy5cwfTpk2Dq6srateuXdrpKFmyZAlMTU2RkZGBPXv2YMaMGdi7dy+OHDkCHR2d0k6P3tDVq1fRoEEDGBkZYcCAAXB1dcXdu3dx+vRpzJo1C9OmTSvtFN+a+/fvo3379oiJiUHHjh3Rq1cvmJqaIiEhAevWrcPy5cuRk5NT2mmWqJ9++gkFBQWlnQaBRROp6caNG+jRowdcXFywd+9eODg4SH3BwcG4evUqtm/f/lZzyszMhImJyVvd5uvSVK4fffQRrK2tAQBDhw5Ft27dsHnzZhw7dgw+Pj5FrpOVlQVjY+M33jZpxsteC/Pnz0dGRgbi4uLg4uIi60tNTX0b6WmNfv36ITY2Fps2bUK3bt1kfdOnT8c333xTSpmVvMLXSNmyZUs7Ffp/PD1Hapk9ezYyMjLwyy+/yAqmQpUrV8bIkSOV2rds2YIaNWrAwMAA1atXx65du2T9N2/exOeffw4PDw8YGRmhfPny+Pjjj5GYmCiLW7lyJXR0dHDgwAF8/vnnsLW1hZOTk1pjAEBaWhpGjx4NV1dXGBgYwMnJCX379sX9+/exf/9+NGjQAADQv39/6VTYypUrpfWPHz+O9u3bw8LCAsbGxmjZsiWOHDki20bhqbSLFy+iV69eKFeuHJo1awYASE5ORv/+/eHk5AQDAwM4ODjgww8/LDJXVbRp0wbAs6IWAFq1aoUaNWogJiYGLVq0gLGxMb7++msAz/7oDhw4EHZ2djA0NEStWrWwatUqpTELCgoQHh4Ob29vGBoawsbGBu3bt8epU6dkcatXr0a9evVgZGQEKysr9OjRA//++68s5sqVK+jWrRvs7e1haGgIJycn9OjRA+np6VJMREQEmjVrBktLS5iamsLDw0PKuVB2djamTJmCypUrw8DAAM7Ozhg3bhyys7OV4kaPHg0bGxuYmZnhgw8+wK1bt1R6Lvfv3w8dHR2sX78eX3/9Nezt7WFiYoIPPvhAab+AN38tFOXatWtwcnJSKpgAwNbWVqlt586daN68OUxMTGBmZobAwEBcuHBBKS4+Ph7du3eHjY0NjIyM4OHhoVR0xMbGIiAgAObm5jA1NUXbtm1x7NgxWUzh+/DIkSMYM2YMbGxsYGJigi5duuDevXuyWCEEvvvuOzg5OcHY2BitW7cuMreiHD9+HNu3b8fAgQOVCiYAMDAwwNy5c2Vte/fulZ4LS0tLfPjhh7h06ZIspvD3cfnyZXz66aewsLCAjY0NJk2aBCEE/v33X3z44YcwNzeHvb09wsLCZOur8xo5dOgQPv74Y1SsWFF6zY4ePVppGkO/fv1gamqKa9euoUOHDjAzM0Pv3r2lvhfnNK1btw716tWDmZkZzM3N4e3tjfDwcFnM9evX8fHHH8PKygrGxsZo3Lix0j+1hfuyYcMGzJgxA05OTjA0NETbtm1x9erVYn4z/1080kRq2bp1KypVqoQmTZqovM7hw4exefNmfP755zAzM8MPP/yAbt26ISkpCeXLlwcAnDx5EkePHkWPHj3g5OSExMRELFmyBK1atcLFixeVjpB8/vnnsLGxweTJk5GZmanWGBkZGWjevDkuXbqEAQMGoG7durh//z7++ecf3Lp1C56envj2228xefJkDBkyBM2bNwcAaZ/37t2LgIAA1KtXD1OmTIGuri5WrFiBNm3a4NChQ2jYsKEs148//hhVqlTB999/DyEEAKBbt264cOEChg8fDldXV6SmpiIiIgJJSUmvNeHz2rVrACA9nwDw4MEDBAQEoEePHvj0009hZ2eHJ0+eoFWrVrh69SpCQkLg5uaGjRs3ol+/fkhLS5MVvAMHDsTKlSsREBCAQYMGIS8vD4cOHcKxY8dQv359AMCMGTMwadIkdO/eHYMGDcK9e/ewcOFCtGjRArGxsbC0tEROTg78/f2RnZ2N4cOHw97eHrdv38a2bduQlpYGCwsLXLhwAR07dkTNmjXx7bffwsDAAFevXpUVHwUFBfjggw9w+PBhDBkyBJ6enjh37hzmz5+Py5cvY8uWLVLsoEGDsHr1avTq1QtNmjTB3r17ERgYqNZzOmPGDOjo6GD8+PFITU3FggUL4Ovri7i4OBgZGQHQzGuhKC4uLoiMjMTevXulgrg4v//+O4KCguDv749Zs2YhKysLS5YsQbNmzRAbGyu9ns6ePYvmzZujbNmyGDJkCFxdXXHt2jVs3boVM2bMAABcuHABzZs3h7m5OcaNG4eyZcti2bJlaNWqFQ4cOIBGjRrJtj18+HCUK1cOU6ZMQWJiIhYsWICQkBCsX79eipk8eTK+++47dOjQAR06dMDp06fh5+en0im1f/75BwDQp0+fV8YCQGRkJAICAlCpUiVMnToVT548wcKFC9G0aVOcPn1a6b31ySefwNPTEzNnzsT27dvx3XffwcrKCsuWLUObNm0wa9YsrFmzBl9++SUaNGiAFi1ayNZX5TWyceNGZGVlYdiwYShfvjxOnDiBhQsX4tatW9i4caNsvLy8PPj7+6NZs2aYO3dusUeGIyIi0LNnT7Rt2xazZs0CAFy6dAlHjhyR3sMpKSlo0qQJsrKyMGLECJQvXx6rVq3CBx98gE2bNqFLly6yMWfOnAldXV18+eWXSE9Px+zZs9G7d28cP35cpef+P0MQqSg9PV0AEB9++KHK6wAQ+vr64urVq1LbmTNnBACxcOFCqS0rK0tp3ejoaAFA/Pbbb1LbihUrBADRrFkzkZeXJ4tXdYzJkycLAGLz5s1K8QUFBUIIIU6ePCkAiBUrVij1V6lSRfj7+0uxhdt2c3MT7dq1k9qmTJkiAIiePXvKxnj06JEAIObMmaO0/VcpHDMhIUHcu3dP3LhxQyxbtkwYGBgIOzs7kZmZKYQQomXLlgKAWLp0qWz9BQsWCABi9erVUltOTo7w8fERpqamQqFQCCGE2Lt3rwAgRowYUexzlJiYKPT09MSMGTNk/efOnRNlypSR2mNjYwUAsXHjxmL3a/78+QKAuHfvXrExv//+u9DV1RWHDh2StS9dulQAEEeOHBFCCBEXFycAiM8//1wW16tXLwFATJkypdhtCCHEvn37BABRoUIF6fkQQogNGzYIACI8PFx6Ht70tVCc8+fPCyMjIwFA1K5dW4wcOVJs2bJF+v0Wevz4sbC0tBSDBw+WtScnJwsLCwtZe4sWLYSZmZm4efOmLPb53Dt37iz09fXFtWvXpLY7d+4IMzMz0aJFC6mt8H3o6+srW3/06NFCT09PpKWlCSGESE1NFfr6+iIwMFAW9/XXXwsAIigo6KXPQ5cuXQQA8ejRo5fGFapdu7awtbUVDx48kNrOnDkjdHV1Rd++faW2wt/HkCFDpLa8vDzh5OQkdHR0xMyZM6X2R48eCSMjI1muqr5GhCj6cyk0NFTo6OjIfhdBQUECgPjqq6+U4oOCgoSLi4u0PHLkSGFubq70Gfi8UaNGCQCy98vjx4+Fm5ubcHV1Ffn5+bJ98fT0FNnZ2VJseHi4ACDOnTtX7Db+i3h6jlRWeHWWmZmZWuv5+vrC3d1dWq5ZsybMzc1x/fp1qa3wvzIAyM3NxYMHD1C5cmVYWlri9OnTSmMOHjwYenp6sjZVx/jzzz9Rq1Ytpf+0ALxyEnVcXByuXLmCXr164cGDB7h//z7u37+PzMxMtG3bFgcPHlSasDl06FClPPX19bF//348evTopdsrjoeHB2xsbODm5obPPvsMlStXxvbt22X/mRoYGKB///6y9Xbs2AF7e3v07NlTaitbtixGjBiBjIwMHDhwAMCz50hHR6fIK7UKn6PNmzejoKAA3bt3l56H+/fvw97eHlWqVMG+ffsAABYWFgCA3bt3Iysrq8j9sbS0BAD8/fffxU543bhxIzw9PVGtWjXZ9gqPxBRub8eOHQCAESNGyNYfNWpUkeMWp2/fvrLX+kcffQQHBwdpfE28FopTvXp1xMXF4dNPP0ViYiLCw8PRuXNn2NnZ4aeffpLiIiIikJaWhp49e8qeEz09PTRq1Eh6Tu7du4eDBw9iwIABqFixomxbhb/P/Px87NmzB507d0alSpWkfgcHB/Tq1QuHDx9WukJzyJAhsvdM8+bNkZ+fj5s3bwJ4duQnJycHw4cPl8Wp+rtQ5zPn7t27iIuLQ79+/WBlZSW116xZE+3atZN+b88bNGiQ9LOenh7q168PIQQGDhwotVtaWsLDw0P2eVXoVa8RQP65lJmZifv376NJkyYQQiA2NlZpzGHDhr1yXy0tLZGZmYmIiIhiY3bs2IGGDRvKTgObmppiyJAhSExMxMWLF2Xx/fv3h76+vrRceIS9qP3+L+PpOVKZubk5AODx48dqrffihzQAlCtXTlYwPHnyBKGhoVixYgVu374tO3Xx/LyXQm5ubkptqo5x7dq1IudHqOLKlSsAgKCgoGJj0tPTUa5cuWJzNTAwwKxZs/DFF1/Azs4OjRs3RseOHdG3b1/Y29urlMeff/4Jc3NzlC1bFk5OTrKitFCFChVkH4LAs3lfVapUga6u/P8lT09PqR949hw5OjrK/vi86MqVKxBCoEqVKkX2F05edXNzw5gxYzBv3jysWbMGzZs3xwcffCDNJQGenSb5+eefMWjQIHz11Vdo27Ytunbtio8++kjK9cqVK7h06RJsbGyK3F7hBOmbN29CV1dX6Tnx8PAodl+K8uJ+6ejooHLlytK8M028Fl6matWq+P3335Gfn4+LFy9i27ZtmD17NoYMGQI3Nzf4+vpKORR3Cq/wPVv4h69GjRrFbu/evXvIysoq8nny9PREQUEB/v33X1SvXl1qf/G9Xbivhe/twtfTi8+ljY2N7HkpzvOfOYWFdXEKt1Vc/rt371aafP9i/oW3Myi8yOL59gcPHiiN+6rXCAAkJSVh8uTJ+Oeff5T+SXrxs61MmTLSHM2X+fzzz7FhwwYEBASgQoUK8PPzQ/fu3dG+fXsp5ubNm0qnUwH5e/3518Orfpf0DIsmUpm5uTkcHR1x/vx5tdZ78YhQoeeLmuHDh2PFihUYNWoUfHx8YGFhAR0dHfTo0aPIIw/P//f2umO8jsJx5syZU+ytCExNTV+Z66hRo9CpUyds2bIFu3fvxqRJkxAaGoq9e/eiTp06r8yjRYsWSh/sLypqu5pUUFAAHR0d7Ny5s8jf8fPPQ1hYGPr164e///4be/bswYgRIxAaGopjx47ByckJRkZGOHjwIPbt24ft27dj165dWL9+Pdq0aYM9e/ZAT08PBQUF8Pb2xrx584rMx9nZucT2tSiaei28ip6eHry9veHt7Q0fHx+0bt0aa9asga+vr5TD77//XmTBXaZMyX7Eq/LefhPVqlUDAJw7d0468qFJReWvyX3Kz89Hu3bt8PDhQ4wfPx7VqlWDiYkJbt++jX79+il9LhkYGCj9Q1MUW1tbxMXFYffu3di5cyd27tyJFStWoG/fvkVe1KGKkv5dvi9YNJFaOnbsiOXLlyM6OrrYS9tfx6ZNmxAUFCS7SuXp06dIS0vT+Bju7u6vLPyKO01XePTC3Nwcvr6+KudW3FhffPEFvvjiC1y5cgW1a9dGWFgYVq9e/UbjvoyLiwvOnj2LgoIC2YdzfHy81F+Y2+7du/Hw4cNijza5u7tDCAE3NzdUrVr1ldsu/MM/ceJEHD16FE2bNsXSpUvx3XffAQB0dXXRtm1btG3bFvPmzcP333+Pb775Bvv27ZNO8Z45cwZt27Z96WlUFxcXFBQU4Nq1a7KjDgkJCa9+gp5TeBSnkBACV69eRc2aNaX9BzTzWlBV4QT8u3fvynKwtbV9aQ6Fp9te9rq3sbGBsbFxkc9TfHw8dHV11S5MC19PV65ckZ3yu3fvnkpHMDp16oTQ0FCsXr36lUVT4baKy9/a2lrjtyZ51Wvk3LlzuHz5MlatWoW+fftKcS87raYqfX19dOrUCZ06dUJBQQE+//xzLFu2DJMmTULlypXh4uJS7HMBoMgrM+nVOKeJ1DJu3DiYmJhg0KBBSElJUeq/du2a0mWvqtDT01P6j2bhwoXIz8/X+BjdunXDmTNn8NdffymNUbh+4YfriwVXvXr14O7ujrlz5yIjI0Np/Rcvty5KVlYWnj59Kmtzd3eHmZmZ0qXzmtahQwckJyfLrm7Ky8vDwoULYWpqipYtWwJ49hwJIYq8iWLhc9S1a1fo6elh2rRpSs+7EEI6naFQKJCXlyfr9/b2hq6urrS/Dx8+VNpO4dGbwpju3bvj9u3bsjk9hZ48eSJdRRkQEAAA+OGHH2QxCxYsKOIZKd5vv/0mOxW9adMm3L17VxpfE6+F4hw6dAi5ublK7YVzZQqLQX9/f5ibm+P7778vMr4wBxsbG7Ro0QK//vorkpKSZDGFvzs9PT34+fnh77//lp1eSklJwdq1a9GsWTPpdJmqfH19UbZsWSxcuFD2GlH1d+Hj44P27dvj559/ll0dWSgnJwdffvklgGdzr2rXro1Vq1bJ3rfnz5/Hnj170KFDB7VyV8WrXiOFR2+e33chxGt9Rj7vxVOFurq6UqFW+H7p0KEDTpw4gejoaCkuMzMTy5cvh6urK7y8vN4oh/8qHmkitbi7u2Pt2rXSpbrP3xH86NGj0uXr6urYsSN+//13WFhYwMvLC9HR0YiMjJRdQq+pMcaOHYtNmzbh448/xoABA1CvXj08fPgQ//zzD5YuXYpatWrB3d0dlpaWWLp0KczMzGBiYoJGjRrBzc0NP//8MwICAlC9enX0798fFSpUwO3bt7Fv3z6Ym5tj69atL83z8uXLaNu2Lbp37w4vLy+UKVMGf/31F1JSUtCjRw+1nzt1DBkyBMuWLUO/fv0QExMDV1dXbNq0CUeOHMGCBQukSa2tW7dGnz598MMPP+DKlSto3749CgoKcOjQIbRu3RohISFwd3fHd999hwkTJiAxMRGdO3eGmZkZbty4gb/++gtDhgzBl19+ib179yIkJAQff/wxqlatiry8PPz+++/Q09OT5pZ9++23OHjwIAIDA+Hi4oLU1FQsXrwYTk5O0kTWPn36YMOGDRg6dCj27duHpk2bIj8/H/Hx8diwYQN2796N+vXro3bt2ujZsycWL16M9PR0NGnSBFFRUWrfc8bKygrNmjVD//79kZKSggULFqBy5coYPHgwgGd/qN70tVCcWbNmISYmBl27dpX+GJ4+fRq//fYbrKyspInU5ubmWLJkCfr06YO6deuiR48esLGxQVJSErZv346mTZvixx9/BPCsiGzWrBnq1q0rzYtKTEzE9u3bpa8L+u6776T7ZX3++ecoU6YMli1bhuzsbMyePVvt/bCxscGXX36J0NBQdOzYER06dEBsbCx27tz5ytPLhX777Tf4+fmha9eu6NSpE9q2bQsTExNcuXIF69atw927d6V7Nc2ZMwcBAQHw8fHBwIEDpVsOWFhYlMh3Dr7qNVKtWjW4u7vjyy+/xO3bt2Fubo4///zzjecJDRo0CA8fPkSbNm3g5OSEmzdvYuHChahdu7Y0Z+mrr77CH3/8gYCAAIwYMQJWVlZYtWoVbty4gT///FOl04BUhLd4pR69Ry5fviwGDx4sXF1dhb6+vjAzMxNNmzYVCxcuFE+fPpXiAIjg4GCl9V1cXGSX8D569Ej0799fWFtbC1NTU+Hv7y/i4+OV4govdT558qTSmKqOIYQQDx48ECEhIaJChQpCX19fODk5iaCgIHH//n0p5u+//xZeXl6iTJkySrcfiI2NFV27dhXly5cXBgYGwsXFRXTv3l1ERUVJMYWXNb94Gf39+/dFcHCwqFatmjAxMREWFhaiUaNGYsOGDa962osd80UtW7YU1atXL7IvJSVFep709fWFt7e30q0VhHh2CfacOXNEtWrVhL6+vrCxsREBAQEiJiZGFvfnn3+KZs2aCRMTE2FiYiKqVasmgoODRUJCghBCiOvXr4sBAwYId3d3YWhoKKysrETr1q1FZGSkNEZUVJT48MMPhaOjo9DX1xeOjo6iZ8+e4vLly7Jt5eTkiFmzZonq1asLAwMDUa5cOVGvXj0xbdo0kZ6eLsU9efJEjBgxQpQvX16YmJiITp06iX///VetWw788ccfYsKECcLW1lYYGRmJwMBApcv1hXiz10Jxjhw5IoKDg0WNGjWEhYWFKFu2rKhYsaLo16+f7HYAz+fs7+8vLCwshKGhoXB3dxf9+vUTp06dksWdP39edOnSRVhaWgpDQ0Ph4eEhJk2aJIs5ffq08Pf3F6ampsLY2Fi0bt1aHD16VBZT3Puw8Lnbt2+f1Jafny+mTZsmHBwchJGRkWjVqpU4f/58ke/L4mRlZYm5c+eKBg0aCFNTU6Gvry+qVKkihg8fLrudiRBCREZGiqZNmwojIyNhbm4uOnXqJC5evCiLKe73ERQUJExMTJS2/+L7SZ3XyMWLF4Wvr68wNTUV1tbWYvDgwdJtV55/3xW37cK+5285sGnTJuHn5ydsbW2Fvr6+qFixovjss8/E3bt3Zetdu3ZNfPTRR9Lvu2HDhmLbtm2ymMJ9efGWIDdu3Cjytiv/dTpCcJYXEVGh/fv3o3Xr1ti4cSM++uij0k6HtBBfI/9dPD5HREREpAIWTUREREQqYNFEREREpALOaSIiIiJSAY80EREREamARRMRERGRCnhzSw0pKCjAnTt3YGZm9tKveCAiIiLtIYTA48eP4ejo+MqbfrJo0pA7d+689S8MJSIiIs34999/4eTk9NIYFk0aUvj1E//++6/a389ERERy5+Nv4Z+I0zgZdwO3Ux7B0swYNT2dETKgHVyd5F/Bcv1mKmYv2YHY8zdRtqweWjTywJdDO8DK8n9f0Hs7+RECPp1b5LZmffMJAlo/+7qagoICbI2IQ+ThC4i/ehfpj7NQwb4cAlrVRFD3ZjDQLytb93HGU/y0dj/2HrmIlHvpsLI0ReO67hjapw0c7CxfuZ85OXlYtCoS2yLjoHj8BFUq2WN4/3bwqVdZzWeMXpdCoYCzs7P0d/xlePWchigUClhYWCA9PZ1FExHRGxr21S84deY6AtvWQbXKjrj3QIFVGw8i60k2/vr1C3i4OwIA7qY8QmCf2TAzNUS/T1oiKysby9fshaNdOfy98kvol312bODfOw/QvPNUfOBXD62bVpdtq0Ftdzg5WAEAMrOyUb3Vl6hTwxVtm9VAeStTnD6XiD+3H0fDOpXxx+Lh0hSMgoICdBkQhis3ktGnW3O4VbRF4q17WP3nYZiaGCJy/TcwNTF86X4On7gCO6PiMKBna7g622DTtuM4e/Em/lgyAg1qu2v6aaUiqPX3uxS/wuW9kp6eLgDIvv+KiIhez6kz10R2Tq6s7frNFFGl6SgxctJKqe2bmeuER7PR4tbdB1LboeOXhEuDELFm82GpLen2feHSIEQs+z1SvEx2Tq44dUb5+/0W/LRDuDQIEYeOX5Ll6NIgRKzacEAWu/6faOHSIETs3Bv30m3Fnr+hlNOTpzmiRZeposuAsJeuS5qjzt9vXj1HRERap17NStJRokJuFW1RtZIDriamSG279p1B22Y1UMHeSmpr1rAaKlW0xfbI00WOnfUkGzm5eUX26Zctg3o1Kym1+7eqBQC4euN/236c+RQAYG0lP61ja/3saIWhofxU3ot2RsVBT08XPTs3kdoMDcqi+wc+OH3uBu6kPHrp+vT2sWgiIqJ3ghAC9x8+RjmLZ3OVklPTcP/hY3h7VlSKrVXdBRcSbim1h/+8E14tv4RHszH4IGgODh67pNK27z1QAADKPTdPqqZnRRgb6SNs2XYcPZmA5NQ0HDt9BTMX/o1aXhXRrIHHS8e8cPkW3CrawszUSNZe28sFAHDxsnL+VLo4EZyIiN4JW3adQnJqGsYM6QAASL3/rJApPLLzPNvy5khTZCE7JxcG+mWhq6uD5o2qwb9VLdjbWiDp9gP8snYv+o1agp/nDkGbZjVeuu1lv0fCzMQQrZp4SW1Wlqb4cUZ/fPX9H+gV/KPU3qKxJ5bMHIgyZfReOmbqfQVsyxeR+//vT8q99JeuT28fiyYiItJ6VxOTMXn2BtT1dkO3wEYAgKfZOQCgdBoPAAwMyv5/zLOiqYK9FX5fGCyL6RrQAL6fzMB34X+9tGhatGI3Dp9IwPRx3WFhZizrsypniuoeTgj6uBKqVHLAxcu3sOz3KIz9djUWzxz40n16mp0Lff2X507ahafniIhIq6XeV2DA6GUwMzXCkpkDoaf37E+XoYE+ABQ5Pyn7/wsOQ4Pi5xVZWpjg406Ncf1mKu4WM39oa0QM5i7djk8+8EGfj5rL+pJu30fPYQvRvZMPgvv7w69lTYwa3AHTx3XHjr1x2Hf0wkv3y9CgLHJyXi93Kh0smoiISGspMp6g36glUDzOwqrwYbCzsZD6Ck9jFZ6me17qAwUszY2V7qv0Isf/v5dSmiJLqe/Q8Xh8MXU12jStjhlffaLUv2nbcWTn5KJNM/ktDNq18AYAxJy58dJt21qbI/VBEbn///48v6+kHVg0ERGRVnqanYtBY5bhRlIqfpk3FFUqOcj67W0tUb6cKc5dSlJa98yFm/Cq+vK7OwNA0u0HAIDy5Uxl7bHnE/HZuJ/g7emMRd/3L3J+0r2HjyEEUFAgv91hbl4+ACAvP/+l2/aq6oQbSal4nPFE1h53IVHqJ+3CoomIiLROfn4BQr5ZgdPnbmBx6ADUq+lWZFz71rURdfi87PL8IycScD0pFR3a1pHaHjx6rLRucmoaNmw9hmqVHWFr/b+jOldvJGPA6KVwciiPX+cNhaGhfpHbrlTRBkIIbHvh1gb/7IkBAFT3+F/R8zAtA1cTk/HkaY7UFtCmNvLzC/DHlqNSW3ZOLjZuO47aNVzhaFeuyO1S6eFEcCIi0jrfhf+FyIPn4Nu8BtIUWfhr50lZf5eABgCA4P5+2BEVi57DfkD/T1oh80k2lq+OQrXKjvi4UyMpPnTh30i6dR9NGlSFnY0Fbt15iLV/HcGTJzmY8sVHUlxG5lP0HbEY6Y+zMOTTtth7RD4vqWIFa6mA+yiwMZav3otvQtfjQsItVK3kgPPx/2L9P9GoWslBurcTAKzacBDhP+/EH0tGwKdeFQBAnRquCGxbB7MX/YMHDx/DxdkGf24/jlt3HmDWN700+4SSRrBoIiIirVN4j6LIQ+cReei8Un9h0eRoVw7rl47E9AWbMWvRPyhbVg9tmlbHNyO7yOYzNW9UDWtuH8Hvmw4hXZEFczNjNKzjjuED2qNGtf992fqj9EzpqNWsRf8obbdbYEOpaCpnaYKtq8Zi3rLtiDp0Hms3H4GlhTG6d2qMsZ93KvKqvheFTe2DCsussHnnSaQ/zoJnZUf8Mm8oGtXld89pI373nIbwu+eIiIjePer8/eacJiIiIiIVsGgiIiIiUgHnNBERaYnrrh1LOwUirVYpcVupbp9HmoiIiIhUwKKJiIiISAUsmoiIiIhUwKKJiIiISAWlWjSFhoaiQYMGMDMzg62tLTp37oyEhARZzNOnTxEcHIzy5cvD1NQU3bp1Q0pKiiwmKSkJgYGBMDY2hq2tLcaOHYu8PPk3R+/fvx9169aFgYEBKleujJUrVyrls2jRIri6usLQ0BCNGjXCiRMnNL7PRERE9G4q1aLpwIEDCA4OxrFjxxAREYHc3Fz4+fkhMzNTihk9ejS2bt2KjRs34sCBA7hz5w66du0q9efn5yMwMBA5OTk4evQoVq1ahZUrV2Ly5MlSzI0bNxAYGIjWrVsjLi4Oo0aNwqBBg7B7924pZv369RgzZgymTJmC06dPo1atWvD390dqaurbeTKIiIhIq2nVHcHv3bsHW1tbHDhwAC1atEB6ejpsbGywdu1afPTRs+8Gio+Ph6enJ6Kjo9G4cWPs3LkTHTt2xJ07d2BnZwcAWLp0KcaPH4979+5BX18f48ePx/bt23H+/P9uxd+jRw+kpaVh165dAIBGjRqhQYMG+PHHHwEABQUFcHZ2xvDhw/HVV1+9MnfeEZyI3hRvOUD0ciVxywF1/n5r1X2a0tPTAQBWVlYAgJiYGOTm5sLX11eKqVatGipWrCgVTdHR0fD29pYKJgDw9/fHsGHDcOHCBdSpUwfR0dGyMQpjRo0aBQDIyclBTEwMJkyYIPXr6urC19cX0dHRReaanZ2N7OxsaVmhULzZzpPKMrOysWx1JOLO38SZizeRrsjCnMm98XHHxkqxqzYcwG+bDuHf2w9QztIEHX3r4ouhgTA2MpDF/fjrbsRdSETchZu4//AxRg4KwOghHYrcfnJqGqbP34yDx+MhhEDjelUweXRXVKxgLcU8fZqDyXM2Iu7CTdxNeYT8ggJUrGCN7h/4oM9HzVG2jN4r91OdnIiIqORpzUTwgoICjBo1Ck2bNkWNGjUAAMnJydDX14elpaUs1s7ODsnJyVLM8wVTYX9h38tiFAoFnjx5gvv37yM/P7/ImMIxXhQaGgoLCwvp4ezsXGQcad7DtAz88PMuXEtMhmeVCsXGhS78G1PmboJHJQdMHtMNAa1rY9WGA/hs3M9KsXOXbsOZi0nwqur00m1nZmWj57AfcDz2KoL7+WHU4ABcTLiFTz4Lx6O0/51Wfpqdi8s3ktG6iRfGBXfC1yO6wKtKBUyfvxlfTP1dpf1UNSciIno7tOZIU3BwMM6fP4/Dhw+XdioqmTBhAsaMGSMtKxQKFk5via21OU7smAFba3OcvZiED/rNUYpJvZ+OX9buRdeABpg3ra/U7lbRBlPmbkLkoXPwbe4ttR/aMhXOjuXxMC0Ddf0mKI1X6PdNh3Dj33v4e+WXqOXlAgBo1cQL/j1D8dPaKIz7/AMAgKWFCbb8+oVs3U+7NYOZqRFWbTyIiaO6wtb65YeBVc2JiIjeDq040hQSEoJt27Zh3759cHL633/V9vb2yMnJQVpamiw+JSUF9vb2UsyLV9MVLr8qxtzcHEZGRrC2toaenl6RMYVjvMjAwADm5uayB70dBvplX1lwnD53A3n5BejkV0/WXri8dc9pWbuzY3mVtr1zbyxqeVWUCiYAqOxqjyb1q2J7ZOwr13dyfHbqWZGR9cpYVXMiIqK3o1SLJiEEQkJC8Ndff2Hv3r1wc3OT9derVw9ly5ZFVFSU1JaQkICkpCT4+PgAAHx8fHDu3DnZVW4REREwNzeHl5eXFPP8GIUxhWPo6+ujXr16spiCggJERUVJMfRuyc55dssJA4OysnYjQ30AwLn4JLXHLCgowKWrd+DtWVGpr3Z1F9y8dR8ZmU9l7Tm5eXiYloE7KY+wa98ZLF+9FxUcrODqZKP29omIqHSV6um54OBgrF27Fn///TfMzMyk+UMWFhYwMjKChYUFBg4ciDFjxsDKygrm5uYYPnw4fHx80Ljxs0m/fn5+8PLyQp8+fTB79mwkJydj4sSJCA4OhoHBs8m+Q4cOxY8//ohx48ZhwIAB2Lt3LzZs2IDt27dLuYwZMwZBQUGoX78+GjZsiAULFiAzMxP9+/d/+08MvTF3l2fz02LOXEeT+lWl9hOx1wAAKffS1R4zTZGFnJw82FpbKPUVHvlKuZ8OUxNDqX3XvjMYMXGltFzTsyJmT+qNMipMBCciIu1SqkXTkiVLAACtWrWSta9YsQL9+vUDAMyfPx+6urro1q0bsrOz4e/vj8WLF0uxenp62LZtG4YNGwYfHx+YmJggKCgI3377rRTj5uaG7du3Y/To0QgPD4eTkxN+/vln+Pv7SzGffPIJ7t27h8mTJyM5ORm1a9fGrl27lCaH07uhRjVn1K7hiqW/R8LO1hI+9argamIyJs7agLJl9PA0O1ftMQvX0S+r/LYx0C8riynkU68KVv8YDMXjJzhy8jIuXbmNrCfZSusTEZH2K9WiSZVbRBkaGmLRokVYtGhRsTEuLi7YsWPHS8dp1aoVYmNfPuckJCQEISEhr8yJ3g1LZw5EyDcrMG76GgCAnp4uBvVsjWOxV3H9pvo3LTX8/1N9Obl5Sn3ZObmymEI25c1hU/7ZUagObetg0Yrd6DN8EfZtmvzKeVlERKRdtObqOSJNs7e1xKafRuNGUiruPVDA1dkWttbmaNjhG1SqqP6cIktzY+jrl0HqfeVTe6n3n92ny66IU3fPC2hTB3OWbEPEwbPo3bWZ2jkQEVHpYdFE7z23irZwq2gLALhy/S5S7yvwUcdGao+jq6uLau6OOHdJeRJ53IWbqFjBWjafqShPs3MAAI8znr40joiItI9W3HKA6G0oKChA6MK/YWSo/9pHeQLa1MaZi0k4e/F/hdO1myk4euoyOrStLbU9TMso8vTz+r+f3WG+puf/7umlyHiCq4nJUGQ8ea2ciIjo7eCRJnonrdpwAIrHT5Dy/6fKog6dR3JKGgAg6JOWMDc1wtSwTcjOyYNXlQrIy8/H37tjcObCTYRN+RQV7K1k423ecQK37z7Ek/8/EnQi9ioW/vLsewm7dGgIJ4dn8X0+ao51fx/FgDFLMbh3G5Qpo4df1u6DtZUZBvduI433186TWLP5MPxa1kTFCtbIzHqKg8ficeh4PHyb10CTBh5S7O79ZzD22zVKXwWjak5ERPR2sGiid9LyNXtx++5DaXnXvjPYte8MAKBzQAOYmxqhuocTfv1jP/7edRK6urqo5VURaxaFyG5BUGj9P9E4fvqqtBwdcwXRMVcAAPVru0sFiqmJIdYtGYHp8zfjx193o0AINK5bBZNGd0X5cmbS+g1queP02RvYuicG9x4+Rhk9XVRyscPEUV3Qr3tLlfZR1ZyIiOjt0BGqXMJGr6TOtyQTERXlumvH0k6BSKtVStym8THV+fvNOU1EREREKmDRRERERKQCFk1EREREKuBE8HeEa8PhpZ0CkdZKPLGwtFMgov8AHmkiIiIiUgGLJiIiIiIVsGgiIiIiUgGLJiIiIiIVsGgiIiIiUgGLJiIiIiIVsGgiIiIiUgGLJiIiIiIVsGgiIiIiUgGLJiIiIiIVsGgiIiIiUgGLJiIiIiIVsGgiIiIiUgGLJiIiIiIVsGgiIiIiUgGLJiIiIiIVsGgiIiIiUgGLJiIiIiIVsGgiIiIiUgGLJiIiIiIVsGgiIiIiUgGLJiIiIiIVlGrRdPDgQXTq1AmOjo7Q0dHBli1bZP06OjpFPubMmSPFuLq6KvXPnDlTNs7Zs2fRvHlzGBoawtnZGbNnz1bKZePGjahWrRoMDQ3h7e2NHTt2lMg+ExER0bupVIumzMxM1KpVC4sWLSqy/+7du7LHr7/+Ch0dHXTr1k0W9+2338rihg8fLvUpFAr4+fnBxcUFMTExmDNnDqZOnYrly5dLMUePHkXPnj0xcOBAxMbGonPnzujcuTPOnz9fMjtORERE75wypbnxgIAABAQEFNtvb28vW/7777/RunVrVKpUSdZuZmamFFtozZo1yMnJwa+//gp9fX1Ur14dcXFxmDdvHoYMGQIACA8PR/v27TF27FgAwPTp0xEREYEff/wRS5cufZNdJCIiovfEOzOnKSUlBdu3b8fAgQOV+mbOnIny5cujTp06mDNnDvLy8qS+6OhotGjRAvr6+lKbv78/EhIS8OjRIynG19dXNqa/vz+io6OLzSc7OxsKhUL2ICIiovdXqR5pUseqVatgZmaGrl27ytpHjBiBunXrwsrKCkePHsWECRNw9+5dzJs3DwCQnJwMNzc32Tp2dnZSX7ly5ZCcnCy1PR+TnJxcbD6hoaGYNm2aJnaNiIiI3gHvTNH066+/onfv3jA0NJS1jxkzRvq5Zs2a0NfXx2effYbQ0FAYGBiUWD4TJkyQbVuhUMDZ2bnEtkdERESl650omg4dOoSEhASsX7/+lbGNGjVCXl4eEhMT4eHhAXt7e6SkpMhiCpcL50EVF1PcPCkAMDAwKNGijIiIiLTLOzGn6ZdffkG9evVQq1atV8bGxcVBV1cXtra2AAAfHx8cPHgQubm5UkxERAQ8PDxQrlw5KSYqKko2TkREBHx8fDS4F0RERPQuK9WiKSMjA3FxcYiLiwMA3LhxA3FxcUhKSpJiFAoFNm7ciEGDBimtHx0djQULFuDMmTO4fv061qxZg9GjR+PTTz+VCqJevXpBX18fAwcOxIULF7B+/XqEh4fLTq2NHDkSu3btQlhYGOLj4zF16lScOnUKISEhJfsEEBER0TujVE/PnTp1Cq1bt5aWCwuZoKAgrFy5EgCwbt06CCHQs2dPpfUNDAywbt06TJ06FdnZ2XBzc8Po0aNlBZGFhQX27NmD4OBg1KtXD9bW1pg8ebJ0uwEAaNKkCdauXYuJEyfi66+/RpUqVbBlyxbUqFGjhPaciIiI3jU6QghR2km8DxQKBSwsLJCeng5zc3ONj+/acPirg4j+oxJPLCztFDTiumvH0k6BSKtVStym8THV+fv9TsxpIiIiIiptLJqIiIiIVMCiiYiIiEgFLJqIiIiIVMCiiYiIiEgFLJqIiIiIVMCiiYiIiEgFLJqIiIiIVMCiiYiIiEgFLJqIiIiIVMCiiYiIiEgFLJqIiIiIVMCiiYiIiEgFLJqIiIiIVMCiiYiIiEgFLJqIiIiIVMCiiYiIiEgFLJqIiIiIVMCiiYiIiEgFLJqIiIiIVMCiiYiIiEgFLJqIiIiIVMCiiYiIiEgFLJqIiIiIVMCiiYiIiEgFLJqIiIiIVMCiiYiIiEgFLJqIiIiIVMCiiYiIiEgFLJqIiIiIVMCiiYiIiEgFLJqIiIiIVFBGlaAxY8aoPOC8efNUjj148CDmzJmDmJgY3L17F3/99Rc6d+4s9ffr1w+rVq2SrePv749du3ZJyw8fPsTw4cOxdetW6Orqolu3bggPD4epqakUc/bsWQQHB+PkyZOwsbHB8OHDMW7cONm4GzduxKRJk5CYmIgqVapg1qxZ6NChg8r7QkRERO83lYqm2NhY2fLp06eRl5cHDw8PAMDly5ehp6eHevXqqbXxzMxM1KpVCwMGDEDXrl2LjGnfvj1WrFghLRsYGMj6e/fujbt37yIiIgK5ubno378/hgwZgrVr1wIAFAoF/Pz84Ovri6VLl+LcuXMYMGAALC0tMWTIEADA0aNH0bNnT4SGhqJjx45Yu3YtOnfujNOnT6NGjRpq7RMRERG9n1Qqmvbt2yf9PG/ePJiZmWHVqlUoV64cAODRo0fo378/mjdvrtbGAwICEBAQ8NIYAwMD2NvbF9l36dIl7Nq1CydPnkT9+vUBAAsXLkSHDh0wd+5cODo6Ys2aNcjJycGvv/4KfX19VK9eHXFxcZg3b55UNIWHh6N9+/YYO3YsAGD69OmIiIjAjz/+iKVLl6q1T0RERPR+UntOU1hYGEJDQ6WCCQDKlSuH7777DmFhYRpNDgD2798PW1tbeHh4YNiwYXjw4IHUFx0dDUtLS6lgAgBfX1/o6uri+PHjUkyLFi2gr68vxfj7+yMhIQGPHj2SYnx9fWXb9ff3R3R0dLF5ZWdnQ6FQyB5ERET0/lK7aFIoFLh3755S+7179/D48WONJFWoffv2+O233xAVFYVZs2bhwIEDCAgIQH5+PgAgOTkZtra2snXKlCkDKysrJCcnSzF2dnaymMLlV8UU9hclNDQUFhYW0sPZ2fnNdpaIiIi0mkqn557XpUsX9O/fH2FhYWjYsCEA4Pjx4xg7dmyx85JeV48ePaSfvb29UbNmTbi7u2P//v1o27atRrelrgkTJsgmyCsUChZORERE7zG1i6alS5fiyy+/RK9evZCbm/tskDJlMHDgQMyZM0fjCT6vUqVKsLa2xtWrV9G2bVvY29sjNTVVFpOXl4eHDx9K86Ds7e2RkpIiiylcflVMcXOpgGdzrV6clE5ERETvL7VPzxkbG2Px4sV48OABYmNjERsbi4cPH2Lx4sUwMTEpiRwlt27dwoMHD+Dg4AAA8PHxQVpaGmJiYqSYvXv3oqCgAI0aNZJiDh48KBV4ABAREQEPDw9pXpaPjw+ioqJk24qIiICPj0+J7g8RERG9O1775pYmJiaoWbMmatas+drFUkZGBuLi4hAXFwcAuHHjBuLi4pCUlISMjAyMHTsWx44dQ2JiIqKiovDhhx+icuXK8Pf3BwB4enqiffv2GDx4ME6cOIEjR44gJCQEPXr0gKOjIwCgV69e0NfXx8CBA3HhwgWsX78e4eHhslNrI0eOxK5duxAWFob4+HhMnToVp06dQkhIyOs+PURERPSeUfv0XGZmJmbOnImoqCikpqaioKBA1n/9+nWVxzp16hRat24tLRcWMkFBQViyZAnOnj2LVatWIS0tDY6OjvDz88P06dNlp8XWrFmDkJAQtG3bVrq55Q8//CD1W1hYYM+ePQgODka9evVgbW2NyZMnS7cbAIAmTZpg7dq1mDhxIr7++mtUqVIFW7Zs4T2aiIiISKIjhBDqrNCzZ08cOHAAffr0gYODA3R0dGT9I0eO1GiC7wqFQgELCwukp6fD3Nxc4+O7Nhyu8TGJ3heJJxaWdgoacd21Y2mnQKTVKiVu0/iY6vz9VvtI086dO7F9+3Y0bdr0tRMkIiIieteoPaepXLlysLKyKolciIiIiLSW2kXT9OnTMXnyZGRlZZVEPkRERERaSe3Tc2FhYbh27Rrs7Ozg6uqKsmXLyvpPnz6tseSIiIiItIXaRVPnzp1LIA0iIiIi7aZ20TRlypSSyIOIiIhIq6ldNBWKiYnBpUuXAADVq1dHnTp1NJYUERERkbZRu2hKTU1Fjx49sH//flhaWgIA0tLS0Lp1a6xbtw42NjaazpGIiIio1Kl99dzw4cPx+PFjXLhwAQ8fPsTDhw9x/vx5KBQKjBgxoiRyJCIiIip1ah9p2rVrFyIjI+Hp6Sm1eXl5YdGiRfDz89NockRERETaQu0jTQUFBUq3GQCAsmXLKn0PHREREdH7Qu2iqU2bNhg5ciTu3Lkjtd2+fRujR49G27ZtNZocERERkbZQu2j68ccfoVAo4OrqCnd3d7i7u8PNzQ0KhQILF74fX5pJRERE9CK15zQ5Ozvj9OnTiIyMRHx8PADA09MTvr6+Gk+OiIiISFu81n2adHR00K5dO7Rr107T+RARERFpJbVPz40YMQI//PCDUvuPP/6IUaNGaSInIiIiIq2jdtH0559/omnTpkrtTZo0waZNmzSSFBEREZG2UbtoevDgASwsLJTazc3Ncf/+fY0kRURERKRt1C6aKleujF27dim179y5E5UqVdJIUkRERETaRu2J4GPGjEFISAju3buHNm3aAACioqIQFhaGBQsWaDo/IiIiIq2gdtE0YMAAZGdnY8aMGZg+fToAwNXVFUuWLEHfvn01niARERGRNnitWw4MGzYMw4YNw71792BkZARTU1NN50VERESkVdSe0wQAeXl5iIyMxObNmyGEAADcuXMHGRkZGk2OiIiISFuofaTp5s2baN++PZKSkpCdnY127drBzMwMs2bNQnZ2NpYuXVoSeRIRERGVKrWPNI0cORL169fHo0ePYGRkJLV36dIFUVFRGk2OiIiISFuofaTp0KFDOHr0KPT19WXtrq6uuH37tsYSIyIiItImah9pKigoQH5+vlL7rVu3YGZmppGkiIiIiLSN2kWTn5+f7H5MOjo6yMjIwJQpU9ChQwdN5kZERESkNdQ+PRcWFgZ/f394eXnh6dOn6NWrF65cuQJra2v88ccfJZEjERERUalTu2hycnLCmTNnsH79epw5cwYZGRkYOHAgevfuLZsYTkRERPQ+ea2bW5YpUwa9e/dG7969NZ0PERERkVZSeU7T5cuXceLECVlbVFQUWrdujYYNG+L777/XeHJERERE2kLlomn8+PHYtm2btHzjxg106tQJ+vr68PHxQWhoqNpf2Hvw4EF06tQJjo6O0NHRwZYtW6S+3NxcjB8/Ht7e3jAxMYGjoyP69u2LO3fuyMZwdXWFjo6O7DFz5kxZzNmzZ9G8eXMYGhrC2dkZs2fPVspl48aNqFatGgwNDeHt7Y0dO3aotS9ERET0flO5aDp16hQCAgKk5TVr1qBq1arYvXs3wsPDsWDBAqxcuVKtjWdmZqJWrVpYtGiRUl9WVhZOnz6NSZMm4fTp09i8eTMSEhLwwQcfKMV+++23uHv3rvQYPny41KdQKODn5wcXFxfExMRgzpw5mDp1KpYvXy7FHD16FD179sTAgQMRGxuLzp07o3Pnzjh//rxa+0NERETvL5XnNN2/fx9OTk7S8r59+9CpUydpuVWrVvjiiy/U2nhAQICsEHuehYUFIiIiZG0//vgjGjZsiKSkJFSsWFFqNzMzg729fZHjrFmzBjk5Ofj111+hr6+P6tWrIy4uDvPmzcOQIUMAAOHh4Wjfvj3Gjh0LAJg+fToiIiLw448/8mthiIiICIAaR5qsrKxw9+5dAM9ucHnq1Ck0btxY6s/JyZG+vLekpKenQ0dHB5aWlrL2mTNnonz58qhTpw7mzJmDvLw8qS86OhotWrSQ3cHc398fCQkJePTokRTj6+srG9Pf3x/R0dHF5pKdnQ2FQiF7EBER0ftL5aKpVatWmD59Ov79918sWLAABQUFaNWqldR/8eJFuLq6lkCKzzx9+hTjx49Hz549YW5uLrWPGDEC69atw759+/DZZ5/h+++/x7hx46T+5ORk2NnZycYqXE5OTn5pTGF/UUJDQ2FhYSE9nJ2d33gfiYiISHupfHpuxowZaNeuHVxcXKCnp4cffvgBJiYmUv/vv/+ONm3alEiSubm56N69O4QQWLJkiaxvzJgx0s81a9aEvr4+PvvsM4SGhsLAwKBE8gGACRMmyLatUChYOBEREb3HVC6aXF1dcenSJVy4cAE2NjZwdHSU9U+bNk0250lTCgummzdvYu/evbKjTEVp1KgR8vLykJiYCA8PD9jb2yMlJUUWU7hcOA+quJji5kkBgIGBQYkWZURERKRd1PruuTJlyqBWrVpKBRMA1KpVC+XLl9dYYsD/CqYrV64gMjJSpfHj4uKgq6sLW1tbAICPjw8OHjyI3NxcKSYiIgIeHh4oV66cFBMVFSUbJyIiAj4+PhrcGyIiInqXvdYdwTUlIyMDV69elZZv3LiBuLg4WFlZwcHBAR999BFOnz6Nbdu2IT8/X5pjZGVlBX19fURHR+P48eNo3bo1zMzMEB0djdGjR+PTTz+VCqJevXph2rRpGDhwIMaPH4/z588jPDwc8+fPl7Y7cuRItGzZEmFhYQgMDMS6detw6tQp2W0JiIiI6L9NR5T0JW8vsX//frRu3VqpPSgoCFOnToWbm1uR6+3btw+tWrXC6dOn8fnnnyM+Ph7Z2dlwc3NDnz59MGbMGNmps7NnzyI4OBgnT56EtbU1hg8fjvHjx8vG3LhxIyZOnIjExERUqVIFs2fPRocOHVTeF4VCAQsLC6Snp7/yFOLrcG04/NVBRP9RiScWlnYKGnHdtWNpp0Ck1Solbnt1kJrU+ftdqkXT+4RFE1HpYdFE9N9Q2kWTWnOa8vLy8O233+LWrVtvlCARERHRu0btieAv3jySiIiI6L9AraIJANq0aYMDBw6URC5EREREWkvtq+cCAgLw1Vdf4dy5c6hXr57sBpcAivxCXSIiIqJ3ndpF0+effw4AmDdvnlKfjo4O8vPz3zwrIiIiIi2jdtFUUFBQEnkQERERaTW15zQ97+nTp5rKg4iIiEirqV005efnY/r06ahQoQJMTU1x/fp1AMCkSZPwyy+/aDxBIiIiIm2gdtE0Y8YMrFy5ErNnz4a+vr7UXqNGDfz8888aTY6IiIhIW6hdNP32229Yvnw5evfuDT09Pam9Vq1aiI+P12hyRERERNpC7aLp9u3bqFy5slJ7QUEBcnNzNZIUERERkbZRu2jy8vLCoUOHlNo3bdqEOnXqaCQpIiIiIm2j9i0HJk+ejKCgINy+fRsFBQXYvHkzEhIS8Ntvv2HbNs1/kR4RERGRNlD7SNOHH36IrVu3IjIyEiYmJpg8eTIuXbqErVu3ol27diWRIxEREVGpU/tIEwA0b94cERERms6FiIiISGu9VtEEAKdOncKlS5cAPJvnVK9ePY0lRURERKRt1C6abt26hZ49e+LIkSOwtLQEAKSlpaFJkyZYt24dnJycNJ0jERERUalTe07ToEGDkJubi0uXLuHhw4d4+PAhLl26hIKCAgwaNKgkciQiIiIqdWofaTpw4ACOHj0KDw8Pqc3DwwMLFy5E8+bNNZocERERkbZQ+0iTs7NzkTexzM/Ph6Ojo0aSIiIiItI2ahdNc+bMwfDhw3Hq1Cmp7dSpUxg5ciTmzp2r0eSIiIiItIXap+f69euHrKwsNGrUCGXKPFs9Ly8PZcqUwYABAzBgwAAp9uHDh5rLlIiIiKgUqV00LViwoATSICIiItJuahdNQUFBJZEHERERkVZTe04TERER0X8RiyYiIiIiFbBoIiIiIlIBiyYiIiIiFbxx0aRQKLBlyxbpy3uJiIiI3kdqF03du3fHjz/+CAB48uQJ6tevj+7du6NmzZr4888/NZ4gERERkTZQu2g6ePCg9B1zf/31F4QQSEtLww8//IDvvvtO4wkSERERaQO1i6b09HRYWVkBAHbt2oVu3brB2NgYgYGBuHLlisYTJCIiItIGr/WFvdHR0cjMzMSuXbvg5+cHAHj06BEMDQ3VGuvgwYPo1KkTHB0doaOjgy1btsj6hRCYPHkyHBwcYGRkBF9fX6XC7OHDh+jduzfMzc1haWmJgQMHIiMjQxZz9uxZNG/eHIaGhnB2dsbs2bOVctm4cSOqVasGQ0NDeHt7Y8eOHWrtCxEREb3f1C6aRo0ahd69e8PJyQmOjo5o1aoVgGcFkLe3t1pjZWZmolatWli0aFGR/bNnz8YPP/yApUuX4vjx4zAxMYG/vz+ePn0qxfTu3RsXLlxAREQEtm3bhoMHD2LIkCFSv0KhgJ+fH1xcXBATE4M5c+Zg6tSpWL58uRRz9OhR9OzZEwMHDkRsbCw6d+6Mzp074/z582rtDxEREb2/dIQQQt2VYmJikJSUhHbt2sHU1BQAsH37dlhaWqJp06avl4iODv766y907twZwLOjTI6Ojvjiiy/w5ZdfAnh2atDOzg4rV65Ejx49cOnSJXh5eeHkyZOoX78+gGenDDt06IBbt27B0dERS5YswTfffIPk5GTo6+sDAL766its2bIF8fHxAIBPPvkEmZmZ2LZtm5RP48aNUbt2bSxdulSl/BUKBSwsLJCeng5zc/PXeg5exrXhcI2PSfS+SDyxsLRT0Ijrrh1LOwUirVYpcdurg9Skzt9vtY405ebmwt3dHcbGxujSpYtUMAFAYGDgaxdMRblx4waSk5Ph6+srtVlYWKBRo0aIjo4GAERHR8PS0lIqmADA19cXurq6OH78uBTTokULqWACAH9/fyQkJODRo0dSzPPbKYwp3E5RsrOzoVAoZA8iIiJ6f6lVNJUtW1Z2aqwkJScnAwDs7Oxk7XZ2dlJfcnIybG1tZf1lypSBlZWVLKaoMZ7fRnExhf1FCQ0NhYWFhfRwdnZWdxeJiIjoHaL2nKbg4GDMmjULeXl5JZHPO2PChAlIT0+XHv/++29pp0REREQlqIy6K5w8eRJRUVHYs2cPvL29YWJiIuvfvHmzRhKzt7cHAKSkpMDBwUFqT0lJQe3ataWY1NRU2Xp5eXl4+PChtL69vT1SUlJkMYXLr4op7C+KgYEBDAwMXmPPiIiI6F2k9pEmS0tLdOvWDf7+/nB0dJSdorKwsNBYYm5ubrC3t0dUVJTUplAocPz4cfj4+AAAfHx8kJaWhpiYGClm7969KCgoQKNGjaSYgwcPIjc3V4qJiIiAh4cHypUrJ8U8v53CmMLtEBEREal9pGnFihUa23hGRgauXr0qLd+4cQNxcXGwsrJCxYoVMWrUKHz33XeoUqUK3NzcMGnSJDg6OkpX2Hl6eqJ9+/YYPHgwli5ditzcXISEhKBHjx5wdHQEAPTq1QvTpk3DwIEDMX78eJw/fx7h4eGYP3++tN2RI0eiZcuWCAsLQ2BgINatW4dTp07JbktARERE/21qF03As1Ng+/fvx7Vr19CrVy+YmZnhzp07MDc3l11R9yqnTp1C69atpeUxY8YAAIKCgrBy5UqMGzcOmZmZGDJkCNLS0tCsWTPs2rVLdhPNNWvWICQkBG3btoWuri66deuGH374Qeq3sLDAnj17EBwcjHr16sHa2hqTJ0+W3cupSZMmWLt2LSZOnIivv/4aVapUwZYtW1CjRo3XeXqIiIjoPaT2fZpu3ryJ9u3bIykpCdnZ2bh8+TIqVaqEkSNHIjs7W+X7Gr1veJ8motLD+zQR/Te8U/dpAp6dyqpfvz4ePXoEIyMjqb1Lly5K84KIiIiI3hdqn547dOgQjh49KrtZJAC4urri9u3bGkuMiIiISJuofaSpoKAA+fn5Su23bt2CmZmZRpIiIiIi0jZqF01+fn5YsGCBtKyjo4OMjAxMmTIFHTp00GRuRERERFpD7dNzYWFh8Pf3h5eXF54+fYpevXrhypUrsLa2xh9//FESORIRERGVOrWLJicnJ5w5cwbr16/HmTNnkJGRgYEDB6J3796yieFERERE7xO1i6aDBw+iSZMm6N27N3r37i215+Xl4eDBg2jRooVGEyQiIiLSBmrPaWrdujUePnyo1J6eni67USURERHR+0TtokkIAR0dHaX2Bw8eKH15LxEREdH7QuXTc127dgXw7Gq5fv36wcDAQOrLz8/H2bNn0aRJE81nSERERKQFVC6aLCwsADw70mRmZiab9K2vr4/GjRtj8ODBms+QiIiISAuoXDStWLECwLM7f48dOxbGxsYllhQRERGRtlF7TlPfvn2L/LqUK1euIDExURM5EREREWkdtYumfv364ejRo0rtx48fR79+/TSRExEREZHWUbtoio2NRdOmTZXaGzdujLi4OE3kRERERKR11C6adHR08PjxY6X29PT0Ir/Il4iIiOh9oHbR1KJFC4SGhsoKpPz8fISGhqJZs2YaTY6IiIhIW6j9NSqzZs1CixYt4OHhgebNmwMADh06BIVCgb1792o8QSIiIiJtoPaRJi8vL5w9exbdu3dHamoqHj9+jL59+yI+Ph41atQoiRyJiIiISp3aR5oAwNHREd9//72mcyEiIiLSWq9VNAFAVlYWkpKSkJOTI2uvWbPmGydFREREpG3ULpru3buH/v37Y+fOnUX28wo6IiIieh+pPadp1KhRSEtLw/Hjx2FkZIRdu3Zh1apVqFKlCv7555+SyJGIiIio1Kl9pGnv3r34+++/Ub9+fejq6sLFxQXt2rWDubk5QkNDERgYWBJ5EhEREZUqtY80ZWZmwtbWFgBQrlw53Lt3DwDg7e2N06dPazY7IiIiIi2hdtHk4eGBhIQEAECtWrWwbNky3L59G0uXLoWDg4PGEyQiIiLSBmqfnhs5ciTu3r0LAJgyZQrat2+PNWvWQF9fHytXrtR0fkRERERaQe2i6dNPP5V+rlevHm7evIn4+HhUrFgR1tbWGk2OiIiISFuodXouNzcX7u7uuHTpktRmbGyMunXrsmAiIiKi95paRVPZsmXx9OnTksqFiIiISGupPRE8ODgYs2bNQl5eXknkQ0RERKSV1J7TdPLkSURFRWHPnj3w9vaGiYmJrH/z5s0aS46IiIhIW6h9pMnS0hLdunWDv78/HB0dYWFhIXtomqurK3R0dJQewcHBAIBWrVop9Q0dOlQ2RlJSEgIDA2FsbAxbW1uMHTtW6UjZ/v37UbduXRgYGKBy5cq8EpCIiIhk1D7StGLFipLIo1gnT56UfZ/d+fPn0a5dO3z88cdS2+DBg/Htt99Ky8bGxtLP+fn5CAwMhL29PY4ePYq7d++ib9++KFu2LL7//nsAwI0bNxAYGIihQ4dizZo1iIqKwqBBg+Dg4AB/f/+3sJdERESk7dQumt42Gxsb2fLMmTPh7u6Oli1bSm3Gxsawt7cvcv09e/bg4sWLiIyMhJ2dHWrXro3p06dj/PjxmDp1KvT19bF06VK4ubkhLCwMAODp6YnDhw9j/vz5LJqIiIgIwGucngOATZs2oXv37mjcuDHq1q0re5SknJwcrF69GgMGDICOjo7UvmbNGlhbW6NGjRqYMGECsrKypL7o6Gh4e3vDzs5OavP394dCocCFCxekGF9fX9m2/P39ER0dXWwu2dnZUCgUsgcRERG9v9Qumn744Qf0798fdnZ2iI2NRcOGDVG+fHlcv34dAQEBJZGjZMuWLUhLS0O/fv2ktl69emH16tXYt28fJkyYgN9//112A87k5GRZwQRAWk5OTn5pjEKhwJMnT4rMJTQ0VDaXy9nZWRO7SERERFpK7dNzixcvxvLly9GzZ0+sXLkS48aNQ6VKlTB58mQ8fPiwJHKU/PLLLwgICICjo6PUNmTIEOlnb29vODg4oG3btrh27Rrc3d1LLJcJEyZgzJgx0rJCoWDhRERE9B5T+0hTUlISmjRpAgAwMjLC48ePAQB9+vTBH3/8odnsnnPz5k1ERkZi0KBBL41r1KgRAODq1asAAHt7e6SkpMhiCpcL50EVF2Nubg4jI6Mit2NgYABzc3PZg4iIiN5fahdN9vb20hGlihUr4tixYwCeXYEmhNBsds9ZsWIFbG1tERgY+NK4uLg4AICDgwMAwMfHB+fOnUNqaqoUExERAXNzc3h5eUkxUVFRsnEiIiLg4+OjwT0gIiKid5naRVObNm3wzz//AAD69++P0aNHo127dvjkk0/QpUsXjScIAAUFBVixYgWCgoJQpsz/ziheu3YN06dPR0xMDBITE/HPP/+gb9++aNGiBWrWrAkA8PPzg5eXF/r06YMzZ85g9+7dmDhxIoKDg2FgYAAAGDp0KK5fv45x48YhPj4eixcvxoYNGzB69OgS2R8iIiJ696g9p2n58uUoKCgA8OwrVcqXL4+jR4/igw8+wGeffabxBAEgMjISSUlJGDBggKxdX18fkZGRWLBgATIzM+Hs7Ixu3bph4sSJUoyenh62bduGYcOGwcfHByYmJggKCpLd18nNzQ3bt2/H6NGjER4eDicnJ/z888+83QARERFJdERJnlP7D1EoFLCwsEB6enqJzG9ybThc42MSvS8STyws7RQ04rprx9JOgUirVUrcpvEx1fn7/Vo3t0xLS8OJEyeQmpoqHXUq1Ldv39cZkoiIiEirqV00bd26Fb1790ZGRgbMzc1lN5nU0dFh0URERETvJbUngn/xxRcYMGAAMjIykJaWhkePHkmPkr5PExEREVFpUbtoun37NkaMGCH7UlwiIiKi953aRZO/vz9OnTpVErkQERERaS2V5jQV3pcJAAIDAzF27FhcvHgR3t7eKFu2rCz2gw8+0GyGRERERFpApaKpc+fOSm3P3+eokI6ODvLz8984KSIiIiJto1LR9OJtBYiIiIj+a9Se00RERET0X6Ry0bR37154eXlBoVAo9aWnp6N69eo4ePCgRpMjIiIi0hYqF00LFizA4MGDi7zFuIWFBT777DPMnz9fo8kRERERaQuVi6YzZ86gffv2xfb7+fkhJiZGI0kRERERaRuVi6aUlBSl2ws8r0yZMrh3755GkiIiIiLSNioXTRUqVMD58+eL7T979iwcHBw0khQRERGRtlG5aOrQoQMmTZqEp0+fKvU9efIEU6ZMQceOHTWaHBEREZG2UOk+TQAwceJEbN68GVWrVkVISAg8PDwAAPHx8Vi0aBHy8/PxzTfflFiiRERERKVJ5aLJzs4OR48exbBhwzBhwgQIIQA8uwu4v78/Fi1aBDs7uxJLlIiIiKg0qVw0AYCLiwt27NiBR48e4erVqxBCoEqVKihXrlxJ5UdERESkFdQqmgqVK1cODRo00HQuRERERFqLX6NCREREpAIWTUREREQqYNFEREREpAIWTUREREQqYNFEREREpAIWTUREREQqYNFEREREpAIWTUREREQqYNFEREREpAIWTUREREQqYNFEREREpAIWTUREREQqYNFEREREpAKtLpqmTp0KHR0d2aNatWpS/9OnTxEcHIzy5cvD1NQU3bp1Q0pKimyMpKQkBAYGwtjYGLa2thg7dizy8vJkMfv370fdunVhYGCAypUrY+XKlW9j94iIiOgdotVFEwBUr14dd+/elR6HDx+W+kaPHo2tW7di48aNOHDgAO7cuYOuXbtK/fn5+QgMDEROTg6OHj2KVatWYeXKlZg8ebIUc+PGDQQGBqJ169aIi4vDqFGjMGjQIOzevfut7icRERFptzKlncCrlClTBvb29krt6enp+OWXX7B27Vq0adMGALBixQp4enri2LFjaNy4Mfbs2YOLFy8iMjISdnZ2qF27NqZPn47x48dj6tSp0NfXx9KlS+Hm5oawsDAAgKenJw4fPoz58+fD39//re4rERERaS+tP9J05coVODo6olKlSujduzeSkpIAADExMcjNzYWvr68UW61aNVSsWBHR0dEAgOjoaHh7e8POzk6K8ff3h0KhwIULF6SY58cojCkcozjZ2dlQKBSyBxEREb2/tLpoatSoEVauXIldu3ZhyZIluHHjBpo3b47Hjx8jOTkZ+vr6sLS0lK1jZ2eH5ORkAEBycrKsYCrsL+x7WYxCocCTJ0+KzS00NBQWFhbSw9nZ+U13l4iIiLSYVp+eCwgIkH6uWbMmGjVqBBcXF2zYsAFGRkalmBkwYcIEjBkzRlpWKBQsnIiIiN5jWn2k6UWWlpaoWrUqrl69Cnt7e+Tk5CAtLU0Wk5KSIs2Bsre3V7qarnD5VTHm5uYvLcwMDAxgbm4uexAREdH7650qmjIyMnDt2jU4ODigXr16KFu2LKKioqT+hIQEJCUlwcfHBwDg4+ODc+fOITU1VYqJiIiAubk5vLy8pJjnxyiMKRyDiIiICNDyounLL7/EgQMHkJiYiKNHj6JLly7Q09NDz549YWFhgYEDB2LMmDHYt28fYmJi0L9/f/j4+KBx48YAAD8/P3h5eaFPnz44c+YMdu/ejYkTJyI4OBgGBgYAgKFDh+L69esYN24c4uPjsXjxYmzYsAGjR48uzV0nIiIiLaPVc5pu3bqFnj174sGDB7CxsUGzZs1w7Ngx2NjYAADmz58PXV1ddOvWDdnZ2fD398fixYul9fX09LBt2zYMGzYMPj4+MDExQVBQEL799lspxs3NDdu3b8fo0aMRHh4OJycn/Pzzz7zdABEREcnoCCFEaSfxPlAoFLCwsEB6enqJzG9ybThc42MSvS8STyws7RQ04rprx9JOgUirVUrcpvEx1fn7rdWn54iIiIi0BYsmIiIiIhWwaCIiIiJSAYsmIiIiIhWwaCIiIiJSAYsmIiIiIhWwaCIiIiJSAYsmIiIiIhWwaCIiIiJSAYsmIiIiIhWwaCIiIiJSAYsmIiIiIhWwaCIiIiJSAYsmIiIiIhWwaCIiIiJSAYsmIiIiIhWwaCIiIiJSAYsmIiIiIhWwaCIiIiJSAYsmIiIiIhWwaCIiIiJSAYsmIiIiIhWwaCIiIiJSAYsmIiIiIhWwaCIiIiJSAYsmIiIiIhWwaCIiIiJSAYsmIiIiIhWwaCIiIiJSAYsmIiIiIhWwaCIiIiJSAYsmIiIiIhVoddEUGhqKBg0awMzMDLa2tujcuTMSEhJkMa1atYKOjo7sMXToUFlMUlISAgMDYWxsDFtbW4wdOxZ5eXmymP3796Nu3bowMDBA5cqVsXLlypLePSIiInqHaHXRdODAAQQHB+PYsWOIiIhAbm4u/Pz8kJmZKYsbPHgw7t69Kz1mz54t9eXn5yMwMBA5OTk4evQoVq1ahZUrV2Ly5MlSzI0bNxAYGIjWrVsjLi4Oo0aNwqBBg7B79+63tq9ERESk3cqUdgIvs2vXLtnyypUrYWtri5iYGLRo0UJqNzY2hr29fZFj7NmzBxcvXkRkZCTs7OxQu3ZtTJ8+HePHj8fUqVOhr6+PpUuXws3NDWFhYQAAT09PHD58GPPnz4e/v3+R42ZnZyM7O1taVigUb7q7REREpMW0+kjTi9LT0wEAVlZWsvY1a9bA2toaNWrUwIQJE5CVlSX1RUdHw9vbG3Z2dlKbv78/FAoFLly4IMX4+vrKxvT390d0dHSxuYSGhsLCwkJ6ODs7v/H+ERERkfbS6iNNzysoKMCoUaPQtGlT1KhRQ2rv1asXXFxc4OjoiLNnz2L8+PFISEjA5s2bAQDJycmyggmAtJycnPzSGIVCgSdPnsDIyEgpnwkTJmDMmDHSskKhYOFERET0Hntniqbg4GCcP38ehw8flrUPGTJE+tnb2xsODg5o27Ytrl27Bnd39xLLx8DAAAYGBiU2PhEREWmXd+L0XEhICLZt24Z9+/bBycnppbGNGjUCAFy9ehUAYG9vj5SUFFlM4XLhPKjiYszNzYs8ykRERET/PVpdNAkhEBISgr/++gt79+6Fm5vbK9eJi4sDADg4OAAAfHx8cO7cOaSmpkoxERERMDc3h5eXlxQTFRUlGyciIgI+Pj4a2hMiIiJ612l10RQcHIzVq1dj7dq1MDMzQ3JyMpKTk/HkyRMAwLVr1zB9+nTExMQgMTER//zzD/r27YsWLVqgZs2aAAA/Pz94eXmhT58+OHPmDHbv3o2JEyciODhYOr02dOhQXL9+HePGjUN8fDwWL16MDRs2YPTo0aW270RERKRdtLpoWrJkCdLT09GqVSs4ODhIj/Xr1wMA9PX1ERkZCT8/P1SrVg1ffPEFunXrhq1bt0pj6OnpYdu2bdDT04OPjw8+/fRT9O3bF99++60U4+bmhu3btyMiIgK1atVCWFgYfv7552JvN0BERET/PVo9EVwI8dJ+Z2dnHDhw4JXjuLi4YMeOHS+NadWqFWJjY9XKj4iIiP47tPpIExEREZG2YNFEREREpAIWTUREREQqYNFEREREpAIWTUREREQqYNFEREREpAIWTUREREQqYNFEREREpAIWTUREREQqYNFEREREpAIWTUREREQqYNFEREREpAIWTUREREQqYNFEREREpAIWTUREREQqYNFEREREpAIWTUREREQqYNFEREREpAIWTUREREQqYNFEREREpAIWTUREREQqYNFEREREpAIWTUREREQqYNFEREREpAIWTUREREQqYNFEREREpAIWTUREREQqYNFEREREpAIWTUREREQqYNFEREREpAIWTUREREQqYNH0gkWLFsHV1RWGhoZo1KgRTpw4UdopERERkRZg0fSc9evXY8yYMZgyZQpOnz6NWrVqwd/fH6mpqaWdGhEREZUyFk3PmTdvHgYPHoz+/fvDy8sLS5cuhbGxMX799dfSTo2IiIhKWZnSTkBb5OTkICYmBhMmTJDadHV14evri+joaKX47OxsZGdnS8vp6ekAAIVCUSL5FeTnlMi4RO+DknrfvW2PC3JLOwUirVYS7/XCMYUQr4xl0fT/7t+/j/z8fNjZ2cna7ezsEB8frxQfGhqKadOmKbU7OzuXWI5EVDQLi+WlnQIRvQ0WFiU29OPHj2HxivFZNL2mCRMmYMyYMdJyQUEBHj58iPLly0NHR6cUM6OSplAo4OzsjH///Rfm5ualnQ4RlQC+z/87hBB4/PgxHB0dXxnLoun/WVtbQ09PDykpKbL2lJQU2NvbK8UbGBjAwMBA1mZpaVmSKZKWMTc354cp0XuO7/P/hlcdYSrEieD/T19fH/Xq1UNUVJTUVlBQgKioKPj4+JRiZkRERKQNeKTpOWPGjEFQUBDq16+Phg0bYsGCBcjMzET//v1LOzUiIiIqZSyanvPJJ5/g3r17mDx5MpKTk1G7dm3s2rVLaXI4/bcZGBhgypQpSqdniej9wfc5FUVHqHKNHREREdF/HOc0EREREamARRMRERGRClg0EREREamARRMRERGRClg00X9Sq1atMGrUKI2Pu3LlSpVucvrLL7/Az89P49tXlaurKxYsWFBs//3792Fra4tbt269vaSI3gNTp05F7dq1NT5uYmIidHR0EBcXp/GxSXUsmuiN9evXD507d1Zq379/P3R0dJCWliZbfvExceLEYsd2dXUtcp2ZM2eW0N6UvKdPn2LSpEmYMmUKgOL3sfDRr1+/t56jtbU1+vbtK+VI9Db069evyPfA1atXX3vMFz+HXrWt9u3bv+Fe0PuM92mity4hIUH2tQSmpqYvjf/2228xePBgWZuZmVmJ5PY2bNq0Cebm5mjatCkA4OTJk8jPzwcAHD16FN26dZM9R0ZGRmqNn5ubi7Jly75xnv3790e9evUwZ84cWFlZvfF4RKpo3749VqxYIWuzsbF5a9vifZnoZXikid46W1tb2NvbS49XFU1mZmayeHt7e5iYmAD433+Ru3fvRp06dWBkZIQ2bdogNTUVO3fuhKenJ8zNzdGrVy9kZWXJxs3Ly0NISAgsLCxgbW2NSZMm4fnblmVnZ+PLL79EhQoVYGJigkaNGmH//v2yMVauXImKFSvC2NgYXbp0wYMHD165/+vWrUOnTp2kZRsbG2m/CouT55+jtWvXwt3dHfr6+vDw8MDvv/8uG09HRwdLlizBBx98ABMTE8yYMQMAsHXrVjRo0ACGhoawtrZGly5dZOtlZWVhwIABMDMzQ8WKFbF8+XJZf/Xq1eHo6Ii//vrrlftEpCkGBgZK7/fw8HB4e3vDxMQEzs7O+Pzzz5GRkSGtc/PmTXTq1AnlypWDiYkJqlevjh07diAxMRGtW7cGAJQrV07pyG1R2ypXrpzUr6Ojg2XLlqFjx44wNjaGp6cnoqOjcfXqVbRq1QomJiZo0qQJrl27prQfy5Ytg7OzM4yNjdG9e3ekp6fL+n/++Wd4enrC0NAQ1apVw+LFi2X9J06cQJ06dWBoaIj69esjNjZWE08vvSlB9IaCgoLEhx9+qNS+b98+AUA8evSoyGVVuLi4iPnz5xfbXzhm48aNxeHDh8Xp06dF5cqVRcuWLYWfn584ffq0OHjwoChfvryYOXOmtF7Lli2FqampGDlypIiPjxerV68WxsbGYvny5VLMoEGDRJMmTcTBgwfF1atXxZw5c4SBgYG4fPmyEEKIY8eOCV1dXTFr1iyRkJAgwsPDhaWlpbCwsHjpPllYWIh169a9dH8Kn6PNmzeLsmXLikWLFomEhAQRFhYm9PT0xN69e6V1AAhbW1vx66+/imvXrombN2+Kbdu2CT09PTF58mRx8eJFERcXJ77//nvZ82plZSUWLVokrly5IkJDQ4Wurq6Ij4+X5fPJJ5+IoKCgl+4PkaYU91kyf/58sXfvXnHjxg0RFRUlPDw8xLBhw6T+wMBA0a5dO3H27Flx7do1sXXrVnHgwAGRl5cn/vzzTwFAJCQkiLt374q0tLSXbut5AESFChXE+vXrRUJCgujcubNwdXUVbdq0Ebt27RIXL14UjRs3Fu3bt5fWmTJlijAxMRFt2rQRsbGx4sCBA6Jy5cqiV69eUszq1auFg4OD+PPPP8X169fFn3/+KaysrMTKlSuFEEI8fvxY2NjYiF69eonz58+LrVu3ikqVKgkAIjY29vWfYHpjLJrojQUFBQk9PT1hYmIiexgaGhZZNL0Yd//+/WLHdnFxEfr6+krrHDx4UDZmZGSktE5oaKgAIK5duya1ffbZZ8Lf319abtmypfD09BQFBQVS2/jx44Wnp6cQQoibN28KPT09cfv2bVk+bdu2FRMmTBBCCNGzZ0/RoUMHWf8nn3zy0qLp0aNHAoCU/4teLJqaNGkiBg8eLIv5+OOPZdsFIEaNGiWL8fHxEb179y42DxcXF/Hpp59KywUFBcLW1lYsWbJEFjd69GjRqlWrYsch0qSiPks++ugjpbiNGzeK8uXLS8ve3t5i6tSpRY5Z3D9rxX1uzZgxQ4oBICZOnCgtR0dHCwDil19+kdr++OMPYWhoKC1PmTJF6OnpiVu3bkltO3fuFLq6uuLu3btCCCHc3d3F2rVrZflMnz5d+Pj4CCGEWLZsmShfvrx48uSJ1L9kyRIWTVqAc5pII1q3bo0lS5bI2o4fP45PP/1UKfbQoUOyOUnPHw4vytixY5UmQ1eoUEG2XLNmTelnOzs7GBsbo1KlSrK2EydOyNZp3LgxdHR0pGUfHx+EhYUhPz8f586dQ35+PqpWrSpbJzs7G+XLlwcAXLp0SemUl4+PD3bt2lXsvjx58gQAYGhoWGzM8y5duoQhQ4bI2po2bYrw8HBZW/369WXLcXFxSvPAXvT8c6ajowN7e3ukpqbKYoyMjJROaxKVpBc/S0xMTBAZGYnQ0FDEx8dDoVAgLy8PT58+RVZWFoyNjTFixAgMGzYMe/bsga+vL7p16yZ7fau6LQBK8/de/GwBAG9vb1nb06dPoVAopHmIFStWlH1G+fj4oKCgAAkJCTAzM8O1a9cwcOBA2Xs0Ly8PFhYWAJ6972vWrCn7nPDx8Xnl/lDJY9FEGmFiYoLKlSvL2oq7XN3NzU2ly/ILWVtbK439oucnPuvo6ChNhNbR0UFBQYHK28zIyICenh5iYmKgp6cn63vVHKyXKV++PHR0dPDo0aPXHqMohXO8CqkyeVyV5+jhw4clNgmXqCgvfpYkJiaiY8eOGDZsGGbMmAErKyscPnwYAwcORE5ODoyNjTFo0CD4+/tj+/bt2LNnD0JDQxEWFobhw4erta2ivPjZUlybqp8vhXOxfvrpJzRq1EjW9+JnDWkfTgSn/6zjx4/Llo8dO4YqVapAT08PderUQX5+PlJTU1G5cmXZw97eHgDg6elZ5Bgvo6+vDy8vL1y8eFGlHD09PXHkyBFZ25EjR+Dl5fXS9WrWrImoqCiVtvEy58+fR506dd54HKLXFRMTg4KCAoSFhaFx48aoWrUq7ty5oxTn7OyMoUOHYvPmzfjiiy/w008/AXj2ngMgXaH6NiQlJclyPHbsGHR1deHh4QE7Ozs4Ojri+vXrSp8tbm5uAJ6978+ePYunT5/KxqDSxyNNpPUeP36M5ORkWZuxsbHstgWvIykpCWPGjMFnn32G06dPY+HChQgLCwMAVK1aFb1790bfvn0RFhaGOnXq4N69e4iKikLNmjURGBiIESNGoGnTppg7dy4+/PBD7N69+6Wn5gr5+/vj8OHDKt1cc+zYsejevTvq1KkDX19fbN26FZs3b0ZkZORL15syZQratm0Ld3d39OjRA3l5edixYwfGjx+v0nMDPLu6LiYmBt9//73K6xBpWuXKlZGbm4uFCxeiU6dOOHLkCJYuXSqLGTVqFAICAlC1alU8evQI+/btg6enJwDAxcUFOjo62LZtGzp06AAjIyPpaHF2drbSZ0uZMmVgbW39RjkbGhoiKCgIc+fOhUKhwIgRI9C9e3fpH65p06ZhxIgRsLCwQPv27ZGdnY1Tp07h0aNHGDNmDHr16oVvvvkGgwcPxoQJE5CYmIi5c+e+UU6kGTzSRFpv8uTJcHBwkD3GjRv3xuP27dsXT548QcOGDREcHIyRI0fK5g+tWLECffv2xRdffAEPDw907twZJ0+eRMWKFQE8mxP1008/ITw8HLVq1cKePXteeqPOQgMHDsSOHTuULkEuSufOnREeHo65c+eievXqWLZsGVasWIFWrVq9dL1WrVph48aN+Oeff1C7dm20adNGaU7Xq/z999+oWLEimjdvrtZ6RJpUq1YtzJs3D7NmzUKNGjWwZs0ahIaGymLy8/MRHBwMT09PtG/fHlWrVpUu4a9QoQKmTZuGr776CnZ2dggJCZHW27Vrl9JnS7Nmzd4458qVK6Nr167o0KED/Pz8ULNmTdktBQYNGoSff/4ZK1asgLe3N1q2bImVK1dKR5pMTU2xdetWnDt3DnXq1ME333yDWbNmvXFe9OZ0hHjuxjRE9FZ8/PHHqFu3LiZMmFDaqRSrcePGGDFiBHr16lXaqRARaQUeaSIqBXPmzHmjCeUl7f79++jatSt69uxZ2qkQEWkNHmkiIiIiUgGPNBERERGpgEUTERERkQpYNBERERGpgEUTERERkQpYNBERERGpgEUTERERkQpYNBERERGpgEUTERERkQpYNBERERGp4P8ABYEtwr9ussAAAAAASUVORK5CYII=", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "def plot_character_per_second_comparison(\n", - " hf_stats: tuple[float, float, float], fst_stats: tuple[float, float, float], documents: list\n", - "):\n", - " # Calculating total characters in documents\n", - " total_characters = sum(len(doc) for doc in documents)\n", - "\n", - " # Calculating characters per second for each model\n", - " hf_chars_per_sec = total_characters / hf_stats[0] # Mean time is at index 0\n", - " fst_chars_per_sec = total_characters / fst_stats[0]\n", - "\n", - " # Plotting the bar chart\n", - " models = [\"HF Embed (Torch)\", \"FastEmbed\"]\n", - " chars_per_sec = [hf_chars_per_sec, fst_chars_per_sec]\n", - "\n", - " bars = plt.bar(models, chars_per_sec, color=[\"#1f356c\", \"#dd1f4b\"])\n", - " plt.ylabel(\"Characters per Second\")\n", - " plt.title(\"Characters Processed per Second Comparison\")\n", - "\n", - " # Adding the number at the top of each bar\n", - " for bar, chars in zip(bars, chars_per_sec):\n", - " plt.text(\n", - " bar.get_x() + bar.get_width() / 2,\n", - " bar.get_height(),\n", - " f\"{chars:.1f}\",\n", - " ha=\"center\",\n", - " va=\"bottom\",\n", - " color=\"#1f356c\",\n", - " fontsize=12,\n", - " )\n", - "\n", - " plt.show()\n", - "\n", - "\n", - "plot_character_per_second_comparison(hf_stats, fst_stats, documents)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Are the Embeddings the same?\n", - "\n", - "This is a very important question. Let's see if the embeddings are the same." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:43:25.537072Z", - "start_time": "2024-03-30T00:43:25.419184Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/var/folders/b4/grpbcmrd36gc7q5_11whbn540000gn/T/ipykernel_14307/1958479940.py:8: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_new.cpp:278.)\n", - " calculate_cosine_similarity(hf.embed(documents), Tensor(list(embedding_model.embed(documents))))\n" - ] - }, - { - "data": { - "text/plain": [ - "0.9999992847442627" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def calculate_cosine_similarity(embeddings1: Tensor, embeddings2: Tensor) -> float:\n", - " \"\"\"\n", - " Calculate cosine similarity between two sets of embeddings\n", - " \"\"\"\n", - " return F.cosine_similarity(embeddings1, embeddings2).mean().item()\n", - "\n", - "\n", - "calculate_cosine_similarity(hf.embed(documents), Tensor(list(embedding_model.embed(documents))))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This indicates the embeddings are quite close to each with a cosine similarity of 0.99 for BAAI/bge-small-en and 0.92 for BAAI/bge-small-en-v1.5. This gives us confidence that the embeddings are the same and we are not sacrificing accuracy for speed." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "fst", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/examples/Hindi_Tamil_RAG_with_Navarasa7B.ipynb b/docs/examples/Hindi_Tamil_RAG_with_Navarasa7B.ipynb deleted file mode 100644 index ef3cc2d78..000000000 --- a/docs/examples/Hindi_Tamil_RAG_with_Navarasa7B.ipynb +++ /dev/null @@ -1,1170 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "eNbQXYWNj2U9" - }, - "source": [ - "# Hindi and Tamil Question Answer / RAG\n", - "\n", - "In this notebook, we use new Navarasa LLMs from TeluguLLM to create a Hindi and Tamil Question Answering system. Since we're using a 7B model with PEFT, this notebook is run on Google Colab with an A100. If you're working with a smaller machine, I'd encourage to try the 2B model instead." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "| Time: 25 min | Level: Beginner | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1-CPpToBFPTqIYohxERhRFEU5FNP_vgbQ?usp=sharing) |\n", - "| --- | ----------- | ----------- |\n", - "| Author | [Nirant Kasliwal](https://nirantk.com/about/) | " - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "id": "rOTVBRFAj2U-" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - } - ], - "source": [ - "!pip install -U fastembed datasets qdrant-client peft transformers accelerate bitsandbytes -qq" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:45:24.814968Z", - "start_time": "2024-03-30T00:45:24.811138Z" - }, - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "aAfdPG15j2U_", - "outputId": "c457a5dd-bccb-4b56-8c7f-878dfc94884d" - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "from datasets import load_dataset\n", - "from peft import AutoPeftModelForCausalLM\n", - "from qdrant_client import QdrantClient\n", - "from qdrant_client.models import PointStruct, VectorParams, Distance\n", - "from transformers import AutoTokenizer\n", - "\n", - "from fastembed import TextEmbedding" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "hf_token = \"\" # Get your token from https://huggingface.co/settings/token, needed for Gemma weights" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T4f74a_gj2U_" - }, - "source": [ - "## Setting Up\n", - "\n", - "We'll download the dataset, our LLM model weights and embedding model weights next" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1xh1P9z5kAxd" - }, - "outputs": [], - "source": [ - "embedding_model = \"sentence-transformers/paraphrase-multilingual-mpnet-base-v2\"\n", - "model_id = \"Telugu-LLM-Labs/Indic-gemma-2b-finetuned-sft-Navarasa\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "CpCX7dOlj2VA" - }, - "outputs": [], - "source": [ - "ds = load_dataset(\"nirantk/chaii-hindi-and-tamil-question-answering\", split=\"train\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "x-AkWw420GgW", - "outputId": "460de1e2-aa4d-4dde-cf1d-abb38288bc97" - }, - "outputs": [], - "source": [ - "ds" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wx1yWuc9zHwg" - }, - "source": [ - "This dataset has questions and contexts which have corresponding answers. The answers must be found by the LLM. This is an extractive Question Answering problem.\n", - "\n", - "In order to do this, we'll setup an embedding model from FastEmbed. And then add it to Qdrant in memory mode, which is powered by Numpy." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 49, - "referenced_widgets": [ - "8dbaa1a6fdcf4b4d805a94c8cce564eb", - "4b89c0d412a64dfd80d82fd818977702", - "a2a21109220d407095b8a3e916424365", - "1d5b6d1ec1154c3ebaf98d62be90f967", - "16d89c278bc34208ab76983db54e6465", - "324c73c2bb464f919314f5a40a1e89e7", - "b304cbb2fa554d909441b51c3a4682ae", - "67ba7e82e98c4408b96bba4fe07e39a3", - "9a63c76313274dd1b8aabf6470f9fa32", - "11e7c0fb108d41cd938dada314157342", - "f4539e2f491a47ae937b59911ea69d8e" - ] - }, - "id": "e1FzPDS6j2VA", - "outputId": "3d69a566-0c67-45f9-b768-e0c498863d95" - }, - "outputs": [], - "source": [ - "embedding_model = TextEmbedding(model_name=embedding_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vg2WgC7_zpfv" - }, - "source": [ - "We'll use the 7B model here, the 2B model isn't great and was suffering from reading comprehension challenges." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Downloading the Navarasa LLM\n", - "\n", - "We'll download the Navarasa LLM from TeluguLLM-Labs. This is a 7B model with PEFT." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 49, - "referenced_widgets": [ - "d5a4726255f24bef914841e541a03a30", - "f1371ea81d6e4930ad3712e370bf1299", - "6e5278269f3c4723be26a03baf12d613", - "27cc073f21404a38b58db04974458b5e", - "503d80fa1c3d4e1db0fbc8b2ab3004a5", - "9e4a0a7cb9bc4132b562691e657dc5ab", - "7a255a55ad3b4cc49470285073addaba", - "cefda21f046e444a8168129e3dc869af", - "15548e2974c749659bc2b8502906a8e4", - "296e600afb144a298d3e2223973e58f9", - "7cfa648b10c2430d974ecc053d823958" - ] - }, - "id": "zst575Z6j2VA", - "outputId": "9237546c-7ca0-479b-9cf9-4396ccc15dc7" - }, - "outputs": [], - "source": [ - "model = AutoPeftModelForCausalLM.from_pretrained(\n", - " model_id,\n", - " load_in_4bit=False,\n", - " token=hf_token,\n", - ")\n", - "tokenizer = AutoTokenizer.from_pretrained(model_id)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LWVHPXIFzzgW" - }, - "source": [ - "## Embed the Context into Vectors" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "questions, contexts = list(ds[\"question\"]), list(ds[\"context\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5yjBVH3Yj2VA" - }, - "outputs": [], - "source": [ - "context_embeddings: list[np.ndarray] = list(\n", - " embedding_model.embed(contexts)\n", - ") # Note the list() call - this is a generator" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "X3qnhlyll3dT", - "outputId": "b496f5b4-219a-45fe-8df4-8869d9c6c4e0" - }, - "outputs": [], - "source": [ - "len(context_embeddings[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Sbe7HN6Qmnu7" - }, - "outputs": [], - "source": [ - "def embed_text(text: str) -> np.array:\n", - " return list(embedding_model.embed(text))[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lIxvYWiQj2VA" - }, - "outputs": [], - "source": [ - "context_points = [\n", - " PointStruct(id=idx, vector=emb, payload={\"text\": text})\n", - " for idx, (emb, text) in enumerate(zip(context_embeddings, contexts))\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "QvuRKZCSmJTc", - "outputId": "bd528f49-b3af-4da5-9a80-124b8ce93b80" - }, - "outputs": [], - "source": [ - "len(context_points[0].vector)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nzwBvJ59zxAc" - }, - "source": [ - "## Insert into Qdrant" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "TkfZxG1gkQ_k", - "outputId": "5c190152-ee40-487c-8aef-858cd5bdccf9" - }, - "outputs": [], - "source": [ - "search_client = QdrantClient(\":memory:\")\n", - "\n", - "search_client.create_collection(\n", - " collection_name=\"hindi_tamil_contexts\",\n", - " vectors_config=VectorParams(size=len(context_points[0].vector), distance=Distance.COSINE),\n", - ")\n", - "search_client.upsert(collection_name=\"hindi_tamil_contexts\", points=context_points)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Selecting a Question\n", - "\n", - "I've randomly selected a question here, with a specific and we then find the answer to it. We have the correct answer for it too -- so we can compare the two when you run the code." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ORNuYzuOm2tH", - "outputId": "bddb668c-434f-43ad-f773-9bbdc5383612" - }, - "outputs": [], - "source": [ - "idx = 997\n", - "\n", - "question = questions[idx]\n", - "print(question)\n", - "search_context = search_client.search(\n", - " query_vector=embed_text(question), collection_name=\"hindi_tamil_contexts\", limit=2\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "F5TDUjpTvtnX", - "outputId": "1105b324-b698-4b07-9af5-12394e69fb6e" - }, - "outputs": [], - "source": [ - "search_context_text = search_context[0].payload[\"text\"]\n", - "len(search_context_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Running the Model with a Question & Context" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FwT2njapj2VA" - }, - "outputs": [], - "source": [ - "input_prompt = \"\"\"\n", - "Answer the following question based on the context given after it in the same language as the question:\n", - "### Question:\n", - "{}\n", - "\n", - "### Context:\n", - "{}\n", - "\n", - "### Answer:\n", - "{}\"\"\"\n", - "\n", - "input_text = input_prompt.format(\n", - " questions[idx], # question\n", - " search_context_text[:2000], # context\n", - " \"\", # output - leave this blank for generation!\n", - ")\n", - "\n", - "inputs = tokenizer([input_text], return_tensors=\"pt\")\n", - "\n", - "outputs = model.generate(**inputs, max_new_tokens=50, use_cache=True)\n", - "response = tokenizer.batch_decode(outputs)[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - }, - "id": "jXw3aILgvMdp", - "outputId": "fe73260c-26c4-49c9-819b-24aa50c090ad" - }, - "outputs": [], - "source": [ - "response.split(sep=\"### Answer:\")[-1].strip(\"\").strip()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - }, - "id": "DKeqAZEm0C0u", - "outputId": "44780197-c7fa-496e-e166-3512321f527d" - }, - "outputs": [], - "source": [ - "ds[idx][\"answer_text\"]" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "A100", - "machine_shape": "hm", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "11e7c0fb108d41cd938dada314157342": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "15548e2974c749659bc2b8502906a8e4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "16d89c278bc34208ab76983db54e6465": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1d5b6d1ec1154c3ebaf98d62be90f967": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_11e7c0fb108d41cd938dada314157342", - "placeholder": "โ€‹", - "style": "IPY_MODEL_f4539e2f491a47ae937b59911ea69d8e", - "value": "โ€‡9/9โ€‡[00:00<00:00,โ€‡757.75it/s]" - } - }, - "27cc073f21404a38b58db04974458b5e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_296e600afb144a298d3e2223973e58f9", - "placeholder": "โ€‹", - "style": "IPY_MODEL_7cfa648b10c2430d974ecc053d823958", - "value": "โ€‡4/4โ€‡[00:50<00:00,โ€‡10.44s/it]" - } - }, - "296e600afb144a298d3e2223973e58f9": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "324c73c2bb464f919314f5a40a1e89e7": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4b89c0d412a64dfd80d82fd818977702": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_324c73c2bb464f919314f5a40a1e89e7", - "placeholder": "โ€‹", - "style": "IPY_MODEL_b304cbb2fa554d909441b51c3a4682ae", - "value": "Fetchingโ€‡9โ€‡files:โ€‡100%" - } - }, - "503d80fa1c3d4e1db0fbc8b2ab3004a5": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "67ba7e82e98c4408b96bba4fe07e39a3": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "6e5278269f3c4723be26a03baf12d613": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_cefda21f046e444a8168129e3dc869af", - "max": 4, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_15548e2974c749659bc2b8502906a8e4", - "value": 4 - } - }, - "7a255a55ad3b4cc49470285073addaba": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "7cfa648b10c2430d974ecc053d823958": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "8dbaa1a6fdcf4b4d805a94c8cce564eb": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_4b89c0d412a64dfd80d82fd818977702", - "IPY_MODEL_a2a21109220d407095b8a3e916424365", - "IPY_MODEL_1d5b6d1ec1154c3ebaf98d62be90f967" - ], - "layout": "IPY_MODEL_16d89c278bc34208ab76983db54e6465" - } - }, - "9a63c76313274dd1b8aabf6470f9fa32": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "9e4a0a7cb9bc4132b562691e657dc5ab": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a2a21109220d407095b8a3e916424365": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_67ba7e82e98c4408b96bba4fe07e39a3", - "max": 9, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_9a63c76313274dd1b8aabf6470f9fa32", - "value": 9 - } - }, - "b304cbb2fa554d909441b51c3a4682ae": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "cefda21f046e444a8168129e3dc869af": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "d5a4726255f24bef914841e541a03a30": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_f1371ea81d6e4930ad3712e370bf1299", - "IPY_MODEL_6e5278269f3c4723be26a03baf12d613", - "IPY_MODEL_27cc073f21404a38b58db04974458b5e" - ], - "layout": "IPY_MODEL_503d80fa1c3d4e1db0fbc8b2ab3004a5" - } - }, - "f1371ea81d6e4930ad3712e370bf1299": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_9e4a0a7cb9bc4132b562691e657dc5ab", - "placeholder": "โ€‹", - "style": "IPY_MODEL_7a255a55ad3b4cc49470285073addaba", - "value": "Loadingโ€‡checkpointโ€‡shards:โ€‡100%" - } - }, - "f4539e2f491a47ae937b59911ea69d8e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - } - } - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/docs/examples/Hybrid_Search.ipynb b/docs/examples/Hybrid_Search.ipynb deleted file mode 100644 index 95182d816..000000000 --- a/docs/examples/Hybrid_Search.ipynb +++ /dev/null @@ -1,1257 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Hybrid Search with FastEmbed & Qdrant\n", - "\n", - "Author: [Nirant Kasliwal](https://twitter.com/nirantk)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## What will we do?\n", - "This notebook demonstrates the usage of Hybrid Search with FastEmbed & Qdrant. \n", - "\n", - "1. Setup: Download and install the required dependencies\n", - "2. Preview data: Load and preview the data\n", - "3. Create Sparse Embeddings: Create SPLADE++ embeddings for the data\n", - "4. Create Dense Embeddings: Create BGE-Large-en-v1.5 embeddings for the data\n", - "5. Indexing: Index the embeddings using Qdrant\n", - "6. Search: Perform Hybrid Search using FastEmbed & Qdrant\n", - "7. Ranking: Rank the search results with Reciprocal Rank Fusion (RRF)\n", - "\n", - "## Setup\n", - "\n", - "In order to get started, you need a few dependencies, and we'll install them next:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install -qU qdrant-client fastembed datasets transformers" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:46:59.293555Z", - "start_time": "2024-03-30T00:46:59.285419Z" - } - }, - "outputs": [], - "source": [ - "import json\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "from datasets import load_dataset\n", - "from qdrant_client import QdrantClient\n", - "from qdrant_client.models import (\n", - " Distance,\n", - " NamedSparseVector,\n", - " NamedVector,\n", - " SparseVector,\n", - " PointStruct,\n", - " SearchRequest,\n", - " SparseIndexParams,\n", - " SparseVectorParams,\n", - " VectorParams,\n", - " ScoredPoint,\n", - ")\n", - "from transformers import AutoTokenizer\n", - "\n", - "import fastembed\n", - "from fastembed import SparseEmbedding, SparseTextEmbedding, TextEmbedding" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'0.2.5'" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fastembed.__version__ # 0.2.5" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:47:06.815264Z", - "start_time": "2024-03-30T00:47:00.149649Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "Dataset({\n", - " features: ['example_id', 'query', 'query_id', 'product_id', 'product_locale', 'esci_label', 'small_version', 'large_version', 'product_title', 'product_description', 'product_bullet_point', 'product_brand', 'product_color', 'product_text'],\n", - " num_rows: 919\n", - "})" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dataset = load_dataset(\"tasksource/esci\", split=\"train\")\n", - "# We'll select the first 1000 examples for this demo\n", - "dataset = dataset.select(range(1000))\n", - "dataset = dataset.filter(lambda x: x[\"product_locale\"] == \"us\")\n", - "dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Preview Data" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:47:06.831216Z", - "start_time": "2024-03-30T00:47:06.810740Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
example_idqueryquery_idproduct_idproduct_localeesci_labelsmall_versionlarge_versionproduct_titleproduct_descriptionproduct_bullet_pointproduct_brandproduct_colorproduct_text
00revent 80 cfm0B000MOO21WusIrrelevant01Panasonic FV-20VQ3 WhisperCeiling 190 CFM Ceil...NoneWhisperCeiling fans feature a totally enclosed...PanasonicWhitePanasonic FV-20VQ3 WhisperCeiling 190 CFM Ceil...
21revent 80 cfm0B07X3Y6B1VusExact01Homewerks 7141-80 Bathroom Fan Integrated LED ...NoneOUTSTANDING PERFORMANCE: This Homewerk's bath ...Homewerks80 CFMHomewerks 7141-80 Bathroom Fan Integrated LED ...
32revent 80 cfm0B07WDM7MQQusExact01Homewerks 7140-80 Bathroom Fan Ceiling Mount E...NoneOUTSTANDING PERFORMANCE: This Homewerk's bath ...HomewerksWhiteHomewerks 7140-80 Bathroom Fan Ceiling Mount E...
43revent 80 cfm0B07RH6Z8KWusExact01Delta Electronics RAD80L BreezRadiance 80 CFM ...This pre-owned or refurbished product has been...Quiet operation at 1.5 sones\\nBuilt-in thermos...DELTA ELECTRONICS (AMERICAS) LTD.WhiteDelta Electronics RAD80L BreezRadiance 80 CFM ...
54revent 80 cfm0B07QJ7WYFQusExact01Panasonic FV-08VRE2 Ventilation Fan with Reces...NoneThe design solution for Fan/light combinations...PanasonicWhitePanasonic FV-08VRE2 Ventilation Fan with Reces...
\n", - "
" - ], - "text/plain": [ - " example_id query query_id product_id product_locale \\\n", - "0 0 revent 80 cfm 0 B000MOO21W us \n", - "2 1 revent 80 cfm 0 B07X3Y6B1V us \n", - "3 2 revent 80 cfm 0 B07WDM7MQQ us \n", - "4 3 revent 80 cfm 0 B07RH6Z8KW us \n", - "5 4 revent 80 cfm 0 B07QJ7WYFQ us \n", - "\n", - " esci_label small_version large_version \\\n", - "0 Irrelevant 0 1 \n", - "2 Exact 0 1 \n", - "3 Exact 0 1 \n", - "4 Exact 0 1 \n", - "5 Exact 0 1 \n", - "\n", - " product_title \\\n", - "0 Panasonic FV-20VQ3 WhisperCeiling 190 CFM Ceil... \n", - "2 Homewerks 7141-80 Bathroom Fan Integrated LED ... \n", - "3 Homewerks 7140-80 Bathroom Fan Ceiling Mount E... \n", - "4 Delta Electronics RAD80L BreezRadiance 80 CFM ... \n", - "5 Panasonic FV-08VRE2 Ventilation Fan with Reces... \n", - "\n", - " product_description \\\n", - "0 None \n", - "2 None \n", - "3 None \n", - "4 This pre-owned or refurbished product has been... \n", - "5 None \n", - "\n", - " product_bullet_point \\\n", - "0 WhisperCeiling fans feature a totally enclosed... \n", - "2 OUTSTANDING PERFORMANCE: This Homewerk's bath ... \n", - "3 OUTSTANDING PERFORMANCE: This Homewerk's bath ... \n", - "4 Quiet operation at 1.5 sones\\nBuilt-in thermos... \n", - "5 The design solution for Fan/light combinations... \n", - "\n", - " product_brand product_color \\\n", - "0 Panasonic White \n", - "2 Homewerks 80 CFM \n", - "3 Homewerks White \n", - "4 DELTA ELECTRONICS (AMERICAS) LTD. White \n", - "5 Panasonic White \n", - "\n", - " product_text \n", - "0 Panasonic FV-20VQ3 WhisperCeiling 190 CFM Ceil... \n", - "2 Homewerks 7141-80 Bathroom Fan Integrated LED ... \n", - "3 Homewerks 7140-80 Bathroom Fan Ceiling Mount E... \n", - "4 Delta Electronics RAD80L BreezRadiance 80 CFM ... \n", - "5 Panasonic FV-08VRE2 Ventilation Fan with Reces... " - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "source_df = dataset.to_pandas()\n", - "df = source_df.drop_duplicates(\n", - " subset=[\"product_text\", \"product_title\", \"product_bullet_point\", \"product_brand\"]\n", - ")\n", - "df = df.dropna(subset=[\"product_text\", \"product_title\", \"product_bullet_point\", \"product_brand\"])\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:47:06.842492Z", - "start_time": "2024-03-30T00:47:06.831564Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Catalog Item Count: 176\n", - "Queries: 919\n" - ] - } - ], - "source": [ - "print(f\"Catalog Item Count: {len(df)}\\nQueries: {len(source_df)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:47:06.843214Z", - "start_time": "2024-03-30T00:47:06.835501Z" - } - }, - "outputs": [], - "source": [ - "df[\"combined_text\"] = (\n", - " df[\"product_title\"] + \"\\n\" + df[\"product_text\"] + \"\\n\" + df[\"product_bullet_point\"]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:47:06.843675Z", - "start_time": "2024-03-30T00:47:06.837385Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "176" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Sparse Embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:47:08.983795Z", - "start_time": "2024-03-30T00:47:06.839334Z" - } - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "930a97b272324022a4ce1a2ff7637c53", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Fetching 9 files: 0%| | 0/9 [00:00 list[SparseEmbedding]:\n", - " return list(sparse_model.embed(texts, batch_size=32))\n", - "\n", - "\n", - "sparse_embedding: list[SparseEmbedding] = make_sparse_embedding(\n", - " [\"Fastembed is a great library for text embeddings!\"]\n", - ")\n", - "sparse_embedding" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The previous output is a SparseEmbedding object for the first document in our list.\n", - "\n", - "It contains two arrays: values and indices. \n", - "- The 'values' array represents the weights of the features (tokens) in the document.\n", - "- The 'indices' array represents the indices of these features in the model's vocabulary.\n", - "\n", - "Each pair of corresponding values and indices represents a token and its weight in the document." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is still a little abstract, so let's use the tokenizer vocab to make sense of these indices." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:47:12.171596Z", - "start_time": "2024-03-30T00:47:12.166737Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'model': 'prithvida/Splade_PP_en_v1',\n", - " 'vocab_size': 30522,\n", - " 'description': 'Misspelled version of the model. Retained for backward compatibility. Independent Implementation of SPLADE++ Model for English',\n", - " 'size_in_GB': 0.532,\n", - " 'sources': {'hf': 'Qdrant/SPLADE_PP_en_v1'}},\n", - " {'model': 'prithivida/Splade_PP_en_v1',\n", - " 'vocab_size': 30522,\n", - " 'description': 'Independent Implementation of SPLADE++ Model for English',\n", - " 'size_in_GB': 0.532,\n", - " 'sources': {'hf': 'Qdrant/SPLADE_PP_en_v1'}}]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "SparseTextEmbedding.list_supported_models()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"fast\": 2.5722568035125732,\n", - " \"##bed\": 2.1076643466949463,\n", - " \"##em\": 1.9767478704452515,\n", - " \"text\": 1.8717442750930786,\n", - " \"em\": 1.6925323009490967,\n", - " \"library\": 1.4840331077575684,\n", - " \"##ding\": 1.2125115394592285,\n", - " \"bed\": 1.2047640085220337,\n", - " \"good\": 0.9082595109939575,\n", - " \"librarian\": 0.9047954082489014,\n", - " \"is\": 0.8048465847969055,\n", - " \"software\": 0.7071243524551392,\n", - " \"format\": 0.6728256344795227,\n", - " \"great\": 0.613731324672699,\n", - " \"texts\": 0.5162659287452698,\n", - " \"quick\": 0.49152234196662903,\n", - " \"device\": 0.4451214075088501,\n", - " \"file\": 0.44369080662727356,\n", - " \"for\": 0.4135623872280121,\n", - " \"best\": 0.38512513041496277,\n", - " \"technique\": 0.36704862117767334,\n", - " \"facility\": 0.2912774682044983,\n", - " \"method\": 0.26381176710128784,\n", - " \"ideal\": 0.26357391476631165,\n", - " \"perfect\": 0.2541669011116028,\n", - " \"##bing\": 0.24062614142894745,\n", - " \"material\": 0.23214206099510193,\n", - " \"storage\": 0.21676106750965118,\n", - " \"tool\": 0.21145488321781158,\n", - " \"nice\": 0.20749981701374054,\n", - " \"web\": 0.19741220772266388,\n", - " \"architecture\": 0.1926720291376114,\n", - " \"##b\": 0.18546289205551147,\n", - " \"book\": 0.18313883244991302,\n", - " \"a\": 0.17295610904693604,\n", - " \"speed\": 0.17008088529109955,\n", - " \"##am\": 0.1678074449300766,\n", - " \"##ization\": 0.16315481066703796,\n", - " \"browser\": 0.11985089629888535,\n", - " \"##ogen\": 0.10030396282672882,\n", - " \"database\": 0.09790635108947754,\n", - " \"connection\": 0.09682106971740723,\n", - " \"excellent\": 0.0670650377869606,\n", - " \"computer\": 0.06487759202718735,\n", - " \"java\": 0.055845409631729126,\n", - " \"algorithm\": 0.051508933305740356,\n", - " \"program\": 0.04257776960730553,\n", - " \"wonderful\": 0.00856015458703041\n", - "}\n" - ] - } - ], - "source": [ - "def get_tokens_and_weights(sparse_embedding, model_name) -> dict[str, float]:\n", - " # Find the tokenizer for the model\n", - " tokenizer_source = None\n", - " for model_info in SparseTextEmbedding.list_supported_models():\n", - " if model_info[\"model\"].lower() == model_name.lower():\n", - " tokenizer_source = model_info[\"sources\"][\"hf\"]\n", - " break\n", - " else:\n", - " raise ValueError(f\"Model {model_name} not found in the supported models.\")\n", - "\n", - " tokenizer = AutoTokenizer.from_pretrained(tokenizer_source)\n", - " token_weight_dict: dict[str, float] = {}\n", - " for i in range(len(sparse_embedding.indices)):\n", - " token = tokenizer.decode([sparse_embedding.indices[i]])\n", - " weight = sparse_embedding.values[i]\n", - " token_weight_dict[token] = weight\n", - "\n", - " # Sort the dictionary by weights\n", - " token_weight_dict = dict(\n", - " sorted(token_weight_dict.items(), key=lambda item: item[1], reverse=True)\n", - " )\n", - " return token_weight_dict\n", - "\n", - "\n", - "# Test the function with the first SparseEmbedding\n", - "print(json.dumps(get_tokens_and_weights(sparse_embedding[0], sparse_model_name), indent=4))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Dense Embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:48:11.018700Z", - "start_time": "2024-03-30T00:48:10.975766Z" - } - }, - "outputs": [], - "source": [ - "def make_dense_embedding(texts: list[str]):\n", - " return list(dense_model.embed(texts))\n", - "\n", - "\n", - "dense_embedding = make_dense_embedding([\"Fastembed is a great library for text embeddings!\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:48:12.418869Z", - "start_time": "2024-03-30T00:48:12.413593Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(1024,)" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dense_embedding[0].shape" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:48:13.789029Z", - "start_time": "2024-03-30T00:48:13.777529Z" - } - }, - "outputs": [], - "source": [ - "product_texts = df[\"combined_text\"].tolist()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 5min 57s, sys: 22 s, total: 6min 19s\n", - "Wall time: 1min 37s\n" - ] - } - ], - "source": [ - "%%time\n", - "df[\"sparse_embedding\"] = make_sparse_embedding(product_texts)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notice that FastEmbed uses data parallelism to speed up the embedding generation process. \n", - "\n", - "This improves throughput and reduces the time it takes to generate embeddings for large datasets. \n", - "\n", - "For our small dataset here, on my local machine -- it reduces the time from user's 6 min 15 seconds to a wall time of about 3 min 6 seconds, or about 2x faster. This is a function of the number of CPU cores available on the machine, CPU usage and other factors -- so your mileage may vary." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 SparseEmbedding(values=array([0.06509431, 0.57...\n", - "2 SparseEmbedding(values=array([0.10595927, 0.20...\n", - "3 SparseEmbedding(values=array([0.1140037 , 0.02...\n", - "4 SparseEmbedding(values=array([6.13510251e-01, ...\n", - "5 SparseEmbedding(values=array([0.90058267, 0.12...\n", - " ... \n", - "780 SparseEmbedding(values=array([5.56782305e-01, ...\n", - "809 SparseEmbedding(values=array([0.38585788, 0.44...\n", - "828 SparseEmbedding(values=array([3.27695787e-01, ...\n", - "867 SparseEmbedding(values=array([0.36255798, 0.74...\n", - "870 SparseEmbedding(values=array([3.74321818e-01, ...\n", - "Name: sparse_embedding, Length: 176, dtype: object" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[\"sparse_embedding\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 15min 51s, sys: 31.7 s, total: 16min 23s\n", - "Wall time: 3min\n" - ] - } - ], - "source": [ - "%%time\n", - "df[\"dense_embedding\"] = make_dense_embedding(product_texts)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Indexing" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:48:19.041998Z", - "start_time": "2024-03-30T00:48:19.036147Z" - } - }, - "outputs": [], - "source": [ - "client = QdrantClient(\":memory:\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### About Qdrant\n", - "\n", - "Qdrant is a vector similarity search engine that allows you to index and search high-dimensional vectors. It supports both sparse and dense embeddings, and it's a great tool for building search engines. \n", - "\n", - "Here, we use the memory mode which is Numpy under the hood for demonstration purposes. In production, you can use the Docker or Cloud for full DB support." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:48:23.670884Z", - "start_time": "2024-03-30T00:48:23.661594Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "collection_name = \"esci\"\n", - "client.create_collection(\n", - " collection_name,\n", - " vectors_config={\n", - " \"text-dense\": VectorParams(\n", - " size=1024, # OpenAI Embeddings\n", - " distance=Distance.COSINE,\n", - " )\n", - " },\n", - " sparse_vectors_config={\n", - " \"text-sparse\": SparseVectorParams(\n", - " index=SparseIndexParams(\n", - " on_disk=False,\n", - " )\n", - " )\n", - " },\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:48:26.252861Z", - "start_time": "2024-03-30T00:48:25.724035Z" - } - }, - "outputs": [], - "source": [ - "def make_points(df: pd.DataFrame) -> list[PointStruct]:\n", - " sparse_vectors = df[\"sparse_embedding\"].tolist()\n", - " product_texts = df[\"combined_text\"].tolist()\n", - " dense_vectors = df[\"dense_embedding\"].tolist()\n", - " rows = df.to_dict(orient=\"records\")\n", - " points = []\n", - " for idx, (text, sparse_vector, dense_vector) in enumerate(\n", - " zip(product_texts, sparse_vectors, dense_vectors)\n", - " ):\n", - " sparse_vector = SparseVector(\n", - " indices=sparse_vector.indices.tolist(), values=sparse_vector.values.tolist()\n", - " )\n", - " point = PointStruct(\n", - " id=idx,\n", - " payload={\n", - " \"text\": text,\n", - " \"product_id\": rows[idx][\"product_id\"],\n", - " }, # Add any additional payload if necessary\n", - " vector={\n", - " \"text-sparse\": sparse_vector,\n", - " \"text-dense\": dense_vector.tolist(),\n", - " },\n", - " )\n", - " points.append(point)\n", - " return points\n", - "\n", - "\n", - "points: list[PointStruct] = make_points(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "UpdateResult(operation_id=0, status=)" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "client.upsert(collection_name, points)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Search" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:48:48.298878Z", - "start_time": "2024-03-30T00:48:48.256591Z" - } - }, - "outputs": [], - "source": [ - "def search(query_text: str):\n", - " # # Compute sparse and dense vectors\n", - " query_sparse_vectors: list[SparseEmbedding] = make_sparse_embedding([query_text])\n", - " query_dense_vector: list[np.ndarray] = make_dense_embedding([query_text])\n", - "\n", - " search_results = client.search_batch(\n", - " collection_name=collection_name,\n", - " requests=[\n", - " SearchRequest(\n", - " vector=NamedVector(\n", - " name=\"text-dense\",\n", - " vector=query_dense_vector[0].tolist(),\n", - " ),\n", - " limit=10,\n", - " with_payload=True,\n", - " ),\n", - " SearchRequest(\n", - " vector=NamedSparseVector(\n", - " name=\"text-sparse\",\n", - " vector=SparseVector(\n", - " indices=query_sparse_vectors[0].indices.tolist(),\n", - " values=query_sparse_vectors[0].values.tolist(),\n", - " ),\n", - " ),\n", - " limit=10,\n", - " with_payload=True,\n", - " ),\n", - " ],\n", - " )\n", - "\n", - " return search_results\n", - "\n", - "\n", - "query_text = \" revent 80 cfm\"\n", - "search_results = search(query_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Ranking\n", - "\n", - "We'll combine the results from the two models using Reciprocal Rank Fusion (RRF). You can read more about RRF [here](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf).\n", - "\n", - "We select RRF for this task because:\n", - "1. It is a simple and effective method for combining search results.\n", - "2. It is robust to the differences in the ranking scores of the two or more ranking lists.\n", - "3. It is easy to implement and requires minimal tuning (only one parameter: alpha, which we don't tune here)." - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:48:53.075137Z", - "start_time": "2024-03-30T00:48:53.059828Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[('A', 0.033465871107430434),\n", - " ('B', 0.033465871107430434),\n", - " ('D', 0.03320985472238179),\n", - " ('C', 0.03294544435749548),\n", - " ('E', 0.01775980832584606)]" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def rrf(rank_lists, alpha=60, default_rank=1000):\n", - " \"\"\"\n", - " Optimized Reciprocal Rank Fusion (RRF) using NumPy for large rank lists.\n", - "\n", - " :param rank_lists: A list of rank lists. Each rank list should be a list of (item, rank) tuples.\n", - " :param alpha: The parameter alpha used in the RRF formula. Default is 60.\n", - " :param default_rank: The default rank assigned to items not present in a rank list. Default is 1000.\n", - " :return: Sorted list of items based on their RRF scores.\n", - " \"\"\"\n", - " # Consolidate all unique items from all rank lists\n", - " all_items = set(item for rank_list in rank_lists for item, _ in rank_list)\n", - "\n", - " # Create a mapping of items to indices\n", - " item_to_index = {item: idx for idx, item in enumerate(all_items)}\n", - "\n", - " # Initialize a matrix to hold the ranks, filled with the default rank\n", - " rank_matrix = np.full((len(all_items), len(rank_lists)), default_rank)\n", - "\n", - " # Fill in the actual ranks from the rank lists\n", - " for list_idx, rank_list in enumerate(rank_lists):\n", - " for item, rank in rank_list:\n", - " rank_matrix[item_to_index[item], list_idx] = rank\n", - "\n", - " # Calculate RRF scores using NumPy operations\n", - " rrf_scores = np.sum(1.0 / (alpha + rank_matrix), axis=1)\n", - "\n", - " # Sort items based on RRF scores\n", - " sorted_indices = np.argsort(-rrf_scores) # Negative for descending order\n", - "\n", - " # Retrieve sorted items\n", - " sorted_items = [(list(item_to_index.keys())[idx], rrf_scores[idx]) for idx in sorted_indices]\n", - "\n", - " return sorted_items\n", - "\n", - "\n", - "# Example usage\n", - "rank_list1 = [(\"A\", 1), (\"B\", 2), (\"C\", 3)]\n", - "rank_list2 = [(\"B\", 1), (\"C\", 2), (\"D\", 3)]\n", - "rank_list3 = [(\"A\", 2), (\"D\", 1), (\"E\", 3)]\n", - "\n", - "# Combine the rank lists\n", - "sorted_items = rrf([rank_list1, rank_list2, rank_list3])\n", - "sorted_items" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Based on this, let's convert our sparse and dense results into rank lists. And then, we'll use the Reciprocal Rank Fusion (RRF) algorithm to combine them." - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "def rank_list(search_result: list[ScoredPoint]):\n", - " return [(point.id, rank + 1) for rank, point in enumerate(search_result)]\n", - "\n", - "\n", - "dense_rank_list, sparse_rank_list = rank_list(search_results[0]), rank_list(search_results[1])" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "rrf_rank_list = rrf([dense_rank_list, sparse_rank_list])" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[(3, 0.032018442622950824),\n", - " (8, 0.03149801587301587),\n", - " (1, 0.03131881575727918),\n", - " (13, 0.030834914611005692),\n", - " (15, 0.030536130536130537),\n", - " (9, 0.030309988518943745),\n", - " (12, 0.030158730158730156),\n", - " (14, 0.029437229437229435),\n", - " (11, 0.028985507246376812),\n", - " (2, 0.01707242848447961),\n", - " (4, 0.01564927857935627)]" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rrf_rank_list" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Record(id=3, payload={'text': 'Delta Electronics RAD80L BreezRadiance 80 CFM Heater/Fan/Light Combo White (Renewed)\\nDelta Electronics RAD80L BreezRadiance 80 CFM Heater/Fan/Light Combo White (Renewed)\\nDELTA ELECTRONICS (AMERICAS) LTD.\\nWhite\\nThis pre-owned or refurbished product has been professionally inspected and tested to work and look like new. How a product becomes part of Amazon Renewed, your destination for pre-owned, refurbished products: A customer buys a new product and returns it or trades it in for a newer or different model. That product is inspected and tested to work and look like new by Amazon-qualified suppliers. Then, the product is sold as an Amazon Renewed product on Amazon. If not satisfied with the purchase, renewed products are eligible for replacement or refund under the Amazon Renewed Guarantee.\\nQuiet operation at 1.5 sones\\nBuilt-in thermostat regulates temperature. Energy efficiency at 7.6 CFM/Watt\\nPrecision engineered with DC brushless motor for extended reliability, this fan will outlast many household appliances\\nGalvanized steel construction resists corrosion\\nDuct: Detachable 4-inch Plastic Duct Adapter\\nQuiet operation at 1.5 sones\\nBuilt-in thermostat regulates temperature. Energy efficiency at 7.6 CFM/Watt\\nPrecision engineered with DC brushless motor for extended reliability, this fan will outlast many household appliances\\nGalvanized steel construction resists corrosion\\nDuct: Detachable 4-inch Plastic Duct Adapter', 'product_id': 'B07RH6Z8KW'}, vector=None, shard_key=None),\n", - " Record(id=8, payload={'text': 'Aero Pure ABF80 L5 W ABF80L5 Ceiling Mount 80 CFM w/LED Light/Nightlight, Energy Star Certified, White Quiet Bathroom Ventilation Fan\\nAero Pure ABF80 L5 W ABF80L5 Ceiling Mount 80 CFM w/LED Light/Nightlight, Energy Star Certified, White Quiet Bathroom Ventilation Fan\\nAero Pure\\nWhite\\nNone\\nQuiet 0.3 Sones, 80 CFM fan with choice of three designer grilles in White, Satin Nickel, or Oil Rubbed Bronze; Full 6 year warranty\\n10W 3000K 800 Lumens LED Light with 0.7W Nightlight included\\nInstallation friendly- Quick-mount adjustable metal bracket for new construction and retrofit; 4โ€, 5: and 6โ€ metal duct adaptor included\\nMeets todayโ€™s demanding building specifications- ETL Listed for wet application, ENERGY STAR certified, CALGreen, JA-8 Compliant for CA Title 24, and ASHRAE 62.2 compliant\\nHousing dimensions- 10 2/5โ€x10 2/5โ€x 7 ยฝโ€; Grille dimensions- 13โ€x13โ€; Fits 2\"x8\" joists\\nQuiet 0.3 Sones, 80 CFM fan with choice of three designer grilles in White, Satin Nickel, or Oil Rubbed Bronze; Full 6 year warranty\\n10W 3000K 800 Lumens LED Light with 0.7W Nightlight included\\nInstallation friendly- Quick-mount adjustable metal bracket for new construction and retrofit; 4โ€, 5: and 6โ€ metal duct adaptor included\\nMeets todayโ€™s demanding building specifications- ETL Listed for wet application, ENERGY STAR certified, CALGreen, JA-8 Compliant for CA Title 24, and ASHRAE 62.2 compliant\\nHousing dimensions- 10 2/5โ€x10 2/5โ€x 7 ยฝโ€; Grille dimensions- 13โ€x13โ€; Fits 2\"x8\" joists', 'product_id': 'B07JY1PQNT'}, vector=None, shard_key=None),\n", - " Record(id=1, payload={'text': \"Homewerks 7141-80 Bathroom Fan Integrated LED Light Ceiling Mount Exhaust Ventilation, 1.1 Sones, 80 CFM\\nHomewerks 7141-80 Bathroom Fan Integrated LED Light Ceiling Mount Exhaust Ventilation, 1.1 Sones, 80 CFM\\nHomewerks\\n80 CFM\\nNone\\nOUTSTANDING PERFORMANCE: This Homewerk's bath fan ensures comfort in your home by quietly eliminating moisture and humidity in the bathroom. This exhaust fan is 1.1 sones at 80 CFM which means itโ€™s able to manage spaces up to 80 square feet and is very quiet..\\nBATH FANS HELPS REMOVE HARSH ODOR: When cleaning the bathroom or toilet, harsh chemicals are used and they can leave an obnoxious odor behind. Homewerkโ€™s bathroom fans can help remove this odor with its powerful ventilation\\nBUILD QUALITY: Designed to be corrosion resistant with its galvanized steel construction featuring a modern style round shape and has an 4000K Cool White Light LED Light. AC motor.\\nEASY INSTALLATION: This exhaust bath fan is easy to install with its no-cut design and ceiling mount ventilation. Ceiling Opening (L) 7-1/2 in x Ceiling Opening (W) 7-1/4 x Ceiling Opening (H) 5-3/4 in. 13 in round grill and 4 in round duct connector.\\nHOMEWERKS TRUSTED QUALITY: Be confident in the quality and construction of each and every one of our products. We ensure that all of our products are produced and certified to regional, national and international industry standards. We are proud of the products we sell, you will be too. 3 Year Limited\\nOUTSTANDING PERFORMANCE: This Homewerk's bath fan ensures comfort in your home by quietly eliminating moisture and humidity in the bathroom. This exhaust fan is 1.1 sones at 80 CFM which means itโ€™s able to manage spaces up to 80 square feet and is very quiet..\\nBATH FANS HELPS REMOVE HARSH ODOR: When cleaning the bathroom or toilet, harsh chemicals are used and they can leave an obnoxious odor behind. Homewerkโ€™s bathroom fans can help remove this odor with its powerful ventilation\\nBUILD QUALITY: Designed to be corrosion resistant with its galvanized steel construction featuring a modern style round shape and has an 4000K Cool White Light LED Light. AC motor.\\nEASY INSTALLATION: This exhaust bath fan is easy to install with its no-cut design and ceiling mount ventilation. Ceiling Opening (L) 7-1/2 in x Ceiling Opening (W) 7-1/4 x Ceiling Opening (H) 5-3/4 in. 13 in round grill and 4 in round duct connector.\\nHOMEWERKS TRUSTED QUALITY: Be confident in the quality and construction of each and every one of our products. We ensure that all of our products are produced and certified to regional, national and international industry standards. We are proud of the products we sell, you will be too. 3 Year Limited\", 'product_id': 'B07X3Y6B1V'}, vector=None, shard_key=None),\n", - " Record(id=13, payload={'text': 'Delta BreezSignature VFB25ACH 80 CFM Exhaust Bath Fan with Humidity Sensor\\nDelta BreezSignature VFB25ACH 80 CFM Exhaust Bath Fan with Humidity Sensor\\nDELTA ELECTRONICS (AMERICAS) LTD.\\nWhite\\nNone\\nVirtually silent at less than 0.3 sones\\nPrecision engineered with DC brushless motor for extended reliability\\nEasily switch in and out of humidity sensing mode by toggling wall switch\\nENERGY STAR qualified for efficient cost-saving operation\\nPrecision engineered with DC brushless motor for extended reliability, this fan will outlast many household appliances\\nVirtually silent at less than 0.3 sones\\nPrecision engineered with DC brushless motor for extended reliability\\nEasily switch in and out of humidity sensing mode by toggling wall switch\\nENERGY STAR qualified for efficient cost-saving operation\\nPrecision engineered with DC brushless motor for extended reliability, this fan will outlast many household appliances', 'product_id': 'B003O0MNGC'}, vector=None, shard_key=None),\n", - " Record(id=15, payload={'text': 'Delta Electronics (Americas) Ltd. GBR80HLED Delta BreezGreenBuilder Series 80 CFM Fan/Dimmable H, LED Light, Dual Speed & Humidity Sensor\\nDelta Electronics (Americas) Ltd. GBR80HLED Delta BreezGreenBuilder Series 80 CFM Fan/Dimmable H, LED Light, Dual Speed & Humidity Sensor\\nDELTA ELECTRONICS (AMERICAS) LTD.\\nWith LED Light, Dual Speed & Humidity Sensor\\nNone\\nUltra energy-efficient LED module (11-watt equivalent to 60-watt incandescent light) included. Main light output-850 Lumens, 3000K\\nExtracts air at a rate of 80 CFM to properly ventilate bathrooms up to 80 sq. Ft., quiet operation at 0.8 sones\\nPrecision engineered with DC brushless motor for extended reliability, this Fan will outlast many household appliances\\nEnergy Star qualified for efficient cost-saving operation, galvanized steel construction resists corrosion\\nFan impeller Stops If obstructed, for safe worry-free operation, attractive grille gives your bathroom a fresh look\\nUltra energy-efficient LED module (11-watt equivalent to 60-watt incandescent light) included. Main light output-850 Lumens, 3000K\\nExtracts air at a rate of 80 CFM to properly ventilate bathrooms up to 80 sq. Ft., quiet operation at 0.8 sones\\nPrecision engineered with DC brushless motor for extended reliability, this Fan will outlast many household appliances\\nEnergy Star qualified for efficient cost-saving operation, galvanized steel construction resists corrosion\\nFan impeller Stops If obstructed, for safe worry-free operation, attractive grille gives your bathroom a fresh look', 'product_id': 'B01N5Y6002'}, vector=None, shard_key=None),\n", - " Record(id=9, payload={'text': \"Delta Electronics (Americas) Ltd. RAD80 Delta BreezRadiance Series 80 CFM Fan with Heater, 10.5W, 1.5 Sones\\nDelta Electronics (Americas) Ltd. RAD80 Delta BreezRadiance Series 80 CFM Fan with Heater, 10.5W, 1.5 Sones\\nDELTA ELECTRONICS (AMERICAS) LTD.\\nWith Heater\\nNone\\nQuiet operation at 1.5 Sones\\nPrecision engineered with DC brushless motor for extended reliability, this Fan will outlast many household appliances\\nGalvanized steel construction resists corrosion, equipped with metal duct adapter\\nFan impeller Stops If obstructed, for safe worry-free operation\\nPeace of mind quality, performance and reliability from the world's largest DC brushless Fan Manufacturer\\nQuiet operation at 1.5 Sones\\nPrecision engineered with DC brushless motor for extended reliability, this Fan will outlast many household appliances\\nGalvanized steel construction resists corrosion, equipped with metal duct adapter\\nFan impeller Stops If obstructed, for safe worry-free operation\\nPeace of mind quality, performance and reliability from the world's largest DC brushless Fan Manufacturer\", 'product_id': 'B01MZIK0PI'}, vector=None, shard_key=None),\n", - " Record(id=12, payload={'text': 'Aero Pure AP80RVLW Super Quiet 80 CFM Recessed Fan/Light Bathroom Ventilation Fan with White Trim Ring\\nAero Pure AP80RVLW Super Quiet 80 CFM Recessed Fan/Light Bathroom Ventilation Fan with White Trim Ring\\nAero Pure\\nWhite\\nNone\\nSuper quiet 80CFM energy efficient fan virtually disappears into the ceiling leaving only a recessed light in view\\nMay be installed over shower when wired to a GFCI breaker and used with a PAR30L 75W (max) CFL\\nBulb not included. Accepts any of the following bulbs: 75W Max. PAR30, 14W Max. BR30 LED, or 75W Max. PAR30L (for use over tub/shower.)\\nSuper quiet 80CFM energy efficient fan virtually disappears into the ceiling leaving only a recessed light in view\\nMay be installed over shower when wired to a GFCI breaker and used with a PAR30L 75W (max) CFL\\nBulb not included. Accepts any of the following bulbs: 75W Max. PAR30, 14W Max. BR30 LED, or 75W Max. PAR30L (for use over tub/shower.)', 'product_id': 'B00MARNO5Y'}, vector=None, shard_key=None),\n", - " Record(id=14, payload={'text': 'Broan Very Quiet Ceiling Bathroom Exhaust Fan, ENERGY STAR Certified, 0.3 Sones, 80 CFM\\nBroan Very Quiet Ceiling Bathroom Exhaust Fan, ENERGY STAR Certified, 0.3 Sones, 80 CFM\\nBroan-NuTone\\nWhite\\nNone\\nHIGH-QUALITY FAN: Very quiet, energy efficient exhaust fan runs on 0. 3 Sones and is motor engineered for continuous operation\\nEFFICIENT: Operates at 80 CFM in bathrooms up to 75 sq. ft. for a high-quality performance. Dimmable Capability: Non Dimmable\\nEASY INSTALLATION: Fan is easy to install and/or replace existing product for DIY\\'ers and needs only 2\" x 8\" construction space. Can be used over bathtubs or showers when connected to a GFCI protected branch circuit\\nFEATURES: Includes hanger bar system for fast, flexible installation for all types of construction and a 6\" ducting for superior performance\\nCERTIFIED: ENERGY STAR qualified and HVI Certified to ensure the best quality for your home\\nHIGH-QUALITY FAN: Very quiet, energy efficient exhaust fan runs on 0. 3 Sones and is motor engineered for continuous operation\\nEFFICIENT: Operates at 80 CFM in bathrooms up to 75 sq. ft. for a high-quality performance. Dimmable Capability: Non Dimmable\\nEASY INSTALLATION: Fan is easy to install and/or replace existing product for DIY\\'ers and needs only 2\" x 8\" construction space. Can be used over bathtubs or showers when connected to a GFCI protected branch circuit\\nFEATURES: Includes hanger bar system for fast, flexible installation for all types of construction and a 6\" ducting for superior performance\\nCERTIFIED: ENERGY STAR qualified and HVI Certified to ensure the best quality for your home', 'product_id': 'B001E6DMKY'}, vector=None, shard_key=None),\n", - " Record(id=11, payload={'text': 'Panasonic FV-0811VF5 WhisperFit EZ Retrofit Ventilation Fan, 80 or 110 CFM\\nPanasonic FV-0811VF5 WhisperFit EZ Retrofit Ventilation Fan, 80 or 110 CFM\\nPanasonic\\nWhite\\nNone\\nRetrofit Solution: Ideal for residential remodeling, hotel construction or renovations\\nLow Profile: 5-5/8-Inch housing depth fits in a 2 x 6 construction\\nPick-A-Flow Speed Selector: Allows you to pick desired airflow from 80 or 110 CFM\\nFlexible Installation: Comes with Flex-Z Fast bracket for easy, fast and trouble-free installation\\nEnergy Star Rated: Delivers powerful airflow without wasting energy\\nRetrofit Solution: Ideal for residential remodeling, hotel construction or renovations\\nLow Profile: 5-5/8-Inch housing depth fits in a 2 x 6 construction\\nPick-A-Flow Speed Selector: Allows you to pick desired airflow from 80 or 110 CFM\\nFlexible Installation: Comes with Flex-Z Fast bracket for easy, fast and trouble-free installation\\nEnergy Star Rated: Delivers powerful airflow without wasting energy', 'product_id': 'B00XBZFWWM'}, vector=None, shard_key=None),\n", - " Record(id=2, payload={'text': 'Homewerks 7140-80 Bathroom Fan Ceiling Mount Exhaust Ventilation, 1.5 Sones, 80 CFM, White\\nHomewerks 7140-80 Bathroom Fan Ceiling Mount Exhaust Ventilation, 1.5 Sones, 80 CFM, White\\nHomewerks\\nWhite\\nNone\\nOUTSTANDING PERFORMANCE: This Homewerk\\'s bath fan ensures comfort in your home by quietly eliminating moisture and humidity in the bathroom. This exhaust fan is 1. 5 sone at 110 CFM which means itโ€™s able to manage spaces up to 110 square feet\\nBATH FANS HELPS REMOVE HARSH ODOR: When cleaning the bathroom or toilet, harsh chemicals are used and they can leave an obnoxious odor behind. Homewerkโ€™s bathroom fans can help remove this odor with its powerful ventilation\\nBUILD QUALITY: Designed to be corrosion resistant with its galvanized steel construction featuring a grille modern style.\\nEASY INSTALLATION: This exhaust bath fan is easy to install with its no-cut design and ceiling mount ventilation. Ceiling Opening (L) 7-1/2 in x Ceiling Opening (W) 7-1/4 x Ceiling Opening (H) 5-3/4 in and a 4\" round duct connector.\\nHOMEWERKS TRUSTED QUALITY: Be confident in the quality and construction of each and every one of our products. We ensure that all of our products are produced and certified to regional, national and international industry standards. We are proud of the products we sell, you will be too. 3 Year Limited\\nOUTSTANDING PERFORMANCE: This Homewerk\\'s bath fan ensures comfort in your home by quietly eliminating moisture and humidity in the bathroom. This exhaust fan is 1. 5 sone at 110 CFM which means itโ€™s able to manage spaces up to 110 square feet\\nBATH FANS HELPS REMOVE HARSH ODOR: When cleaning the bathroom or toilet, harsh chemicals are used and they can leave an obnoxious odor behind. Homewerkโ€™s bathroom fans can help remove this odor with its powerful ventilation\\nBUILD QUALITY: Designed to be corrosion resistant with its galvanized steel construction featuring a grille modern style.\\nEASY INSTALLATION: This exhaust bath fan is easy to install with its no-cut design and ceiling mount ventilation. Ceiling Opening (L) 7-1/2 in x Ceiling Opening (W) 7-1/4 x Ceiling Opening (H) 5-3/4 in and a 4\" round duct connector.\\nHOMEWERKS TRUSTED QUALITY: Be confident in the quality and construction of each and every one of our products. We ensure that all of our products are produced and certified to regional, national and international industry standards. We are proud of the products we sell, you will be too. 3 Year Limited', 'product_id': 'B07WDM7MQQ'}, vector=None, shard_key=None),\n", - " Record(id=4, payload={'text': 'Panasonic FV-08VRE2 Ventilation Fan with Recessed LED (Renewed)\\nPanasonic FV-08VRE2 Ventilation Fan with Recessed LED (Renewed)\\nPanasonic\\nWhite\\nNone\\nThe design solution for Fan/light combinations\\nEnergy Star rated architectural grade recessed Fan/LED light\\nQuiet, energy efficient and powerful 80 CFM ventilation hidden above the Ceiling\\nLED lamp is dimmable\\nBeautiful Lighting with 6-1/2โ€aperture and advanced luminaire design\\nThe design solution for Fan/light combinations\\nEnergy Star rated architectural grade recessed Fan/LED light\\nQuiet, energy efficient and powerful 80 CFM ventilation hidden above the Ceiling\\nLED lamp is dimmable\\nBeautiful Lighting with 6-1/2โ€aperture and advanced luminaire design', 'product_id': 'B07QJ7WYFQ'}, vector=None, shard_key=None)]" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def find_point_by_id(\n", - " client: QdrantClient, collection_name: str, rrf_rank_list: list[tuple[int, float]]\n", - "):\n", - " return client.retrieve(\n", - " collection_name=collection_name, ids=[item[0] for item in rrf_rank_list]\n", - " )\n", - "\n", - "\n", - "find_point_by_id(client, collection_name, rrf_rank_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, let's check the ESCI (Exact, Substitute, Compliment, and Irrelvant) label for the results against the source data." - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Exact\n", - "Exact\n", - "Exact\n", - "Exact\n", - "Exact\n", - "Exact\n", - "Exact\n", - "Exact\n", - "Exact\n", - "Exact\n", - "Exact\n" - ] - } - ], - "source": [ - "ids = [item[0] for item in rrf_rank_list]\n", - "df[df[\"query\"] == query_text]\n", - "\n", - "for idx in ids:\n", - " print(df.iloc[idx][\"esci_label\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This was amazing! We pulled only Exact results with k=10. This is a great result for a small dataset like this with out of the box vectors which are not even fine-tuned for e-Commerce." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "11" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(rrf_rank_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Conclusion\n", - "\n", - "In this notebook, we demonstrated the usage of Hybrid Search with FastEmbed & Qdrant. We used FastEmbed to create Sparse and Dense embeddings for the data and indexed them using Qdrant. We then performed Hybrid Search using FastEmbed & Qdrant and ranked the search results using Reciprocal Rank Fusion (RRF)." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/examples/Image_Embedding.ipynb b/docs/examples/Image_Embedding.ipynb deleted file mode 100644 index 956a5aaef..000000000 --- a/docs/examples/Image_Embedding.ipynb +++ /dev/null @@ -1,128 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "aa0a86859809102", - "metadata": { - "collapsed": false - }, - "source": [ - "# Image Embedding\n", - "As of version 0.3.0 fastembed supports computation of image embeddings.\n", - "\n", - "The process is as easy and straightforward as with text embeddings. Let's see how it works." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "cea8fd5c019571fe", - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-02T11:35:40.126023Z", - "start_time": "2024-06-02T11:35:39.864701Z" - }, - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Fetching 3 files: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 3/3 [00:00<00:00, 47482.69it/s]\n" - ] - }, - { - "data": { - "text/plain": "[array([0. , 0. , 0. , ..., 0. , 0.01139933,\n 0. ], dtype=float32),\n array([0.02169187, 0. , 0. , ..., 0. , 0.00848291,\n 0. ], dtype=float32)]" - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastembed import ImageEmbedding\n", - "\n", - "model = ImageEmbedding(\"Qdrant/resnet50-onnx\")\n", - "\n", - "embeddings_generator = model.embed(\n", - " [\"../../tests/misc/image.jpeg\", \"../../tests/misc/small_image.jpeg\"]\n", - ")\n", - "embeddings_list = list(embeddings_generator)\n", - "embeddings_list" - ] - }, - { - "cell_type": "markdown", - "id": "3f838f18523ad1e0", - "metadata": { - "collapsed": false - }, - "source": [ - "## Preprocessing\n", - "\n", - "Preprocessing is encapsulated in the ImageEmbedding class, applied operations are identical to the ones provided by [Hugging Face Transformers](https://huggingface.co/docs/transformers/en/index).\n", - "You don't need to think about batching, opening/closing files, resizing images, etc., Fastembed will take care of it." - ] - }, - { - "cell_type": "markdown", - "id": "894b33ff9b385d72", - "metadata": { - "collapsed": false - }, - "source": [ - "## Supported models\n", - "\n", - "List of supported image embedding models can either be found [here](https://qdrant.github.io/fastembed/examples/Supported_Models/#supported-image-embedding-models) or by calling the `ImageEmbedding.list_supported_models()` method." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "6d6a4cbbd2200d14", - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-02T11:40:19.313226Z", - "start_time": "2024-06-02T11:40:19.309845Z" - }, - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": "[{'model': 'Qdrant/clip-ViT-B-32-vision',\n 'dim': 512,\n 'description': 'CLIP vision encoder based on ViT-B/32',\n 'size_in_GB': 0.34,\n 'sources': {'hf': 'Qdrant/clip-ViT-B-32-vision'},\n 'model_file': 'model.onnx'},\n {'model': 'Qdrant/resnet50-onnx',\n 'dim': 2048,\n 'description': 'ResNet-50 from `Deep Residual Learning for Image Recognition `__.',\n 'size_in_GB': 0.1,\n 'sources': {'hf': 'Qdrant/resnet50-onnx'},\n 'model_file': 'model.onnx'}]" - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ImageEmbedding.list_supported_models()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/examples/SPLADE_with_FastEmbed.ipynb b/docs/examples/SPLADE_with_FastEmbed.ipynb deleted file mode 100644 index c36baf878..000000000 --- a/docs/examples/SPLADE_with_FastEmbed.ipynb +++ /dev/null @@ -1,389 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Introduction to SPLADE with FastEmbed\n", - "\n", - "In this notebook, we will explore how to generate Sparse Vectors -- in particular a variant of the [SPLADE](https://arxiv.org/abs/2107.05720).\n", - "\n", - "> ๐Ÿ’ก The original [naver/SPLADE](https://github.com/naver/splade) models were licensed CC BY-NC-SA 4.0 -- Not for Commercial Use. This [SPLADE++](https://huggingface.co/prithivida/Splade_PP_en_v1) model is Apache License and hence, licensed for commercial use. \n", - "\n", - "## Outline:\n", - "1. [What is SPLADE?](#What-is-SPLADE?)\n", - "2. [Setting up the environment](#Setting-up-the-environment)\n", - "3. [Generating SPLADE vectors with FastEmbed](#Generating-SPLADE-vectors-with-FastEmbed)\n", - "4. [Understanding SPLADE vectors](#Understanding-SPLADE-vectors)\n", - "5. [Observations and Design Choices](#Observations-and-Model-Design-Choices)\n", - "\n", - "\n", - "## What is SPLADE?\n", - "\n", - "SPLADE was a novel method for _learning_ sparse vectors for text representation. This model beats BM25 -- the underlying approach for the Elastic/Lucene family of implementations. Thus making it highly effective for tasks such as information retrieval, document classification, and more. \n", - "\n", - "The key advantage of SPLADE is its ability to generate sparse vectors, which are more efficient and interpretable than dense vectors. This makes SPLADE a powerful tool for handling large-scale text data.\n", - "\n", - "## Setting up the environment\n", - "\n", - "This notebook uses few dependencies, which are installed below: " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# !pip install -q fastembed" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's get started! ๐Ÿš€" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:49:20.516644Z", - "start_time": "2024-03-30T00:49:20.188543Z" - } - }, - "outputs": [], - "source": [ - "from fastembed import SparseTextEmbedding, SparseEmbedding" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> You can find the list of all supported Sparse Embedding models by calling this API: `SparseTextEmbedding.list_supported_models()`" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:49:22.366294Z", - "start_time": "2024-03-30T00:49:22.362384Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'model': 'prithvida/Splade_PP_en_v1',\n", - " 'vocab_size': 30522,\n", - " 'description': 'Misspelled version of the model. Retained for backward compatibility. Independent Implementation of SPLADE++ Model for English',\n", - " 'size_in_GB': 0.532,\n", - " 'sources': {'hf': 'Qdrant/SPLADE_PP_en_v1'}},\n", - " {'model': 'prithivida/Splade_PP_en_v1',\n", - " 'vocab_size': 30522,\n", - " 'description': 'Independent Implementation of SPLADE++ Model for English',\n", - " 'size_in_GB': 0.532,\n", - " 'sources': {'hf': 'Qdrant/SPLADE_PP_en_v1'}}]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "SparseTextEmbedding.list_supported_models()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:49:27.193530Z", - "start_time": "2024-03-30T00:49:26.139248Z" - } - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2aa47b26ab01475e8d3577433037f685", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Fetching 9 files: 0%| | 0/9 [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
modeldimdescriptionlicensesize_in_GB
0BAAI/bge-small-en-v1.5384Text embeddings, Unimodal (text), English, 512...mit0.067
1BAAI/bge-small-zh-v1.5512Text embeddings, Unimodal (text), Chinese, 512...mit0.090
2snowflake/snowflake-arctic-embed-xs384Text embeddings, Unimodal (text), English, 512...apache-2.00.090
3sentence-transformers/all-MiniLM-L6-v2384Text embeddings, Unimodal (text), English, 256...apache-2.00.090
4jinaai/jina-embeddings-v2-small-en512Text embeddings, Unimodal (text), English, 819...apache-2.00.120
5BAAI/bge-small-en384Text embeddings, Unimodal (text), English, 512...mit0.130
6snowflake/snowflake-arctic-embed-s384Text embeddings, Unimodal (text), English, 512...apache-2.00.130
7nomic-ai/nomic-embed-text-v1.5-Q768Text embeddings, Multimodal (text, image), Eng...apache-2.00.130
8BAAI/bge-base-en-v1.5768Text embeddings, Unimodal (text), English, 512...mit0.210
9sentence-transformers/paraphrase-multilingual-...384Text embeddings, Unimodal (text), Multilingual...apache-2.00.220
10Qdrant/clip-ViT-B-32-text512Text embeddings, Multimodal (text&image), Engl...mit0.250
11jinaai/jina-embeddings-v2-base-de768Text embeddings, Unimodal (text), Multilingual...apache-2.00.320
12BAAI/bge-base-en768Text embeddings, Unimodal (text), English, 512...mit0.420
13snowflake/snowflake-arctic-embed-m768Text embeddings, Unimodal (text), English, 512...apache-2.00.430
14nomic-ai/nomic-embed-text-v1.5768Text embeddings, Multimodal (text, image), Eng...apache-2.00.520
15jinaai/jina-embeddings-v2-base-en768Text embeddings, Unimodal (text), English, 819...apache-2.00.520
16nomic-ai/nomic-embed-text-v1768Text embeddings, Multimodal (text, image), Eng...apache-2.00.520
17snowflake/snowflake-arctic-embed-m-long768Text embeddings, Unimodal (text), English, 204...apache-2.00.540
18mixedbread-ai/mxbai-embed-large-v11024Text embeddings, Unimodal (text), English, 512...apache-2.00.640
19jinaai/jina-embeddings-v2-base-code768Text embeddings, Unimodal (text), Multilingual...apache-2.00.640
20sentence-transformers/paraphrase-multilingual-...768Text embeddings, Unimodal (text), Multilingual...apache-2.01.000
21snowflake/snowflake-arctic-embed-l1024Text embeddings, Unimodal (text), English, 512...apache-2.01.020
22thenlper/gte-large1024Text embeddings, Unimodal (text), English, 512...mit1.200
23BAAI/bge-large-en-v1.51024Text embeddings, Unimodal (text), English, 512...mit1.200
24intfloat/multilingual-e5-large1024Text embeddings, Unimodal (text), Multilingual...mit2.240
\n", - "" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 12 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Supported Sparse Text Embedding Models" - ] - }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-11-13T09:01:07.038954Z", - "start_time": "2024-11-13T09:01:07.019656Z" - } - }, - "source": [ - "(\n", - " pd.DataFrame(SparseTextEmbedding.list_supported_models())\n", - " .sort_values(\"size_in_GB\")\n", - " .drop(columns=[\"sources\", \"model_file\", \"additional_files\"])\n", - " .reset_index(drop=True)\n", - ")" - ], - "outputs": [ - { - "data": { - "text/plain": [ - " model vocab_size \\\n", - "0 Qdrant/bm25 NaN \n", - "1 Qdrant/bm42-all-minilm-l6-v2-attentions 30522.0 \n", - "2 prithivida/Splade_PP_en_v1 30522.0 \n", - "3 prithvida/Splade_PP_en_v1 30522.0 \n", - "\n", - " description license size_in_GB \\\n", - "0 BM25 as sparse embeddings meant to be used wit... apache-2.0 0.010 \n", - "1 Light sparse embedding model, which assigns an... apache-2.0 0.090 \n", - "2 Independent Implementation of SPLADE++ Model f... apache-2.0 0.532 \n", - "3 Independent Implementation of SPLADE++ Model f... apache-2.0 0.532 \n", - "\n", - " requires_idf \n", - "0 True \n", - "1 True \n", - "2 NaN \n", - "3 NaN " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
modelvocab_sizedescriptionlicensesize_in_GBrequires_idf
0Qdrant/bm25NaNBM25 as sparse embeddings meant to be used wit...apache-2.00.010True
1Qdrant/bm42-all-minilm-l6-v2-attentions30522.0Light sparse embedding model, which assigns an...apache-2.00.090True
2prithivida/Splade_PP_en_v130522.0Independent Implementation of SPLADE++ Model f...apache-2.00.532NaN
3prithvida/Splade_PP_en_v130522.0Independent Implementation of SPLADE++ Model f...apache-2.00.532NaN
\n", - "
" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 13 - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "## Supported Late Interaction Text Embedding Models" - ] - }, - { - "cell_type": "code", - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-11-13T09:01:08.074442Z", - "start_time": "2024-11-13T09:01:08.056138Z" - } - }, - "source": [ - "(\n", - " pd.DataFrame(LateInteractionTextEmbedding.list_supported_models())\n", - " .sort_values(\"size_in_GB\")\n", - " .drop(columns=[\"sources\", \"model_file\"])\n", - " .reset_index(drop=True)\n", - ")" - ], - "outputs": [ - { - "data": { - "text/plain": [ - " model dim \\\n", - "0 answerdotai/answerai-colbert-small-v1 96 \n", - "1 colbert-ir/colbertv2.0 128 \n", - "2 jinaai/jina-colbert-v2 128 \n", - "\n", - " description license \\\n", - "0 Text embeddings, Unimodal (text), Multilingual... apache-2.0 \n", - "1 Late interaction model mit \n", - "2 New model that expands capabilities of colbert... cc-by-nc-4.0 \n", - "\n", - " size_in_GB additional_files \n", - "0 0.13 NaN \n", - "1 0.44 NaN \n", - "2 2.24 [onnx/model.onnx_data] " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
modeldimdescriptionlicensesize_in_GBadditional_files
0answerdotai/answerai-colbert-small-v196Text embeddings, Unimodal (text), Multilingual...apache-2.00.13NaN
1colbert-ir/colbertv2.0128Late interaction modelmit0.44NaN
2jinaai/jina-colbert-v2128New model that expands capabilities of colbert...cc-by-nc-4.02.24[onnx/model.onnx_data]
\n", - "
" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 14 - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "## Supported Image Embedding Models" - ] - }, - { - "cell_type": "code", - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-11-13T09:01:09.171647Z", - "start_time": "2024-11-13T09:01:09.150940Z" - } - }, - "source": [ - "(\n", - " pd.DataFrame(ImageEmbedding.list_supported_models())\n", - " .sort_values(\"size_in_GB\")\n", - " .drop(columns=[\"sources\", \"model_file\"])\n", - " .reset_index(drop=True)\n", - ")" - ], - "outputs": [ - { - "data": { - "text/plain": [ - " model dim \\\n", - "0 Qdrant/resnet50-onnx 2048 \n", - "1 Qdrant/clip-ViT-B-32-vision 512 \n", - "2 Qdrant/Unicom-ViT-B-32 512 \n", - "3 Qdrant/Unicom-ViT-B-16 768 \n", - "\n", - " description license size_in_GB \n", - "0 Image embeddings, Unimodal (image), 2016 year apache-2.0 0.10 \n", - "1 Image embeddings, Multimodal (text&image), 202... mit 0.34 \n", - "2 Image embeddings, Multimodal (text&image), 202... apache-2.0 0.48 \n", - "3 Image embeddings (more detailed than Unicom-Vi... apache-2.0 0.82 " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
modeldimdescriptionlicensesize_in_GB
0Qdrant/resnet50-onnx2048Image embeddings, Unimodal (image), 2016 yearapache-2.00.10
1Qdrant/clip-ViT-B-32-vision512Image embeddings, Multimodal (text&image), 202...mit0.34
2Qdrant/Unicom-ViT-B-32512Image embeddings, Multimodal (text&image), 202...apache-2.00.48
3Qdrant/Unicom-ViT-B-16768Image embeddings (more detailed than Unicom-Vi...apache-2.00.82
\n", - "
" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 15 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Supported Rerank Cross Encoder Models" - ] - }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-11-13T09:01:10.313943Z", - "start_time": "2024-11-13T09:01:10.298428Z" - } - }, - "source": [ - "(\n", - " pd.DataFrame(TextCrossEncoder.list_supported_models())\n", - " .sort_values(\"size_in_GB\")\n", - " .drop(columns=[\"sources\", \"model_file\"])\n", - " .reset_index(drop=True)\n", - ")" - ], - "outputs": [ - { - "data": { - "text/plain": [ - " model size_in_GB \\\n", - "0 Xenova/ms-marco-MiniLM-L-6-v2 0.08 \n", - "1 Xenova/ms-marco-MiniLM-L-12-v2 0.12 \n", - "2 jinaai/jina-reranker-v1-tiny-en 0.13 \n", - "3 jinaai/jina-reranker-v1-turbo-en 0.15 \n", - "4 BAAI/bge-reranker-base 1.04 \n", - "5 jinaai/jina-reranker-v2-base-multilingual 1.11 \n", - "\n", - " description license \n", - "0 MiniLM-L-6-v2 model optimized for re-ranking t... apache-2.0 \n", - "1 MiniLM-L-12-v2 model optimized for re-ranking ... apache-2.0 \n", - "2 Designed for blazing-fast re-ranking with 8K c... apache-2.0 \n", - "3 Designed for blazing-fast re-ranking with 8K c... apache-2.0 \n", - "4 BGE reranker base model for cross-encoder re-r... mit \n", - "5 A multi-lingual reranker model for cross-encod... cc-by-nc-4.0 " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
modelsize_in_GBdescriptionlicense
0Xenova/ms-marco-MiniLM-L-6-v20.08MiniLM-L-6-v2 model optimized for re-ranking t...apache-2.0
1Xenova/ms-marco-MiniLM-L-12-v20.12MiniLM-L-12-v2 model optimized for re-ranking ...apache-2.0
2jinaai/jina-reranker-v1-tiny-en0.13Designed for blazing-fast re-ranking with 8K c...apache-2.0
3jinaai/jina-reranker-v1-turbo-en0.15Designed for blazing-fast re-ranking with 8K c...apache-2.0
4BAAI/bge-reranker-base1.04BGE reranker base model for cross-encoder re-r...mit
5jinaai/jina-reranker-v2-base-multilingual1.11A multi-lingual reranker model for cross-encod...cc-by-nc-4.0
\n", - "
" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 16 - }, - { - "metadata": {}, - "cell_type": "code", - "outputs": [], - "execution_count": null, - "source": "" - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.8.18 ('base')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.8" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "c4a27af61e455bc18dcf16f5867a2ff0402fa12b01dd0f6ce3a79ae73ad15e91" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/experimental/Accuracy_vs_SamplingRate.png b/docs/experimental/Accuracy_vs_SamplingRate.png deleted file mode 100644 index 7e27e1ef6..000000000 Binary files a/docs/experimental/Accuracy_vs_SamplingRate.png and /dev/null differ diff --git a/docs/experimental/Binary Quantization from Scratch.ipynb b/docs/experimental/Binary Quantization from Scratch.ipynb deleted file mode 100644 index 1463c4969..000000000 --- a/docs/experimental/Binary Quantization from Scratch.ipynb +++ /dev/null @@ -1,409 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup: Install Dependencies, Imports & Download Embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-06T17:00:06.460001Z", - "start_time": "2024-06-06T17:00:04.214098Z" - } - }, - "outputs": [], - "source": [ - "!pip install matplotlib tqdm pandas numpy datasets --quiet --upgrade" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-06T17:00:07.041784Z", - "start_time": "2024-06-06T17:00:06.461658Z" - }, - "id": "WBVTItUX4yyr" - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "from datasets import load_dataset\n", - "from tqdm import tqdm" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ๐Ÿ‘จ๐Ÿพโ€๐Ÿ’ป Code Walkthrough\n", - "Here's an explanation of the code structure provided:\n", - "\n", - "1. **Loading Data**: OpenAI embeddings are loaded from a parquet files (we can load upto 1M embedding) and concatenated into one array.\n", - "2. **Binary Conversion**: A new array with the same shape is initialized with zeros, and the positive values in the original vectors are set to 1.\n", - "3. **Accuracy Function**: The accuracy function compares original vectors with binary vectors for a given index, limit, and oversampling rate. The comparison is done using dot products and logical XOR, sorting the results, and measuring the intersection.\n", - "4. **Testing**: The accuracy is tested for different oversampling rates (1, 2, 4), revealing a correctness of ~0.96 for an oversampling of 4.\n", - "\n", - "\n", - "## ๐Ÿ’ฟ Loading Data" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-06T17:01:09.343230Z", - "start_time": "2024-06-06T17:00:07.042526Z" - }, - "colab": { - "base_uri": "https://localhost:8080/", - "height": 250 - }, - "id": "REJpFqkG7EG2", - "outputId": "7a43c0ae-fbcc-45fe-fd58-bfe691297b22" - }, - "outputs": [], - "source": [ - "# Download from Huggingface Hub\n", - "ds = load_dataset(\n", - " \"Qdrant/dbpedia-entities-openai3-text-embedding-3-large-3072-100K\", split=\"train\"\n", - ")\n", - "openai_vectors = np.array(ds[\"text-embedding-3-large-3072-embedding\"])\n", - "del ds" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-06T17:01:10.900963Z", - "start_time": "2024-06-06T17:01:09.344842Z" - } - }, - "outputs": [], - "source": [ - "openai_bin = np.zeros_like(openai_vectors, dtype=np.int8)\n", - "openai_bin[openai_vectors > 0] = 1" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-06T17:01:10.906827Z", - "start_time": "2024-06-06T17:01:10.901820Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": "3072" - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "n_dim = openai_vectors.shape[1]\n", - "n_dim" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ๐ŸŽฏ Accuracy Function\n", - "\n", - "We will use the accuracy function to compare the original vectors with the binary vectors for a given index, limit, and oversampling rate. The comparison is done using dot products and logical XOR, sorting the results, and measuring the intersection." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-06T17:01:10.909730Z", - "start_time": "2024-06-06T17:01:10.908166Z" - }, - "id": "FqshI-GlIERd" - }, - "outputs": [], - "source": [ - "def accuracy(idx, limit: int, oversampling: int):\n", - " scores = np.dot(openai_vectors, openai_vectors[idx])\n", - " dot_results = np.argsort(scores)[-limit:][::-1]\n", - "\n", - " bin_scores = n_dim - np.logical_xor(openai_bin, openai_bin[idx]).sum(axis=1)\n", - " bin_results = np.argsort(bin_scores)[-(limit * oversampling) :][::-1]\n", - "\n", - " return len(set(dot_results).intersection(set(bin_results))) / limit" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ๐Ÿ“Š Results" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-06T17:01:25.206592Z", - "start_time": "2024-06-06T17:01:10.911971Z" - }, - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "qtzUlq_sFTRf", - "outputId": "17fe04ea-4f73-4a57-990b-180f1c04b472" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 0%| | 0/4 [00:00\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sampling_ratelimitmean_acc
0130.90
11100.83
2231.00
32100.97
4331.00
53100.98
6531.00
75100.99
\n", - "text/plain": " sampling_rate limit mean_acc\n0 1 3 0.90\n1 1 10 0.83\n2 2 3 1.00\n3 2 10 0.97\n4 3 3 1.00\n5 3 10 0.98\n6 5 3 1.00\n7 5 10 0.99" - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "results = pd.DataFrame(results)\n", - "results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "colab": { - "machine_shape": "hm", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/docs/index.md b/docs/index.md index 15ea646a3..c8af29452 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,75 +1,77 @@ -# โšก๏ธ What is FastEmbed? +--- +icon: lucide/dna +--- -FastEmbed is a lightweight, fast, Python library built for embedding generation. We [support popular text models](https://qdrant.github.io/fastembed/examples/Supported_Models/). Please [open a Github issue](https://github.com/qdrant/fastembed/issues/new) if you want us to add a new model. +# fastembed-bio -1. Light & Fast - - Quantized model weights - - ONNX Runtime for inference +Fast, lightweight biological sequence embeddings using ONNX. No PyTorch or GPU required. -2. Accuracy/Recall - - Better than OpenAI Ada-002 - - Default is Flag Embedding, which has shown good results on the [MTEB](https://huggingface.co/spaces/mteb/leaderboard) leaderboard - - List of [supported models](https://qdrant.github.io/fastembed/examples/Supported_Models/) - including multilingual models - -Here is an example for [Retrieval Embedding Generation](https://qdrant.github.io/fastembed/examples/Retrieval%20with%20FastEmbed/) and how to use [FastEmbed with Qdrant](https://qdrant.github.io/fastembed/examples/Usage_With_Qdrant/). - -## ๐Ÿš€ Installation - -To install the FastEmbed library, pip works: +## Installation ```bash -pip install fastembed +pip install fastembed-bio ``` -## ๐Ÿ“– Usage +## Quick Example ```python -from fastembed import TextEmbedding - -documents: list[str] = [ - "passage: Hello, World!", - "query: Hello, World!", - "passage: This is an example passage.", - "fastembed is supported by and maintained by Qdrant." -] -embedding_model = TextEmbedding() -embeddings: list[np.ndarray] = embedding_model.embed(documents) +from fastembed_bio import DNAEmbedding, ProteinEmbedding + +# DNA embeddings +dna_model = DNAEmbedding("PoetschLab/GROVER") +dna_embeddings = list(dna_model.embed(["ATCGATCGATCG", "GCTAGCTAGCTA"])) + +# Protein embeddings +protein_model = ProteinEmbedding("facebook/esm2_t12_35M_UR50D") +protein_embeddings = list(protein_model.embed(["MKTVRQERLKS", "GKGDPKKPRGKM"])) ``` -## Usage with Qdrant +## Why fastembed-bio? -Installation with Qdrant Client in Python: +- **Fast**: ONNX runtime for CPU inference, no GPU needed +- **Lightweight**: Minimal dependencies, small model files +- **Simple**: Clean API inspired by [fastembed](https://github.com/qdrant/fastembed) +- **Biological focus**: DNA, protein, and (coming soon) single-cell embeddings -```bash -pip install qdrant-client[fastembed] -``` +## Supported Models -Might have to use ```pip install 'qdrant-client[fastembed]'``` on zsh. +### DNA Embeddings -```python -from qdrant_client import QdrantClient - -# Initialize the client -client = QdrantClient(":memory:") # Using an in-process Qdrant - -# Prepare your documents, metadata, and IDs -docs = ["Qdrant has Langchain integrations", "Qdrant also has Llama Index integrations"] -metadata = [ - {"source": "Langchain-docs"}, - {"source": "Llama-index-docs"}, -] -ids = [42, 2] - -client.add( - collection_name="demo_collection", - documents=docs, - metadata=metadata, - ids=ids -) - -search_result = client.query( - collection_name="demo_collection", - query_text="This is a query document" -) -print(search_result) -``` +| Model | Dimensions | Description | +|-------|------------|-------------| +| `PoetschLab/GROVER` | 768 | GROVER DNA foundation model | + +### Protein Embeddings + +| Model | Dimensions | Description | +|-------|------------|-------------| +| `facebook/esm2_t12_35M_UR50D` | 480 | ESM-2 35M parameters | + +## Next Steps + +- [DNA Embedding Quickstart](quickstart/dna.md) - Get started with DNA sequence embeddings +- [Protein Embedding Quickstart](quickstart/protein.md) - Get started with protein sequence embeddings + +## Roadmap + +I'm actively expanding fastembed-bio. Here's where my mind is at for future model support: + +### DNA/RNA Models +- [ ] Nucleotide Transformer v3 (species-conditioned embeddings) +- [ ] Hyena DNA +- [ ] Additional GROVER variants + +### Protein Models +- [ ] ESM-2 larger variants (150M, 650M) +- [ ] ESMFold embeddings + +### Single-Cell Models +- [ ] Geneformer (scRNA-seq) +- [ ] scGPT +- [ ] Tahoe-x1 + +### Other Modalities +- [ ] ATAC-seq embeddings (Atacformer) +- [ ] Multi-modal embeddings + +Want to contribute? Check out our [GitHub repository](https://github.com/nleroy917/fastembed-bio)! \ No newline at end of file diff --git a/docs/overrides/main.html b/docs/overrides/main.html deleted file mode 100644 index 1c0fd5b42..000000000 --- a/docs/overrides/main.html +++ /dev/null @@ -1,27 +0,0 @@ -{% extends "base.html" %} - -{% block content %} -{% if page.nb_url %} - -{% endif %} - -{{ super() }} - -{% endblock content %} - -{% block announce %} -
- If you're using FastEmbed from Qdrant, join the - - - {% include ".icons/fontawesome/brands/discord.svg" %} - - Qdrant Discord server - - to get help and share your work! Or check out Qdrant Cloud to - get started with vector search! -
-{% endblock %} diff --git a/docs/qdrant/Binary_Quantization_with_Qdrant.ipynb b/docs/qdrant/Binary_Quantization_with_Qdrant.ipynb deleted file mode 100644 index cf1f5defb..000000000 --- a/docs/qdrant/Binary_Quantization_with_Qdrant.ipynb +++ /dev/null @@ -1,418 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Binary Quantization with Qdrant & OpenAI Embedding\n", - "\n", - "---\n", - "In the world of large-scale data retrieval and processing, efficiency is crucial. With the exponential growth of data, the ability to retrieve information quickly and accurately can significantly affect system performance. This blog post explores a technique known as binary quantization applied to OpenAI embeddings, demonstrating how it can enhance **retrieval latency by 20x** or more.\n", - "\n", - "## What Are OpenAI Embeddings?\n", - "OpenAI embeddings are numerical representations of textual information. They transform text into a vector space where semantically similar texts are mapped close together. This mathematical representation enables computers to understand and process human language more effectively.\n", - "\n", - "## Binary Quantization\n", - "Binary quantization is a method which converts continuous numerical values into binary values (0 or 1). It simplifies the data structure, allowing faster computations. Here's a brief overview of the binary quantization process applied to OpenAI embeddings:\n", - "\n", - "1. **Load Embeddings**: OpenAI embeddings are loaded from parquet files.\n", - "2. **Binary Transformation**: The continuous valued vectors are converted into binary form. Here, values greater than 0 are set to 1, and others remain 0.\n", - "3. **Comparison & Retrieval**: Binary vectors are used for comparison using logical XOR operations and other efficient algorithms.\n", - "\n", - "Binary Quantization is a promising approach to improve retrieval speeds and reduce memory footprint of vector search engines. In this notebook we will show how to use Qdrant to perform binary quantization of vectors and perform fast similarity search on the resulting index.\n", - "\n", - "## Table of Contents\n", - "1. Imports\n", - "2. Download and Slice Dataset\n", - "3. Create Qdrant Collection\n", - "4. Indexing\n", - "5. Search\n", - "\n", - "## 1. Imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-06T18:31:54.357040Z", - "start_time": "2024-06-06T18:31:52.672431Z" - } - }, - "outputs": [], - "source": [ - "!pip install qdrant-client pandas dataset --quiet --upgrade" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-06T18:31:55.482034Z", - "start_time": "2024-06-06T18:31:54.357645Z" - } - }, - "outputs": [], - "source": [ - "import os\n", - "import random\n", - "import time\n", - "\n", - "import datasets\n", - "import numpy as np\n", - "import pandas as pd\n", - "from qdrant_client import QdrantClient, models\n", - "\n", - "random.seed(37)\n", - "np.random.seed(37)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Download and Slice Dataset\n", - "\n", - "We will be using the [dbpedia-entities](https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-small-1536-100K) dataset from the [HuggingFace Datasets](https://huggingface.co/datasets) library. This contains 100K vectors of 1536 dimensions each" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-06T18:31:58.405581Z", - "start_time": "2024-06-06T18:31:55.471027Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": "100000" - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dataset = datasets.load_dataset(\n", - " \"Qdrant/dbpedia-entities-openai3-text-embedding-3-small-1536-100K\", split=\"train\"\n", - ")\n", - "len(dataset)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-06T18:32:24.456211Z", - "start_time": "2024-06-06T18:31:58.396924Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": "1536" - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "n_dim = len(dataset[\"text-embedding-3-small-1536-embedding\"][0])\n", - "n_dim" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-06T18:32:24.536891Z", - "start_time": "2024-06-06T18:32:24.455087Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": "True" - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "client = QdrantClient( # assumes Qdrant is launched at localhost:6333\n", - " prefer_grpc=True,\n", - ")\n", - "\n", - "collection_name = \"binary-quantization\"\n", - "\n", - "client.create_collection(\n", - " collection_name=collection_name,\n", - " vectors_config=models.VectorParams(\n", - " size=n_dim,\n", - " distance=models.Distance.DOT,\n", - " on_disk=True,\n", - " ),\n", - " quantization_config=models.BinaryQuantization(\n", - " binary=models.BinaryQuantizationConfig(always_ram=True),\n", - " ),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-06T18:33:22.583349Z", - "start_time": "2024-06-06T18:32:24.815155Z" - } - }, - "outputs": [], - "source": [ - "def iter_dataset(dataset):\n", - " for point in dataset:\n", - " yield point[\"openai\"], {\"text\": point[\"text\"]}\n", - "\n", - "\n", - "vectors, payload = zip(*iter_dataset(dataset))\n", - "client.upload_collection(\n", - " collection_name=collection_name,\n", - " vectors=vectors,\n", - " payload=payload,\n", - " parallel=max(1, (os.cpu_count() // 2)),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-06T18:35:47.748898Z", - "start_time": "2024-06-06T18:35:47.740719Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": "{'status': ,\n 'optimizer_status': ,\n 'vectors_count': None,\n 'indexed_vectors_count': 97760,\n 'points_count': 100000,\n 'segments_count': 7,\n 'config': {'params': {'vectors': {'size': 1536,\n 'distance': ,\n 'hnsw_config': None,\n 'quantization_config': None,\n 'on_disk': True,\n 'datatype': None},\n 'shard_number': 1,\n 'sharding_method': None,\n 'replication_factor': 1,\n 'write_consistency_factor': 1,\n 'read_fan_out_factor': None,\n 'on_disk_payload': True,\n 'sparse_vectors': None},\n 'hnsw_config': {'m': 16,\n 'ef_construct': 100,\n 'full_scan_threshold': 10000,\n 'max_indexing_threads': 0,\n 'on_disk': False,\n 'payload_m': None},\n 'optimizer_config': {'deleted_threshold': 0.2,\n 'vacuum_min_vector_number': 1000,\n 'default_segment_number': 0,\n 'max_segment_size': None,\n 'memmap_threshold': None,\n 'indexing_threshold': 20000,\n 'flush_interval_sec': 5,\n 'max_optimization_threads': None},\n 'wal_config': {'wal_capacity_mb': 32, 'wal_segments_ahead': 0},\n 'quantization_config': {'binary': {'always_ram': True}}},\n 'payload_schema': {}}" - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "collection_info = client.get_collection(collection_name=f\"{collection_name}\")\n", - "collection_info.dict()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Oversampling vs Recall\n", - "\n", - "### Preparing a query dataset\n", - "\n", - "For the purpose of this illustration, we'll take a few vectors which we know are already in the index and query them. We should get the same vectors back as results from the Qdrant index. " - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-06T18:33:22.643845Z", - "start_time": "2024-06-06T18:33:22.589346Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": "[89391,\n 79659,\n 12006,\n 80978,\n 87219,\n 97885,\n 83155,\n 67504,\n 4645,\n 82711,\n 48395,\n 57375,\n 69208,\n 14136,\n 89515,\n 59880,\n 78730,\n 36952,\n 49620,\n 96486,\n 55473,\n 58179,\n 18926,\n 6489,\n 11931,\n 54146,\n 9850,\n 71259,\n 37825,\n 47331,\n 84964,\n 92399,\n 56669,\n 77042,\n 73744,\n 47993,\n 83780,\n 92429,\n 75114,\n 4463,\n 69030,\n 81185,\n 27950,\n 66217,\n 54652,\n 8260,\n 1151,\n 993,\n 85954,\n 66863,\n 47303,\n 8992,\n 92688,\n 76030,\n 29472,\n 3077,\n 42454,\n 46120,\n 69140,\n 20877,\n 2844,\n 95423,\n 1770,\n 28568,\n 96448,\n 94227,\n 40837,\n 91684,\n 29785,\n 66936,\n 85121,\n 39546,\n 81910,\n 5514,\n 37068,\n 35731,\n 93990,\n 26685,\n 63076,\n 18762,\n 27922,\n 34916,\n 80976,\n 83189,\n 6328,\n 57508,\n 58860,\n 13758,\n 72976,\n 85030,\n 332,\n 34963,\n 85009,\n 31344,\n 11560,\n 58108,\n 85163,\n 17064,\n 44712,\n 45962]" - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "query_indices = random.sample(range(len(dataset)), 100)\n", - "query_dataset = dataset[query_indices]\n", - "query_indices" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-06T18:33:22.644389Z", - "start_time": "2024-06-06T18:33:22.642909Z" - } - }, - "outputs": [], - "source": [ - "## Add Gaussian noise to any vector\n", - "\n", - "\n", - "def add_noise(vector, noise=0.05):\n", - " return vector + noise * np.random.randn(*vector.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-06T18:33:22.647634Z", - "start_time": "2024-06-06T18:33:22.646107Z" - } - }, - "outputs": [], - "source": [ - "def correct(results, text):\n", - " return text in [x.payload[\"text\"] for x in results]\n", - "\n", - "\n", - "def count_correct(query_dataset, limit=1, oversampling=1, rescore=False):\n", - " correct_results = 0\n", - " for query_vector, text in zip(query_dataset[\"openai\"], query_dataset[\"text\"]):\n", - " results = client.search(\n", - " collection_name=collection_name,\n", - " query_vector=add_noise(np.array(query_vector)),\n", - " limit=limit,\n", - " search_params=models.SearchParams(\n", - " quantization=models.QuantizationSearchParams(\n", - " rescore=rescore,\n", - " oversampling=oversampling,\n", - " )\n", - " ),\n", - " )\n", - " correct_results += correct(results, text)\n", - " return correct_results" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-06T18:36:09.037565Z", - "start_time": "2024-06-06T18:35:58.069484Z" - }, - "collapsed": false - }, - "outputs": [], - "source": [ - "limit_grid = [1, 3, 10, 20, 50]\n", - "oversampling_grid = [1.0, 3.0, 5.0]\n", - "rescore_grid = [True, False]\n", - "results = []\n", - "\n", - "for limit in limit_grid:\n", - " for oversampling in oversampling_grid:\n", - " for rescore in rescore_grid:\n", - " start = time.perf_counter()\n", - " correct_results = count_correct(\n", - " query_dataset, limit=limit, oversampling=oversampling, rescore=rescore\n", - " )\n", - " end = time.perf_counter()\n", - " results.append(\n", - " {\n", - " \"limit\": limit,\n", - " \"oversampling\": oversampling,\n", - " \"bq_candidates\": int(oversampling * limit),\n", - " \"rescore\": rescore,\n", - " \"accuracy\": correct_results / 100,\n", - " \"total queries\": len(query_dataset[\"text\"]),\n", - " \"time\": end - start,\n", - " }\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "ExecuteTime": { - "end_time": "2024-06-06T18:36:09.054593Z", - "start_time": "2024-06-06T18:36:09.048053Z" - } - }, - "outputs": [ - { - "data": { - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
limitoversamplingrescoreaccuracybq_candidatestime
011.0True0.9510.300152
111.0False0.8510.244668
213.0True0.9530.124406
313.0False0.8330.171471
415.0True0.9850.118219
515.0False0.8750.111914
631.0True0.9530.121328
731.0False0.9230.267725
833.0True0.9690.416834
933.0False0.9090.410730
1035.0True0.97150.231671
1135.0False0.93150.252269
12101.0True0.96100.133462
13101.0False0.92100.285158
14103.0True0.95300.320695
15103.0False0.98300.457904
16105.0True0.96500.453204
17105.0False0.94500.450944
18201.0True0.97200.361066
19201.0False0.95200.585992
20203.0True0.96600.550389
21203.0False0.96600.618630
22205.0True1.001000.458241
23205.0False0.951000.441106
24501.0True0.98500.603967
25501.0False0.96500.514531
26503.0True1.001500.548153
27503.0False0.981500.608930
28505.0True1.002500.487522
29505.0False0.992500.313810
\n
", - "text/plain": " limit oversampling rescore accuracy bq_candidates time\n0 1 1.0 True 0.95 1 0.300152\n1 1 1.0 False 0.85 1 0.244668\n2 1 3.0 True 0.95 3 0.124406\n3 1 3.0 False 0.83 3 0.171471\n4 1 5.0 True 0.98 5 0.118219\n5 1 5.0 False 0.87 5 0.111914\n6 3 1.0 True 0.95 3 0.121328\n7 3 1.0 False 0.92 3 0.267725\n8 3 3.0 True 0.96 9 0.416834\n9 3 3.0 False 0.90 9 0.410730\n10 3 5.0 True 0.97 15 0.231671\n11 3 5.0 False 0.93 15 0.252269\n12 10 1.0 True 0.96 10 0.133462\n13 10 1.0 False 0.92 10 0.285158\n14 10 3.0 True 0.95 30 0.320695\n15 10 3.0 False 0.98 30 0.457904\n16 10 5.0 True 0.96 50 0.453204\n17 10 5.0 False 0.94 50 0.450944\n18 20 1.0 True 0.97 20 0.361066\n19 20 1.0 False 0.95 20 0.585992\n20 20 3.0 True 0.96 60 0.550389\n21 20 3.0 False 0.96 60 0.618630\n22 20 5.0 True 1.00 100 0.458241\n23 20 5.0 False 0.95 100 0.441106\n24 50 1.0 True 0.98 50 0.603967\n25 50 1.0 False 0.96 50 0.514531\n26 50 3.0 True 1.00 150 0.548153\n27 50 3.0 False 0.98 150 0.608930\n28 50 5.0 True 1.00 250 0.487522\n29 50 5.0 False 0.99 250 0.313810" - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.DataFrame(results)\n", - "\n", - "df[[\"limit\", \"oversampling\", \"rescore\", \"accuracy\", \"bq_candidates\", \"time\"]]\n", - "# df.to_csv(\"candidates-rescore-time.csv\", index=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "#### Why results for oversampling=1.0 and limit=1 with rescore=True are better than with rescore=False? \n", - "\n", - "It might seem that with oversampling=1.0 and limit=1 Qdrant retrieves only 1 point, and it does not matter whether we rescore it or not, it should stay the same, but with a different score (from original vectors).\n", - "\n", - "But in fact, there are 2 reasons why results are different:\n", - "1) HNSW is an approximate algorithm, and it might return different results for the same query. \n", - "2) Qdrant stores points in segments. When we do a query for 1 point, Qdrant looks for this one point in each segment, and then chooses the best match. \n", - "3) In this example we had 8 segments, Qdrant found 8 points with binary scores, replaced their scores with original vectors scores, and selected the best one from them, which led to a better accuracy. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/qdrant/Retrieval_with_FastEmbed.ipynb b/docs/qdrant/Retrieval_with_FastEmbed.ipynb deleted file mode 100644 index f4fb58989..000000000 --- a/docs/qdrant/Retrieval_with_FastEmbed.ipynb +++ /dev/null @@ -1,183 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# โš“๏ธ Retrieval with FastEmbed\n", - "\n", - "This notebook demonstrates how to use FastEmbed to perform vector search and retrieval. It consists of the following sections:\n", - "\n", - "1. Setup: Installing the necessary packages.\n", - "2. Importing Libraries: Importing FastEmbed and other libraries.\n", - "3. Data Preparation: Example data and embedding generation.\n", - "4. Querying: Defining a function to search documents based on a query.\n", - "5. Running Queries: Running example queries.\n", - "\n", - "## Setup\n", - "\n", - "First, we need to install the dependencies. `fastembed` to create embeddings and perform retrieval." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# !pip install fastembed --quiet --upgrade" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Importing the necessary libraries:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "from fastembed import TextEmbedding" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preparation\n", - "We initialize the embedding model and generate embeddings for the documents.\n", - "\n", - "### ๐Ÿ’ก Tip: Prefer using `query_embed` for queries and `passage_embed` for documents." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(384,) 10\n" - ] - } - ], - "source": [ - "# Example list of documents\n", - "documents: list[str] = [\n", - " \"Maharana Pratap was a Rajput warrior king from Mewar\",\n", - " \"He fought against the Mughal Empire led by Akbar\",\n", - " \"The Battle of Haldighati in 1576 was his most famous battle\",\n", - " \"He refused to submit to Akbar and continued guerrilla warfare\",\n", - " \"His capital was Chittorgarh, which he lost to the Mughals\",\n", - " \"He died in 1597 at the age of 57\",\n", - " \"Maharana Pratap is considered a symbol of Rajput resistance against foreign rule\",\n", - " \"His legacy is celebrated in Rajasthan through festivals and monuments\",\n", - " \"He had 11 wives and 17 sons, including Amar Singh I who succeeded him as ruler of Mewar\",\n", - " \"His life has been depicted in various films, TV shows, and books\",\n", - "]\n", - "# Initialize the DefaultEmbedding class with the desired parameters\n", - "embedding_model = TextEmbedding(model_name=\"BAAI/bge-small-en\")\n", - "\n", - "# We'll use the passage_embed method to get the embeddings for the documents\n", - "embeddings: list[np.ndarray] = list(\n", - " embedding_model.passage_embed(documents)\n", - ") # notice that we are casting the generator to a list\n", - "\n", - "print(embeddings[0].shape, len(embeddings))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Querying\n", - "\n", - "We'll define a function to print the top k documents based on a query, and prepare a sample query." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "query = \"Who was Maharana Pratap?\"\n", - "query_embedding = list(embedding_model.query_embed(query))[0]\n", - "plain_query_embedding = list(embedding_model.embed(query))[0]\n", - "\n", - "\n", - "def print_top_k(query_embedding, embeddings, documents, k=5):\n", - " # use numpy to calculate the cosine similarity between the query and the documents\n", - " scores = np.dot(embeddings, query_embedding)\n", - " # sort the scores in descending order\n", - " sorted_scores = np.argsort(scores)[::-1]\n", - " # print the top 5\n", - " for i in range(k):\n", - " print(f\"Rank {i+1}: {documents[sorted_scores[i]]}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([-0.06002192, 0.04322132, -0.00545516, -0.04419701, -0.00542277],\n", - " dtype=float32),\n", - " array([-0.06002192, 0.04322132, -0.00545516, -0.04419701, -0.00542277],\n", - " dtype=float32))" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "query_embedding[:5], plain_query_embedding[:5]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `query_embed` is specifically designed for queries, leading to more relevant and context-aware results. The retrieved documents tend to align closely with the query's intent.\n", - "\n", - "In contrast, `embed` is a more general-purpose representation that might not capture the nuances of the query as effectively. The retrieved documents using plain embeddings might be less relevant or ordered differently compared to the results obtained using query embeddings.\n", - "\n", - "Conclusion: Using query and passage embeddings leads to more relevant and context-aware results." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "fst", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/qdrant/Usage_With_Qdrant.ipynb b/docs/qdrant/Usage_With_Qdrant.ipynb deleted file mode 100644 index d0dcec97c..000000000 --- a/docs/qdrant/Usage_With_Qdrant.ipynb +++ /dev/null @@ -1,243 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Usage With Qdrant\n", - "\n", - "This notebook demonstrates how to use FastEmbed and Qdrant to perform vector search and retrieval. Qdrant is an open-source vector similarity search engine that is used to store, organize, and query collections of high-dimensional vectors. \n", - "\n", - "We will use the Qdrant to add a collection of documents to the engine and then query the collection to retrieve the most relevant documents.\n", - "\n", - "It consists of the following sections:\n", - "\n", - "1. Setup: Installing necessary packages, including the Qdrant Client and FastEmbed.\n", - "2. Importing Libraries: Importing FastEmbed and other libraries\n", - "3. Data Preparation: Example data and embedding generation\n", - "4. Querying: Defining a function to search documents based on a query\n", - "5. Running Queries: Running example queries\n", - "\n", - "## Setup\n", - "\n", - "First, we need to install the dependencies. `fastembed` to create embeddings and perform retrieval, and `qdrant-client` to interact with the Qdrant database." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install 'qdrant-client[fastembed]' --quiet --upgrade" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Importing the necessary libraries:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from qdrant_client import QdrantClient" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preparation\n", - "We initialize the embedding model and generate embeddings for the documents.\n", - "\n", - "### ๐Ÿ’ก Tip: Prefer using `query_embed` for queries and `passage_embed` for documents." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# Example list of documents\n", - "documents: list[str] = [\n", - " \"Maharana Pratap was a Rajput warrior king from Mewar\",\n", - " \"He fought against the Mughal Empire led by Akbar\",\n", - " \"The Battle of Haldighati in 1576 was his most famous battle\",\n", - " \"He refused to submit to Akbar and continued guerrilla warfare\",\n", - " \"His capital was Chittorgarh, which he lost to the Mughals\",\n", - " \"He died in 1597 at the age of 57\",\n", - " \"Maharana Pratap is considered a symbol of Rajput resistance against foreign rule\",\n", - " \"His legacy is celebrated in Rajasthan through festivals and monuments\",\n", - " \"He had 11 wives and 17 sons, including Amar Singh I who succeeded him as ruler of Mewar\",\n", - " \"His life has been depicted in various films, TV shows, and books\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This tutorial demonstrates how to utilize the QdrantClient to add documents to a collection and query the collection for relevant documents." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## โž• Adding Documents\n", - "\n", - "The `add` creates a collection if it does not already exist. Now, we can add the documents to the collection:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 77.7M/77.7M [00:05<00:00, 14.6MiB/s]\n" - ] - }, - { - "data": { - "text/plain": [ - "['4fa8b10c78da4b18ba0830ba8a57367a',\n", - " '2eae04b515ee4e9185a9a0e6be812bba',\n", - " 'c6039f88486f47f1835ae3b069c5823c',\n", - " 'c2c8c51e305144d1917b373125fb4d95',\n", - " '79fd23b9ec0648cdab38d1947c6b933e',\n", - " '036aa200d8c3492b8a438e4f825f5e7f',\n", - " 'c35c77f3ea37460a9a13723fb77b7367',\n", - " '6ebccbca571b40d0ab6e83e5e0f2f562',\n", - " '38048c2ccc1d4962a4f8f1bd89c8357a',\n", - " 'c6b09308360140c7b4f106af3658a31e']" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "client = QdrantClient(\":memory:\")\n", - "client.add(collection_name=\"test_collection\", documents=documents)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "These are the ids of the documents we just added. We don't have a use for them in this tutorial, but they can be used to update or delete documents." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ๐Ÿ“ Running Queries\n", - "We'll define a function to print the top k documents based on a query, and prepare a sample query." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[42, 2]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Prepare your documents, metadata, and IDs\n", - "docs = [\"Qdrant has Langchain integrations\", \"Qdrant also has Llama Index integrations\"]\n", - "metadata = [\n", - " {\"source\": \"Langchain-docs\"},\n", - " {\"source\": \"Linkedin-docs\"},\n", - "]\n", - "ids = [42, 2]\n", - "\n", - "# Use the new add method\n", - "client.add(collection_name=\"demo_collection\", documents=docs, metadata=metadata, ids=ids)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Behind the scenes, Qdrant Client uses the FastEmbed library to make a passage embedding and then uses the Qdrant API to upsert the documents with metadata, put together as a Points into the collection." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[QueryResponse(id=42, embedding=None, metadata={'document': 'Qdrant has Langchain integrations', 'source': 'Langchain-docs'}, document='Qdrant has Langchain integrations', score=0.8276550115796268), QueryResponse(id=2, embedding=None, metadata={'document': 'Qdrant also has Llama Index integrations', 'source': 'Linkedin-docs'}, document='Qdrant also has Llama Index integrations', score=0.8265536935180283)]\n" - ] - } - ], - "source": [ - "search_result = client.query(\n", - " collection_name=\"demo_collection\", query_text=\"This is a query document\"\n", - ")\n", - "print(search_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ๐ŸŽฌ Conclusion\n", - "\n", - "This tutorial demonstrates the basics of working with the QdrantClient to add and query documents. By following this guide, you can easily integrate Qdrant into your projects for vector similarity search and retrieval.\n", - "\n", - "Remember to properly handle the closing of the client connection and further customization of the query parameters according to your specific needs.\n", - "\n", - "The official Qdrant Python client documentation can be found [here](https://github.com/qdrant/qdrant-client) for more details on customization and advanced features." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "fst", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/quickstart/dna.md b/docs/quickstart/dna.md new file mode 100644 index 000000000..eb3904839 --- /dev/null +++ b/docs/quickstart/dna.md @@ -0,0 +1,89 @@ +--- +icon: lucide/dna +--- + +# DNA Embedding Quickstart + +Generate embeddings for DNA sequences using the GROVER foundation model. + +## Basic Usage + +```python +from fastembed_bio import DNAEmbedding + +# Initialize the model (downloads on first use) +model = DNAEmbedding("PoetschLab/GROVER") + +# Embed sequences +sequences = ["ATCGATCGATCGATCG", "GCTAGCTAGCTAGCTA"] +embeddings = list(model.embed(sequences)) + +print(f"Shape: {embeddings[0].shape}") # (768,) +``` + +## Single Sequence + +You can pass a single sequence as a string: + +```python +embedding = list(model.embed("ATCGATCGATCGATCG")) +print(len(embedding)) # 1 +``` + +## Batch Processing + +Control batch size for memory efficiency: + +```python +# Large dataset +sequences = ["ATCG" * 100] * 1000 + +# Process in batches of 16 +embeddings = list(model.embed(sequences, batch_size=16)) +``` + +## Lazy Loading + +Defer model loading until first use: + +```python +# Model files are downloaded but not loaded into memory +model = DNAEmbedding("PoetschLab/GROVER", lazy_load=True) + +# Model loads here on first embed call +embeddings = list(model.embed(["ATCGATCG"])) +``` + +## List Available Models + +```python +models = DNAEmbedding.list_supported_models() +for m in models: + print(f"{m['model']}: {m['dim']} dimensions") +``` + +## Get Embedding Dimensions + +```python +# Without loading the model +dim = DNAEmbedding.get_embedding_size("PoetschLab/GROVER") +print(dim) # 768 + +# From an instance +model = DNAEmbedding("PoetschLab/GROVER") +print(model.embedding_size) # 768 +``` + +## Embeddings are Normalized + +All embeddings are L2-normalized (unit length): + +```python +import numpy as np + +embeddings = list(model.embed(["ATCGATCG"])) +norm = np.linalg.norm(embeddings[0]) +print(f"L2 norm: {norm:.4f}") # ~1.0000 +``` + +This makes them ready for cosine similarity comparisons or use with vector databases. \ No newline at end of file diff --git a/docs/quickstart/index.md b/docs/quickstart/index.md new file mode 100644 index 000000000..ea7f5706b --- /dev/null +++ b/docs/quickstart/index.md @@ -0,0 +1,29 @@ +--- +icon: lucide/rocket +--- + +# Quickstart Guides + +Get started with biological sequence embeddings in minutes. + +## Available Guides + +
+ +- :lucide-dna: **DNA Embeddings** + + --- + + Generate embeddings for DNA sequences with species conditioning. + + [:octicons-arrow-right-24: DNA Quickstart](dna.md) + +- :lucide-flask-conical: **Protein Embeddings** + + --- + + Generate embeddings for protein sequences using ESM-2. + + [:octicons-arrow-right-24: Protein Quickstart](protein.md) + +
\ No newline at end of file diff --git a/docs/quickstart/protein.md b/docs/quickstart/protein.md new file mode 100644 index 000000000..f4c810cb8 --- /dev/null +++ b/docs/quickstart/protein.md @@ -0,0 +1,92 @@ +--- +icon: lucide/flask-conical +--- + +# Protein Embedding Quickstart + +Generate embeddings for protein sequences using ESM-2 models. + +## Basic Usage + +```python +from fastembed_bio import ProteinEmbedding + +# Initialize the model (downloads on first use) +model = ProteinEmbedding("facebook/esm2_t12_35M_UR50D") + +# Embed sequences (amino acid strings) +sequences = [ + "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG", + "GKGDPKKPRGKMSSYAFFVQTSREEHKKKHPDASVNFSEFSKKCSERWKTMSAKEKGKFEDMAKADKARYEREMKTY", +] +embeddings = list(model.embed(sequences)) + +print(f"Shape: {embeddings[0].shape}") # (480,) +``` + +## Single Sequence + +You can pass a single sequence as a string: + +```python +embedding = list(model.embed("MKTVRQERLKS")) +print(len(embedding)) # 1 +``` + +## Batch Processing + +Control batch size for memory efficiency: + +```python +# Many sequences +sequences = ["MKTVRQERLKS"] * 100 + +# Process in batches +embeddings = list(model.embed(sequences, batch_size=32)) +``` + +## Lazy Loading + +Defer model loading until first use: + +```python +# Model files are downloaded but not loaded into memory +model = ProteinEmbedding("facebook/esm2_t12_35M_UR50D", lazy_load=True) + +# Model loads here on first embed call +embeddings = list(model.embed(["MKTVRQERLKS"])) +``` + +## List Available Models + +```python +models = ProteinEmbedding.list_supported_models() +for m in models: + print(f"{m['model']}: {m['dim']} dimensions") +``` + +## Get Embedding Dimensions + +```python +# Without loading the model +dim = ProteinEmbedding.get_embedding_size("facebook/esm2_t12_35M_UR50D") +print(dim) # 480 + +# From an instance +model = ProteinEmbedding("facebook/esm2_t12_35M_UR50D") +print(model.embedding_size) # 480 +``` + +## Embeddings are Normalized + +All embeddings are L2-normalized (unit length): + +```python +import numpy as np + +embeddings = list(model.embed(["MKTVRQERLKS"])) +norm = np.linalg.norm(embeddings[0]) +print(f"L2 norm: {norm:.4f}") # ~1.0000 +``` + +This makes them ready for cosine similarity comparisons or use with vector databases. \ No newline at end of file diff --git a/experiments/01_ONNX_Port.ipynb b/experiments/01_ONNX_Port.ipynb deleted file mode 100644 index ba7d247a9..000000000 --- a/experiments/01_ONNX_Port.ipynb +++ /dev/null @@ -1,363 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "0e9dbcde", - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c37e1fda-c7f1-46e7-a5d4-19fa05c36ac1", - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "from typing import Any\n", - "\n", - "import numpy as np\n", - "import time\n", - "from torch import Tensor\n", - "from transformers import AutoTokenizer, AutoModel\n", - "\n", - "from optimum.onnxruntime import AutoOptimizationConfig, ORTModelForFeatureExtraction, ORTOptimizer\n", - "from optimum.pipelines import pipeline\n", - "import torch.nn.functional as F" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "78a65856", - "metadata": {}, - "outputs": [], - "source": [ - "# Load the tokenizer and export the model to the ONNX format\n", - "# model_id = \"sentence-transformers/all-MiniLM-L6-v2\"\n", - "# model_id = \"thenlper/gte-base\"\n", - "# model_id = \"intfloat/multilingual-e5-large\"\n", - "model_id = \"BAAI/bge-small-en-v1.5\"\n", - "save_dir = f\"fast-{model_id.split('/')[1]}\"\n", - "print(save_dir)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b1ecf0b6-db81-4da3-b47f-e31460ccfbf1", - "metadata": {}, - "outputs": [], - "source": [ - "hf_model = AutoModel.from_pretrained(model_id)\n", - "hf_tokenizer = AutoTokenizer.from_pretrained(model_id)\n", - "\n", - "# The input texts can be in any language, not just English.\n", - "# Each input text should start with \"query: \" or \"passage: \", even for non-English texts.\n", - "# For tasks other than retrieval, you can simply use the \"query: \" prefix.\n", - "input_texts = [\n", - " \"query: how much protein should a female eat\",\n", - " \"query: ๅ—็“œ็š„ๅฎถๅธธๅšๆณ•\",\n", - " \"query: เคญเคพเคฐเคค เค•เคพ เคฐเคพเคทเฅเคŸเฅเคฐเฅ€เคฏ เค–เฅ‡เคฒ เค•เฅŒเคจ-เคธเคพ เคนเฅˆ?\", # Hindi text\n", - " \"query: เฐญเฐพเฐฐเฐคเฑ เฐฆเฑ‡เฐถเฐ‚เฐฒเฑ‹ เฐฐเฐพเฐทเฑเฐŸเฑเฐฐเฐชเฐคเฐฟ เฐŽเฐตเฐฐเฑ?\", # Telugu text\n", - " \"query: เฎ‡เฎจเฏเฎคเฎฟเฎฏเฎพเฎตเฎฟเฎฉเฏ เฎคเฏ‡เฎšเฎฟเฎฏ เฎ•เฏ‹เฎชเฏเฎชเฏˆ เฎŽเฎคเฏ?\", # Tamil text\n", - " \"query: เฒญเฒพเฒฐเฒคเฒฆเฒฒเณเฒฒเฒฟ เฒฐเฒพเฒทเณเฒŸเณเฒฐเฒชเฒคเฒฟ เฒฏเฒพเฒฐเณ?\", # Kannada text\n", - " \"query: เด‡เดจเตเดคเตเดฏเดฏเตเดŸเต† เดฐเดพเดทเตเดŸเตเดฐเต€เดฏ เด—เดพเดจเด‚ เดŽเดจเตเดคเดพเดฃเต?\", # Malayalam text\n", - "]\n", - "\n", - "english_texts = [\n", - " \"India: Where the Taj Mahal meets spicy curry.\",\n", - " \"Machine Learning: Turning data into knowledge, one algorithm at a time.\",\n", - " \"Python: The language that makes programming a piece of cake.\",\n", - " \"fastembed: Accelerating embeddings for lightning-fast similarity search.\",\n", - " \"Qdrant: The ultimate tool for high-dimensional indexing and search.\",\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9f8c761c", - "metadata": {}, - "outputs": [], - "source": [ - "def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:\n", - " last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)\n", - " return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]\n", - "\n", - "\n", - "def hf_embed(model_id: str, inputs: list[str]):\n", - " # Tokenize the input texts\n", - " batch_dict = hf_tokenizer(\n", - " inputs, max_length=512, padding=True, truncation=True, return_tensors=\"pt\"\n", - " )\n", - "\n", - " outputs = hf_model(**batch_dict)\n", - " embeddings = average_pool(outputs.last_hidden_state, batch_dict[\"attention_mask\"])\n", - "\n", - " # normalize embeddings\n", - " embeddings = F.normalize(embeddings, p=2, dim=1)\n", - " return embeddings.detach().numpy()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "69bb4501", - "metadata": {}, - "outputs": [], - "source": [ - "hf_embed(inputs=english_texts, model_id=model_id)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "451dbd16", - "metadata": {}, - "outputs": [], - "source": [ - "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", - "model = ORTModelForFeatureExtraction.from_pretrained(model_id, export=True)\n", - "\n", - "# Remove all existing files in the save_dir using Path.unlink()\n", - "save_dir = Path(save_dir)\n", - "save_dir.mkdir(parents=True, exist_ok=True)\n", - "for p in save_dir.iterdir():\n", - " p.unlink()\n", - "\n", - "# Load the optimization configuration detailing the optimization we wish to apply\n", - "optimization_config = AutoOptimizationConfig.O4()\n", - "optimizer = ORTOptimizer.from_pretrained(model)\n", - "\n", - "optimizer.optimize(\n", - " save_dir=save_dir, optimization_config=optimization_config, use_external_data_format=True\n", - ")\n", - "model = ORTModelForFeatureExtraction.from_pretrained(save_dir)\n", - "\n", - "tokenizer.save_pretrained(save_dir)\n", - "# model.save_pretrained(save_dir)\n", - "# model.push_to_hub(\"new_path_for_directory\", repository_id=\"my-onnx-repo\", use_auth_token=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8422cddd", - "metadata": {}, - "outputs": [], - "source": [ - "onnx_quant_embed = pipeline(\n", - " \"feature-extraction\", model=model, accelerator=\"ort\", tokenizer=tokenizer, return_tensors=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "51fa5775", - "metadata": {}, - "outputs": [], - "source": [ - "embeddings = onnx_quant_embed(inputs=english_texts)\n", - "F.normalize(embeddings[4])[:, 0], english_texts[4], len(embeddings), len(english_texts)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "df405d70", - "metadata": {}, - "outputs": [], - "source": [ - "def measure_pipeline_time(\n", - " pipeline, input_texts: list[str], num_runs=10, **kwargs: Any\n", - ") -> tuple[float, float]:\n", - " \"\"\"Measures the time it takes to run the pipeline on the input texts.\"\"\"\n", - " times = []\n", - " total_chars = sum(len(text) for text in input_texts)\n", - " for _ in range(num_runs):\n", - " start_time = time.time()\n", - " _ = pipeline(inputs=input_texts, **kwargs)\n", - " end_time = time.time()\n", - " times.append(end_time - start_time)\n", - "\n", - " mean_time = np.mean(times)\n", - " std_dev = np.std(times)\n", - " chars_per_second = total_chars / mean_time\n", - " return mean_time, std_dev, chars_per_second" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "2d72aba5", - "metadata": {}, - "source": [ - "# Ours" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6b881152", - "metadata": {}, - "outputs": [], - "source": [ - "_, _, chars_per_sec = measure_pipeline_time(onnx_quant_embed, input_texts)\n", - "print(f\"Multilingual Speed: {chars_per_sec:.2f} chars/sec\")\n", - "_, _, chars_per_sec = measure_pipeline_time(onnx_quant_embed, english_texts)\n", - "print(f\"English Speed: {chars_per_sec:.2f} chars/sec\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "49e1daf8", - "metadata": {}, - "source": [ - "# Original" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "61b3bf53", - "metadata": {}, - "outputs": [], - "source": [ - "_, _, chars_per_sec = measure_pipeline_time(hf_embed, input_texts=input_texts, model_id=model_id)\n", - "print(f\"Multilingual Speed: {chars_per_sec:.2f} chars/sec\")\n", - "_, _, chars_per_sec = measure_pipeline_time(hf_embed, input_texts=english_texts, model_id=model_id)\n", - "print(f\"English Speed: {chars_per_sec:.2f} chars/sec\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "f0b7da8f-ffe7-4f58-95dd-7e9836f19328", - "metadata": {}, - "source": [ - "# Compress & Upload\n", - "\n", - "## Compress" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "578b1d74", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from pathlib import Path\n", - "import tarfile\n", - "\n", - "save_dir = Path(\"../local_cache/fast-bge-small-en-v1.5\")\n", - "\n", - "\n", - "def compress(directory_path):\n", - " directory_path = Path(directory_path)\n", - " assert directory_path.exists(), f\"{directory_path} does not exist\"\n", - " output_filename = directory_path.name + \".tar.gz\"\n", - " if Path(output_filename).exists():\n", - " print(\"We've an output file already? Manually delete that first\")\n", - " return output_filename\n", - "\n", - " with tarfile.open(output_filename, \"w:gz\") as tar:\n", - " tar.add(directory_path, arcname=os.path.basename(directory_path))\n", - " return output_filename\n", - "\n", - "\n", - "compressed_file_name = compress(save_dir)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "96cdf140-eca8-4778-9ebe-947988b4cfcb", - "metadata": {}, - "source": [ - "## Upload to Qdrant Google Cloud Storage" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "1dab9595", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/homebrew/Caskroom/miniconda/base/envs/fst/lib/python3.9/site-packages/google/auth/_default.py:76: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", - " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "File fast-bge-small-en-v1.5.tar.gz uploaded to qdrant-fastembed.\n" - ] - } - ], - "source": [ - "from google.cloud import storage\n", - "\n", - "\n", - "def upload(bucket_name, source_file_path):\n", - " storage_client = storage.Client(project=\"main\")\n", - " bucket = storage_client.bucket(bucket_name)\n", - " blob = bucket.blob(os.path.basename(source_file_path))\n", - "\n", - " blob.upload_from_filename(source_file_path)\n", - "\n", - " print(f\"File {source_file_path} uploaded to {bucket_name}.\")\n", - "\n", - "\n", - "upload(\"qdrant-fastembed\", source_file_path=compressed_file_name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "731554f0", - "metadata": {}, - "outputs": [], - "source": [ - "# Remove the directory and compressed file\n", - "!rm -rvf {save_dir}\n", - "!rm -vf {save_dir}.tar.gz" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.17" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/experiments/02_SPLADE_to_ONNX.ipynb b/experiments/02_SPLADE_to_ONNX.ipynb deleted file mode 100644 index aa57e18f8..000000000 --- a/experiments/02_SPLADE_to_ONNX.ipynb +++ /dev/null @@ -1,371 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import torch\n", - "from transformers import AutoModelForMaskedLM, AutoTokenizer" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Running the model with Transformers and Torch" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "sentences = [\n", - " \"Hello World\",\n", - " \"Built by Nirant Kasliwal\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## PyTorch Code from the [SPLADERunner](https://github.com/PrithivirajDamodaran/SPLADERunner) library" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "hf_token = \"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Output Logits shape: torch.Size([2, 10, 30522])\n", - "Output Attention mask shape: torch.Size([2, 10])\n", - "Sparse Vector shape: torch.Size([2, 30522])\n", - "SPLADE BOW rep for sentence:\tBuilt by Nirant Kasliwal\n", - "[('##rant', 2.02), ('built', 1.94), ('##wal', 1.79), ('##sl', 1.69), ('build', 1.57), ('ka', 1.4), ('ni', 1.26), ('made', 0.93), ('architect', 0.76), ('was', 0.69), ('who', 0.61), ('his', 0.5), ('wrote', 0.47), ('india', 0.45), ('company', 0.41), ('##i', 0.41), ('he', 0.37), ('manufacturer', 0.36), ('by', 0.35), ('engineer', 0.33), ('architecture', 0.33), ('ko', 0.23), ('him', 0.22), ('invented', 0.19), ('said', 0.14), ('k', 0.11), ('man', 0.11), ('statue', 0.11), ('bomb', 0.1), ('##wa', 0.1), ('builder', 0.09), ('.', 0.07), ('started', 0.06), (',', 0.04), ('ku', 0.03)]\n" - ] - } - ], - "source": [ - "# Download the model and tokenizer\n", - "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n", - "tokenizer = AutoTokenizer.from_pretrained(\"prithivida/Splade_PP_en_v1\", token=hf_token)\n", - "reverse_voc = {v: k for k, v in tokenizer.vocab.items()}\n", - "model = AutoModelForMaskedLM.from_pretrained(\"prithivida/Splade_PP_en_v1\", token=hf_token)\n", - "model.to(device)\n", - "\n", - "# Tokenize the input\n", - "inputs = tokenizer(sentences, return_tensors=\"pt\", padding=True, truncation=True, max_length=512)\n", - "inputs = {key: val.to(device) for key, val in inputs.items()}\n", - "input_ids = inputs[\"input_ids\"]\n", - "attention_mask = inputs[\"attention_mask\"]\n", - "token_type_ids = inputs[\"token_type_ids\"]\n", - "\n", - "# Run model and prepare sparse vector\n", - "outputs = model(**inputs)\n", - "logits = outputs.logits\n", - "print(\"Output Logits shape: \", logits.shape)\n", - "print(\"Output Attention mask shape: \", attention_mask.shape)\n", - "relu_log = torch.log(1 + torch.relu(logits))\n", - "weighted_log = relu_log * attention_mask.unsqueeze(-1)\n", - "max_val, _ = torch.max(weighted_log, dim=1)\n", - "vector = max_val.squeeze()\n", - "print(\"Sparse Vector shape: \", vector.shape)\n", - "# print(\"Number of Actual Dimensions: \", len(cols))\n", - "cols = [vec.nonzero().squeeze().cpu().tolist() for vec in vector]\n", - "weights = [vec[col].cpu().tolist() for vec, col in zip(vector, cols)]\n", - "\n", - "idx = 1\n", - "cols, weights = cols[idx], weights[idx]\n", - "# Print the BOW representation\n", - "d = {k: v for k, v in zip(cols, weights)}\n", - "sorted_d = {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}\n", - "bow_rep = []\n", - "for k, v in sorted_d.items():\n", - " bow_rep.append((reverse_voc[k], round(v, 2)))\n", - "print(f\"SPLADE BOW rep for sentence:\\t{sentences[idx]}\\n{bow_rep}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Export with output_attentions and logits" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Exporting model to models/nirantk_SPLADE_PP_en_v1\n" - ] - }, - { - "data": { - "text/plain": [ - "('models/nirantk_SPLADE_PP_en_v1/tokenizer_config.json',\n", - " 'models/nirantk_SPLADE_PP_en_v1/special_tokens_map.json',\n", - " 'models/nirantk_SPLADE_PP_en_v1/vocab.txt',\n", - " 'models/nirantk_SPLADE_PP_en_v1/added_tokens.json',\n", - " 'models/nirantk_SPLADE_PP_en_v1/tokenizer.json')" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from transformers import AutoTokenizer\n", - "\n", - "model_id = \"nirantk/SPLADE_PP_en_v1\"\n", - "output_dir = f\"models/{model_id.replace('/', '_')}\"\n", - "model_kwargs = {\"output_attentions\": True, \"return_dict\": True}\n", - "\n", - "print(f\"Exporting model to {output_dir}\")\n", - "tokenizer.save_pretrained(output_dir)\n", - "# main_export(\n", - "# model_id,\n", - "# output=output_dir,\n", - "# no_post_process=True,\n", - "# model_kwargs=model_kwargs,\n", - "# token=hf_token,\n", - "# )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Running the model with ONNX" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "from optimum.onnxruntime import ORTModelForMaskedLM\n", - "\n", - "model = ORTModelForMaskedLM.from_pretrained(\"nirantk/SPLADE_PP_en_v1\")\n", - "tokenizer = AutoTokenizer.from_pretrained(\"nirantk/SPLADE_PP_en_v1\")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "inputs = tokenizer(sentences, return_tensors=\"pt\", padding=True, truncation=True, max_length=512)\n", - "inputs = {key: val.to(device) for key, val in inputs.items()}\n", - "input_ids = inputs[\"input_ids\"]\n", - "attention_mask = inputs[\"attention_mask\"]\n", - "token_type_ids = inputs[\"token_type_ids\"]\n", - "\n", - "onnx_input = {\n", - " \"input_ids\": input_ids.cpu().numpy(),\n", - " \"attention_mask\": attention_mask.cpu().numpy(),\n", - " \"token_type_ids\": token_type_ids.cpu().numpy(),\n", - "}\n", - "\n", - "logits = model(**onnx_input).logits" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(2, 10, 30522)" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "logits.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Output Logits shape: (2, 10, 30522)\n", - "Sparse Vector shape: (2, 30522)\n", - "SPLADE BOW rep for sentence:\tBuilt by Nirant Kasliwal\n", - "[('##rant', 2.02), ('built', 1.94), ('##wal', 1.79), ('##sl', 1.69), ('build', 1.57), ('ka', 1.4), ('ni', 1.26), ('made', 0.93), ('architect', 0.76), ('was', 0.69), ('who', 0.61), ('his', 0.5), ('wrote', 0.47), ('india', 0.45), ('company', 0.41), ('##i', 0.41), ('he', 0.37), ('manufacturer', 0.36), ('by', 0.35), ('engineer', 0.33), ('architecture', 0.33), ('ko', 0.23), ('him', 0.22), ('invented', 0.19), ('said', 0.14), ('k', 0.11), ('man', 0.11), ('statue', 0.11), ('bomb', 0.1), ('##wa', 0.1), ('builder', 0.09), ('.', 0.07), ('started', 0.06), (',', 0.04), ('ku', 0.03)]\n" - ] - } - ], - "source": [ - "print(\"Output Logits shape: \", logits.shape)\n", - "\n", - "relu_log = np.log(1 + np.maximum(logits, 0))\n", - "\n", - "# Equivalent to relu_log * attention_mask.unsqueeze(-1)\n", - "# For NumPy, you might need to explicitly expand dimensions if 'attention_mask' is not already 2D\n", - "weighted_log = relu_log * np.expand_dims(attention_mask, axis=-1)\n", - "\n", - "# Equivalent to torch.max(weighted_log, dim=1)\n", - "# NumPy's max function returns only the max values, not the indices, so we don't need to unpack two values\n", - "max_val = np.max(weighted_log, axis=1)\n", - "\n", - "# Equivalent to max_val.squeeze()\n", - "# This step may be unnecessary in NumPy if max_val doesn't have unnecessary dimensions\n", - "vector = np.squeeze(max_val)\n", - "print(\"Sparse Vector shape: \", vector.shape)\n", - "\n", - "# print(vector[0].nonzero())\n", - "\n", - "cols = [vec.nonzero()[0].squeeze().tolist() for vec in vector]\n", - "weights = [vec[col].tolist() for vec, col in zip(vector, cols)]\n", - "\n", - "idx = 1\n", - "cols, weights = cols[idx], weights[idx]\n", - "# Print the BOW representation\n", - "d = {k: v for k, v in zip(cols, weights)}\n", - "sorted_d = {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}\n", - "bow_rep = []\n", - "for k, v in sorted_d.items():\n", - " bow_rep.append((reverse_voc[k], round(v, 2)))\n", - "print(f\"SPLADE BOW rep for sentence:\\t{sentences[idx]}\\n{bow_rep}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "35" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(cols)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[1010,\n", - " 1012,\n", - " 1047,\n", - " 2001,\n", - " 2002,\n", - " 2010,\n", - " 2011,\n", - " 2032,\n", - " 2040,\n", - " 2056,\n", - " 2072,\n", - " 2081,\n", - " 2158,\n", - " 2194,\n", - " 2318,\n", - " 2328,\n", - " 2626,\n", - " 2634,\n", - " 3857,\n", - " 3992,\n", - " 4213,\n", - " 4294,\n", - " 4944,\n", - " 5968,\n", - " 6231,\n", - " 7751,\n", - " 8826,\n", - " 9152,\n", - " 10556,\n", - " 12508,\n", - " 12849,\n", - " 13476,\n", - " 13970,\n", - " 14540,\n", - " 17884]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cols" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "fst", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/experiments/Example. Convert Resnet50 to ONNX.ipynb b/experiments/Example. Convert Resnet50 to ONNX.ipynb deleted file mode 100644 index 84da59e79..000000000 --- a/experiments/Example. Convert Resnet50 to ONNX.ipynb +++ /dev/null @@ -1,122 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "4bdb2a91-fa2a-4cee-ad5a-176cc957394d", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-23T12:15:28.171586Z", - "start_time": "2024-05-23T12:15:28.076314Z" - } - }, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'torch'", - "output_type": "error", - "traceback": [ - "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[0;31mModuleNotFoundError\u001B[0m Traceback (most recent call last)", - "Cell \u001B[0;32mIn[1], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mtorch\u001B[39;00m\n\u001B[1;32m 2\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mtorch\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01monnx\u001B[39;00m\n\u001B[1;32m 3\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mtorchvision\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mmodels\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m \u001B[38;5;21;01mmodels\u001B[39;00m\n", - "\u001B[0;31mModuleNotFoundError\u001B[0m: No module named 'torch'" - ] - } - ], - "source": [ - "import torch\n", - "import torch.onnx\n", - "import torchvision.models as models\n", - "import torchvision.transforms as transforms\n", - "from PIL import Image\n", - "import numpy as np\n", - "from tests.config import TEST_MISC_DIR\n", - "\n", - "# Load pre-trained ResNet-50 model\n", - "resnet = models.resnet50(pretrained=True)\n", - "resnet = torch.nn.Sequential(*(list(resnet.children())[:-1])) # Remove the last fully connected layer\n", - "resnet.eval()\n", - "\n", - "# Define preprocessing transform\n", - "preprocess = transforms.Compose([\n", - " transforms.Resize(256),\n", - " transforms.CenterCrop(224),\n", - " transforms.ToTensor(),\n", - " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", - "])\n", - "\n", - "# Load and preprocess the image\n", - "def preprocess_image(image_path):\n", - " input_image = Image.open(image_path)\n", - " input_tensor = preprocess(input_image)\n", - " input_batch = input_tensor.unsqueeze(0) # Add batch dimension\n", - " return input_batch\n", - "\n", - "# Example input for exporting\n", - "input_image = preprocess_image('example.jpg')\n", - "\n", - "# Export the model to ONNX with dynamic axes\n", - "torch.onnx.export(\n", - " resnet, \n", - " input_image, \n", - " \"model.onnx\", \n", - " export_params=True, \n", - " opset_version=9, \n", - " input_names=['input'], \n", - " output_names=['output'],\n", - " dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}\n", - ")\n", - "\n", - "# Load ONNX model\n", - "import onnx\n", - "import onnxruntime as ort\n", - "\n", - "onnx_model = onnx.load(\"model.onnx\")\n", - "ort_session = ort.InferenceSession(\"model.onnx\")\n", - "\n", - "# Run inference and extract feature vectors\n", - "def extract_feature_vectors(image_paths):\n", - " input_images = [preprocess_image(image_path) for image_path in image_paths]\n", - " input_batch = torch.cat(input_images, dim=0) # Combine images into a single batch\n", - " ort_inputs = {ort_session.get_inputs()[0].name: input_batch.numpy()}\n", - " ort_outs = ort_session.run(None, ort_inputs)\n", - " return ort_outs[0]\n", - "\n", - "# Example usage\n", - "images = [TEST_MISC_DIR / \"image.jpeg\", str(TEST_MISC_DIR / \"small_image.jpeg\")] # Replace with your image paths\n", - "feature_vectors = extract_feature_vectors(images)\n", - "print(\"Feature vector shape:\", feature_vectors.shape)\n" - ] - }, - { - "cell_type": "code", - "outputs": [], - "source": [], - "metadata": { - "collapsed": false - }, - "id": "baa650c4cb3e0e6d" - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/experiments/Throughput_Across_Models.ipynb b/experiments/Throughput_Across_Models.ipynb deleted file mode 100644 index 1befa732c..000000000 --- a/experiments/Throughput_Across_Models.ipynb +++ /dev/null @@ -1,349 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# ๐Ÿค— Huggingface vs โšก FastEmbed๏ธ\n", - "\n", - "Comparing the performance of Huggingface's ๐Ÿค— Transformers and โšก FastEmbed๏ธ on a simple task on the following machine: Apple M2 Max, 32 GB RAM\n", - "\n", - "## ๐Ÿ“ฆ Imports\n", - "\n", - "Importing the necessary libraries for this comparison." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:33:35.753669Z", - "start_time": "2024-03-30T00:33:34.371658Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/joein/work/qdrant/fastembed/venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "import time\n", - "from typing import Callable\n", - "\n", - "import torch.nn.functional as F\n", - "from fastembed import TextEmbedding\n", - "import matplotlib.pyplot as plt\n", - "from transformers import AutoModel, AutoTokenizer" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ๐Ÿ“– Data\n", - "\n", - "data is a list of strings, each string is a document." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:33:35.766679Z", - "start_time": "2024-03-30T00:33:35.755112Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": "12" - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "documents: list[str] = [\n", - " \"Chandrayaan-3 is India's third lunar mission\",\n", - " \"It aimed to land a rover on the Moon's surface - joining the US, China and Russia\",\n", - " \"The mission is a follow-up to Chandrayaan-2, which had partial success\",\n", - " \"Chandrayaan-3 will be launched by the Indian Space Research Organisation (ISRO)\",\n", - " \"The estimated cost of the mission is around $35 million\",\n", - " \"It will carry instruments to study the lunar surface and atmosphere\",\n", - " \"Chandrayaan-3 landed on the Moon's surface on 23rd August 2023\",\n", - " \"It consists of a lander named Vikram and a rover named Pragyan similar to Chandrayaan-2. Its propulsion module would act like an orbiter.\",\n", - " \"The propulsion module carries the lander and rover configuration until the spacecraft is in a 100-kilometre (62 mi) lunar orbit\",\n", - " \"The mission used GSLV Mk III rocket for its launch\",\n", - " \"Chandrayaan-3 was launched from the Satish Dhawan Space Centre in Sriharikota\",\n", - " \"Chandrayaan-3 was launched earlier in the year 2023\",\n", - "]\n", - "len(documents)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:33:35.766791Z", - "start_time": "2024-03-30T00:33:35.756803Z" - } - }, - "outputs": [], - "source": [ - "model_id = \"BAAI/bge-small-en\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setting up ๐Ÿค— Huggingface\n", - "\n", - "We'll be using the [Huggingface Transformers](https://huggingface.co/transformers/) with PyTorch library to generate embeddings. We'll be using the same model across both libraries for a fair(er?) comparison." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:34:03.988Z", - "start_time": "2024-03-30T00:33:37.460865Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "config.json: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 684/684 [00:00<00:00, 491kB/s]\n", - "model.safetensors: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 133M/133M [00:21<00:00, 6.24MB/s] \n", - "tokenizer_config.json: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 366/366 [00:00<00:00, 4.06MB/s]\n", - "vocab.txt: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 232k/232k [00:00<00:00, 1.12MB/s]\n", - "tokenizer.json: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 711k/711k [00:00<00:00, 1.59MB/s]\n", - "special_tokens_map.json: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 125/125 [00:00<00:00, 399kB/s]\n" - ] - }, - { - "data": { - "text/plain": "torch.Size([12, 384])" - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "class HF:\n", - " \"\"\"\n", - " HuggingFace Transformer implementation of FlagEmbedding\n", - " Based on https://huggingface.co/BAAI/bge-base-en\n", - " \"\"\"\n", - "\n", - " def __init__(self, model_id: str):\n", - " self.model = AutoModel.from_pretrained(model_id)\n", - " self.tokenizer = AutoTokenizer.from_pretrained(model_id)\n", - "\n", - " def embed(self, texts: list[str]):\n", - " encoded_input = self.tokenizer(\n", - " texts, max_length=512, padding=True, truncation=True, return_tensors=\"pt\"\n", - " )\n", - " model_output = self.model(**encoded_input)\n", - " sentence_embeddings = model_output[0][:, 0]\n", - " sentence_embeddings = F.normalize(sentence_embeddings)\n", - " return sentence_embeddings\n", - "\n", - "\n", - "hf = HF(model_id=model_id)\n", - "hf.embed(documents).shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setting up โšก๏ธFastEmbed\n", - "\n", - "Sorry, don't have a lot to set up here. We'll be using the default model, which is Flag Embedding, same as the Huggingface model." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:34:04.076422Z", - "start_time": "2024-03-30T00:34:03.987162Z" - } - }, - "outputs": [], - "source": [ - "embedding_model = TextEmbedding(model_name=model_id)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ๐Ÿ“Š Comparison\n", - "\n", - "We'll be comparing the following metrics: Minimum, Maximum, Mean, across k runs. Let's write a function to do that:\n", - "\n", - "### ๐Ÿš€ Calculating Stats" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:34:06.543782Z", - "start_time": "2024-03-30T00:34:06.357816Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Huggingface Transformers (Average, Max, Min): (0.05358994007110596, 0.0568850040435791, 0.05029487609863281)\n", - "FastEmbed (Average, Max, Min): (0.035953521728515625, 0.03631591796875, 0.03559112548828125)\n" - ] - } - ], - "source": [ - "import types\n", - "\n", - "\n", - "def calculate_time_stats(\n", - " embed_func: Callable, documents: list, k: int\n", - ") -> tuple[float, float, float]:\n", - " times = []\n", - " for _ in range(k):\n", - " # Timing the embed_func call\n", - " start_time = time.time()\n", - " embeddings = embed_func(documents)\n", - " # Force computation if embed_func returns a generator\n", - " if isinstance(embeddings, types.GeneratorType):\n", - " list(embeddings)\n", - "\n", - " end_time = time.time()\n", - " times.append(end_time - start_time)\n", - "\n", - " # Returning mean, max, and min time for the call\n", - " return (sum(times) / k, max(times), min(times))\n", - "\n", - "\n", - "hf_stats = calculate_time_stats(hf.embed, documents, k=2)\n", - "print(f\"Huggingface Transformers (Average, Max, Min): {hf_stats}\")\n", - "fst_stats = calculate_time_stats(embedding_model.embed, documents, k=2)\n", - "print(f\"FastEmbed (Average, Max, Min): {fst_stats}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ๐Ÿ“ˆ Results\n", - "\n", - "Let's run the comparison and see the results." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "ExecuteTime": { - "end_time": "2024-03-30T00:34:11.032206Z", - "start_time": "2024-03-30T00:34:10.828410Z" - } - }, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAGzCAYAAAAyiiOsAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/H5lhTAAAACXBIWXMAAA9hAAAPYQGoP6dpAABihklEQVR4nO3dd1gUV9sG8HtB6U2QIoKgqAiKJdiwFwTsLTEqiaCoiYI1RuMbW4ohaowaNZYkrxqjid1EjVjA3kWxY0ERG0UFVkBA2PP94ce8rgs6q4tszP27rr10zzxz5plhd3mYOXNWIYQQICIiIqIXMijrBIiIiIj+CVg0EREREcnAoomIiIhIBhZNRERERDKwaCIiIiKSgUUTERERkQwsmoiIiIhkYNFEREREJAOLJiIiIiIZWDRRqVIoFIiIiCjrNIgAPH09Tps2razTIADu7u4IDQ0t6zT+EUJDQ+Hu7l7WaRBYNNErSkhIwEcffYRq1arBxMQEVlZWaN68OebNm4fHjx+XdXqv7e7du5g2bRri4uLKOhU106ZNg0KhkB5mZmbw9vbGpEmToFQqyzo90qHExEQMHDgQHh4eMDExgZOTE1q1aoWpU6eWdWpvXG5uLubMmYMmTZrA2toaJiYmqFmzJiIiInDlypWyTo/+RcqVdQL0z7Nt2za89957MDY2xoABA1CnTh3k5+fj4MGD+PTTT3HhwgUsXbq0rNN8LXfv3sUXX3wBd3d31K9fv6zT0bBo0SJYWFggKysLO3fuxPTp0xETE4NDhw5BoVCUdXr0mq5du4ZGjRrB1NQUgwYNgru7O+7du4dTp05hxowZ+OKLL8o6xTfm/v37CAoKQmxsLLp06YL+/fvDwsICly9fxh9//IGlS5ciPz+/rNMsVT/99BNUKlVZp0Fg0URaunHjBvr27Qs3NzfExMSgUqVK0rLw8HBcu3YN27Zte6M5ZWdnw9zc/I1u81XpKtd3330XFStWBAB8/PHH6N27NzZu3IijR4/Cz8+v2HVycnJgZmb22tsm3XjRa2HOnDnIyspCXFwc3Nzc1Jalpqa+ifT0RmhoKE6fPo3169ejd+/easu++uorfP7552WUWekreo2UL1++rFOh/8fLc6SVmTNnIisrC7/88otawVSkevXqGDVqlEb75s2bUadOHRgbG6N27dqIiopSW37z5k0MHz4cnp6eMDU1hZ2dHd577z0kJiaqxS1fvhwKhQL79u3D8OHD4eDgABcXF636AICMjAyMGTMG7u7uMDY2houLCwYMGID79+9j7969aNSoEQBg4MCB0qWw5cuXS+sfO3YMQUFBsLa2hpmZGVq3bo1Dhw6pbaPoUtrFixfRv39/VKhQAS1atAAAJCcnY+DAgXBxcYGxsTEqVaqE7t27F5urHO3atQPwtKgFgDZt2qBOnTqIjY1Fq1atYGZmhv/85z8Anv7SDQsLg6OjI0xMTFCvXj2sWLFCo0+VSoV58+bBx8cHJiYmsLe3R1BQEE6ePKkW99tvv8HX1xempqawtbVF3759cevWLbWYq1evonfv3nBycoKJiQlcXFzQt29fZGZmSjG7du1CixYtYGNjAwsLC3h6eko5F8nLy8PUqVNRvXp1GBsbw9XVFePHj0deXp5G3JgxY2Bvbw9LS0t069YNt2/flnUs9+7dC4VCgTVr1uA///kPnJycYG5ujm7dumnsF/D6r4XiJCQkwMXFRaNgAgAHBweNtu3bt6Nly5YwNzeHpaUlOnfujAsXLmjExcfHo0+fPrC3t4epqSk8PT01io7Tp0+jY8eOsLKygoWFBdq3b4+jR4+qxRS9Dw8dOoSxY8fC3t4e5ubm6NmzJ9LS0tRihRD4+uuv4eLiAjMzM7Rt27bY3Ipz7NgxbNu2DWFhYRoFEwAYGxvju+++U2uLiYmRjoWNjQ26d++OS5cuqcUU/TyuXLmCDz74ANbW1rC3t8fkyZMhhMCtW7fQvXt3WFlZwcnJCbNnz1ZbX5vXyIEDB/Dee++hSpUq0mt2zJgxGsMYQkNDYWFhgYSEBHTq1AmWlpYIDg6Wlj0/pumPP/6Ar68vLC0tYWVlBR8fH8ybN08t5vr163jvvfdga2sLMzMzNG3aVOOP2qJ9Wbt2LaZPnw4XFxeYmJigffv2uHbtWgk/mX8vnmkirWzZsgXVqlVDs2bNZK9z8OBBbNy4EcOHD4elpSV++OEH9O7dG0lJSbCzswMAnDhxAocPH0bfvn3h4uKCxMRELFq0CG3atMHFixc1zpAMHz4c9vb2mDJlCrKzs7XqIysrCy1btsSlS5cwaNAgvPPOO7h//z7++usv3L59G15eXvjyyy8xZcoUDB06FC1btgQAaZ9jYmLQsWNH+Pr6YurUqTAwMMCyZcvQrl07HDhwAI0bN1bL9b333kONGjXwzTffQAgBAOjduzcuXLiAESNGwN3dHampqdi1axeSkpJeacBnQkICAEjHEwAePHiAjh07om/fvvjggw/g6OiIx48fo02bNrh27RoiIiJQtWpVrFu3DqGhocjIyFAreMPCwrB8+XJ07NgRgwcPRkFBAQ4cOICjR4+iYcOGAIDp06dj8uTJ6NOnDwYPHoy0tDTMnz8frVq1wunTp2FjY4P8/HwEBgYiLy8PI0aMgJOTE+7cuYOtW7ciIyMD1tbWuHDhArp06YK6deviyy+/hLGxMa5du6ZWfKhUKnTr1g0HDx7E0KFD4eXlhXPnzmHOnDm4cuUKNm/eLMUOHjwYv/32G/r3749mzZohJiYGnTt31uqYTp8+HQqFAhMmTEBqairmzp0Lf39/xMXFwdTUFIBuXgvFcXNzw+7duxETEyMVxCVZuXIlQkJCEBgYiBkzZiAnJweLFi1CixYtcPr0aen1dPbsWbRs2RLly5fH0KFD4e7ujoSEBGzZsgXTp08HAFy4cAEtW7aElZUVxo8fj/Lly2PJkiVo06YN9u3bhyZNmqhte8SIEahQoQKmTp2KxMREzJ07FxEREVizZo0UM2XKFHz99dfo1KkTOnXqhFOnTiEgIEDWJbW//voLAPDhhx++NBYAdu/ejY4dO6JatWqYNm0aHj9+jPnz56N58+Y4deqUxnvr/fffh5eXF7799lts27YNX3/9NWxtbbFkyRK0a9cOM2bMwKpVqzBu3Dg0atQIrVq1Ultfzmtk3bp1yMnJwbBhw2BnZ4fjx49j/vz5uH37NtatW6fWX0FBAQIDA9GiRQt89913JZ4Z3rVrF/r164f27dtjxowZAIBLly7h0KFD0ns4JSUFzZo1Q05ODkaOHAk7OzusWLEC3bp1w/r169GzZ0+1Pr/99lsYGBhg3LhxyMzMxMyZMxEcHIxjx47JOvb/GoJIpszMTAFAdO/eXfY6AISRkZG4du2a1HbmzBkBQMyfP19qy8nJ0Vj3yJEjAoD49ddfpbZly5YJAKJFixaioKBALV5uH1OmTBEAxMaNGzXiVSqVEEKIEydOCABi2bJlGstr1KghAgMDpdiibVetWlV06NBBaps6daoAIPr166fWR3p6ugAgZs2apbH9lynq8/LlyyItLU3cuHFDLFmyRBgbGwtHR0eRnZ0thBCidevWAoBYvHix2vpz584VAMRvv/0mteXn5ws/Pz9hYWEhlEqlEEKImJgYAUCMHDmyxGOUmJgoDA0NxfTp09WWnzt3TpQrV05qP336tAAg1q1bV+J+zZkzRwAQaWlpJcasXLlSGBgYiAMHDqi1L168WAAQhw4dEkIIERcXJwCI4cOHq8X1799fABBTp04tcRtCCLFnzx4BQFSuXFk6HkIIsXbtWgFAzJs3TzoOr/taKMn58+eFqampACDq168vRo0aJTZv3iz9fIs8evRI2NjYiCFDhqi1JycnC2tra7X2Vq1aCUtLS3Hz5k212Gdz79GjhzAyMhIJCQlS2927d4WlpaVo1aqV1Fb0PvT391dbf8yYMcLQ0FBkZGQIIYRITU0VRkZGonPnzmpx//nPfwQAERIS8sLj0LNnTwFApKenvzCuSP369YWDg4N48OCB1HbmzBlhYGAgBgwYILUV/TyGDh0qtRUUFAgXFxehUCjEt99+K7Wnp6cLU1NTtVzlvkaEKP5zKTIyUigUCrWfRUhIiAAgPvvsM434kJAQ4ebmJj0fNWqUsLKy0vgMfNbo0aMFALX3y6NHj0TVqlWFu7u7KCwsVNsXLy8vkZeXJ8XOmzdPABDnzp0rcRv/Rrw8R7IV3Z1laWmp1Xr+/v7w8PCQntetWxdWVla4fv261Fb0VxkAPHnyBA8ePED16tVhY2ODU6dOafQ5ZMgQGBoaqrXJ7WPDhg2oV6+exl9aAF46iDouLg5Xr15F//798eDBA9y/fx/3799HdnY22rdvj/3792sM2Pz444818jQyMsLevXuRnp7+wu2VxNPTE/b29qhatSo++ugjVK9eHdu2bVP7y9TY2BgDBw5UW+/vv/+Gk5MT+vXrJ7WVL18eI0eORFZWFvbt2wfg6TFSKBTF3qlVdIw2btwIlUqFPn36SMfh/v37cHJyQo0aNbBnzx4AgLW1NQBgx44dyMnJKXZ/bGxsAAB//vlniQNe161bBy8vL9SqVUtte0VnYoq29/fffwMARo4cqbb+6NGji+23JAMGDFB7rb/77ruoVKmS1L8uXgslqV27NuLi4vDBBx8gMTER8+bNQ48ePeDo6IiffvpJitu1axcyMjLQr18/tWNiaGiIJk2aSMckLS0N+/fvx6BBg1ClShW1bRX9PAsLC7Fz50706NED1apVk5ZXqlQJ/fv3x8GDBzXu0Bw6dKjae6Zly5YoLCzEzZs3ATw985Ofn48RI0aoxcn9WWjzmXPv3j3ExcUhNDQUtra2UnvdunXRoUMH6ef2rMGDB0v/NzQ0RMOGDSGEQFhYmNRuY2MDT09Ptc+rIi97jQDqn0vZ2dm4f/8+mjVrBiEETp8+rdHnsGHDXrqvNjY2yM7Oxq5du0qM+fvvv9G4cWO1y8AWFhYYOnQoEhMTcfHiRbX4gQMHwsjISHpedIa9uP3+N+PlOZLNysoKAPDo0SOt1nv+QxoAKlSooFYwPH78GJGRkVi2bBnu3Lmjduni2XEvRapWrarRJrePhISEYsdHyHH16lUAQEhISIkxmZmZqFChQom5GhsbY8aMGfjkk0/g6OiIpk2bokuXLhgwYACcnJxk5bFhwwZYWVmhfPnycHFxUStKi1SuXFntQxB4Ou6rRo0aMDBQ/3vJy8tLWg48PUbOzs5qv3yed/XqVQghUKNGjWKXFw1erVq1KsaOHYvvv/8eq1atQsuWLdGtWzdpLAnw9DLJzz//jMGDB+Ozzz5D+/bt0atXL7z77rtSrlevXsWlS5dgb29f7PaKBkjfvHkTBgYGGsfE09OzxH0pzvP7pVAoUL16dWncmS5eCy9Ss2ZNrFy5EoWFhbh48SK2bt2KmTNnYujQoahatSr8/f2lHEq6hFf0ni36xVenTp0St5eWloacnJxij5OXlxdUKhVu3bqF2rVrS+3Pv7eL9rXovV30enr+WNrb26sdl5I8+5lTVFiXpGhbJeW/Y8cOjcH3z+dfNJ1B0U0Wz7Y/ePBAo9+XvUYAICkpCVOmTMFff/2l8UfS859t5cqVk8Zovsjw4cOxdu1adOzYEZUrV0ZAQAD69OmDoKAgKebmzZsal1MB9ff6s6+Hl/0s6SkWTSSblZUVnJ2dcf78ea3We/6MUJFni5oRI0Zg2bJlGD16NPz8/GBtbQ2FQoG+ffsWe+bh2b/eXrWPV1HUz6xZs0qcisDCwuKluY4ePRpdu3bF5s2bsWPHDkyePBmRkZGIiYlBgwYNXppHq1atND7Yn1fcdnVJpVJBoVBg+/btxf6Mnz0Os2fPRmhoKP7880/s3LkTI0eORGRkJI4ePQoXFxeYmppi//792LNnD7Zt24aoqCisWbMG7dq1w86dO2FoaAiVSgUfHx98//33xebj6upaavtaHF29Fl7G0NAQPj4+8PHxgZ+fH9q2bYtVq1bB399fymHlypXFFtzlypXuR7yc9/brqFWrFgDg3Llz0pkPXSouf13uU2FhITp06ICHDx9iwoQJqFWrFszNzXHnzh2EhoZqfC4ZGxtr/EFTHAcHB8TFxWHHjh3Yvn07tm/fjmXLlmHAgAHF3tQhR2n/LN8WLJpIK126dMHSpUtx5MiREm9tfxXr169HSEiI2l0qubm5yMjI0HkfHh4eLy38SrpMV3T2wsrKCv7+/rJzK6mvTz75BJ988gmuXr2K+vXrY/bs2fjtt99eq98XcXNzw9mzZ6FSqdQ+nOPj46XlRbnt2LEDDx8+LPFsk4eHB4QQqFq1KmrWrPnSbRf94p80aRIOHz6M5s2bY/Hixfj6668BAAYGBmjfvj3at2+P77//Ht988w0+//xz7NmzR7rEe+bMGbRv3/6Fl1Hd3NygUqmQkJCgdtbh8uXLLz9Azyg6i1NECIFr166hbt260v4DunktyFU0AP/evXtqOTg4OLwwh6LLbS963dvb28PMzKzY4xQfHw8DAwOtC9Oi19PVq1fVLvmlpaXJOoPRtWtXREZG4rfffntp0VS0rZLyr1ixos6nJnnZa+TcuXO4cuUKVqxYgQEDBkhxL7qsJpeRkRG6du2Krl27QqVSYfjw4ViyZAkmT56M6tWrw83NrcRjAaDYOzPp5TimibQyfvx4mJubY/DgwUhJSdFYnpCQoHHbqxyGhoYaf9HMnz8fhYWFOu+jd+/eOHPmDDZt2qTRR9H6RR+uzxdcvr6+8PDwwHfffYesrCyN9Z+/3bo4OTk5yM3NVWvz8PCApaWlxq3zutapUyckJyer3d1UUFCA+fPnw8LCAq1btwbw9BgJIYqdRLHoGPXq1QuGhob44osvNI67EEK6nKFUKlFQUKC23MfHBwYGBtL+Pnz4UGM7RWdvimL69OmDO3fuqI3pKfL48WPpLsqOHTsCAH744Qe1mLlz5xZzREr266+/ql2KXr9+Pe7duyf1r4vXQkkOHDiAJ0+eaLQXjZUpKgYDAwNhZWWFb775ptj4ohzs7e3RqlUr/Pe//0VSUpJaTNHPztDQEAEBAfjzzz/VLi+lpKRg9erVaNGihXS5TC5/f3+UL18e8+fPV3uNyP1Z+Pn5ISgoCD///LPa3ZFF8vPzMW7cOABPx17Vr18fK1asUHvfnj9/Hjt37kSnTp20yl2Ol71Gis7ePLvvQohX+ox81vOXCg0MDKRCrej90qlTJxw/fhxHjhyR4rKzs7F06VK4u7vD29v7tXL4t+KZJtKKh4cHVq9eLd2q++yM4IcPH5ZuX9dWly5dsHLlSlhbW8Pb2xtHjhzB7t271W6h11Ufn376KdavX4/33nsPgwYNgq+vLx4+fIi//voLixcvRr169eDh4QEbGxssXrwYlpaWMDc3R5MmTVC1alX8/PPP6NixI2rXro2BAweicuXKuHPnDvbs2QMrKyts2bLlhXleuXIF7du3R58+feDt7Y1y5cph06ZNSElJQd++fbU+dtoYOnQolixZgtDQUMTGxsLd3R3r16/HoUOHMHfuXGlQa9u2bfHhhx/ihx9+wNWrVxEUFASVSoUDBw6gbdu2iIiIgIeHB77++mtMnDgRiYmJ6NGjBywtLXHjxg1s2rQJQ4cOxbhx4xATE4OIiAi89957qFmzJgoKCrBy5UoYGhpKY8u+/PJL7N+/H507d4abmxtSU1Px448/wsXFRRrI+uGHH2Lt2rX4+OOPsWfPHjRv3hyFhYWIj4/H2rVrsWPHDjRs2BD169dHv3798OOPPyIzMxPNmjVDdHS01nPO2NraokWLFhg4cCBSUlIwd+5cVK9eHUOGDAHw9BfV674WSjJjxgzExsaiV69e0i/DU6dO4ddff4Wtra00kNrKygqLFi3Chx9+iHfeeQd9+/aFvb09kpKSsG3bNjRv3hwLFiwA8LSIbNGiBd555x1pXFRiYiK2bdsmfV3Q119/Lc2XNXz4cJQrVw5LlixBXl4eZs6cqfV+2NvbY9y4cYiMjESXLl3QqVMnnD59Gtu3b3/p5eUiv/76KwICAtCrVy907doV7du3h7m5Oa5evYo//vgD9+7dk+ZqmjVrFjp27Ag/Pz+EhYVJUw5YW1uXyncOvuw1UqtWLXh4eGDcuHG4c+cOrKyssGHDhtceJzR48GA8fPgQ7dq1g4uLC27evIn58+ejfv360pilzz77DL///js6duyIkSNHwtbWFitWrMCNGzewYcMGWZcBqRhv8E49eotcuXJFDBkyRLi7uwsjIyNhaWkpmjdvLubPny9yc3OlOAAiPDxcY303Nze1W3jT09PFwIEDRcWKFYWFhYUIDAwU8fHxGnFFtzqfOHFCo0+5fQghxIMHD0RERISoXLmyMDIyEi4uLiIkJETcv39fivnzzz+Ft7e3KFeunMb0A6dPnxa9evUSdnZ2wtjYWLi5uYk+ffqI6OhoKabotubnb6O/f/++CA8PF7Vq1RLm5ubC2tpaNGnSRKxdu/Zlh73EPp/XunVrUbt27WKXpaSkSMfJyMhI+Pj4aEytIMTTW7BnzZolatWqJYyMjIS9vb3o2LGjiI2NVYvbsGGDaNGihTA3Nxfm5uaiVq1aIjw8XFy+fFkIIcT169fFoEGDhIeHhzAxMRG2traibdu2Yvfu3VIf0dHRonv37sLZ2VkYGRkJZ2dn0a9fP3HlyhW1beXn54sZM2aI2rVrC2NjY1GhQgXh6+srvvjiC5GZmSnFPX78WIwcOVLY2dkJc3Nz0bVrV3Hr1i2tphz4/fffxcSJE4WDg4MwNTUVnTt31rhdX4jXey2U5NChQyI8PFzUqVNHWFtbi/Lly4sqVaqI0NBQtekAns05MDBQWFtbCxMTE+Hh4SFCQ0PFyZMn1eLOnz8vevbsKWxsbISJiYnw9PQUkydPVos5deqUCAwMFBYWFsLMzEy0bdtWHD58WC2mpPdh0bHbs2eP1FZYWCi++OILUalSJWFqairatGkjzp8/X+z7siQ5OTniu+++E40aNRIWFhbCyMhI1KhRQ4wYMUJtOhMhhNi9e7do3ry5MDU1FVZWVqJr167i4sWLajEl/TxCQkKEubm5xvaffz9p8xq5ePGi8Pf3FxYWFqJixYpiyJAh0rQrz77vStp20bJnpxxYv369CAgIEA4ODsLIyEhUqVJFfPTRR+LevXtq6yUkJIh3331X+nk3btxYbN26VS2maF+enxLkxo0bxU678m+nEIKjvIiIiuzduxdt27bFunXr8O6775Z1OqSH+Br59+L5OSIiIiIZWDQRERERycCiiYiIiEgGjmkiIiIikoFnmoiIiIhkYNFEREREJAMnt9QRlUqFu3fvwtLS8oVf8UBERET6QwiBR48ewdnZ+aWTfrJo0pG7d+++8S8MJSIiIt24desWXFxcXhjDoklHir5+4tatW1p/PxMREak7H38bf+06hRNxN3AnJR02lmao6+WKiEEd4O7yv69gmTRzPf7aeVpjfXfXivhr2Ri1NpVKheXrDmLtluO4/+AR3FzsENavNTq1q6cWt37bCWyLjsONpDQ8ys6FvZ0VGtWrio8/bIfKThWkuOTUDGyKisWBY5dx884DGBoYoLq7I4YGt0FT3+ov3cfU+0rM+SkK5y/fQdoDJQwNDODmYof3uzdFtw4NeNXiDVEqlXB1dZV+j78IiyYdKXpxW1lZsWgiInpNKzcewckz19G5fQPUqu6MtAdKrFi3H32HLcSm/34CTw9nAED58uVhZFQOMz7vr7a+pYWJxmfxjIV/YdGKXejXoxnqelfBrn3n8Nk3a2FmZoZuAb5S3PWk+3B3dUBQ2/qwtjTDrbsP8Pvmwzhw7Aq2r/oMjvbWAIBNUaexbM0BBLSuiz7dmqGgUIWNfx/H0AnLMHNyMPp0bfrCfbyT8gj307PRxf8dODtVQEFBIQ4cv4zJMzfgXqoS44d308WhJJnkFKmcckBHlEolrK2tkZmZyaKJiOg1xZ69Dh+vKjAq/7+/7W8kpSKwfyQ6tauPuV+GAAA++WIltsfE4eK+2S/sLzk1Ay17TEO/ns3w5ad9ADwdy/L+R/Nw6+4DHPzzCxgaljye5dylJHQNmYXx4V0xPCQAAHAl4R4q2lnC1sZCisvLf4JOH8xATk4ejmz96pX2PWzsEhyJvYJzMbNemBPphja/v/nTICIiveNbt5pawQQAVas4oGa1SriWmKIRX1iowqOsxyX2t2v/WTwpKMSHvVtKbQqFAsG9W+BeagZOnbvxwnxcKtkBAJSP/reNmh6V1AomADA2Ko+2zbxxLzUDWdm5L+yz5G3Z4nHuEzx5UvBK61Pp4eU5IiL6RxBC4P7DR6hR1Umt/XHuE9Rp+yke5+bD2urppbbPIrrD3MxYirlw+TbMTI1Q/bl169d2k5Y3qu+htiw9IxuFKhXuJqfjh1+2AwCaN/J8aZ5pDx7B1MQIpiZGsvYrNzcfObn5yM7Jw7FT17Bu61G84+MOE5nr05vDoomIiP4RNkedRHJqBsYO7SS1OVS0xkcftkcdT1eohMC+Ixexcv0BXLp6B38sGoly5QwBPB10XdHWSmPcikPFp+OTUu5namyvSZdJyM9/erangrU5pn3yLlo2qfXCHBNvpSFq7xl0bldf9qW1/67Zi5kLt0jPmzeqiVlTPpC1Lr1ZLJqIiEjvXUtMxpSZa/GOT1X07txEap8Qrj5YuluAL6pVccCsRVvxd0ycNMA7N+8JjIw0f+UZ/39bbt4TjWXL5w5DXv4TJNxIwaaoE8jJzXthjo9z8zF84n9hYlweEyK6y963bgENUderCh6kZyHm4AXcf6hEbq5mPlT2OKaJiIj0Wup9JQaNWQJLC1Ms+jbspWdwwvq1hYGBAoeOX5baTIzLS2eNnpX3/20mxuU1ljVrWBNtm9XG4OB2+DFyEOb9HIUVa/cVu83CQhVGfL4M124kY1HkIOkOOzlcKtmiReNa6B7YEPO+CkGVyhXxQcQC5Obmy+6D3gwWTUREpLeUWY8ROnoRlI9ysGLeMFnFiImJESpYmyNDmS21OVS0QtoDJZ6/YTz1/y/LOVZ8cb9uLvaoXdMFm6NOFrv8s29+R/TBC5g1JRjNZIx7epGO7erjbko6jp1OeK1+SPdYNBERkV7KzXuCwWOX4EZSKn75/mPUqFZJ1npZ2bl4mJEN2wr/m6zQu6YLHufm49qNZLXYuPM3/395ZVn5PCrmjrhvftiMdVuOYvKYXuge2FBWji/bDoAX3g1IZYNFExER6Z3CQhUiPl+GU+du4MfIQfCtW1UjJjfvSbG39c//JQpCCLRu6iW1dWjlg/LlDLFywwGpTQiBVRsPwsnBBr51qwEACgoKkanM0egz7kIiLifcRd1a6l+XtWTlbiz9LRrhoQEY1LdNifujzHqMa4nJUD5TCD1If1Rs7Nq/jkChUKBOLX41l77hQHAiItI7X8/bhN37z8G/ZR1kKHOwafsJteU9OzZC2gMlOn84A90CfOHh7ggA2H/0EvYcuojWfl4IaO0jxVdyrIBBfdtgyW/ReFKgQj2vKti57yyOxyVg3pch0jip7Md58Os6GV3830HNapVgamqEywl3sW7LMViam2BEWJDUZ9SeM4ic/yequtqjelUnjRxbNPaEvd3TyRJ37D2DT79chVlTgvFel6czhS9YthOxZ66jtZ8XnJ0qICMzB1F74nDmYhJC+7SGu6u97g8svRYWTUREpHcuXrkNANh94Dx2Hzivsbxnx0awsjRF+xZ1cPD4ZWzYdhyFKhXcXezx6fCuGPpBe41vrJ8Q0Q1WVmZYvekQNmw9BndXe8z9cgC6B/3vkpqpiRHe794MR05ewfaYOOTmPYGDvTW6BfgiYlAgXJ3tpNhLV+8AAG7cSsOYqb9q5Pj7opFS0VScds1rI+n2fazdchQP07NgbFwetao7Y9aUYLz7zB2CpEdEGfrmm29Ew4YNhYWFhbC3txfdu3cX8fHxajGtW7cWANQeH330kVrMzZs3RadOnYSpqamwt7cX48aNE0+ePFGL2bNnj2jQoIEwMjISHh4eYtmyZRr5LFiwQLi5uQljY2PRuHFjcezYMdn7kpmZKQCIzMxM+QeAiIiIypQ2v7/LdEzTvn37EB4ejqNHj2LXrl148uQJAgICkJ2drRY3ZMgQ3Lt3T3rMnDlTWlZYWIjOnTsjPz8fhw8fxooVK7B8+XJMmTJFirlx4wY6d+6Mtm3bIi4uDqNHj8bgwYOxY8cOKWbNmjUYO3Yspk6dilOnTqFevXoIDAxEampq6R8IIiIi0nt69YW9aWlpcHBwwL59+9CqVSsAQJs2bVC/fn3MnTu32HW2b9+OLl264O7du3B0fHpNe/HixZgwYQLS0tJgZGSECRMmYNu2bTh//n+nePv27YuMjAxERUUBAJo0aYJGjRphwYIFAACVSgVXV1eMGDECn3322Utz5xf2EhER/fNo8/tbr8Y0ZWY+nS/D1tZWrX3VqlX47bff4OTkhK5du2Ly5MkwMzMDABw5cgQ+Pj5SwQQAgYGBGDZsGC5cuIAGDRrgyJEj8Pf3V+szMDAQo0ePBgDk5+cjNjYWEydOlJYbGBjA398fR44cKTbXvLw85OX9b3ZYpVL56jtORATgunuXsk6BSK9VS9xaptvXm6JJpVJh9OjRaN68OerUqSO19+/fH25ubnB2dsbZs2cxYcIEXL58GRs3bgQAJCcnqxVMAKTnycnJL4xRKpV4/Pgx0tPTUVhYWGxMfHx8sflGRkbiiy++eL2dJiIion8MvSmawsPDcf78eRw8eFCtfejQodL/fXx8UKlSJbRv3x4JCQnw8PB4vps3ZuLEiRg7dqz0XKlUwtWVc2oQERG9rfSiaIqIiMDWrVuxf/9+uLi4vDC2SZOnt2Feu3YNHh4ecHJywvHjx9ViUlJSAABOTk7Sv0Vtz8ZYWVnB1NQUhoaGMDQ0LDamqI/nGRsbw9jYWP5OEhER0T9amd49J4RAREQENm3ahJiYGFStqjnj6/Pi4uIAAJUqPZ1O38/PD+fOnVO7y23Xrl2wsrKCt7e3FBMdHa3Wz65du+Dn5wcAMDIygq+vr1qMSqVCdHS0FENERET/bmV6pik8PByrV6/Gn3/+CUtLS2kMkrW1NUxNTZGQkIDVq1ejU6dOsLOzw9mzZzFmzBi0atUKdevWBQAEBATA29sbH374IWbOnInk5GRMmjQJ4eHh0pmgjz/+GAsWLMD48eMxaNAgxMTEYO3atdi2bZuUy9ixYxESEoKGDRuicePGmDt3LrKzszFw4MA3f2CIiIhI75TplAMKhaLY9mXLliE0NBS3bt3CBx98gPPnzyM7Oxuurq7o2bMnJk2apHZb4M2bNzFs2DDs3bsX5ubmCAkJwbfffoty5f5XE+7duxdjxozBxYsX4eLigsmTJyM0NFRtuwsWLMCsWbOQnJyM+vXr44cffpAuB74MpxwgotfFu+eIXqw07p7T5ve3Xs3T9E/GoomIXheLJqIXK+uiqUzHNBERERH9U7BoIiIiIpKBRRMRERGRDCyaiIiIiGRg0UREREQkA4smIiIiIhlYNBERERHJwKKJiIiISAYWTUREREQysGgiIiIikoFFExEREZEMLJqIiIiIZGDRRERERCQDiyYiIiIiGVg0EREREcnAoomIiIhIBhZNRERERDKwaCIiIiKSgUUTERERkQwsmoiIiIhkYNFEREREJAOLJiIiIiIZWDQRERERycCiiYiIiEgGFk1EREREMrBoIiIiIpKBRRMRERGRDCyaiIiIiGRg0UREREQkA4smIiIiIhlYNBERERHJwKKJiIiISAYWTUREREQysGgiIiIikoFFExEREZEMLJqIiIiIZGDRRERERCQDiyYiIiIiGVg0EREREcnAoomIiIhIBhZNRERERDKwaCIiIiKSgUUTERERkQwsmoiIiIhkYNFEREREJAOLJiIiIiIZWDQRERERycCiiYiIiEgGFk1EREREMrBoIiIiIpKBRRMRERGRDCyaiIiIiGRg0UREREQkA4smIiIiIhlYNBERERHJwKKJiIiISAYWTUREREQysGgiIiIikoFFExEREZEMZVo0RUZGolGjRrC0tISDgwN69OiBy5cvq8Xk5uYiPDwcdnZ2sLCwQO/evZGSkqIWk5SUhM6dO8PMzAwODg749NNPUVBQoBazd+9evPPOOzA2Nkb16tWxfPlyjXwWLlwId3d3mJiYoEmTJjh+/LjO95mIiIj+mcq0aNq3bx/Cw8Nx9OhR7Nq1C0+ePEFAQACys7OlmDFjxmDLli1Yt24d9u3bh7t376JXr17S8sLCQnTu3Bn5+fk4fPgwVqxYgeXLl2PKlClSzI0bN9C5c2e0bdsWcXFxGD16NAYPHowdO3ZIMWvWrMHYsWMxdepUnDp1CvXq1UNgYCBSU1PfzMEgIiIivaYQQoiyTqJIWloaHBwcsG/fPrRq1QqZmZmwt7fH6tWr8e677wIA4uPj4eXlhSNHjqBp06bYvn07unTpgrt378LR0REAsHjxYkyYMAFpaWkwMjLChAkTsG3bNpw/f17aVt++fZGRkYGoqCgAQJMmTdCoUSMsWLAAAKBSqeDq6ooRI0bgs88+e2nuSqUS1tbWyMzMhJWVla4PDRH9C1x371LWKRDptWqJW3Xepza/v/VqTFNmZiYAwNbWFgAQGxuLJ0+ewN/fX4qpVasWqlSpgiNHjgAAjhw5Ah8fH6lgAoDAwEAolUpcuHBBinm2j6KYoj7y8/MRGxurFmNgYAB/f38p5nl5eXlQKpVqDyIiInp76U3RpFKpMHr0aDRv3hx16tQBACQnJ8PIyAg2NjZqsY6OjkhOTpZini2YipYXLXtRjFKpxOPHj3H//n0UFhYWG1PUx/MiIyNhbW0tPVxdXV9tx4mIiOgfQW+KpvDwcJw/fx5//PFHWaciy8SJE5GZmSk9bt26VdYpERERUSkqV9YJAEBERAS2bt2K/fv3w8XFRWp3cnJCfn4+MjIy1M42paSkwMnJSYp5/i63orvrno15/o67lJQUWFlZwdTUFIaGhjA0NCw2pqiP5xkbG8PY2PjVdpiIiIj+ccr0TJMQAhEREdi0aRNiYmJQtWpVteW+vr4oX748oqOjpbbLly8jKSkJfn5+AAA/Pz+cO3dO7S63Xbt2wcrKCt7e3lLMs30UxRT1YWRkBF9fX7UYlUqF6OhoKYaIiIj+3cr0TFN4eDhWr16NP//8E5aWltL4IWtra5iamsLa2hphYWEYO3YsbG1tYWVlhREjRsDPzw9NmzYFAAQEBMDb2xsffvghZs6cieTkZEyaNAnh4eHSmaCPP/4YCxYswPjx4zFo0CDExMRg7dq12LZtm5TL2LFjERISgoYNG6Jx48aYO3cusrOzMXDgwDd/YIiIiEjvlGnRtGjRIgBAmzZt1NqXLVuG0NBQAMCcOXNgYGCA3r17Iy8vD4GBgfjxxx+lWENDQ2zduhXDhg2Dn58fzM3NERISgi+//FKKqVq1KrZt24YxY8Zg3rx5cHFxwc8//4zAwEAp5v3330daWhqmTJmC5ORk1K9fH1FRURqDw4mIiOjfSa/mafon4zxNRPS6OE8T0YtxniYiIiKifwAWTUREREQysGgiIiIikoFFExEREZEMLJqIiIiIZGDRRERERCQDiyYiIiIiGVg0EREREcnAoomIiIhIBhZNRERERDKwaCIiIiKSgUUTERERkQwsmoiIiIhkKCcnaOzYsbI7/P777185GSIiIiJ9JatoOn36tNrzU6dOoaCgAJ6engCAK1euwNDQEL6+vrrPkIiIiEgPyCqa9uzZI/3/+++/h6WlJVasWIEKFSoAANLT0zFw4EC0bNmydLIkIiIiKmMKIYTQZoXKlStj586dqF27tlr7+fPnERAQgLt37+o0wX8KpVIJa2trZGZmwsrKqqzTIaJ/oOvuXco6BSK9Vi1xq8771Ob3t9YDwZVKJdLS0jTa09LS8OjRI227IyIiIvpH0Lpo6tmzJwYOHIiNGzfi9u3buH37NjZs2ICwsDD06tWrNHIkIiIiKnOyxjQ9a/HixRg3bhz69++PJ0+ePO2kXDmEhYVh1qxZOk+QiIiISB9oPaapSHZ2NhISEgAAHh4eMDc312li/zQc00REr4tjmoherKzHNGl9pqmIubk56tat+6qrExEREf2jaF00ZWdn49tvv0V0dDRSU1OhUqnUll+/fl1nyRGVJDsnD0t+24248zdx5uJNZCpzMGtKMN7r0lQjVqVSYdXGQ1i96RCuJ6XC1Lg8vGpUxuQxveBd0wUAMGfp35j38/YSt7f+pzFoWK+a9HzF2n34df0B3LrzABVszNHF/x188nFnmJkal9jH5qgTGD3lV5iZGuHivtkv3cdDxy9j846TOBmXgHupGbC3s0KzhjXxyced4VDR+qXrExGRbmldNA0ePBj79u3Dhx9+iEqVKkGhUJRGXkQv9DAjCz/8HIXKThXgVaMyjsZeLTH2069W4c+ok+jVqTFC3muFnNx8XLh8Cw/Ss6SYoLb14O5qr7HurB+3IPtxHup6V5HaIuf/iSUrd6NTu/oY+H4bXLuRjBVr9+HK9XtYOT+82Byyc/IQOf9PmJkayd7Hbxf8iQxlDjq1b4CqrvZIunsfv647gOiD5/H3b5/BoSIvAxMRvUlaF03bt2/Htm3b0Lx589LIh0gWh4pWOP73dDhUtMLZi0noFlr8TQhbd53Chm3HsXjGYAS1rVdif141KsOrRmW1trsp6biXmoG+3f1gVP7pWyX1fiZ+WR2DXh0b4fsvBkixVavYY+p367H7wDn4t/TR6H/+f6NgbmYMP98a2LnvrKx9nDS6FxrVrwYDg//d5Nq6qTfe/3gefl23H+OGcfwLEdGbpPWUAxUqVICtrW1p5EIkm7FReVlnWn7+fQ/q1XZDUNt6UKlUyHmcJ3sbf+2IhRAC3YMaSm2nzt1AQaEKXQPUvzKo6PmWnac0+rmRlIr//r4Xk0f3gqGh/Ldck3eqqxVMRW02Vma4lpgsux8iItINrYumr776ClOmTEFOTk5p5EOkM4+yHuPMhZuo510FM3/8Cz7txsO79Ti07DENW3dpFjfP2xx1As6OFdCkQXWpLS+/AABgbFxeLdbU5Ollt3PxSRr9fDlnA5r61kDb5rU1lmkrOycPOY/zUcHG4rX7IiIi7Wh9eW727NlISEiAo6Mj3N3dUb68+i+PU6de/suI6E24eec+hBDYsvMUDA0N8FlEd1hZmOK/a/ZixKTlsLAwQRs/72LXvZJwD/HX7uKjD/3Vxu15uDkCAGLPXEezhjWl9uOnn06/kZKWqdZPzMHzOHA0HttXfaaTffrv73uQ/6QAXf0b6KQ/IiKST+uiqUePHqWQBpHu5eQ8vRSXnpmNTf/9BA3quAMA/Fv5oGWPaVjw3x0lFk2bd5wAAPR45tIcANSp5Yr6ddyxeOVuODrYwM+3Bq4lJmPSjLUoX84QuXlPpNj8JwX4as5GBPdqgRrVKr32/hw7dQ3zft6Ozv4N0KyR52v3R0RE2tG6aJo6dWpp5EGkcybGTy+ZuTrbSQUTAJibGaN9yzrYvP0ECgoKUa6codp6Qgj8uSMWnh6VNAaHA8Dib8MQ8fkyjP9qFQDA0NAAg/u1xdHT13D9ZqoU98vqPXiYmY0xQzu99r5cS0zGR+N/Qk0PZ8z4vP9r90dERNp75cktY2NjcenSJQBA7dq10aABLxeQfnG0fzqXUUVbS41ldhUs8KSgEDm5+bCyMFVbdvLMddy59xDjw7sW26+Tgw3W/zQGN5JSkfZACXdXBzhUtELjTp+jWpWn0xYosx5jwbId+KB3SzzKzsWj7FwAQM7jfAgB3Lr7AKYmRsXm9ry7KekYMOJHWFqYYvncj2FhbqLVcSAiIt3QumhKTU1F3759sXfvXtjY2AAAMjIy0LZtW/zxxx+wt9ec64aoLDjaW8PezkpjnBEApKZlwti4PCzMNCej3Bx1EgqFAt0DG2ose1bVKg6oWsUBAHD1+j2k3lfi3S5NAACZypynE3Cu3I0lK3drrNuyxzR0aOWDn74b+sJtpGdk48MRC5H/pADrFo7mpJZERGVI66JpxIgRePToES5cuAAvLy8AwMWLFxESEoKRI0fi999/13mSRK+qS4d3sOyPvThwLB4tm9QC8HRizF37z6FZwxoat/Q/KSjE39Gn0aheNVR2kje1hkqlQuT8P2FqYoTgXi0APD27tWTmYI3Y5Wv24dT5RPzwVYhaAZR6PxPKrMdwc7FH+f+/XJjzOA+hYxYhJS0Tv/84QirQiIiobGhdNEVFRWH37t1SwQQA3t7eWLhwIQICAnSaHNGLrFi7D8pHj5Fy/+mZpOgD55GckgEACHm/NawsTDE8pAO27T6FYZ/9grB+bWFpYYLVGw/hSUEhPh2meflt/5FLSM/MVpub6XnTZq9HXn4BvGtURkFhIf7cEYszF25i9tQPpELL1MQIgW00J9Pcue8szly8qbFsxsK/sGHbcRzYPA2uznYAgFGTV+DMhZvo07UpriWm4FpiihRvZlp8/0REVHq0LppUKpXGNAMAUL58eY3voSMqTUtXxeDOvYfS86g9ZxC15wwAoEfHRrCyMIW9nRXW/zQG0+dtwn9/34MnBYV4x6cq5nw5QPreuWdt3nEC5csZonP7ksfo1fZ0wX9/34s/o07AwMAA9byrYNXCCLUpCHTh4tU7AIC1W45i7ZajassqV7Jl0URE9IYphBBCmxW6d++OjIwM/P7773B2dgYA3LlzB8HBwahQoQI2bdpUKonqO6VSCWtra2RmZsLKit8JRkTau+7Or8YhepFqiVt13qc2v7+1nhF8wYIFUCqVcHd3h4eHBzw8PFC1alUolUrMnz//lZMmIiIi0mdaX55zdXXFqVOnsHv3bsTHxwMAvLy84O/vr/PkiIiIiPTFK83TpFAo0KFDB3To0EHX+VAJ3BuPKOsUiPRW4nGe5Sai0qf15bmRI0fihx9+0GhfsGABRo8erYuciIiIiPSO1kXThg0b0Lx5c432Zs2aYf369TpJioiIiEjfaF00PXjwANbWmrMSW1lZ4f79+zpJioiIiEjfaF00Va9eHVFRURrt27dvR7Vq1XSSFBEREZG+0Xog+NixYxEREYG0tDS0a9cOABAdHY3Zs2dj7ty5us6PiIiISC9oXTQNGjQIeXl5mD59Or766isAgLu7OxYtWoQBAwboPEEiIiIiffBKUw4MGzYMw4YNQ1paGkxNTWFhYaHrvIiIiIj0itZjmgCgoKAAu3fvxsaNG1H0LSx3795FVlaWTpMjIiIi0hdan2m6efMmgoKCkJSUhLy8PHTo0AGWlpaYMWMG8vLysHjx4tLIk4iIiKhMaX2madSoUWjYsCHS09Nhamoqtffs2RPR0dE6TY6IiIhIX2h9punAgQM4fPgwjIyM1Nrd3d1x584dnSVGREREpE+0PtOkUqlQWFio0X779m1YWlrqJCkiIiIifaN10RQQEKA2H5NCoUBWVhamTp2KTp066TI3IiIiIr2h9eW52bNnIzAwEN7e3sjNzUX//v1x9epVVKxYEb///ntp5EhERERU5rQumlxcXHDmzBmsWbMGZ86cQVZWFsLCwhAcHKw2MJyIiIjobfJKk1uWK1cOwcHBCA4O1nU+RERERHpJ9pimK1eu4Pjx42pt0dHRaNu2LRo3boxvvvlG58kRERER6QvZRdOECROwdetW6fmNGzfQtWtXGBkZwc/PD5GRkfzCXiIiInpryb48d/LkSYwfP156vmrVKtSsWRM7duwAANStWxfz58/H6NGjdZ4kERERUVmTfabp/v37cHFxkZ7v2bMHXbt2lZ63adMGiYmJOk2OiIiISF/ILppsbW1x7949AE8nuDx58iSaNm0qLc/Pz5e+vJeIiIjobSO7aGrTpg2++uor3Lp1C3PnzoVKpUKbNm2k5RcvXoS7u7tWG9+/fz+6du0KZ2dnKBQKbN68WW15aGgoFAqF2iMoKEgt5uHDhwgODoaVlRVsbGwQFhaGrKwstZizZ8+iZcuWMDExgaurK2bOnKmRy7p161CrVi2YmJjAx8cHf//9t1b7QkRERG832UXT9OnTER8fDzc3N0yYMAEzZ86Eubm5tHzlypVo166dVhvPzs5GvXr1sHDhwhJjgoKCcO/ePenx/ASawcHBuHDhAnbt2oWtW7di//79GDp0qLRcqVQiICAAbm5uiI2NxaxZszBt2jQsXbpUijl8+DD69euHsLAwnD59Gj169ECPHj1w/vx5rfaHiIiI3l4KocU1tYKCAly4cAH29vZwdnZWW3bmzBm4uLjAzs7u1RJRKLBp0yb06NFDagsNDUVGRobGGagily5dgre3N06cOIGGDRsCAKKiotCpUyfcvn0bzs7OWLRoET7//HMkJydLXzL82WefYfPmzYiPjwcAvP/++8jOzla7O7Bp06aoX78+Fi9eXOy28/LykJeXJz1XKpVwdXVFZmYmrKysXukYvIh74xE675PobZF4fH5Zp6AT1927lHUKRHqtWuLWlwdpSalUwtraWtbvb62+e65cuXKoV6+eRsEEAPXq1XvlgulF9u7dCwcHB3h6emLYsGF48OCBtOzIkSOwsbGRCiYA8Pf3h4GBAY4dOybFtGrVSiqYACAwMBCXL19Genq6FOPv76+23cDAQBw5cqTEvCIjI2FtbS09XF1ddbK/REREpJ+0/sLeNykoKAi//voroqOjMWPGDOzbtw8dO3ZEYWEhACA5ORkODg5q65QrVw62trZITk6WYhwdHdViip6/LKZoeXEmTpyIzMxM6XHr1q3X21kiIiLSa6/0NSpvSt++faX/+/j4oG7duvDw8MDevXvRvn37MswMMDY2hrGxcZnmQERERG+OXp9pel61atVQsWJFXLt2DQDg5OSE1NRUtZiCggI8fPgQTk5OUkxKSopaTNHzl8UULSciIiLSqmgqKCjAl19+idu3b5dWPi90+/ZtPHjwAJUqVQIA+Pn5ISMjA7GxsVJMTEwMVCoVmjRpIsXs378fT548kWJ27doFT09PVKhQQYqJjo5W29auXbvg5+dX2rtERERE/xBaDwSfNWsWCgoKdLLxrKwsxMXFIS4uDsDT77OLi4tDUlISsrKy8Omnn+Lo0aNITExEdHQ0unfvjurVqyMwMBAA4OXlhaCgIAwZMgTHjx/HoUOHEBERgb59+0qD1fv37w8jIyOEhYXhwoULWLNmDebNm4exY8dKeYwaNQpRUVGYPXs24uPjMW3aNJw8eRIRERE62U8iIiL659P68ly7du2wb98+nWz85MmTaNCgARo0aAAAGDt2LBo0aIApU6bA0NAQZ8+eRbdu3VCzZk2EhYXB19cXBw4cUBtLtGrVKtSqVQvt27dHp06d0KJFC7U5mKytrbFz507cuHEDvr6++OSTTzBlyhS1uZyaNWuG1atXY+nSpahXrx7Wr1+PzZs3o06dOjrZTyIiIvrn02qeJgBYvHgxvvjiCwQHB8PX11dtgksA6Natm04T/KfQZp6HV8F5mohKxnmaiP4dynqeJq3vnhs+fDgA4Pvvv9dYplAopOkAiIiIiN4mWhdNKpWqNPIgIiIi0muvNeVAbm6urvIgIiIi0mtaF02FhYX46quvULlyZVhYWOD69esAgMmTJ+OXX37ReYJERERE+kDromn69OlYvnw5Zs6cqfZ9bnXq1MHPP/+s0+SIiIiI9IXWRdOvv/6KpUuXIjg4GIaGhlJ7vXr1EB8fr9PkiIiIiPSF1kXTnTt3UL16dY12lUqlNus2ERER0dtE66LJ29sbBw4c0Ghfv369NEklERER0dtG6ykHpkyZgpCQENy5cwcqlQobN27E5cuX8euvv2LrVt1POkVERESkD7Q+09S9e3ds2bIFu3fvhrm5OaZMmYJLly5hy5Yt6NChQ2nkSERERFTmtD7TBAAtW7bErl27dJ0LERERkd56paIJePplu5cuXQLwdJyTr6+vzpIiIiIi0jdaF023b99Gv379cOjQIdjY2AAAMjIy0KxZM/zxxx9wcXHRdY5EREREZU7rMU2DBw/GkydPcOnSJTx8+BAPHz7EpUuXoFKpMHjw4NLIkYiIiKjMaX2mad++fTh8+DA8PT2lNk9PT8yfPx8tW7bUaXJERERE+kLrM02urq7FTmJZWFgIZ2dnnSRFREREpG+0LppmzZqFESNG4OTJk1LbyZMnMWrUKHz33Xc6TY6IiIhIX2h9eS40NBQ5OTlo0qQJypV7unpBQQHKlSuHQYMGYdCgQVLsw4cPdZcpERERURnSumiaO3duKaRBREREpN+0LppCQkJKIw8iIiIivab1mCYiIiKifyMWTUREREQysGgiIiIikoFFExEREZEMr100KZVKbN68WfryXiIiIqK3kdZFU58+fbBgwQIAwOPHj9GwYUP06dMHdevWxYYNG3SeIBEREZE+0Lpo2r9/v/Qdc5s2bYIQAhkZGfjhhx/w9ddf6zxBIiIiIn2gddGUmZkJW1tbAEBUVBR69+4NMzMzdO7cGVevXtV5gkRERET64JW+sPfIkSPIzs5GVFQUAgICAADp6ekwMTHReYJERERE+kDrGcFHjx6N4OBgWFhYwM3NDW3atAHw9LKdj4+PrvMjIiIi0gtaF03Dhw9HkyZNkJSUhA4dOsDA4OnJqmrVqnFMExEREb21tLo89+TJE3h4eMDMzAw9e/aEhYWFtKxz585o3ry5zhMkIiIi0gdaFU3ly5dHbm5uaeVCREREpLe0HggeHh6OGTNmoKCgoDTyISIiItJLWo9pOnHiBKKjo7Fz5074+PjA3NxcbfnGjRt1lhwRERGRvtC6aLKxsUHv3r1LIxciIiIivaV10bRs2bLSyIOIiIhIr73SF/YWFBRg9+7dWLJkCR49egQAuHv3LrKysnSaHBEREZG+0PpM082bNxEUFISkpCTk5eWhQ4cOsLS0xIwZM5CXl4fFixeXRp5EREREZUrrM02jRo1Cw4YNkZ6eDlNTU6m9Z8+eiI6O1mlyRERERPpC6zNNBw4cwOHDh2FkZKTW7u7ujjt37ugsMSIiIiJ9ovWZJpVKhcLCQo3227dvw9LSUidJEREREekbrYumgIAAzJ07V3quUCiQlZWFqVOnolOnTrrMjYiIiEhvaH15bvbs2QgMDIS3tzdyc3PRv39/XL16FRUrVsTvv/9eGjkSERERlTmtiyYXFxecOXMGa9aswZkzZ5CVlYWwsDAEBwerDQwnIiIieptoXTTt378fzZo1Q3BwMIKDg6X2goIC7N+/H61atdJpgkRERET6QOsxTW3btsXDhw812jMzM9G2bVudJEVERESkb7QumoQQUCgUGu0PHjzQ+PJeIiIioreF7MtzvXr1AvD0brnQ0FAYGxtLywoLC3H27Fk0a9ZM9xkSERER6QHZRZO1tTWAp2eaLC0t1QZ9GxkZoWnTphgyZIjuMyQiIiLSA7KLpmXLlgF4OvP3p59+CjMzs1JLioiIiEjfaD2macCAAcV+XcrVq1eRmJioi5yIiIiI9I7WRVNoaCgOHz6s0X7s2DGEhobqIiciIiIivaN10XT69Gk0b95co71p06aIi4vTRU5EREREekfrokmhUODRo0ca7ZmZmcV+kS8RERHR20DroqlVq1aIjIxUK5AKCwsRGRmJFi1a6DQ5IiIiIn2h9deozJgxA61atYKnpydatmwJADhw4ACUSiViYmJ0niARERGRPtD6TJO3tzfOnj2LPn36IDU1FY8ePcKAAQMQHx+POnXqlEaORERERGVO6zNNAODs7IxvvvlG17kQERER6S2tzzQVycnJQXx8PM6ePav20Mb+/fvRtWtXODs7Q6FQYPPmzWrLhRCYMmUKKlWqBFNTU/j7++Pq1atqMQ8fPkRwcDCsrKxgY2ODsLAwZGVlqcWcPXsWLVu2hImJCVxdXTFz5kyNXNatW4datWrBxMQEPj4++Pvvv7XaFyIiInq7aV00paWloUuXLrC0tETt2rXRoEEDtYc2srOzUa9ePSxcuLDY5TNnzsQPP/yAxYsX49ixYzA3N0dgYCByc3OlmODgYFy4cAG7du3C1q1bsX//fgwdOlRarlQqERAQADc3N8TGxmLWrFmYNm0ali5dKsUcPnwY/fr1Q1hYGE6fPo0ePXqgR48eOH/+vJZHh4iIiN5WCiGE0GaF4OBg3Lx5E3PnzkWbNm2wadMmpKSk4Ouvv8bs2bPRuXPnV0tEocCmTZvQo0cPAE/PMjk7O+OTTz7BuHHjADyd1sDR0RHLly9H3759cenSJXh7e+PEiRNo2LAhACAqKgqdOnXC7du34ezsjEWLFuHzzz9HcnIyjIyMAACfffYZNm/ejPj4eADA+++/j+zsbGzdulXKp2nTpqhfvz4WL14sK3+lUglra2tkZmbCysrqlY7Bi7g3HqHzPoneFonH55d1Cjpx3b1LWadApNeqJW59eZCWtPn9rfWZppiYGHz//fdo2LAhDAwM4Obmhg8++AAzZ85EZGTkKyf9vBs3biA5ORn+/v5Sm7W1NZo0aYIjR44AAI4cOQIbGxupYAIAf39/GBgY4NixY1JMq1atpIIJAAIDA3H58mWkp6dLMc9upyimaDvFycvLg1KpVHsQERHR20vroik7OxsODg4AgAoVKiAtLQ0A4OPjg1OnTuksseTkZACAo6OjWrujo6O0LDk5WcqlSLly5WBra6sWU1wfz26jpJii5cWJjIyEtbW19HB1ddV2F4mIiOgfROuiydPTE5cvXwYA1KtXD0uWLMGdO3ewePFiVKpUSecJ6quJEyciMzNTety6dausUyIiIqJSpPWUA6NGjcK9e/cAAFOnTkVQUBBWrVoFIyMjLF++XGeJOTk5AQBSUlLUirGUlBTUr19fiklNTVVbr6CgAA8fPpTWd3JyQkpKilpM0fOXxRQtL46xsTGMjY1fYc+IiIjon0jrM00ffPABQkNDAQC+vr64efMmTpw4gVu3buH999/XWWJVq1aFk5MToqOjpTalUoljx47Bz88PAODn54eMjAzExsZKMTExMVCpVGjSpIkUs3//fjx58kSK2bVrFzw9PVGhQgUp5tntFMUUbYeIiIhIq6LpyZMn8PDwwKVLl6Q2MzMzvPPOO6hYsaLWG8/KykJcXBzi4uIAPB38HRcXh6SkJCgUCowePRpff/01/vrrL5w7dw4DBgyAs7OzdIedl5cXgoKCMGTIEBw/fhyHDh1CREQE+vbtC2dnZwBA//79YWRkhLCwMFy4cAFr1qzBvHnzMHbsWCmPUaNGISoqCrNnz0Z8fDymTZuGkydPIiIiQut9IiIioreTVpfnypcvrzZH0us6efIk2rZtKz0vKmRCQkKwfPlyjB8/HtnZ2Rg6dCgyMjLQokULREVFwcTERFpn1apViIiIQPv27WFgYIDevXvjhx9+kJZbW1tj586dCA8Ph6+vLypWrIgpU6aozeXUrFkzrF69GpMmTcJ//vMf1KhRA5s3b+bXwhAREZFE63mavvnmG1y5cgU///wzypV7pW9heStxniaissN5moj+Hcp6niatq54TJ04gOjoaO3fuhI+PD8zNzdWWb9y4UdsuiYiIiPSe1kWTjY0NevfuXRq5EBEREektrYumZcuWlUYeRERERHpN6ykHiIiIiP6NXmkk9/r167F27VokJSUhPz9fbZkuv0qFiIiISF9ofabphx9+wMCBA+Ho6IjTp0+jcePGsLOzw/Xr19GxY8fSyJGIiIiozGldNP34449YunQp5s+fDyMjI4wfPx67du3CyJEjkZmZWRo5EhEREZU5rYumpKQkNGvWDABgamqKR48eAQA+/PBD/P7777rNjoiIiEhPaF00OTk54eHDhwCAKlWq4OjRowCefgWKlvNkEhEREf1jaF00tWvXDn/99RcAYODAgRgzZgw6dOiA999/Hz179tR5gkRERET6QOu755YuXQqVSgUACA8Ph52dHQ4fPoxu3brho48+0nmCRERERPpA66LJwMAABgb/O0HVt29f9O3bV6dJEREREembV5qnKSMjA8ePH0dqaqp01qnIgAEDdJIYERERkT7RumjasmULgoODkZWVBSsrKygUCmmZQqFg0URERERvJa0Hgn/yyScYNGgQsrKykJGRgfT0dOlRdFcdERER0dtG66Lpzp07GDlyJMzMzEojHyIiIiK9pHXRFBgYiJMnT5ZGLkRERER6S9aYpqJ5mQCgc+fO+PTTT3Hx4kX4+PigfPnyarHdunXTbYZEREREekBW0dSjRw+Nti+//FKjTaFQoLCw8LWTIiIiItI3soqm56cVICIiIvq30XpMExEREdG/keyiKSYmBt7e3lAqlRrLMjMzUbt2bezfv1+nyRERERHpC9lF09y5czFkyBBYWVlpLLO2tsZHH32EOXPm6DQ5IiIiIn0hu2g6c+YMgoKCSlweEBCA2NhYnSRFREREpG9kF00pKSka0ws8q1y5ckhLS9NJUkRERET6RnbRVLlyZZw/f77E5WfPnkWlSpV0khQRERGRvpFdNHXq1AmTJ09Gbm6uxrLHjx9j6tSp6NKli06TIyIiItIXsuZpAoBJkyZh48aNqFmzJiIiIuDp6QkAiI+Px8KFC1FYWIjPP/+81BIlIiIiKkuyiyZHR0ccPnwYw4YNw8SJEyGEAPB0FvDAwEAsXLgQjo6OpZYoERERUVmSXTQBgJubG/7++2+kp6fj2rVrEEKgRo0aqFChQmnlR0RERKQXtCqailSoUAGNGjXSdS5EREREeotfo0JEREQkA4smIiIiIhlYNBERERHJwKKJiIiISAYWTUREREQysGgiIiIikoFFExEREZEMLJqIiIiIZGDRRERERCQDiyYiIiIiGVg0EREREcnAoomIiIhIBhZNRERERDKwaCIiIiKSgUUTERERkQwsmoiIiIhkYNFEREREJAOLJiIiIiIZWDQRERERycCiiYiIiEgGFk1EREREMrBoIiIiIpKBRRMRERGRDCyaiIiIiGRg0UREREQkA4smIiIiIhlYNBERERHJwKKJiIiISAa9LpqmTZsGhUKh9qhVq5a0PDc3F+Hh4bCzs4OFhQV69+6NlJQUtT6SkpLQuXNnmJmZwcHBAZ9++ikKCgrUYvbu3Yt33nkHxsbGqF69OpYvX/4mdo+IiIj+QfS6aAKA2rVr4969e9Lj4MGD0rIxY8Zgy5YtWLduHfbt24e7d++iV69e0vLCwkJ07twZ+fn5OHz4MFasWIHly5djypQpUsyNGzfQuXNntG3bFnFxcRg9ejQGDx6MHTt2vNH9JCIiIv1WrqwTeJly5crByclJoz0zMxO//PILVq9ejXbt2gEAli1bBi8vLxw9ehRNmzbFzp07cfHiRezevRuOjo6oX78+vvrqK0yYMAHTpk2DkZERFi9ejKpVq2L27NkAAC8vLxw8eBBz5sxBYGDgG91XIiIi0l96f6bp6tWrcHZ2RrVq1RAcHIykpCQAQGxsLJ48eQJ/f38ptlatWqhSpQqOHDkCADhy5Ah8fHzg6OgoxQQGBkKpVOLChQtSzLN9FMUU9VGSvLw8KJVKtQcRERG9vfS6aGrSpAmWL1+OqKgoLFq0CDdu3EDLli3x6NEjJCcnw8jICDY2NmrrODo6Ijk5GQCQnJysVjAVLS9a9qIYpVKJx48fl5hbZGQkrK2tpYerq+vr7i4RERHpMb2+PNexY0fp/3Xr1kWTJk3g5uaGtWvXwtTUtAwzAyZOnIixY8dKz5VKJQsnIiKit5hen2l6no2NDWrWrIlr167ByckJ+fn5yMjIUItJSUmRxkA5OTlp3E1X9PxlMVZWVi8szIyNjWFlZaX2ICIiorfXP6poysrKQkJCAipVqgRfX1+UL18e0dHR0vLLly8jKSkJfn5+AAA/Pz+cO3cOqampUsyuXbtgZWUFb29vKebZPopiivogIiIiAvS8aBo3bhz27duHxMREHD58GD179oShoSH69esHa2trhIWFYezYsdizZw9iY2MxcOBA+Pn5oWnTpgCAgIAAeHt748MPP8SZM2ewY8cOTJo0CeHh4TA2NgYAfPzxx7h+/TrGjx+P+Ph4/Pjjj1i7di3GjBlTlrtOREREekavxzTdvn0b/fr1w4MHD2Bvb48WLVrg6NGjsLe3BwDMmTMHBgYG6N27N/Ly8hAYGIgff/xRWt/Q0BBbt27FsGHD4OfnB3Nzc4SEhODLL7+UYqpWrYpt27ZhzJgxmDdvHlxcXPDzzz9zugEiIiJSoxBCiLJO4m2gVCphbW2NzMzMUhnf5N54hM77JHpbJB6fX9Yp6MR19y5lnQKRXquWuFXnfWrz+1uvL88RERER6QsWTUREREQysGgiIiIikoFFExEREZEMLJqIiIiIZGDRRERERCQDiyYiIiIiGVg0EREREcnAoomIiIhIBhZNRERERDKwaCIiIiKSgUUTERERkQwsmoiIiIhkYNFEREREJAOLJiIiIiIZWDQRERERycCiiYiIiEgGFk1EREREMrBoIiIiIpKBRRMRERGRDCyaiIiIiGRg0UREREQkA4smIiIiIhlYNBERERHJwKKJiIiISAYWTUREREQysGgiIiIikoFFExEREZEMLJqIiIiIZGDRRERERCQDiyYiIiIiGVg0EREREcnAoomIiIhIBhZNRERERDKwaCIiIiKSgUUTERERkQwsmoiIiIhkYNFEREREJAOLJiIiIiIZWDQRERERycCiiYiIiEgGFk1EREREMrBoIiIiIpKBRRMRERGRDCyaiIiIiGRg0UREREQkA4smIiIiIhlYNBERERHJwKKJiIiISAYWTUREREQysGgiIiIikoFFExEREZEMLJqIiIiIZGDRRERERCQDiyYiIiIiGVg0EREREcnAoomIiIhIBhZNRERERDKwaCIiIiKSgUXTcxYuXAh3d3eYmJigSZMmOH78eFmnRERERHqARdMz1qxZg7Fjx2Lq1Kk4deoU6tWrh8DAQKSmppZ1akRERFTGWDQ94/vvv8eQIUMwcOBAeHt7Y/HixTAzM8N///vfsk6NiIiIyli5sk5AX+Tn5yM2NhYTJ06U2gwMDODv748jR45oxOfl5SEvL096npmZCQBQKpWlkp+qML9U+iV6G5TW++5Ne6R6UtYpEOm10nivF/UphHhpLIum/3f//n0UFhbC0dFRrd3R0RHx8fEa8ZGRkfjiiy802l1dXUstRyIqnrX10rJOgYjeBGvrUuv60aNHsH5J/yyaXtHEiRMxduxY6blKpcLDhw9hZ2cHhUJRhplRaVMqlXB1dcWtW7dgZWVV1ukQUSng+/zfQwiBR48ewdnZ+aWxLJr+X8WKFWFoaIiUlBS19pSUFDg5OWnEGxsbw9jYWK3NxsamNFMkPWNlZcUPU6K3HN/n/w4vO8NUhAPB/5+RkRF8fX0RHR0ttalUKkRHR8PPz68MMyMiIiJ9wDNNzxg7dixCQkLQsGFDNG7cGHPnzkV2djYGDhxY1qkRERFRGWPR9Iz3338faWlpmDJlCpKTk1G/fn1ERUVpDA6nfzdjY2NMnTpV4/IsEb09+D6n4iiEnHvsiIiIiP7lOKaJiIiISAYWTUREREQysGgiIiIikoFFExEREZEMLJroX6lNmzYYPXq0zvtdvny5rElOf/nlFwQEBOh8+3K5u7tj7ty5JS6/f/8+HBwccPv27TeXFNFbYNq0aahfv77O+01MTIRCoUBcXJzO+yb5WDTRawsNDUWPHj002vfu3QuFQoGMjAy1588/Jk2aVGLf7u7uxa7z7bffltLelL7c3FxMnjwZU6dOBVDyPhY9QkND33iOFStWxIABA6Qcid6E0NDQYt8D165de+U+n/8cetm2goKCXnMv6G3GeZrojbt8+bLa1xJYWFi8MP7LL7/EkCFD1NosLS1LJbc3Yf369bCyskLz5s0BACdOnEBhYSEA4PDhw+jdu7faMTI1NdWq/ydPnqB8+fKvnefAgQPh6+uLWbNmwdbW9rX7I5IjKCgIy5YtU2uzt7d/Y9vivEz0IjzTRG+cg4MDnJycpMfLiiZLS0u1eCcnJ5ibmwP431+RO3bsQIMGDWBqaop27dohNTUV27dvh5eXF6ysrNC/f3/k5OSo9VtQUICIiAhYW1ujYsWKmDx5Mp6dtiwvLw/jxo1D5cqVYW5ujiZNmmDv3r1qfSxfvhxVqlSBmZkZevbsiQcPHrx0///44w907dpVem5vby/tV1Fx8uwxWr16NTw8PGBkZARPT0+sXLlSrT+FQoFFixahW7duMDc3x/Tp0wEAW7ZsQaNGjWBiYoKKFSuiZ8+eauvl5ORg0KBBsLS0RJUqVbB06VK15bVr14azszM2bdr00n0i0hVjY2ON9/u8efPg4+MDc3NzuLq6Yvjw4cjKypLWuXnzJrp27YoKFSrA3NwctWvXxt9//43ExES0bdsWAFChQgWNM7fFbatChQrScoVCgSVLlqBLly4wMzODl5cXjhw5gmvXrqFNmzYwNzdHs2bNkJCQoLEfS5YsgaurK8zMzNCnTx9kZmaqLf/555/h5eUFExMT1KpVCz/++KPa8uPHj6NBgwYwMTFBw4YNcfr0aV0cXnpdgug1hYSEiO7du2u079mzRwAQ6enpxT6Xw83NTcyZM6fE5UV9Nm3aVBw8eFCcOnVKVK9eXbRu3VoEBASIU6dOif379ws7Ozvx7bffSuu1bt1aWFhYiFGjRon4+Hjx22+/CTMzM7F06VIpZvDgwaJZs2Zi//794tq1a2LWrFnC2NhYXLlyRQghxNGjR4WBgYGYMWOGuHz5spg3b56wsbER1tbWL9wna2tr8ccff7xwf4qO0caNG0X58uXFwoULxeXLl8Xs2bOFoaGhiImJkdYBIBwcHMR///tfkZCQIG7evCm2bt0qDA0NxZQpU8TFixdFXFyc+Oabb9SOq62trVi4cKG4evWqiIyMFAYGBiI+Pl4tn/fff1+EhIS8cH+IdKWkz5I5c+aImJgYcePGDREdHS08PT3FsGHDpOWdO3cWHTp0EGfPnhUJCQliy5YtYt++faKgoEBs2LBBABCXL18W9+7dExkZGS/c1rMAiMqVK4s1a9aIy5cvix49egh3d3fRrl07ERUVJS5evCiaNm0qgoKCpHWmTp0qzM3NRbt27cTp06fFvn37RPXq1UX//v2lmN9++01UqlRJbNiwQVy/fl1s2LBB2NraiuXLlwshhHj06JGwt7cX/fv3F+fPnxdbtmwR1apVEwDE6dOnX/0A02tj0USvLSQkRBgaGgpzc3O1h4mJSbFF0/Nx9+/fL7FvNzc3YWRkpLHO/v371frcvXu3tE5kZKQAIBISEqS2jz76SAQGBkrPW7duLby8vIRKpZLaJkyYILy8vIQQQty8eVMYGhqKO3fuqOXTvn17MXHiRCGEEP369ROdOnVSW/7++++/sGhKT08XAKT8n/d80dSsWTMxZMgQtZj33ntPbbsAxOjRo9Vi/Pz8RHBwcIl5uLm5iQ8++EB6rlKphIODg1i0aJFa3JgxY0SbNm1K7IdIl4r7LHn33Xc14tatWyfs7Oyk5z4+PmLatGnF9lnSH2slfW5Nnz5digEgJk2aJD0/cuSIACB++eUXqe33338XJiYm0vOpU6cKQ0NDcfv2balt+/btwsDAQNy7d08IIYSHh4dYvXq1Wj5fffWV8PPzE0IIsWTJEmFnZyceP34sLV+0aBGLJj3AMU2kE23btsWiRYvU2o4dO4YPPvhAI/bAgQNqY5KePR1enE8//VRjMHTlypXVntetW1f6v6OjI8zMzFCtWjW1tuPHj6ut07RpUygUCum5n58fZs+ejcLCQpw7dw6FhYWoWbOm2jp5eXmws7MDAFy6dEnjkpefnx+ioqJK3JfHjx8DAExMTEqMedalS5cwdOhQtbbmzZtj3rx5am0NGzZUex4XF6cxDux5zx4zhUIBJycnpKamqsWYmppqXNYkKk3Pf5aYm5tj9+7diIyMRHx8PJRKJQoKCpCbm4ucnByYmZlh5MiRGDZsGHbu3Al/f3/07t1b7fUtd1sANMbvPf/ZAgA+Pj5qbbm5uVAqldI4xCpVqqh9Rvn5+UGlUuHy5cuwtLREQkICwsLC1N6jBQUFsLa2BvD0fV+3bl21zwk/P7+X7g+VPhZNpBPm5uaoXr26WltJt6tXrVpV1m35RSpWrKjR9/OeHfisUCg0BkIrFAqoVCrZ28zKyoKhoSFiY2NhaGiotuxlY7BexM7ODgqFAunp6a/cR3GKxngVkTN4XM4xevjwYakNwiUqzvOfJYmJiejSpQuGDRuG6dOnw9bWFgcPHkRYWBjy8/NhZmaGwYMHIzAwENu2bcPOnTsRGRmJ2bNnY8SIEVptqzjPf7aU1Cb386VoLNZPP/2EJk2aqC17/rOG9A8HgtO/1rFjx9SeHz16FDVq1IChoSEaNGiAwsJCpKamonr16moPJycnAICXl1exfbyIkZERvL29cfHiRVk5enl54dChQ2pthw4dgre39wvXq1u3LqKjo2Vt40XOnz+PBg0avHY/RK8qNjYWKpUKs2fPRtOmTVGzZk3cvXtXI87V1RUff/wxNm7ciE8++QQ//fQTgKfvOQDSHapvQlJSklqOR48ehYGBATw9PeHo6AhnZ2dcv35d47OlatWqAJ6+78+ePYvc3Fy1Pqjs8UwT6b1Hjx4hOTlZrc3MzExt2oJXkZSUhLFjx+Kjjz7CqVOnMH/+fMyePRsAULNmTQQHB2PAgAGYPXs2GjRogLS0NERHR6Nu3bro3LkzRo4ciebNm+O7775D9+7dsWPHjhdemisSGBiIgwcPyppc89NPP0WfPn3QoEED+Pv7Y8uWLdi4cSN27979wvWmTp2K9u3bw8PDA3379kVBQQH+/vtvTJgwQdaxAZ7eXRcbG4tvvvlG9jpEula9enU8efIE8+fPR9euXXHo0CEsXrxYLWb06NHo2LEjatasifT0dOzZswdeXl4AADc3NygUCmzduhWdOnWCqampdLY4Ly9P47OlXLlyqFix4mvlbGJigpCQEHz33XdQKpUYOXIk+vTpI/3B9cUXX2DkyJGwtrZGUFAQ8vLycPLkSaSnp2Ps2LHo378/Pv/8cwwZMgQTJ05EYmIivvvuu9fKiXSDZ5pI702ZMgWVKlVSe4wfP/61+x0wYAAeP36Mxo0bIzw8HKNGjVIbP7Rs2TIMGDAAn3zyCTw9PdGjRw+cOHECVapUAfB0TNRPP/2EefPmoV69eti5c+cLJ+osEhYWhr///lvjFuTi9OjRA/PmzcN3332H2rVrY8mSJVi2bBnatGnzwvXatGmDdevW4a+//kL9+vXRrl07jTFdL/Pnn3+iSpUqaNmypVbrEelSvXr18P3332PGjBmoU6cOVq1ahcjISLWYwsJChIeHw8vLC0FBQahZs6Z0C3/lypXxxRdf4LPPPoOjoyMiIiKk9aKiojQ+W1q0aPHaOVevXh29evVCp06dEBAQgLp166pNKTB48GD8/PPPWLZsGXx8fNC6dWssX75cOtNkYWGBLVu24Ny5c2jQoAE+//xzzJgx47XzotenEOKZiWmI6I1477338M4772DixIllnUqJmjZtipEjR6J///5lnQoRkV7gmSaiMjBr1qzXGlBe2u7fv49evXqhX79+ZZ0KEZHe4JkmIiIiIhl4pomIiIhIBhZNRERERDKwaCIiIiKSgUUTERERkQwsmoiIiIhkYNFEREREJAOLJiIiIiIZWDQRERERycCiiYiIiEiG/wN2+xR3bKUzTgAAAABJRU5ErkJggg==", - "text/plain": "
" - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "def plot_character_per_second_comparison(\n", - " hf_stats: tuple[float, float, float], fst_stats: tuple[float, float, float], documents: list\n", - "):\n", - " # Calculating total characters in documents\n", - " total_characters = sum(len(doc) for doc in documents)\n", - "\n", - " # Calculating characters per second for each model\n", - " hf_chars_per_sec = total_characters / hf_stats[0] # Mean time is at index 0\n", - " fst_chars_per_sec = total_characters / fst_stats[0]\n", - "\n", - " # Plotting the bar chart\n", - " models = [\"HF Embed (Torch)\", \"FastEmbed\"]\n", - " chars_per_sec = [hf_chars_per_sec, fst_chars_per_sec]\n", - "\n", - " bars = plt.bar(models, chars_per_sec, color=[\"#1f356c\", \"#dd1f4b\"])\n", - " plt.ylabel(\"Characters per Second\")\n", - " plt.title(\"Characters Processed per Second Comparison\")\n", - "\n", - " # Adding the number at the top of each bar\n", - " for bar, chars in zip(bars, chars_per_sec):\n", - " plt.text(\n", - " bar.get_x() + bar.get_width() / 2,\n", - " bar.get_height(),\n", - " f\"{chars:.1f}\",\n", - " ha=\"center\",\n", - " va=\"bottom\",\n", - " color=\"#1f356c\",\n", - " fontsize=12,\n", - " )\n", - "\n", - " plt.show()\n", - "\n", - "\n", - "plot_character_per_second_comparison(hf_stats, fst_stats, documents)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "fst", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.17" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/experiments/attention_export.py b/experiments/attention_export.py deleted file mode 100644 index 4507904ad..000000000 --- a/experiments/attention_export.py +++ /dev/null @@ -1,17 +0,0 @@ -from optimum.exporters.onnx import main_export -from transformers import AutoTokenizer - -model_id = "sentence-transformers/paraphrase-MiniLM-L6-v2" -output_dir = f"models/{model_id.replace('/', '_')}" -model_kwargs = {"output_attentions": True, "return_dict": True} -tokenizer = AutoTokenizer.from_pretrained(model_id) - -# export if the output model does not exist -# try: -# sess = onnxruntime.InferenceSession(f"{output_dir}/model.onnx") -# print("Model already exported") -# except FileNotFoundError: -print(f"Exporting model to {output_dir}") -main_export( - model_id, output=output_dir, no_post_process=True, model_kwargs=model_kwargs -) diff --git a/experiments/try_attention_export.py b/experiments/try_attention_export.py deleted file mode 100644 index e0301aa48..000000000 --- a/experiments/try_attention_export.py +++ /dev/null @@ -1,33 +0,0 @@ -import numpy as np -import onnx -import onnxruntime -from transformers import AutoTokenizer - -model_id = "sentence-transformers/paraphrase-MiniLM-L6-v2" -output_dir = f"models/{model_id.replace('/', '_')}" -model_kwargs = {"output_attentions": True, "return_dict": True} -tokenizer = AutoTokenizer.from_pretrained(model_id) - -model_path = f"{output_dir}/model.onnx" -onnx_model = onnx.load(model_path) -ort_session = onnxruntime.InferenceSession(model_path) -text = "This is a test sentence" -tokenizer_output = tokenizer(text, return_tensors="np") -input_ids = tokenizer_output["input_ids"] -attention_mask = tokenizer_output["attention_mask"] -print(attention_mask) -# Prepare the input -input_ids = np.array(input_ids).astype( - np.int64 -) # Replace your_input_ids with actual input data - -# Run the ONNX model -outputs = ort_session.run( - None, {"input_ids": input_ids, "attention_mask": attention_mask} -) - -# Get the attention weights -attentions = outputs[-1] - -# Print the attention weights for the first layer and first head -print(attentions[0][0]) diff --git a/fastembed/__init__.py b/fastembed/__init__.py deleted file mode 100644 index 7a2e41340..000000000 --- a/fastembed/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -import importlib.metadata - -from fastembed.image import ImageEmbedding -from fastembed.late_interaction import LateInteractionTextEmbedding -from fastembed.late_interaction_multimodal import LateInteractionMultimodalEmbedding -from fastembed.sparse import SparseEmbedding, SparseTextEmbedding -from fastembed.text import TextEmbedding - -try: - version = importlib.metadata.version("fastembed") -except importlib.metadata.PackageNotFoundError as _: - version = importlib.metadata.version("fastembed-gpu") - -__version__ = version -__all__ = [ - "TextEmbedding", - "SparseTextEmbedding", - "SparseEmbedding", - "ImageEmbedding", - "LateInteractionTextEmbedding", - "LateInteractionMultimodalEmbedding", -] diff --git a/fastembed/common/__init__.py b/fastembed/common/__init__.py deleted file mode 100644 index 5caf45f3e..000000000 --- a/fastembed/common/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from fastembed.common.types import ImageInput, OnnxProvider, PathInput - -__all__ = ["OnnxProvider", "ImageInput", "PathInput"] diff --git a/fastembed/embedding.py b/fastembed/embedding.py deleted file mode 100644 index 375666916..000000000 --- a/fastembed/embedding.py +++ /dev/null @@ -1,24 +0,0 @@ -from typing import Any - -from loguru import logger - -from fastembed import TextEmbedding - -logger.warning( - "DefaultEmbedding, FlagEmbedding, JinaEmbedding are deprecated." - "Use from fastembed import TextEmbedding instead." -) - -DefaultEmbedding = TextEmbedding -FlagEmbedding = TextEmbedding - - -class JinaEmbedding(TextEmbedding): - def __init__( - self, - model_name: str = "jinaai/jina-embeddings-v2-base-en", - cache_dir: str | None = None, - threads: int | None = None, - **kwargs: Any, - ): - super().__init__(model_name, cache_dir, threads, **kwargs) diff --git a/fastembed/image/__init__.py b/fastembed/image/__init__.py deleted file mode 100644 index cc4f140b3..000000000 --- a/fastembed/image/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from fastembed.image.image_embedding import ImageEmbedding - -__all__ = ["ImageEmbedding"] diff --git a/fastembed/image/image_embedding.py b/fastembed/image/image_embedding.py deleted file mode 100644 index a36303456..000000000 --- a/fastembed/image/image_embedding.py +++ /dev/null @@ -1,135 +0,0 @@ -from typing import Any, Iterable, Sequence, Type -from dataclasses import asdict - -from fastembed.common.types import NumpyArray, Device -from fastembed.common import ImageInput, OnnxProvider -from fastembed.image.image_embedding_base import ImageEmbeddingBase -from fastembed.image.onnx_embedding import OnnxImageEmbedding -from fastembed.common.model_description import DenseModelDescription - - -class ImageEmbedding(ImageEmbeddingBase): - EMBEDDINGS_REGISTRY: list[Type[ImageEmbeddingBase]] = [OnnxImageEmbedding] - - @classmethod - def list_supported_models(cls) -> list[dict[str, Any]]: - """ - Lists the supported models. - - Returns: - list[dict[str, Any]]: A list of dictionaries containing the model information. - - Example: - ``` - [ - { - "model": "Qdrant/clip-ViT-B-32-vision", - "dim": 512, - "description": "CLIP vision encoder based on ViT-B/32", - "license": "mit", - "size_in_GB": 0.33, - "sources": { - "hf": "Qdrant/clip-ViT-B-32-vision", - }, - "model_file": "model.onnx", - } - ] - ``` - """ - return [asdict(model) for model in cls._list_supported_models()] - - @classmethod - def _list_supported_models(cls) -> list[DenseModelDescription]: - result: list[DenseModelDescription] = [] - for embedding in cls.EMBEDDINGS_REGISTRY: - result.extend(embedding._list_supported_models()) - return result - - def __init__( - self, - model_name: str, - cache_dir: str | None = None, - threads: int | None = None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_ids: list[int] | None = None, - lazy_load: bool = False, - **kwargs: Any, - ): - super().__init__(model_name, cache_dir, threads, **kwargs) - for EMBEDDING_MODEL_TYPE in self.EMBEDDINGS_REGISTRY: - supported_models = EMBEDDING_MODEL_TYPE._list_supported_models() - if any(model_name.lower() == model.model.lower() for model in supported_models): - self.model = EMBEDDING_MODEL_TYPE( - model_name, - cache_dir, - threads=threads, - providers=providers, - cuda=cuda, - device_ids=device_ids, - lazy_load=lazy_load, - **kwargs, - ) - return - - raise ValueError( - f"Model {model_name} is not supported in ImageEmbedding." - "Please check the supported models using `ImageEmbedding.list_supported_models()`" - ) - - @property - def embedding_size(self) -> int: - """Get the embedding size of the current model""" - if self._embedding_size is None: - self._embedding_size = self.get_embedding_size(self.model_name) - return self._embedding_size - - @classmethod - def get_embedding_size(cls, model_name: str) -> int: - """Get the embedding size of the passed model - - Args: - model_name (str): The name of the model to get embedding size for. - - Returns: - int: The size of the embedding. - - Raises: - ValueError: If the model name is not found in the supported models. - """ - descriptions = cls._list_supported_models() - embedding_size: int | None = None - for description in descriptions: - if description.model.lower() == model_name.lower(): - embedding_size = description.dim - break - if embedding_size is None: - model_names = [description.model for description in descriptions] - raise ValueError( - f"Embedding size for model {model_name} was None. " - f"Available model names: {model_names}" - ) - return embedding_size - - def embed( - self, - images: ImageInput | Iterable[ImageInput], - batch_size: int = 16, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[NumpyArray]: - """ - Encode a list of images into list of embeddings. - - Args: - images: Iterator of image paths or single image path to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - - Returns: - List of embeddings, one per document - """ - yield from self.model.embed(images, batch_size, parallel, **kwargs) diff --git a/fastembed/image/image_embedding_base.py b/fastembed/image/image_embedding_base.py deleted file mode 100644 index 3601d1e57..000000000 --- a/fastembed/image/image_embedding_base.py +++ /dev/null @@ -1,55 +0,0 @@ -from typing import Iterable, Any - -from fastembed.common.model_description import DenseModelDescription -from fastembed.common.types import NumpyArray -from fastembed.common.model_management import ModelManagement -from fastembed.common.types import ImageInput - - -class ImageEmbeddingBase(ModelManagement[DenseModelDescription]): - def __init__( - self, - model_name: str, - cache_dir: str | None = None, - threads: int | None = None, - **kwargs: Any, - ): - self.model_name = model_name - self.cache_dir = cache_dir - self.threads = threads - self._local_files_only = kwargs.pop("local_files_only", False) - self._embedding_size: int | None = None - - def embed( - self, - images: ImageInput | Iterable[ImageInput], - batch_size: int = 16, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[NumpyArray]: - """ - Embeds a list of images into a list of embeddings. - - Args: - images: The list of image paths to preprocess and embed. - batch_size: Batch size for encoding - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - **kwargs: Additional keyword argument to pass to the embed method. - - Yields: - Iterable[NdArray]: The embeddings. - """ - raise NotImplementedError() - - @classmethod - def get_embedding_size(cls, model_name: str) -> int: - """Returns embedding size of the chosen model.""" - raise NotImplementedError("Subclasses must implement this method") - - @property - def embedding_size(self) -> int: - """Returns embedding size for the current model""" - raise NotImplementedError("Subclasses must implement this method") diff --git a/fastembed/image/onnx_embedding.py b/fastembed/image/onnx_embedding.py deleted file mode 100644 index 3e1739951..000000000 --- a/fastembed/image/onnx_embedding.py +++ /dev/null @@ -1,217 +0,0 @@ -from typing import Any, Iterable, Sequence, Type - - -from fastembed.common.types import NumpyArray, Device -from fastembed.common import ImageInput, OnnxProvider -from fastembed.common.onnx_model import OnnxOutputContext -from fastembed.common.utils import define_cache_dir, normalize -from fastembed.image.image_embedding_base import ImageEmbeddingBase -from fastembed.image.onnx_image_model import ImageEmbeddingWorker, OnnxImageModel - -from fastembed.common.model_description import DenseModelDescription, ModelSource - -supported_onnx_models: list[DenseModelDescription] = [ - DenseModelDescription( - model="Qdrant/clip-ViT-B-32-vision", - dim=512, - description="Image embeddings, Multimodal (text&image), 2021 year", - license="mit", - size_in_GB=0.34, - sources=ModelSource(hf="Qdrant/clip-ViT-B-32-vision"), - model_file="model.onnx", - ), - DenseModelDescription( - model="Qdrant/resnet50-onnx", - dim=2048, - description="Image embeddings, Unimodal (image), 2016 year", - license="apache-2.0", - size_in_GB=0.1, - sources=ModelSource(hf="Qdrant/resnet50-onnx"), - model_file="model.onnx", - ), - DenseModelDescription( - model="Qdrant/Unicom-ViT-B-16", - dim=768, - description="Image embeddings (more detailed than Unicom-ViT-B-32), Multimodal (text&image), 2023 year", - license="apache-2.0", - size_in_GB=0.82, - sources=ModelSource(hf="Qdrant/Unicom-ViT-B-16"), - model_file="model.onnx", - ), - DenseModelDescription( - model="Qdrant/Unicom-ViT-B-32", - dim=512, - description="Image embeddings, Multimodal (text&image), 2023 year", - license="apache-2.0", - size_in_GB=0.48, - sources=ModelSource(hf="Qdrant/Unicom-ViT-B-32"), - model_file="model.onnx", - ), - DenseModelDescription( - model="jinaai/jina-clip-v1", - dim=768, - description="Image embeddings, Multimodal (text&image), 2024 year", - license="apache-2.0", - size_in_GB=0.34, - sources=ModelSource(hf="jinaai/jina-clip-v1"), - model_file="onnx/vision_model.onnx", - ), -] - - -class OnnxImageEmbedding(ImageEmbeddingBase, OnnxImageModel[NumpyArray]): - def __init__( - self, - model_name: str, - - cache_dir: str | None = None, - threads: int | None = None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_ids: list[int] | None = None, - lazy_load: bool = False, - device_id: int | None = None, - specific_model_path: str | None = None, - **kwargs: Any, - ): - """ - Args: - model_name (str): The name of the model to use. - cache_dir (str, optional): The path to the cache directory. - Can be set using the `FASTEMBED_CACHE_PATH` env variable. - Defaults to `fastembed_cache` in the system's temp directory. - threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. - providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use. - Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None. - cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers` - Defaults to Device.AUTO. - device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in - workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive - with `providers`. Defaults to None. - lazy_load (bool, optional): Whether to load the model during class initialization or on demand. - Should be set to True when using multiple-gpu and parallel encoding. Defaults to False. - device_id (Optional[int], optional): The device id to use for loading the model in the worker process. - specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else - - Raises: - ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. - """ - - super().__init__(model_name, cache_dir, threads, **kwargs) - self.providers = providers - self.lazy_load = lazy_load - self._extra_session_options = self._select_exposed_session_options(kwargs) - - # List of device ids, that can be used for data parallel processing in workers - self.device_ids = device_ids - self.cuda = cuda - - # This device_id will be used if we need to load model in current process - self.device_id: int | None = None - if device_id is not None: - self.device_id = device_id - elif self.device_ids is not None: - self.device_id = self.device_ids[0] - - self.model_description = self._get_model_description(model_name) - self.cache_dir = str(define_cache_dir(cache_dir)) - self._specific_model_path = specific_model_path - self._model_dir = self.download_model( - self.model_description, - self.cache_dir, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - ) - - if not self.lazy_load: - self.load_onnx_model() - - def load_onnx_model(self) -> None: - """ - Load the onnx model. - """ - self._load_onnx_model( - model_dir=self._model_dir, - model_file=self.model_description.model_file, - threads=self.threads, - providers=self.providers, - cuda=self.cuda, - device_id=self.device_id, - extra_session_options=self._extra_session_options, - ) - - @classmethod - def _list_supported_models(cls) -> list[DenseModelDescription]: - """ - Lists the supported models. - - Returns: - list[DenseModelDescription]: A list of DenseModelDescription objects containing the model information. - """ - return supported_onnx_models - - def embed( - self, - images: ImageInput | Iterable[ImageInput], - batch_size: int = 16, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[NumpyArray]: - """ - Encode a list of images into list of embeddings. - We use mean pooling with attention so that the model can handle variable-length inputs. - - Args: - images: Iterator of image paths or single image path to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - - Returns: - List of embeddings, one per document - """ - - yield from self._embed_images( - model_name=self.model_name, - cache_dir=str(self.cache_dir), - images=images, - batch_size=batch_size, - parallel=parallel, - providers=self.providers, - cuda=self.cuda, - device_ids=self.device_ids, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - extra_session_options=self._extra_session_options, - **kwargs, - ) - - @classmethod - def _get_worker_class(cls) -> Type["ImageEmbeddingWorker[NumpyArray]"]: - return OnnxImageEmbeddingWorker - - def _preprocess_onnx_input( - self, onnx_input: dict[str, NumpyArray], **kwargs: Any - ) -> dict[str, NumpyArray]: - """ - Preprocess the onnx input. - """ - - return onnx_input - - def _post_process_onnx_output( - self, output: OnnxOutputContext, **kwargs: Any - ) -> Iterable[NumpyArray]: - return normalize(output.model_output) - - -class OnnxImageEmbeddingWorker(ImageEmbeddingWorker[NumpyArray]): - def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> OnnxImageEmbedding: - return OnnxImageEmbedding( - model_name=model_name, - cache_dir=cache_dir, - threads=1, - **kwargs, - ) diff --git a/fastembed/image/onnx_image_model.py b/fastembed/image/onnx_image_model.py deleted file mode 100644 index deddcf73c..000000000 --- a/fastembed/image/onnx_image_model.py +++ /dev/null @@ -1,156 +0,0 @@ -import contextlib -import os -from multiprocessing import get_all_start_methods -from pathlib import Path -from typing import Any, Iterable, Sequence, Type - -import numpy as np -from PIL import Image - -from fastembed.image.transform.operators import Compose -from fastembed.common.types import NumpyArray, Device -from fastembed.common import ImageInput, OnnxProvider -from fastembed.common.onnx_model import EmbeddingWorker, OnnxModel, OnnxOutputContext, T -from fastembed.common.preprocessor_utils import load_preprocessor -from fastembed.common.utils import iter_batch -from fastembed.parallel_processor import ParallelWorkerPool - -# Holds type of the embedding result - - -class OnnxImageModel(OnnxModel[T]): - @classmethod - def _get_worker_class(cls) -> Type["ImageEmbeddingWorker[T]"]: - raise NotImplementedError("Subclasses must implement this method") - - def _post_process_onnx_output(self, output: OnnxOutputContext, **kwargs: Any) -> Iterable[T]: - """Post-process the ONNX model output to convert it into a usable format. - - Args: - output (OnnxOutputContext): The raw output from the ONNX model. - **kwargs: Additional keyword arguments that may be needed by specific implementations. - - Returns: - Iterable[T]: Post-processed output as an iterable of type T. - """ - raise NotImplementedError("Subclasses must implement this method") - - def __init__(self) -> None: - super().__init__() - self.processor: Compose | None = None - - def _preprocess_onnx_input( - self, onnx_input: dict[str, NumpyArray], **kwargs: Any - ) -> dict[str, NumpyArray]: - """ - Preprocess the onnx input. - """ - return onnx_input - - def _load_onnx_model( - self, - model_dir: Path, - model_file: str, - threads: int | None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_id: int | None = None, - extra_session_options: dict[str, Any] | None = None, - ) -> None: - super()._load_onnx_model( - model_dir=model_dir, - model_file=model_file, - threads=threads, - providers=providers, - cuda=cuda, - device_id=device_id, - extra_session_options=extra_session_options, - ) - self.processor = load_preprocessor(model_dir=model_dir) - - def load_onnx_model(self) -> None: - raise NotImplementedError("Subclasses must implement this method") - - def _build_onnx_input(self, encoded: NumpyArray) -> dict[str, NumpyArray]: - input_name = self.model.get_inputs()[0].name # type: ignore[union-attr] - return {input_name: encoded} - - def onnx_embed(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutputContext: - with contextlib.ExitStack() as stack: - image_files = [ - stack.enter_context(Image.open(image)) - if not isinstance(image, Image.Image) - else image - for image in images - ] - assert self.processor is not None, "Processor is not initialized" - encoded = np.array(self.processor(image_files)) - onnx_input = self._build_onnx_input(encoded) - onnx_input = self._preprocess_onnx_input(onnx_input) - model_output = self.model.run(None, onnx_input) # type: ignore[union-attr] - embeddings = model_output[0].reshape(len(images), -1) - return OnnxOutputContext(model_output=embeddings) - - def _embed_images( - self, - model_name: str, - cache_dir: str, - images: ImageInput | Iterable[ImageInput], - batch_size: int = 256, - parallel: int | None = None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_ids: list[int] | None = None, - local_files_only: bool = False, - specific_model_path: str | None = None, - extra_session_options: dict[str, Any] | None = None, - **kwargs: Any, - ) -> Iterable[T]: - is_small = False - - if isinstance(images, (str, Path, Image.Image)): - images = [images] - is_small = True - - if isinstance(images, list) and len(images) < batch_size: - is_small = True - - if parallel is None or is_small: - if not hasattr(self, "model") or self.model is None: - self.load_onnx_model() - - for batch in iter_batch(images, batch_size): - yield from self._post_process_onnx_output(self.onnx_embed(batch), **kwargs) - else: - if parallel == 0: - parallel = os.cpu_count() - - start_method = "forkserver" if "forkserver" in get_all_start_methods() else "spawn" - params = { - "model_name": model_name, - "cache_dir": cache_dir, - "providers": providers, - "local_files_only": local_files_only, - "specific_model_path": specific_model_path, - **kwargs, - } - - if extra_session_options is not None: - params.update(extra_session_options) - - pool = ParallelWorkerPool( - num_workers=parallel or 1, - worker=self._get_worker_class(), - cuda=cuda, - device_ids=device_ids, - start_method=start_method, - ) - for batch in pool.ordered_map(iter_batch(images, batch_size), **params): - yield from self._post_process_onnx_output(batch, **kwargs) # type: ignore - - -class ImageEmbeddingWorker(EmbeddingWorker[T]): - def process(self, items: Iterable[tuple[int, Any]]) -> Iterable[tuple[int, Any]]: - for idx, batch in items: - embeddings = self.model.onnx_embed(batch) - yield idx, embeddings diff --git a/fastembed/image/transform/functional.py b/fastembed/image/transform/functional.py deleted file mode 100644 index 9d9e2197e..000000000 --- a/fastembed/image/transform/functional.py +++ /dev/null @@ -1,221 +0,0 @@ -import numpy as np -from PIL import Image - -from fastembed.common.types import NumpyArray - - -def convert_to_rgb(image: Image.Image) -> Image.Image: - if image.mode == "RGB": - return image - - image = image.convert("RGB") - return image - - -def center_crop( - image: Image.Image | NumpyArray, - size: tuple[int, int], -) -> NumpyArray: - if isinstance(image, np.ndarray): - _, orig_height, orig_width = image.shape - else: - orig_height, orig_width = image.height, image.width - # (H, W, C) -> (C, H, W) - image = np.array(image).transpose((2, 0, 1)) - - crop_height, crop_width = size - - # left upper corner (0, 0) - top = (orig_height - crop_height) // 2 - bottom = top + crop_height - left = (orig_width - crop_width) // 2 - right = left + crop_width - - # Check if cropped area is within image boundaries - if top >= 0 and bottom <= orig_height and left >= 0 and right <= orig_width: - image = image[..., top:bottom, left:right] - return image - - # Padding with zeros - new_height = max(crop_height, orig_height) - new_width = max(crop_width, orig_width) - new_shape = image.shape[:-2] + (new_height, new_width) - new_image = np.zeros_like(image, shape=new_shape, dtype=np.float32) - - top_pad = (new_height - orig_height) // 2 - bottom_pad = top_pad + orig_height - left_pad = (new_width - orig_width) // 2 - right_pad = left_pad + orig_width - new_image[..., top_pad:bottom_pad, left_pad:right_pad] = image - - top += top_pad - bottom += top_pad - left += left_pad - right += left_pad - - new_image = new_image[ - ..., max(0, top) : min(new_height, bottom), max(0, left) : min(new_width, right) - ] - - return new_image - - -def normalize( - image: NumpyArray, - mean: float | list[float], - std: float | list[float], -) -> NumpyArray: - num_channels = image.shape[1] if len(image.shape) == 4 else image.shape[0] - - if not np.issubdtype(image.dtype, np.floating): - image = image.astype(np.float32) - - mean_list = mean if isinstance(mean, list) else [mean] * num_channels - - if len(mean_list) != num_channels: - raise ValueError( - f"mean must have the same number of channels as the image, image has {num_channels} channels, got " - f"{len(mean_list)}" - ) - - mean_arr = np.array(mean_list, dtype=np.float32) - - std_list = std if isinstance(std, list) else [std] * num_channels - if len(std_list) != num_channels: - raise ValueError( - f"std must have the same number of channels as the image, image has {num_channels} channels, got {len(std_list)}" - ) - - std_arr = np.array(std_list, dtype=np.float32) - - image_upd = ((image.T - mean_arr) / std_arr).T - return image_upd - - -def resize( - image: Image.Image, - size: int | tuple[int, int], - resample: int | Image.Resampling = Image.Resampling.BILINEAR, -) -> Image.Image: - if isinstance(size, tuple): - return image.resize(size, resample) - - height, width = image.height, image.width - short, long = (width, height) if width <= height else (height, width) - - new_short, new_long = size, int(size * long / short) - if width <= height: - new_size = (new_short, new_long) - else: - new_size = (new_long, new_short) - return image.resize(new_size, resample) - - -def rescale(image: NumpyArray, scale: float, dtype: type = np.float32) -> NumpyArray: - return (image * scale).astype(dtype) - - -def pil2ndarray(image: Image.Image | NumpyArray) -> NumpyArray: - if isinstance(image, Image.Image): - return np.asarray(image).transpose((2, 0, 1)) - return image - - -def pad2square( - image: Image.Image, - size: int, - fill_color: str | int | tuple[int, ...] = 0, -) -> Image.Image: - height, width = image.height, image.width - - left, right = 0, width - top, bottom = 0, height - - crop_required = False - if width > size: - left = (width - size) // 2 - right = left + size - crop_required = True - - if height > size: - top = (height - size) // 2 - bottom = top + size - crop_required = True - - new_image = Image.new(mode="RGB", size=(size, size), color=fill_color) - new_image.paste(image.crop((left, top, right, bottom)) if crop_required else image) - return new_image - - -def resize_longest_edge( - image: Image.Image, - max_size: int, - resample: int | Image.Resampling = Image.Resampling.LANCZOS, -) -> Image.Image: - height, width = image.height, image.width - aspect_ratio = width / height - - if width >= height: - # Width is longer - new_width = max_size - new_height = int(new_width / aspect_ratio) - else: - # Height is longer - new_height = max_size - new_width = int(new_height * aspect_ratio) - - # Ensure even dimensions - if new_height % 2 != 0: - new_height += 1 - if new_width % 2 != 0: - new_width += 1 - - return image.resize((new_width, new_height), resample) - - -def crop_ndarray( - image: NumpyArray, - x1: int, - y1: int, - x2: int, - y2: int, - channel_first: bool = True, -) -> NumpyArray: - if channel_first: - # (C, H, W) format - return image[:, y1:y2, x1:x2] - else: - # (H, W, C) format - return image[y1:y2, x1:x2, :] - - -def resize_ndarray( - image: NumpyArray, - size: tuple[int, int], - resample: int | Image.Resampling = Image.Resampling.LANCZOS, - channel_first: bool = True, -) -> NumpyArray: - # Convert to PIL-friendly format (H, W, C) - if channel_first: - img_hwc = image.transpose((1, 2, 0)) - else: - img_hwc = image - - # Handle different dtypes - if img_hwc.dtype == np.float32 or img_hwc.dtype == np.float64: - # Assume normalized, scale to 0-255 for PIL - img_hwc_scaled = (img_hwc * 255).astype(np.uint8) - pil_img = Image.fromarray(img_hwc_scaled, mode="RGB") - resized = pil_img.resize(size, resample) - result = np.array(resized).astype(np.float32) / 255.0 - else: - # uint8 or similar - pil_img = Image.fromarray(img_hwc.astype(np.uint8), mode="RGB") - resized = pil_img.resize(size, resample) - result = np.array(resized) - - # Convert back to original format - if channel_first: - result = result.transpose((2, 0, 1)) - - return result diff --git a/fastembed/image/transform/operators.py b/fastembed/image/transform/operators.py deleted file mode 100644 index e6ba4d95f..000000000 --- a/fastembed/image/transform/operators.py +++ /dev/null @@ -1,499 +0,0 @@ -from typing import Any -import math - -from PIL import Image - -from fastembed.common.types import NumpyArray -from fastembed.image.transform.functional import ( - center_crop, - convert_to_rgb, - crop_ndarray, - normalize, - pil2ndarray, - rescale, - resize, - resize_longest_edge, - resize_ndarray, - pad2square, -) - - -class Transform: - def __call__(self, images: list[Any]) -> list[Image.Image] | list[NumpyArray]: - raise NotImplementedError("Subclasses must implement this method") - - -class ConvertToRGB(Transform): - def __call__(self, images: list[Image.Image]) -> list[Image.Image]: - return [convert_to_rgb(image=image) for image in images] - - -class CenterCrop(Transform): - def __init__(self, size: tuple[int, int]): - self.size = size - - def __call__(self, images: list[Image.Image]) -> list[NumpyArray]: - return [center_crop(image=image, size=self.size) for image in images] - - -class Normalize(Transform): - def __init__(self, mean: float | list[float], std: float | list[float]): - self.mean = mean - self.std = std - - def __call__( # type: ignore[override] - self, images: list[NumpyArray] | list[list[NumpyArray]] - ) -> list[NumpyArray] | list[list[NumpyArray]]: - if images and isinstance(images[0], list): - # Nested structure from ImageSplitter - return [ - [normalize(image, mean=self.mean, std=self.std) for image in img_patches] # type: ignore[arg-type] - for img_patches in images - ] - else: - # Flat structure (backward compatibility) - return [normalize(image, mean=self.mean, std=self.std) for image in images] # type: ignore[arg-type] - - -class Resize(Transform): - def __init__( - self, - size: int | tuple[int, int], - resample: Image.Resampling = Image.Resampling.BICUBIC, - ): - self.size = size - self.resample = resample - - def __call__(self, images: list[Image.Image]) -> list[Image.Image]: - return [resize(image, size=self.size, resample=self.resample) for image in images] - - -class Rescale(Transform): - def __init__(self, scale: float = 1 / 255): - self.scale = scale - - def __call__( # type: ignore[override] - self, images: list[NumpyArray] | list[list[NumpyArray]] - ) -> list[NumpyArray] | list[list[NumpyArray]]: - if images and isinstance(images[0], list): - # Nested structure from ImageSplitter - return [ - [rescale(image, scale=self.scale) for image in img_patches] # type: ignore[arg-type] - for img_patches in images - ] - else: - # Flat structure (backward compatibility) - return [rescale(image, scale=self.scale) for image in images] # type: ignore[arg-type] - - -class PILtoNDarray(Transform): - def __call__(self, images: list[Image.Image | NumpyArray]) -> list[NumpyArray]: - return [pil2ndarray(image) for image in images] - - -class PadtoSquare(Transform): - def __init__( - self, - size: int, - fill_color: str | int | tuple[int, ...], - ): - self.size = size - self.fill_color = fill_color - - def __call__(self, images: list[Image.Image]) -> list[Image.Image]: - return [ - pad2square(image=image, size=self.size, fill_color=self.fill_color) for image in images - ] - - -class ResizeLongestEdge(Transform): - """Resize images so the longest edge equals target size, preserving aspect ratio.""" - - def __init__( - self, - size: int, - resample: Image.Resampling = Image.Resampling.LANCZOS, - ): - self.size = size - self.resample = resample - - def __call__(self, images: list[Image.Image]) -> list[Image.Image]: - return [resize_longest_edge(image, self.size, self.resample) for image in images] - - -class ResizeForVisionEncoder(Transform): - """ - Resize both dimensions to be multiples of vision_encoder_max_size. - Preserves aspect ratio approximately. - Works on numpy arrays in (C, H, W) format. - """ - - def __init__( - self, - max_size: int, - resample: Image.Resampling = Image.Resampling.LANCZOS, - ): - self.max_size = max_size - self.resample = resample - - def __call__(self, images: list[NumpyArray]) -> list[NumpyArray]: - result = [] - for image in images: - # Assume (C, H, W) format - _, height, width = image.shape - - aspect_ratio = width / height - - if width >= height: - # Calculate new width as multiple of max_size - new_width = math.ceil(width / self.max_size) * self.max_size - new_height = int(new_width / aspect_ratio) - new_height = math.ceil(new_height / self.max_size) * self.max_size - else: - # Calculate new height as multiple of max_size - new_height = math.ceil(height / self.max_size) * self.max_size - new_width = int(new_height * aspect_ratio) - new_width = math.ceil(new_width / self.max_size) * self.max_size - - # Resize using the ndarray resize function - resized = resize_ndarray( - image, - size=(new_width, new_height), # PIL expects (width, height) - resample=self.resample, - channel_first=True, - ) - result.append(resized) - - return result - - -class ImageSplitter(Transform): - """ - Split images into grid of patches plus a global view. - - If image dimensions exceed max_size: - - Divide into ceil(H/max_size) x ceil(W/max_size) patches - - Each patch is cropped from the image - - Add a global view (original resized to max_size x max_size) - - If image is smaller than max_size: - - Return single image unchanged - - Works on numpy arrays in (C, H, W) format. - """ - - def __init__( - self, - max_size: int, - resample: Image.Resampling = Image.Resampling.LANCZOS, - ): - self.max_size = max_size - self.resample = resample - - def __call__(self, images: list[NumpyArray]) -> list[list[NumpyArray]]: # type: ignore[override] - result = [] - - for image in images: - # Assume (C, H, W) format - _, height, width = image.shape - max_height = max_width = self.max_size - - frames = [] - - if height > max_height or width > max_width: - # Calculate the number of splits needed - num_splits_h = math.ceil(height / max_height) - num_splits_w = math.ceil(width / max_width) - - # Calculate optimal patch dimensions - optimal_height = math.ceil(height / num_splits_h) - optimal_width = math.ceil(width / num_splits_w) - - # Generate patches in grid order (row by row) - for r in range(num_splits_h): - for c in range(num_splits_w): - # Calculate crop coordinates - start_x = c * optimal_width - start_y = r * optimal_height - end_x = min(start_x + optimal_width, width) - end_y = min(start_y + optimal_height, height) - - # Crop the patch - cropped = crop_ndarray( - image, x1=start_x, y1=start_y, x2=end_x, y2=end_y, channel_first=True - ) - frames.append(cropped) - - # Add global view (resized to max_size x max_size) - global_view = resize_ndarray( - image, - size=(max_width, max_height), # PIL expects (width, height) - resample=self.resample, - channel_first=True, - ) - frames.append(global_view) - else: - # Image is small enough, no splitting needed - frames.append(image) - - # Append (not extend) to preserve per-image grouping - result.append(frames) - - return result - - -class SquareResize(Transform): - """ - Resize images to square dimensions (max_size x max_size). - Works on numpy arrays in (C, H, W) format. - """ - - def __init__( - self, - size: int, - resample: Image.Resampling = Image.Resampling.LANCZOS, - ): - self.size = size - self.resample = resample - - def __call__(self, images: list[NumpyArray]) -> list[list[NumpyArray]]: # type: ignore[override] - return [ - [ - resize_ndarray( - image, size=(self.size, self.size), resample=self.resample, channel_first=True - ) - ] - for image in images - ] - - -class Compose: - def __init__(self, transforms: list[Transform]): - self.transforms = transforms - - def __call__( - self, images: list[Image.Image] | list[NumpyArray] - ) -> list[NumpyArray] | list[Image.Image]: - for transform in self.transforms: - images = transform(images) - return images - - @classmethod - def from_config(cls, config: dict[str, Any]) -> "Compose": - """Creates processor from a config dict. - Args: - config (dict[str, Any]): Configuration dictionary. - - Valid keys: - - do_resize - - resize_mode - - size - - fill_color - - do_center_crop - - crop_size - - do_rescale - - rescale_factor - - do_normalize - - image_mean - - mean - - image_std - - std - - resample - - interpolation - Valid size keys (nested): - - {"height", "width"} - - {"shortest_edge"} - - {"longest_edge"} - - Returns: - Compose: Image processor. - """ - transforms: list[Transform] = [] - cls._get_convert_to_rgb(transforms, config) - cls._get_resize(transforms, config) - cls._get_pad2square(transforms, config) - cls._get_center_crop(transforms, config) - cls._get_pil2ndarray(transforms, config) - cls._get_image_splitting(transforms, config) - cls._get_rescale(transforms, config) - cls._get_normalize(transforms, config) - return cls(transforms=transforms) - - @staticmethod - def _get_convert_to_rgb(transforms: list[Transform], config: dict[str, Any]) -> None: - transforms.append(ConvertToRGB()) - - @classmethod - def _get_resize(cls, transforms: list[Transform], config: dict[str, Any]) -> None: - mode = config.get("image_processor_type", "CLIPImageProcessor") - if mode in ("CLIPImageProcessor", "SiglipImageProcessor"): - if config.get("do_resize", False): - size = config["size"] - if "shortest_edge" in size: - size = size["shortest_edge"] - elif "height" in size and "width" in size: - size = (size["height"], size["width"]) - else: - raise ValueError( - "Size must contain either 'shortest_edge' or 'height' and 'width'." - ) - transforms.append( - Resize( - size=size, - resample=config.get("resample", Image.Resampling.BICUBIC), - ) - ) - elif mode == "ConvNextFeatureExtractor": - if "size" in config and "shortest_edge" not in config["size"]: - raise ValueError( - f"Size dictionary must contain 'shortest_edge' key. Got {config['size'].keys()}" - ) - shortest_edge = config["size"]["shortest_edge"] - crop_pct = config.get("crop_pct", 0.875) - if shortest_edge < 384: - # maintain same ratio, resizing shortest edge to shortest_edge/crop_pct - resize_shortest_edge = int(shortest_edge / crop_pct) - transforms.append( - Resize( - size=resize_shortest_edge, - resample=config.get("resample", Image.Resampling.BICUBIC), - ) - ) - transforms.append(CenterCrop(size=(shortest_edge, shortest_edge))) - else: - transforms.append( - Resize( - size=(shortest_edge, shortest_edge), - resample=config.get("resample", Image.Resampling.BICUBIC), - ) - ) - elif mode == "JinaCLIPImageProcessor": - interpolation = config.get("interpolation") - if isinstance(interpolation, str): - resample = cls._interpolation_resolver(interpolation) - else: - resample = interpolation or Image.Resampling.BICUBIC - - if "size" in config: - resize_mode = config.get("resize_mode", "shortest") - if resize_mode == "shortest": - transforms.append( - Resize( - size=config["size"], - resample=resample, - ) - ) - elif mode == "Idefics3ImageProcessor": - if config.get("do_resize", False): - size = config.get("size", {}) - if "longest_edge" not in size: - raise ValueError( - "Size dictionary must contain 'longest_edge' key for Idefics3ImageProcessor" - ) - - # Handle resample parameter - can be int enum or PIL.Image.Resampling - resample = config.get("resample", Image.Resampling.LANCZOS) - if isinstance(resample, int): - resample = Image.Resampling(resample) - - transforms.append( - ResizeLongestEdge( - size=size["longest_edge"], - resample=resample, - ) - ) - else: - raise ValueError(f"Preprocessor {mode} is not supported") - - @staticmethod - def _get_center_crop(transforms: list[Transform], config: dict[str, Any]) -> None: - mode = config.get("image_processor_type", "CLIPImageProcessor") - if mode in ("CLIPImageProcessor", "SiglipImageProcessor"): - if config.get("do_center_crop", False): - crop_size_raw = config["crop_size"] - crop_size: tuple[int, int] - if isinstance(crop_size_raw, int): - crop_size = (crop_size_raw, crop_size_raw) - elif isinstance(crop_size_raw, dict): - crop_size = (crop_size_raw["height"], crop_size_raw["width"]) - else: - raise ValueError(f"Invalid crop size: {crop_size_raw}") - transforms.append(CenterCrop(size=crop_size)) - elif mode == "ConvNextFeatureExtractor": - pass - elif mode == "JinaCLIPImageProcessor": - pass - elif mode == "Idefics3ImageProcessor": - pass - else: - raise ValueError(f"Preprocessor {mode} is not supported") - - @staticmethod - def _get_pil2ndarray(transforms: list[Transform], config: dict[str, Any]) -> None: - transforms.append(PILtoNDarray()) - - @classmethod - def _get_image_splitting(cls, transforms: list[Transform], config: dict[str, Any]) -> None: - """ - Add image splitting transforms for Idefics3. - Handles conditional logic: splitting vs square resize. - Must be called AFTER PILtoNDarray. - """ - mode = config.get("image_processor_type", "CLIPImageProcessor") - - if mode == "Idefics3ImageProcessor": - do_splitting = config.get("do_image_splitting", False) - max_size = config.get("max_image_size", {}).get("longest_edge", 512) - resample = config.get("resample", Image.Resampling.LANCZOS) - if isinstance(resample, int): - resample = Image.Resampling(resample) - - if do_splitting: - transforms.append(ResizeForVisionEncoder(max_size, resample)) - transforms.append(ImageSplitter(max_size, resample)) - else: - transforms.append(SquareResize(max_size, resample)) - - @staticmethod - def _get_rescale(transforms: list[Transform], config: dict[str, Any]) -> None: - if config.get("do_rescale", True): - rescale_factor = config.get("rescale_factor", 1 / 255) - transforms.append(Rescale(scale=rescale_factor)) - - @staticmethod - def _get_normalize(transforms: list[Transform], config: dict[str, Any]) -> None: - if config.get("do_normalize", False): - transforms.append(Normalize(mean=config["image_mean"], std=config["image_std"])) - elif "mean" in config and "std" in config: - transforms.append(Normalize(mean=config["mean"], std=config["std"])) - - @staticmethod - def _get_pad2square(transforms: list[Transform], config: dict[str, Any]) -> None: - mode = config.get("image_processor_type", "CLIPImageProcessor") - if mode == "CLIPImageProcessor": - pass - elif mode == "ConvNextFeatureExtractor": - pass - elif mode == "JinaCLIPImageProcessor": - transforms.append( - PadtoSquare( - size=config["size"], - fill_color=config.get("fill_color", 0), - ) - ) - - @staticmethod - def _interpolation_resolver(resample: str | None = None) -> Image.Resampling: - interpolation_map = { - "nearest": Image.Resampling.NEAREST, - "lanczos": Image.Resampling.LANCZOS, - "bilinear": Image.Resampling.BILINEAR, - "bicubic": Image.Resampling.BICUBIC, - "box": Image.Resampling.BOX, - "hamming": Image.Resampling.HAMMING, - } - - if resample and (method := interpolation_map.get(resample.lower())): - return method - - raise ValueError(f"Unknown interpolation method: {resample}") diff --git a/fastembed/late_interaction/__init__.py b/fastembed/late_interaction/__init__.py deleted file mode 100644 index eae5d0354..000000000 --- a/fastembed/late_interaction/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from fastembed.late_interaction.late_interaction_text_embedding import ( - LateInteractionTextEmbedding, -) - -__all__ = ["LateInteractionTextEmbedding"] diff --git a/fastembed/late_interaction/colbert.py b/fastembed/late_interaction/colbert.py deleted file mode 100644 index b8b925429..000000000 --- a/fastembed/late_interaction/colbert.py +++ /dev/null @@ -1,301 +0,0 @@ -import string -from typing import Any, Iterable, Sequence, Type - -import numpy as np -from tokenizers import Encoding, Tokenizer - -from fastembed.common.preprocessor_utils import load_tokenizer -from fastembed.common.types import NumpyArray, Device -from fastembed.common import OnnxProvider -from fastembed.common.onnx_model import OnnxOutputContext -from fastembed.common.utils import define_cache_dir, iter_batch -from fastembed.late_interaction.late_interaction_embedding_base import ( - LateInteractionTextEmbeddingBase, -) -from fastembed.text.onnx_text_model import OnnxTextModel, TextEmbeddingWorker -from fastembed.common.model_description import DenseModelDescription, ModelSource - -supported_colbert_models: list[DenseModelDescription] = [ - DenseModelDescription( - model="colbert-ir/colbertv2.0", - dim=128, - description="Text embeddings, Unimodal (text), English, 512 input tokens truncation, 2023 year", - license="mit", - size_in_GB=0.44, - sources=ModelSource(hf="colbert-ir/colbertv2.0"), - model_file="model.onnx", - ), - DenseModelDescription( - model="answerdotai/answerai-colbert-small-v1", - dim=96, - description="Text embeddings, Unimodal (text), English, 512 input tokens truncation, 2024 year", - license="apache-2.0", - size_in_GB=0.13, - sources=ModelSource(hf="answerdotai/answerai-colbert-small-v1"), - model_file="vespa_colbert.onnx", - ), -] - - -class Colbert(LateInteractionTextEmbeddingBase, OnnxTextModel[NumpyArray]): - QUERY_MARKER_TOKEN_ID = 1 - DOCUMENT_MARKER_TOKEN_ID = 2 - MIN_QUERY_LENGTH = 31 # it's 32, we add one additional special token in the beginning - MASK_TOKEN = "[MASK]" - - def _post_process_onnx_output( - self, output: OnnxOutputContext, is_doc: bool = True, **kwargs: Any - ) -> Iterable[NumpyArray]: - if not is_doc: - for embedding in output.model_output: - yield embedding - else: - if output.input_ids is None or output.attention_mask is None: - raise ValueError( - "input_ids and attention_mask must be provided for document post-processing" - ) - - for i, token_sequence in enumerate(output.input_ids): - for j, token_id in enumerate(token_sequence): # type: ignore - if token_id in self.skip_list or token_id == self.pad_token_id: - output.attention_mask[i, j] = 0 - - output.model_output *= np.expand_dims(output.attention_mask, 2) - norm = np.linalg.norm(output.model_output, ord=2, axis=2, keepdims=True) - norm_clamped = np.maximum(norm, 1e-12) - output.model_output /= norm_clamped - - for embedding, attention_mask in zip(output.model_output, output.attention_mask): - yield embedding[attention_mask == 1] - - def _preprocess_onnx_input( - self, onnx_input: dict[str, NumpyArray], is_doc: bool = True, **kwargs: Any - ) -> dict[str, NumpyArray]: - marker_token = self.DOCUMENT_MARKER_TOKEN_ID if is_doc else self.QUERY_MARKER_TOKEN_ID - onnx_input["input_ids"] = np.insert( - onnx_input["input_ids"].astype(np.int64), 1, marker_token, axis=1 - ) - onnx_input["attention_mask"] = np.insert( - onnx_input["attention_mask"].astype(np.int64), 1, 1, axis=1 - ) - return onnx_input - - def tokenize(self, documents: list[str], is_doc: bool = True, **kwargs: Any) -> list[Encoding]: - return ( - self._tokenize_documents(documents=documents) - if is_doc - else self._tokenize_query(query=next(iter(documents))) - ) - - def _tokenize_query(self, query: str) -> list[Encoding]: - assert self.query_tokenizer is not None - encoded = self.query_tokenizer.encode_batch([query]) - return encoded - - def _tokenize_documents(self, documents: list[str]) -> list[Encoding]: - encoded = self.tokenizer.encode_batch(documents) # type: ignore[union-attr] - return encoded - - def token_count( - self, - texts: str | Iterable[str], - batch_size: int = 1024, - is_doc: bool = True, - include_extension: bool = False, - **kwargs: Any, - ) -> int: - if not hasattr(self, "model") or self.model is None: - self.load_onnx_model() # loads the tokenizer as well - token_num = 0 - texts = [texts] if isinstance(texts, str) else texts - tokenizer = self.tokenizer if is_doc else self.query_tokenizer - assert tokenizer is not None - for batch in iter_batch(texts, batch_size): - for tokens in tokenizer.encode_batch(batch): - if is_doc: - token_num += sum(tokens.attention_mask) - else: - attend_count = sum(tokens.attention_mask) - if include_extension: - token_num += max(attend_count, self.MIN_QUERY_LENGTH) - - else: - token_num += attend_count - if include_extension: - token_num += len( - batch - ) # add 1 for each cls.DOC_MARKER_TOKEN_ID or cls.QUERY_MARKER_TOKEN_ID - - return token_num - - @classmethod - def _list_supported_models(cls) -> list[DenseModelDescription]: - """Lists the supported models. - - Returns: - list[DenseModelDescription]: A list of DenseModelDescription objects containing the model information. - """ - return supported_colbert_models - - def __init__( - self, - model_name: str, - cache_dir: str | None = None, - threads: int | None = None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_ids: list[int] | None = None, - lazy_load: bool = False, - device_id: int | None = None, - specific_model_path: str | None = None, - **kwargs: Any, - ): - """ - Args: - model_name (str): The name of the model to use. - cache_dir (str, optional): The path to the cache directory. - Can be set using the `FASTEMBED_CACHE_PATH` env variable. - Defaults to `fastembed_cache` in the system's temp directory. - threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. - providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use. - Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None. - cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers` - Defaults to Device.AUTO. - device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in - workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive - with `providers`. Defaults to None. - lazy_load (bool, optional): Whether to load the model during class initialization or on demand. - Should be set to True when using multiple-gpu and parallel encoding. Defaults to False. - device_id (Optional[int], optional): The device id to use for loading the model in the worker process. - specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else - - Raises: - ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. - """ - - super().__init__(model_name, cache_dir, threads, **kwargs) - self.providers = providers - self.lazy_load = lazy_load - self._extra_session_options = self._select_exposed_session_options(kwargs) - - # List of device ids, that can be used for data parallel processing in workers - self.device_ids = device_ids - self.cuda = cuda - - # This device_id will be used if we need to load model in current process - self.device_id: int | None = None - if device_id is not None: - self.device_id = device_id - elif self.device_ids is not None: - self.device_id = self.device_ids[0] - - self.model_description = self._get_model_description(model_name) - self.cache_dir = str(define_cache_dir(cache_dir)) - - self._specific_model_path = specific_model_path - self._model_dir = self.download_model( - self.model_description, - self.cache_dir, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - ) - self.mask_token_id: int | None = None - self.pad_token_id: int | None = None - self.skip_list: set[int] = set() - - self.query_tokenizer: Tokenizer | None = None - - if not self.lazy_load: - self.load_onnx_model() - - def load_onnx_model(self) -> None: - self._load_onnx_model( - model_dir=self._model_dir, - model_file=self.model_description.model_file, - threads=self.threads, - providers=self.providers, - cuda=self.cuda, - device_id=self.device_id, - extra_session_options=self._extra_session_options, - ) - self.query_tokenizer, _ = load_tokenizer(model_dir=self._model_dir) - - assert self.tokenizer is not None - self.mask_token_id = self.special_token_to_id[self.MASK_TOKEN] - self.pad_token_id = self.tokenizer.padding["pad_id"] - self.skip_list = { - self.tokenizer.encode(symbol, add_special_tokens=False).ids[0] - for symbol in string.punctuation - } - current_max_length = self.tokenizer.truncation["max_length"] - # ensure not to overflow after adding document-marker - self.tokenizer.enable_truncation(max_length=current_max_length - 1) - self.query_tokenizer.enable_truncation(max_length=current_max_length - 1) - self.query_tokenizer.enable_padding( - pad_token=self.MASK_TOKEN, - pad_id=self.mask_token_id, - length=self.MIN_QUERY_LENGTH, - ) - - def embed( - self, - documents: str | Iterable[str], - batch_size: int = 256, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[NumpyArray]: - """ - Encode a list of documents into list of embeddings. - We use mean pooling with attention so that the model can handle variable-length inputs. - - Args: - documents: Iterator of documents or single document to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - - Returns: - List of embeddings, one per document - """ - yield from self._embed_documents( - model_name=self.model_name, - cache_dir=str(self.cache_dir), - documents=documents, - batch_size=batch_size, - parallel=parallel, - providers=self.providers, - cuda=self.cuda, - device_ids=self.device_ids, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - extra_session_options=self._extra_session_options, - **kwargs, - ) - - def query_embed(self, query: str | Iterable[str], **kwargs: Any) -> Iterable[NumpyArray]: - if isinstance(query, str): - query = [query] - - if not hasattr(self, "model") or self.model is None: - self.load_onnx_model() - - for text in query: - yield from self._post_process_onnx_output( - self.onnx_embed([text], is_doc=False), is_doc=False - ) - - @classmethod - def _get_worker_class(cls) -> Type[TextEmbeddingWorker[NumpyArray]]: - return ColbertEmbeddingWorker - - -class ColbertEmbeddingWorker(TextEmbeddingWorker[NumpyArray]): - def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> Colbert: - return Colbert( - model_name=model_name, - cache_dir=cache_dir, - threads=1, - **kwargs, - ) diff --git a/fastembed/late_interaction/jina_colbert.py b/fastembed/late_interaction/jina_colbert.py deleted file mode 100644 index 3ef89c63c..000000000 --- a/fastembed/late_interaction/jina_colbert.py +++ /dev/null @@ -1,58 +0,0 @@ -from typing import Any, Type - -from fastembed.common.types import NumpyArray -from fastembed.late_interaction.colbert import Colbert, ColbertEmbeddingWorker -from fastembed.common.model_description import DenseModelDescription, ModelSource - -supported_jina_colbert_models: list[DenseModelDescription] = [ - DenseModelDescription( - model="jinaai/jina-colbert-v2", - dim=128, - description="New model that expands capabilities of colbert-v1 with multilingual and context length of 8192, 2024 year", - license="cc-by-nc-4.0", - size_in_GB=2.24, - sources=ModelSource(hf="jinaai/jina-colbert-v2"), - model_file="onnx/model.onnx", - additional_files=["onnx/model.onnx_data"], - ) -] - - -class JinaColbert(Colbert): - QUERY_MARKER_TOKEN_ID = 250002 - DOCUMENT_MARKER_TOKEN_ID = 250003 - MIN_QUERY_LENGTH = 31 # it's 32, we add one additional special token in the beginning - MASK_TOKEN = "" - - @classmethod - def _get_worker_class(cls) -> Type[ColbertEmbeddingWorker]: - return JinaColbertEmbeddingWorker - - @classmethod - def _list_supported_models(cls) -> list[DenseModelDescription]: - """Lists the supported models. - - Returns: - list[DenseModelDescription]: A list of DenseModelDescription objects containing the model information. - """ - return supported_jina_colbert_models - - def _preprocess_onnx_input( - self, onnx_input: dict[str, NumpyArray], is_doc: bool = True, **kwargs: Any - ) -> dict[str, NumpyArray]: - onnx_input = super()._preprocess_onnx_input(onnx_input, is_doc) - - # the attention mask for jina-colbert-v2 is always 1 in queries - if not is_doc: - onnx_input["attention_mask"][:] = 1 - return onnx_input - - -class JinaColbertEmbeddingWorker(ColbertEmbeddingWorker): - def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> JinaColbert: - return JinaColbert( - model_name=model_name, - cache_dir=cache_dir, - threads=1, - **kwargs, - ) diff --git a/fastembed/late_interaction/late_interaction_embedding_base.py b/fastembed/late_interaction/late_interaction_embedding_base.py deleted file mode 100644 index 1ba7909e5..000000000 --- a/fastembed/late_interaction/late_interaction_embedding_base.py +++ /dev/null @@ -1,80 +0,0 @@ -from typing import Iterable, Any - -from fastembed.common.model_description import DenseModelDescription -from fastembed.common.types import NumpyArray -from fastembed.common.model_management import ModelManagement - - -class LateInteractionTextEmbeddingBase(ModelManagement[DenseModelDescription]): - def __init__( - self, - model_name: str, - cache_dir: str | None = None, - threads: int | None = None, - **kwargs: Any, - ): - self.model_name = model_name - self.cache_dir = cache_dir - self.threads = threads - self._local_files_only = kwargs.pop("local_files_only", False) - self._embedding_size: int | None = None - - def embed( - self, - documents: str | Iterable[str], - batch_size: int = 256, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[NumpyArray]: - raise NotImplementedError() - - def passage_embed(self, texts: Iterable[str], **kwargs: Any) -> Iterable[NumpyArray]: - """ - Embeds a list of text passages into a list of embeddings. - - Args: - texts (Iterable[str]): The list of texts to embed. - **kwargs: Additional keyword argument to pass to the embed method. - - Yields: - Iterable[NdArray]: The embeddings. - """ - - # This is model-specific, so that different models can have specialized implementations - yield from self.embed(texts, **kwargs) - - def query_embed(self, query: str | Iterable[str], **kwargs: Any) -> Iterable[NumpyArray]: - """ - Embeds queries - - Args: - query (Union[str, Iterable[str]]): The query to embed, or an iterable e.g. list of queries. - - Returns: - Iterable[NdArray]: The embeddings. - """ - - # This is model-specific, so that different models can have specialized implementations - if isinstance(query, str): - yield from self.embed([query], **kwargs) - else: - yield from self.embed(query, **kwargs) - - @classmethod - def get_embedding_size(cls, model_name: str) -> int: - """Returns embedding size of the chosen model.""" - raise NotImplementedError("Subclasses must implement this method") - - @property - def embedding_size(self) -> int: - """Returns embedding size for the current model""" - raise NotImplementedError("Subclasses must implement this method") - - def token_count( - self, - texts: str | Iterable[str], - batch_size: int = 1024, - **kwargs: Any, - ) -> int: - """Returns the number of tokens in the texts.""" - raise NotImplementedError("Subclasses must implement this method") diff --git a/fastembed/late_interaction/late_interaction_text_embedding.py b/fastembed/late_interaction/late_interaction_text_embedding.py deleted file mode 100644 index 30c8b70d6..000000000 --- a/fastembed/late_interaction/late_interaction_text_embedding.py +++ /dev/null @@ -1,180 +0,0 @@ -from typing import Any, Iterable, Sequence, Type -from dataclasses import asdict - -from fastembed.common.model_description import DenseModelDescription -from fastembed.common.types import NumpyArray, Device -from fastembed.common import OnnxProvider -from fastembed.late_interaction.colbert import Colbert -from fastembed.late_interaction.jina_colbert import JinaColbert -from fastembed.late_interaction.late_interaction_embedding_base import ( - LateInteractionTextEmbeddingBase, -) - - -class LateInteractionTextEmbedding(LateInteractionTextEmbeddingBase): - EMBEDDINGS_REGISTRY: list[Type[LateInteractionTextEmbeddingBase]] = [Colbert, JinaColbert] - - @classmethod - def list_supported_models(cls) -> list[dict[str, Any]]: - """ - Lists the supported models. - - Returns: - list[dict[str, Any]]: A list of dictionaries containing the model information. - - Example: - ``` - [ - { - "model": "colbert-ir/colbertv2.0", - "dim": 128, - "description": "Late interaction model", - "license": "mit", - "size_in_GB": 0.44, - "sources": { - "hf": "colbert-ir/colbertv2.0", - }, - "model_file": "model.onnx", - }, - ] - ``` - """ - return [asdict(model) for model in cls._list_supported_models()] - - @classmethod - def _list_supported_models(cls) -> list[DenseModelDescription]: - result: list[DenseModelDescription] = [] - for embedding in cls.EMBEDDINGS_REGISTRY: - result.extend(embedding._list_supported_models()) - return result - - def __init__( - self, - model_name: str, - cache_dir: str | None = None, - threads: int | None = None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_ids: list[int] | None = None, - lazy_load: bool = False, - **kwargs: Any, - ): - super().__init__(model_name, cache_dir, threads, **kwargs) - for EMBEDDING_MODEL_TYPE in self.EMBEDDINGS_REGISTRY: - supported_models = EMBEDDING_MODEL_TYPE._list_supported_models() - if any(model_name.lower() == model.model.lower() for model in supported_models): - self.model = EMBEDDING_MODEL_TYPE( - model_name, - cache_dir, - threads=threads, - providers=providers, - cuda=cuda, - device_ids=device_ids, - lazy_load=lazy_load, - **kwargs, - ) - return - - raise ValueError( - f"Model {model_name} is not supported in LateInteractionTextEmbedding." - "Please check the supported models using `LateInteractionTextEmbedding.list_supported_models()`" - ) - - @property - def embedding_size(self) -> int: - """Get the embedding size of the current model""" - if self._embedding_size is None: - self._embedding_size = self.get_embedding_size(self.model_name) - return self._embedding_size - - @classmethod - def get_embedding_size(cls, model_name: str) -> int: - """Get the embedding size of the passed model - - Args: - model_name (str): The name of the model to get embedding size for. - - Returns: - int: The size of the embedding. - - Raises: - ValueError: If the model name is not found in the supported models. - """ - descriptions = cls._list_supported_models() - embedding_size: int | None = None - for description in descriptions: - if description.model.lower() == model_name.lower(): - embedding_size = description.dim - break - if embedding_size is None: - model_names = [description.model for description in descriptions] - raise ValueError( - f"Embedding size for model {model_name} was None. " - f"Available model names: {model_names}" - ) - return embedding_size - - def embed( - self, - documents: str | Iterable[str], - batch_size: int = 256, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[NumpyArray]: - """ - Encode a list of documents into list of embeddings. - We use mean pooling with attention so that the model can handle variable-length inputs. - - Args: - documents: Iterator of documents or single document to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - - Returns: - List of embeddings, one per document - """ - yield from self.model.embed(documents, batch_size, parallel, **kwargs) - - def query_embed(self, query: str | Iterable[str], **kwargs: Any) -> Iterable[NumpyArray]: - """ - Embeds queries - - Args: - query (Union[str, Iterable[str]]): The query to embed, or an iterable e.g. list of queries. - - Returns: - Iterable[NdArray]: The embeddings. - """ - - # This is model-specific, so that different models can have specialized implementations - yield from self.model.query_embed(query, **kwargs) - - def token_count( - self, - texts: str | Iterable[str], - batch_size: int = 1024, - is_doc: bool = True, - include_extension: bool = False, - **kwargs: Any, - ) -> int: - """Returns the number of tokens in the texts. - - Args: - texts (str | Iterable[str]): The list of texts to embed. - batch_size (int): Batch size for encoding - is_doc (bool): Whether the texts are documents (disable embedding a query with include_mask=True). - include_extension (bool): Turn on to count DOC / QUERY marker tokens, and [MASK] token in query mode. - - Returns: - int: Sum of number of tokens in the texts. - """ - return self.model.token_count( - texts, - batch_size=batch_size, - is_doc=is_doc, - include_extension=include_extension, - **kwargs, - ) diff --git a/fastembed/late_interaction/token_embeddings.py b/fastembed/late_interaction/token_embeddings.py deleted file mode 100644 index 55d3cd94e..000000000 --- a/fastembed/late_interaction/token_embeddings.py +++ /dev/null @@ -1,83 +0,0 @@ -from dataclasses import asdict -from typing import Iterable, Any, Type - -from fastembed.common.model_description import DenseModelDescription, ModelSource -from fastembed.common.onnx_model import OnnxOutputContext -from fastembed.common.types import NumpyArray -from fastembed.late_interaction.late_interaction_embedding_base import ( - LateInteractionTextEmbeddingBase, -) -from fastembed.text.onnx_embedding import OnnxTextEmbedding -from fastembed.text.onnx_text_model import TextEmbeddingWorker - - -supported_token_embeddings_models = [ - DenseModelDescription( - model="jinaai/jina-embeddings-v2-small-en-tokens", - dim=512, - description="Text embeddings, Unimodal (text), English, 8192 input tokens truncation," - " Prefixes for queries/documents: not necessary, 2023 year.", - license="apache-2.0", - size_in_GB=0.12, - sources=ModelSource(hf="xenova/jina-embeddings-v2-small-en"), - model_file="onnx/model.onnx", - ), -] - - -class TokenEmbeddingsModel(OnnxTextEmbedding, LateInteractionTextEmbeddingBase): - @classmethod - def _list_supported_models(cls) -> list[DenseModelDescription]: - """Lists the supported models. - - Returns: - list[DenseModelDescription]: A list of DenseModelDescription objects containing the model information. - """ - return supported_token_embeddings_models - - @classmethod - def list_supported_models(cls) -> list[dict[str, Any]]: - """Lists the supported models. - - Returns: - list[dict[str, Any]]: A list of dictionaries containing the model information. - """ - return [asdict(model) for model in cls._list_supported_models()] - - @classmethod - def _get_worker_class(cls) -> Type[TextEmbeddingWorker[NumpyArray]]: - return TokensEmbeddingWorker - - def _post_process_onnx_output( - self, output: OnnxOutputContext, **kwargs: Any - ) -> Iterable[NumpyArray]: - # Size: (batch_size, sequence_length, hidden_size) - embeddings = output.model_output - # Size: (batch_size, sequence_length) - assert output.attention_mask is not None - masks = output.attention_mask - - # For each document we only select those embeddings that are not masked out - for i in range(embeddings.shape[0]): - yield embeddings[i, masks[i] == 1] - - def embed( - self, - documents: str | Iterable[str], - batch_size: int = 256, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[NumpyArray]: - yield from super().embed(documents, batch_size=batch_size, parallel=parallel, **kwargs) - - -class TokensEmbeddingWorker(TextEmbeddingWorker[NumpyArray]): - def init_embedding( - self, model_name: str, cache_dir: str, **kwargs: Any - ) -> TokenEmbeddingsModel: - return TokenEmbeddingsModel( - model_name=model_name, - cache_dir=cache_dir, - threads=1, - **kwargs, - ) diff --git a/fastembed/late_interaction_multimodal/__init__.py b/fastembed/late_interaction_multimodal/__init__.py deleted file mode 100644 index 50588cdef..000000000 --- a/fastembed/late_interaction_multimodal/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding import ( - LateInteractionMultimodalEmbedding, -) - -__all__ = ["LateInteractionMultimodalEmbedding"] diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py deleted file mode 100644 index 20b8e4f7b..000000000 --- a/fastembed/late_interaction_multimodal/colmodernvbert.py +++ /dev/null @@ -1,532 +0,0 @@ -import contextlib -from typing import Any, Iterable, Type, Optional, Sequence -import json - -import numpy as np -from tokenizers import Encoding -from PIL import Image - -from fastembed.common import ImageInput -from fastembed.common.model_description import DenseModelDescription, ModelSource -from fastembed.common.onnx_model import OnnxOutputContext -from fastembed.common.types import NumpyArray, OnnxProvider -from fastembed.common.utils import define_cache_dir, iter_batch -from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import ( - LateInteractionMultimodalEmbeddingBase, -) -from fastembed.late_interaction_multimodal.onnx_multimodal_model import ( - OnnxMultimodalModel, - TextEmbeddingWorker, - ImageEmbeddingWorker, -) - -supported_colmodernvbert_models: list[DenseModelDescription] = [ - DenseModelDescription( - model="Qdrant/colmodernvbert", - dim=128, - description="The late-interaction version of ModernVBERT, CPU friendly, English, 2025.", - license="mit", - size_in_GB=1.0, - sources=ModelSource(hf="Qdrant/colmodernvbert"), - additional_files=["processor_config.json"], - model_file="model.onnx", - ), -] - - -class ColModernVBERT(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel[NumpyArray]): - """ - The ModernVBERT/colmodernvbert model implementation. This model uses - bidirectional attention, which proves to work better for retrieval. - - See: https://huggingface.co/ModernVBERT/colmodernvbert - """ - - VISUAL_PROMPT_PREFIX = ( - "<|begin_of_text|>User:Describe the image.\nAssistant:" - ) - QUERY_AUGMENTATION_TOKEN = "" - - def __init__( - self, - model_name: str, - cache_dir: Optional[str] = None, - threads: Optional[int] = None, - providers: Optional[Sequence[OnnxProvider]] = None, - cuda: bool = False, - device_ids: Optional[list[int]] = None, - lazy_load: bool = False, - device_id: Optional[int] = None, - specific_model_path: Optional[str] = None, - **kwargs: Any, - ): - """ - Args: - model_name (str): The name of the model to use. - cache_dir (str, optional): The path to the cache directory. - Can be set using the `FASTEMBED_CACHE_PATH` env variable. - Defaults to `fastembed_cache` in the system's temp directory. - threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. - providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use. - Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None. - cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers` - Defaults to False. - device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in - workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None. - lazy_load (bool, optional): Whether to load the model during class initialization or on demand. - Should be set to True when using multiple-gpu and parallel encoding. Defaults to False. - device_id (Optional[int], optional): The device id to use for loading the model in the worker process. - - Raises: - ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. - """ - super().__init__(model_name, cache_dir, threads, **kwargs) - self.providers = providers - self.lazy_load = lazy_load - self._extra_session_options = self._select_exposed_session_options(kwargs) - - # List of device ids, that can be used for data parallel processing in workers - self.device_ids = device_ids - self.cuda = cuda - - # This device_id will be used if we need to load model in current process - self.device_id: Optional[int] = None - if device_id is not None: - self.device_id = device_id - elif self.device_ids is not None: - self.device_id = self.device_ids[0] - - self.model_description = self._get_model_description(model_name) - self.cache_dir = str(define_cache_dir(cache_dir)) - - self._specific_model_path = specific_model_path - self._model_dir = self.download_model( - self.model_description, - self.cache_dir, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - ) - self.mask_token_id = None - self.pad_token_id = None - self.image_seq_len: Optional[int] = None - self.max_image_size: Optional[int] = None - self.image_size: Optional[int] = None - - if not self.lazy_load: - self.load_onnx_model() - - @classmethod - def _list_supported_models(cls) -> list[DenseModelDescription]: - """Lists the supported models. - - Returns: - list[DenseModelDescription]: A list of DenseModelDescription objects containing the model information. - """ - return supported_colmodernvbert_models - - def load_onnx_model(self) -> None: - self._load_onnx_model( - model_dir=self._model_dir, - model_file=self.model_description.model_file, - threads=self.threads, - providers=self.providers, - cuda=self.cuda, - device_id=self.device_id, - extra_session_options=self._extra_session_options, - ) - - # Load image processing configuration - processor_config_path = self._model_dir / "processor_config.json" - with open(processor_config_path) as f: - processor_config = json.load(f) - self.image_seq_len = processor_config.get("image_seq_len", 64) - - preprocessor_config_path = self._model_dir / "preprocessor_config.json" - with open(preprocessor_config_path) as f: - preprocessor_config = json.load(f) - self.max_image_size = preprocessor_config.get("max_image_size", {}).get( - "longest_edge", 512 - ) - - # Load model configuration - config_path = self._model_dir / "config.json" - with open(config_path) as f: - model_config = json.load(f) - vision_config = model_config.get("vision_config", {}) - self.image_size = vision_config.get("image_size", 512) - - def _preprocess_onnx_text_input( - self, onnx_input: dict[str, NumpyArray], **kwargs: Any - ) -> dict[str, NumpyArray]: - """ - Post-process the ONNX model output to convert it into a usable format. - - Args: - output (OnnxOutputContext): The raw output from the ONNX model. - - Returns: - Iterable[NumpyArray]: Post-processed output as NumPy arrays. - """ - batch_size, seq_length = onnx_input["input_ids"].shape - empty_image_placeholder: NumpyArray = np.zeros( - (batch_size, seq_length, 3, self.image_size, self.image_size), - dtype=np.float32, # type: ignore[type-var,arg-type,assignment] - ) - onnx_input["pixel_values"] = empty_image_placeholder - return onnx_input - - def _post_process_onnx_text_output( - self, - output: OnnxOutputContext, - ) -> Iterable[NumpyArray]: - """ - Post-process the ONNX model output to convert it into a usable format. - - Args: - output (OnnxOutputContext): The raw output from the ONNX model. - - Returns: - Iterable[NumpyArray]: Post-processed output as NumPy arrays. - """ - return output.model_output - - def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]: - # Add query augmentation tokens (matching process_queries logic from colpali-engine) - augmented_queries = [doc + self.QUERY_AUGMENTATION_TOKEN * 10 for doc in documents] - encoded = self.tokenizer.encode_batch(augmented_queries) # type: ignore[union-attr] - return encoded - - def token_count( - self, - texts: str | Iterable[str], - batch_size: int = 1024, - include_extension: bool = False, - **kwargs: Any, - ) -> int: - if not hasattr(self, "model") or self.model is None: - self.load_onnx_model() # loads the tokenizer as well - token_num = 0 - texts = [texts] if isinstance(texts, str) else texts - assert self.tokenizer is not None - tokenize_func = self.tokenize if include_extension else self.tokenizer.encode_batch - for batch in iter_batch(texts, batch_size): - token_num += sum([sum(encoding.attention_mask) for encoding in tokenize_func(batch)]) - return token_num - - def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutputContext: - with contextlib.ExitStack() as stack: - image_files = [ - stack.enter_context(Image.open(image)) - if not isinstance(image, Image.Image) - else image - for image in images - ] - assert self.processor is not None, "Processor is not initialized" - processed = self.processor(image_files) - encoded, attention_mask, metadata = self._process_nested_patches(processed) # type: ignore[arg-type] - - onnx_input = {"pixel_values": encoded, "attention_mask": attention_mask} - onnx_input = self._preprocess_onnx_image_input(onnx_input, **kwargs) - model_output = self.model.run(None, onnx_input) # type: ignore[union-attr] - - return OnnxOutputContext( - model_output=model_output[0], - attention_mask=attention_mask, # type: ignore[arg-type] - metadata=metadata, - ) - - @staticmethod - def _process_nested_patches( - processed: list[list[NumpyArray]], - ) -> tuple[NumpyArray, NumpyArray, dict[str, Any]]: - """ - Process nested image patches (from ImageSplitter). - - Args: - processed: List of patch lists, one per image [[img1_patches], [img2_patches], ...] - - Returns: - tuple: (encoded array, attention_mask, metadata) - - encoded: (batch_size, max_patches, C, H, W) - - attention_mask: (batch_size, max_patches) with 1 for real patches, 0 for padding - - metadata: Dict with 'patch_counts' key - """ - patch_counts = [len(patches) for patches in processed] - max_patches = max(patch_counts) - - # Get dimensions from first patch - channels, height, width = processed[0][0].shape - batch_size = len(processed) - - # Create padded array - encoded = np.zeros( - (batch_size, max_patches, channels, height, width), dtype=processed[0][0].dtype - ) - - # Create attention mask (1 for real patches, 0 for padding) - attention_mask = np.zeros((batch_size, max_patches), dtype=np.int64) - - # Fill in patches and attention mask - for i, patches in enumerate(processed): - for j, patch in enumerate(patches): - encoded[i, j] = patch - attention_mask[i, j] = 1 - - metadata = {"patch_counts": patch_counts} - return encoded, attention_mask, metadata # type: ignore[return-value] - - def _preprocess_onnx_image_input( - self, onnx_input: dict[str, np.ndarray], **kwargs: Any - ) -> dict[str, NumpyArray]: - """ - Add text input placeholders for image data, following Idefics3 processing logic. - - Constructs input_ids dynamically based on the actual number of image patches, - using the same token expansion logic as Idefics3Processor. - - Args: - onnx_input: Dict with 'pixel_values' (batch, num_patches, C, H, W) - and 'attention_mask' (batch, num_patches) indicating real patches - **kwargs: Additional arguments - - Returns: - Updated onnx_input with 'input_ids' and updated 'attention_mask' for token sequence - """ - # The attention_mask in onnx_input has a shape of (batch_size, num_patches), - # and should be used to create an attention mask matching the input_ids shape. - patch_attention_mask = onnx_input["attention_mask"] - pixel_values = onnx_input["pixel_values"] - - batch_size = pixel_values.shape[0] - batch_input_ids = [] - - # Build input_ids for each image based on its actual patch count - for i in range(batch_size): - # Count real patches (non-padded) from attention mask - patch_count = int(np.sum(patch_attention_mask[i])) - - # Compute rows/cols from patch count - rows, cols = self._compute_rows_cols_from_patches(patch_count) - - # Build input_ids for this image - input_ids = self._build_input_ids_for_image(rows, cols) - batch_input_ids.append(input_ids) - - # Pad sequences to max length in batch - max_len = max(len(ids) for ids in batch_input_ids) - - # Get padding config from tokenizer - padding_direction = self.tokenizer.padding["direction"] # type: ignore[index,union-attr] - pad_token_id = self.tokenizer.padding["pad_id"] # type: ignore[index,union-attr] - - # Initialize with pad token - padded_input_ids = np.full((batch_size, max_len), pad_token_id, dtype=np.int64) - attention_mask = np.zeros((batch_size, max_len), dtype=np.int64) - - for i, input_ids in enumerate(batch_input_ids): - seq_len = len(input_ids) - if padding_direction == "left": - # Left padding: place tokens at the END of the array - start_idx = max_len - seq_len - padded_input_ids[i, start_idx:] = input_ids - attention_mask[i, start_idx:] = 1 - else: - # Right padding: place tokens at the START of the array - padded_input_ids[i, :seq_len] = input_ids - attention_mask[i, :seq_len] = 1 - - onnx_input["input_ids"] = padded_input_ids - # Update attention_mask with token-level data - onnx_input["attention_mask"] = attention_mask - return onnx_input - - @staticmethod - def _compute_rows_cols_from_patches(patch_count: int) -> tuple[int, int]: - if patch_count <= 1: - return 0, 0 - - # Subtract 1 for the global image - grid_patches = patch_count - 1 - - # Find rows and cols (assume square or near-square grid) - rows = int(grid_patches**0.5) - cols = grid_patches // rows - - # Verify the calculation - if rows * cols + 1 != patch_count: - # Handle non-square grids - for r in range(1, grid_patches + 1): - if grid_patches % r == 0: - c = grid_patches // r - if r * c + 1 == patch_count: - return r, c - # Fallback: treat as unsplit - return 0, 0 - - return rows, cols - - def _create_single_image_prompt_string(self) -> str: - return ( - "" - + "" - + "" * self.image_seq_len # type: ignore[operator] - + "" - ) - - def _create_split_image_prompt_string(self, rows: int, cols: int) -> str: - text_split_images = "" - - # Add tokens for each patch in the grid - for n_h in range(rows): - for n_w in range(cols): - text_split_images += ( - "" - + f"" - + "" * self.image_seq_len # type: ignore[operator] - ) - text_split_images += "\n" - - # Add global image at the end - text_split_images += ( - "\n" - + "" - + "" * self.image_seq_len # type: ignore[operator] - + "" - ) - - return text_split_images - - def _build_input_ids_for_image(self, rows: int, cols: int) -> np.ndarray: - # Create the appropriate image prompt string - if rows == 0 and cols == 0: - image_prompt_tokens = self._create_single_image_prompt_string() - else: - image_prompt_tokens = self._create_split_image_prompt_string(rows, cols) - - # Replace in visual prompt with expanded tokens - # The visual prompt is: "<|begin_of_text|>User:Describe the image.\nAssistant:" - expanded_prompt = self.VISUAL_PROMPT_PREFIX.replace("", image_prompt_tokens) - - # Tokenize the complete prompt - encoded = self.tokenizer.encode(expanded_prompt) # type: ignore[union-attr] - - # Convert to numpy array - return np.array(encoded.ids, dtype=np.int64) - - def _post_process_onnx_image_output( - self, - output: OnnxOutputContext, - ) -> Iterable[NumpyArray]: - """ - Post-process the ONNX model output to convert it into a usable format. - - Args: - output (OnnxOutputContext): The raw output from the ONNX model. - - Returns: - Iterable[NumpyArray]: Post-processed output as NumPy arrays. - """ - assert self.model_description.dim is not None, "Model dim is not defined" - return output.model_output.reshape( - output.model_output.shape[0], -1, self.model_description.dim - ) - - def embed_text( - self, - documents: str | Iterable[str], - batch_size: int = 256, - parallel: Optional[int] = None, - **kwargs: Any, - ) -> Iterable[NumpyArray]: - """ - Encode a list of documents into list of embeddings. - - Args: - documents: Iterator of documents or single document to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - - Returns: - List of embeddings, one per document - """ - yield from self._embed_documents( - model_name=self.model_name, - cache_dir=str(self.cache_dir), - documents=documents, - batch_size=batch_size, - parallel=parallel, - providers=self.providers, - cuda=self.cuda, - device_ids=self.device_ids, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - extra_session_options=self._extra_session_options, - **kwargs, - ) - - def embed_image( - self, - images: ImageInput | Iterable[ImageInput], - batch_size: int = 16, - parallel: Optional[int] = None, - **kwargs: Any, - ) -> Iterable[NumpyArray]: - """ - Encode a list of images into list of embeddings. - - Args: - images: Iterator of image paths or single image path to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - - Returns: - List of embeddings, one per document - """ - yield from self._embed_images( - model_name=self.model_name, - cache_dir=str(self.cache_dir), - images=images, - batch_size=batch_size, - parallel=parallel, - providers=self.providers, - cuda=self.cuda, - device_ids=self.device_ids, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - extra_session_options=self._extra_session_options, - **kwargs, - ) - - @classmethod - def _get_text_worker_class(cls) -> Type[TextEmbeddingWorker[NumpyArray]]: - return ColModernVBERTTextEmbeddingWorker - - @classmethod - def _get_image_worker_class(cls) -> Type[ImageEmbeddingWorker[NumpyArray]]: - return ColModernVBERTImageEmbeddingWorker - - -class ColModernVBERTTextEmbeddingWorker(TextEmbeddingWorker[NumpyArray]): - def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColModernVBERT: - return ColModernVBERT( - model_name=model_name, - cache_dir=cache_dir, - threads=1, - **kwargs, - ) - - -class ColModernVBERTImageEmbeddingWorker(ImageEmbeddingWorker[NumpyArray]): - def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColModernVBERT: - return ColModernVBERT( - model_name=model_name, - cache_dir=cache_dir, - threads=1, - **kwargs, - ) diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py deleted file mode 100644 index 85fbcd06b..000000000 --- a/fastembed/late_interaction_multimodal/colpali.py +++ /dev/null @@ -1,327 +0,0 @@ -from typing import Any, Iterable, Sequence, Type - -import numpy as np -from tokenizers import Encoding - -from fastembed.common import OnnxProvider, ImageInput -from fastembed.common.onnx_model import OnnxOutputContext -from fastembed.common.types import NumpyArray, Device -from fastembed.common.utils import define_cache_dir, iter_batch -from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import ( - LateInteractionMultimodalEmbeddingBase, -) -from fastembed.late_interaction_multimodal.onnx_multimodal_model import ( - OnnxMultimodalModel, - TextEmbeddingWorker, - ImageEmbeddingWorker, -) -from fastembed.common.model_description import DenseModelDescription, ModelSource - -supported_colpali_models: list[DenseModelDescription] = [ - DenseModelDescription( - model="Qdrant/colpali-v1.3-fp16", - dim=128, - description="Text embeddings, Multimodal (text&image), English, 50 tokens query length truncation, 2024.", - license="mit", - size_in_GB=6.5, - sources=ModelSource(hf="Qdrant/colpali-v1.3-fp16"), - additional_files=["model.onnx_data"], - model_file="model.onnx", - ), -] - - -class ColPali(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel[NumpyArray]): - QUERY_PREFIX = "Query: " - BOS_TOKEN = "" - PAD_TOKEN = "" - QUERY_MARKER_TOKEN_ID = [2, 5098] - IMAGE_PLACEHOLDER_SIZE = (3, 448, 448) - EMPTY_TEXT_PLACEHOLDER = np.array( - [257152] * 1024 + [2, 50721, 573, 2416, 235265, 108] - ) # This is a tokenization of '' * 1024 + 'Describe the image.\n' line which is used as placeholder - # while processing an image - EVEN_ATTENTION_MASK = np.array([1] * 1030) - - def __init__( - self, - model_name: str, - cache_dir: str | None = None, - threads: int | None = None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_ids: list[int] | None = None, - lazy_load: bool = False, - device_id: int | None = None, - specific_model_path: str | None = None, - **kwargs: Any, - ): - """ - Args: - model_name (str): The name of the model to use. - cache_dir (str, optional): The path to the cache directory. - Can be set using the `FASTEMBED_CACHE_PATH` env variable. - Defaults to `fastembed_cache` in the system's temp directory. - threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. - providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use. - Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None. - cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers` - Defaults to Device.AUTO. - device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in - workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive - with `providers`. Defaults to None. - lazy_load (bool, optional): Whether to load the model during class initialization or on demand. - Should be set to True when using multiple-gpu and parallel encoding. Defaults to False. - device_id (Optional[int], optional): The device id to use for loading the model in the worker process. - - Raises: - ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. - """ - - super().__init__(model_name, cache_dir, threads, **kwargs) - self.providers = providers - self.lazy_load = lazy_load - self._extra_session_options = self._select_exposed_session_options(kwargs) - - # List of device ids, that can be used for data parallel processing in workers - self.device_ids = device_ids - self.cuda = cuda - - # This device_id will be used if we need to load model in current process - self.device_id: int | None = None - if device_id is not None: - self.device_id = device_id - elif self.device_ids is not None: - self.device_id = self.device_ids[0] - - self.model_description = self._get_model_description(model_name) - self.cache_dir = str(define_cache_dir(cache_dir)) - - self._specific_model_path = specific_model_path - self._model_dir = self.download_model( - self.model_description, - self.cache_dir, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - ) - self.mask_token_id = None - self.pad_token_id = None - - if not self.lazy_load: - self.load_onnx_model() - - @classmethod - def _list_supported_models(cls) -> list[DenseModelDescription]: - """Lists the supported models. - - Returns: - list[DenseModelDescription]: A list of DenseModelDescription objects containing the model information. - """ - return supported_colpali_models - - def load_onnx_model(self) -> None: - self._load_onnx_model( - model_dir=self._model_dir, - model_file=self.model_description.model_file, - threads=self.threads, - providers=self.providers, - cuda=self.cuda, - device_id=self.device_id, - extra_session_options=self._extra_session_options, - ) - - def _post_process_onnx_image_output( - self, - output: OnnxOutputContext, - ) -> Iterable[NumpyArray]: - """ - Post-process the ONNX model output to convert it into a usable format. - - Args: - output (OnnxOutputContext): The raw output from the ONNX model. - - Returns: - Iterable[NumpyArray]: Post-processed output as NumPy arrays. - """ - assert self.model_description.dim is not None, "Model dim is not defined" - return output.model_output.reshape( - output.model_output.shape[0], -1, self.model_description.dim - ) - - def _post_process_onnx_text_output( - self, - output: OnnxOutputContext, - ) -> Iterable[NumpyArray]: - """ - Post-process the ONNX model output to convert it into a usable format. - - Args: - output (OnnxOutputContext): The raw output from the ONNX model. - - Returns: - Iterable[NumpyArray]: Post-processed output as NumPy arrays. - """ - return output.model_output - - def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]: - texts_query: list[str] = [] - for query in documents: - query = self.BOS_TOKEN + self.QUERY_PREFIX + query + self.PAD_TOKEN * 10 - query += "\n" - - texts_query.append(query) - encoded = self.tokenizer.encode_batch(texts_query) # type: ignore[union-attr] - return encoded - - def token_count( - self, - texts: str | Iterable[str], - batch_size: int = 1024, - include_extension: bool = False, - **kwargs: Any, - ) -> int: - if not hasattr(self, "model") or self.model is None: - self.load_onnx_model() # loads the tokenizer as well - token_num = 0 - texts = [texts] if isinstance(texts, str) else texts - assert self.tokenizer is not None - tokenize_func = self.tokenize if include_extension else self.tokenizer.encode_batch - for batch in iter_batch(texts, batch_size): - token_num += sum([sum(encoding.attention_mask) for encoding in tokenize_func(batch)]) - return token_num - - def _preprocess_onnx_text_input( - self, onnx_input: dict[str, NumpyArray], **kwargs: Any - ) -> dict[str, NumpyArray]: - onnx_input["input_ids"] = np.array( - [ - self.QUERY_MARKER_TOKEN_ID + input_ids[2:].tolist() # type: ignore[index] - for input_ids in onnx_input["input_ids"] - ] - ) - empty_image_placeholder: NumpyArray = np.zeros( - self.IMAGE_PLACEHOLDER_SIZE, dtype=np.float32 - ) - onnx_input["pixel_values"] = np.array( - [empty_image_placeholder for _ in onnx_input["input_ids"]], - ) - return onnx_input - - def _preprocess_onnx_image_input( - self, onnx_input: dict[str, np.ndarray], **kwargs: Any - ) -> dict[str, NumpyArray]: - """ - Add placeholders for text input when processing image data for ONNX. - Args: - onnx_input (Dict[str, NumpyArray]): Preprocessed image inputs. - **kwargs: Additional arguments. - Returns: - Dict[str, NumpyArray]: ONNX input with text placeholders. - """ - onnx_input["input_ids"] = np.array( - [self.EMPTY_TEXT_PLACEHOLDER for _ in onnx_input["pixel_values"]] - ) - onnx_input["attention_mask"] = np.array( - [self.EVEN_ATTENTION_MASK for _ in onnx_input["pixel_values"]] - ) - return onnx_input - - def embed_text( - self, - documents: str | Iterable[str], - batch_size: int = 256, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[NumpyArray]: - """ - Encode a list of documents into list of embeddings. - - Args: - documents: Iterator of documents or single document to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - - Returns: - List of embeddings, one per document - """ - yield from self._embed_documents( - model_name=self.model_name, - cache_dir=str(self.cache_dir), - documents=documents, - batch_size=batch_size, - parallel=parallel, - providers=self.providers, - cuda=self.cuda, - device_ids=self.device_ids, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - extra_session_options=self._extra_session_options, - **kwargs, - ) - - def embed_image( - self, - images: ImageInput | Iterable[ImageInput], - batch_size: int = 16, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[NumpyArray]: - """ - Encode a list of images into list of embeddings. - - Args: - images: Iterator of image paths or single image path to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - - Returns: - List of embeddings, one per document - """ - yield from self._embed_images( - model_name=self.model_name, - cache_dir=str(self.cache_dir), - images=images, - batch_size=batch_size, - parallel=parallel, - providers=self.providers, - cuda=self.cuda, - device_ids=self.device_ids, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - extra_session_options=self._extra_session_options, - **kwargs, - ) - - @classmethod - def _get_text_worker_class(cls) -> Type[TextEmbeddingWorker[NumpyArray]]: - return ColPaliTextEmbeddingWorker - - @classmethod - def _get_image_worker_class(cls) -> Type[ImageEmbeddingWorker[NumpyArray]]: - return ColPaliImageEmbeddingWorker - - -class ColPaliTextEmbeddingWorker(TextEmbeddingWorker[NumpyArray]): - def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColPali: - return ColPali( - model_name=model_name, - cache_dir=cache_dir, - threads=1, - **kwargs, - ) - - -class ColPaliImageEmbeddingWorker(ImageEmbeddingWorker[NumpyArray]): - def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColPali: - return ColPali( - model_name=model_name, - cache_dir=cache_dir, - threads=1, - **kwargs, - ) diff --git a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py deleted file mode 100644 index 10d426d0d..000000000 --- a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py +++ /dev/null @@ -1,189 +0,0 @@ -from typing import Any, Iterable, Sequence, Type -from dataclasses import asdict - -from fastembed.common import OnnxProvider, ImageInput -from fastembed.common.types import NumpyArray, Device -from fastembed.late_interaction_multimodal.colpali import ColPali -from fastembed.late_interaction_multimodal.colmodernvbert import ColModernVBERT - -from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import ( - LateInteractionMultimodalEmbeddingBase, -) -from fastembed.common.model_description import DenseModelDescription - - -class LateInteractionMultimodalEmbedding(LateInteractionMultimodalEmbeddingBase): - EMBEDDINGS_REGISTRY: list[Type[LateInteractionMultimodalEmbeddingBase]] = [ - ColPali, - ColModernVBERT, - ] - - @classmethod - def list_supported_models(cls) -> list[dict[str, Any]]: - """ - Lists the supported models. - - Returns: - list[dict[str, Any]]: A list of dictionaries containing the model information. - - Example: - ``` - [ - { - "model": "Qdrant/colpali-v1.3-fp16", - "dim": 128, - "description": "Text embeddings, Unimodal (text), Aligned to image latent space, ColBERT-compatible, 512 tokens max, 2024.", - "license": "mit", - "size_in_GB": 6.06, - "sources": { - "hf": "Qdrant/colpali-v1.3-fp16", - }, - "additional_files": [ - "model.onnx_data", - ], - "model_file": "model.onnx", - }, - ] - ``` - """ - return [asdict(model) for model in cls._list_supported_models()] - - @classmethod - def _list_supported_models(cls) -> list[DenseModelDescription]: - result: list[DenseModelDescription] = [] - for embedding in cls.EMBEDDINGS_REGISTRY: - result.extend(embedding._list_supported_models()) - return result - - def __init__( - self, - model_name: str, - cache_dir: str | None = None, - threads: int | None = None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_ids: list[int] | None = None, - lazy_load: bool = False, - **kwargs: Any, - ): - super().__init__(model_name, cache_dir, threads, **kwargs) - for EMBEDDING_MODEL_TYPE in self.EMBEDDINGS_REGISTRY: - supported_models = EMBEDDING_MODEL_TYPE._list_supported_models() - if any(model_name.lower() == model.model.lower() for model in supported_models): - self.model = EMBEDDING_MODEL_TYPE( - model_name, - cache_dir, - threads=threads, - providers=providers, - cuda=cuda, - device_ids=device_ids, - lazy_load=lazy_load, - **kwargs, - ) - return - - raise ValueError( - f"Model {model_name} is not supported in LateInteractionMultimodalEmbedding." - "Please check the supported models using `LateInteractionMultimodalEmbedding.list_supported_models()`" - ) - - @property - def embedding_size(self) -> int: - """Get the embedding size of the current model""" - if self._embedding_size is None: - self._embedding_size = self.get_embedding_size(self.model_name) - return self._embedding_size - - @classmethod - def get_embedding_size(cls, model_name: str) -> int: - """Get the embedding size of the passed model - - Args: - model_name (str): The name of the model to get embedding size for. - - Returns: - int: The size of the embedding. - - Raises: - ValueError: If the model name is not found in the supported models. - """ - descriptions = cls._list_supported_models() - embedding_size: int | None = None - for description in descriptions: - if description.model.lower() == model_name.lower(): - embedding_size = description.dim - break - if embedding_size is None: - model_names = [description.model for description in descriptions] - raise ValueError( - f"Embedding size for model {model_name} was None. " - f"Available model names: {model_names}" - ) - return embedding_size - - def embed_text( - self, - documents: str | Iterable[str], - batch_size: int = 256, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[NumpyArray]: - """ - Encode a list of documents into list of embeddings. - - Args: - documents: Iterator of documents or single document to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - - Returns: - List of embeddings, one per document - """ - yield from self.model.embed_text(documents, batch_size, parallel, **kwargs) - - def embed_image( - self, - images: ImageInput | Iterable[ImageInput], - batch_size: int = 16, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[NumpyArray]: - """ - Encode a list of images into list of embeddings. - - Args: - images: Iterator of image paths or single image path to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - - Returns: - List of embeddings, one per image - """ - yield from self.model.embed_image(images, batch_size, parallel, **kwargs) - - def token_count( - self, - texts: str | Iterable[str], - batch_size: int = 1024, - include_extension: bool = False, - **kwargs: Any, - ) -> int: - """Returns the number of tokens in the texts. - - Args: - texts (str | Iterable[str]): The list of texts to embed. - batch_size (int): Batch size for encoding - include_extension (bool): Whether to include tokens added by preprocessing - - Returns: - int: Sum of number of tokens in the texts. - """ - return self.model.token_count( - texts, batch_size=batch_size, include_extension=include_extension, **kwargs - ) diff --git a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py deleted file mode 100644 index 72a87fe5e..000000000 --- a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py +++ /dev/null @@ -1,86 +0,0 @@ -from typing import Iterable, Any - - -from fastembed.common import ImageInput -from fastembed.common.model_description import DenseModelDescription -from fastembed.common.model_management import ModelManagement -from fastembed.common.types import NumpyArray - - -class LateInteractionMultimodalEmbeddingBase(ModelManagement[DenseModelDescription]): - def __init__( - self, - model_name: str, - cache_dir: str | None = None, - threads: int | None = None, - **kwargs: Any, - ): - self.model_name = model_name - self.cache_dir = cache_dir - self.threads = threads - self._local_files_only = kwargs.pop("local_files_only", False) - self._embedding_size: int | None = None - - def embed_text( - self, - documents: str | Iterable[str], - batch_size: int = 256, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[NumpyArray]: - """ - Embeds a list of documents into a list of embeddings. - - Args: - documents (Iterable[str]): The list of texts to embed. - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - **kwargs: Additional keyword argument to pass to the embed method. - - Yields: - Iterable[NumpyArray]: The embeddings. - """ - raise NotImplementedError() - - def embed_image( - self, - images: ImageInput | Iterable[ImageInput], - batch_size: int = 16, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[NumpyArray]: - """ - Encode a list of images into list of embeddings. - Args: - images: Iterator of image paths or single image path to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - - Returns: - List of embeddings, one per image - """ - raise NotImplementedError() - - @classmethod - def get_embedding_size(cls, model_name: str) -> int: - """Returns embedding size of the chosen model.""" - raise NotImplementedError("Subclasses must implement this method") - - @property - def embedding_size(self) -> int: - """Returns embedding size for the current model""" - raise NotImplementedError("Subclasses must implement this method") - - def token_count( - self, - texts: str | Iterable[str], - **kwargs: Any, - ) -> int: - """Returns the number of tokens in the texts.""" - raise NotImplementedError("Subclasses must implement this method") diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py deleted file mode 100644 index 934368957..000000000 --- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py +++ /dev/null @@ -1,291 +0,0 @@ -import contextlib -import os -from multiprocessing import get_all_start_methods -from pathlib import Path -from typing import Any, Iterable, Sequence, Type - -import numpy as np -from PIL import Image -from tokenizers import Encoding, Tokenizer - -from fastembed.common import OnnxProvider, ImageInput -from fastembed.common.onnx_model import EmbeddingWorker, OnnxModel, OnnxOutputContext, T -from fastembed.common.preprocessor_utils import load_tokenizer, load_preprocessor -from fastembed.common.types import NumpyArray, Device -from fastembed.common.utils import iter_batch -from fastembed.image.transform.operators import Compose -from fastembed.parallel_processor import ParallelWorkerPool - - -class OnnxMultimodalModel(OnnxModel[T]): - ONNX_OUTPUT_NAMES: list[str] | None = None - - def __init__(self) -> None: - super().__init__() - self.tokenizer: Tokenizer | None = None - self.processor: Compose | None = None - self.special_token_to_id: dict[str, int] = {} - - def _preprocess_onnx_text_input( - self, onnx_input: dict[str, NumpyArray], **kwargs: Any - ) -> dict[str, NumpyArray]: - """ - Preprocess the onnx input. - """ - return onnx_input - - def _preprocess_onnx_image_input( - self, onnx_input: dict[str, NumpyArray], **kwargs: Any - ) -> dict[str, NumpyArray]: - """ - Preprocess the onnx input. - """ - return onnx_input - - @classmethod - def _get_text_worker_class(cls) -> Type["TextEmbeddingWorker[T]"]: - raise NotImplementedError("Subclasses must implement this method") - - @classmethod - def _get_image_worker_class(cls) -> Type["ImageEmbeddingWorker[T]"]: - raise NotImplementedError("Subclasses must implement this method") - - def _post_process_onnx_image_output(self, output: OnnxOutputContext) -> Iterable[T]: - raise NotImplementedError("Subclasses must implement this method") - - def _post_process_onnx_text_output(self, output: OnnxOutputContext) -> Iterable[T]: - raise NotImplementedError("Subclasses must implement this method") - - def _load_onnx_model( - self, - model_dir: Path, - model_file: str, - threads: int | None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_id: int | None = None, - extra_session_options: dict[str, Any] | None = None, - ) -> None: - super()._load_onnx_model( - model_dir=model_dir, - model_file=model_file, - threads=threads, - providers=providers, - cuda=cuda, - device_id=device_id, - extra_session_options=extra_session_options, - ) - self.tokenizer, self.special_token_to_id = load_tokenizer(model_dir=model_dir) - assert self.tokenizer is not None - self.processor = load_preprocessor(model_dir=model_dir) - - def load_onnx_model(self) -> None: - raise NotImplementedError("Subclasses must implement this method") - - def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]: - return self.tokenizer.encode_batch(documents) # type: ignore[union-attr] - - def onnx_embed_text( - self, - documents: list[str], - **kwargs: Any, - ) -> OnnxOutputContext: - encoded = self.tokenize(documents, **kwargs) - input_ids = np.array([e.ids for e in encoded]) - attention_mask = np.array([e.attention_mask for e in encoded]) # type: ignore[union-attr] - input_names = {node.name for node in self.model.get_inputs()} # type: ignore[union-attr] - onnx_input: dict[str, NumpyArray] = { - "input_ids": np.array(input_ids, dtype=np.int64), - } - if "attention_mask" in input_names: - onnx_input["attention_mask"] = np.array(attention_mask, dtype=np.int64) - if "token_type_ids" in input_names: - onnx_input["token_type_ids"] = np.array( - [np.zeros(len(e), dtype=np.int64) for e in input_ids], dtype=np.int64 - ) - - onnx_input = self._preprocess_onnx_text_input(onnx_input, **kwargs) - model_output = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input) # type: ignore[union-attr] - return OnnxOutputContext( - model_output=model_output[0], - attention_mask=onnx_input.get("attention_mask", attention_mask), - input_ids=onnx_input.get("input_ids", input_ids), - ) - - def _embed_documents( - self, - model_name: str, - cache_dir: str, - documents: str | Iterable[str], - batch_size: int = 256, - parallel: int | None = None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_ids: list[int] | None = None, - local_files_only: bool = False, - specific_model_path: str | None = None, - extra_session_options: dict[str, Any] | None = None, - **kwargs: Any, - ) -> Iterable[T]: - is_small = False - - if isinstance(documents, str): - documents = [documents] - is_small = True - - if isinstance(documents, list): - if len(documents) < batch_size: - is_small = True - - if parallel is None or is_small: - if not hasattr(self, "model") or self.model is None: - self.load_onnx_model() - for batch in iter_batch(documents, batch_size): - yield from self._post_process_onnx_text_output(self.onnx_embed_text(batch)) - else: - if parallel == 0: - parallel = os.cpu_count() - - start_method = "forkserver" if "forkserver" in get_all_start_methods() else "spawn" - params = { - "model_name": model_name, - "cache_dir": cache_dir, - "providers": providers, - "local_files_only": local_files_only, - "specific_model_path": specific_model_path, - **kwargs, - } - - if extra_session_options is not None: - params.update(extra_session_options) - - pool = ParallelWorkerPool( - num_workers=parallel or 1, - worker=self._get_text_worker_class(), - cuda=cuda, - device_ids=device_ids, - start_method=start_method, - ) - for batch in pool.ordered_map(iter_batch(documents, batch_size), **params): - yield from self._post_process_onnx_text_output(batch) # type: ignore - - def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutputContext: - with contextlib.ExitStack() as stack: - image_files = [ - stack.enter_context(Image.open(image)) - if not isinstance(image, Image.Image) - else image - for image in images - ] - assert self.processor is not None, "Processor is not initialized" - encoded = np.array(self.processor(image_files)) - onnx_input = {"pixel_values": encoded} - onnx_input = self._preprocess_onnx_image_input(onnx_input, **kwargs) - model_output = self.model.run(None, onnx_input) # type: ignore[union-attr] - embeddings = model_output[0].reshape(len(images), -1) - return OnnxOutputContext(model_output=embeddings) - - def _embed_images( - self, - model_name: str, - cache_dir: str, - images: Iterable[ImageInput] | ImageInput, - batch_size: int = 256, - parallel: int | None = None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_ids: list[int] | None = None, - local_files_only: bool = False, - specific_model_path: str | None = None, - extra_session_options: dict[str, Any] | None = None, - **kwargs: Any, - ) -> Iterable[T]: - is_small = False - - if isinstance(images, (str, Path, Image.Image)): - images = [images] - is_small = True - - if isinstance(images, list) and len(images) < batch_size: - is_small = True - - if parallel is None or is_small: - if not hasattr(self, "model") or self.model is None: - self.load_onnx_model() - - for batch in iter_batch(images, batch_size): - yield from self._post_process_onnx_image_output(self.onnx_embed_image(batch)) - else: - if parallel == 0: - parallel = os.cpu_count() - - start_method = "forkserver" if "forkserver" in get_all_start_methods() else "spawn" - params = { - "model_name": model_name, - "cache_dir": cache_dir, - "providers": providers, - "local_files_only": local_files_only, - "specific_model_path": specific_model_path, - **kwargs, - } - - if extra_session_options is not None: - params.update(extra_session_options) - - pool = ParallelWorkerPool( - num_workers=parallel or 1, - worker=self._get_image_worker_class(), - cuda=cuda, - device_ids=device_ids, - start_method=start_method, - ) - for batch in pool.ordered_map(iter_batch(images, batch_size), **params): - yield from self._post_process_onnx_image_output(batch) # type: ignore - - -class TextEmbeddingWorker(EmbeddingWorker[T]): - def __init__( - self, - model_name: str, - cache_dir: str, - **kwargs: Any, - ): - self.model: OnnxMultimodalModel - super().__init__(model_name, cache_dir, **kwargs) - - def init_embedding( - self, - model_name: str, - cache_dir: str, - **kwargs: Any, - ) -> OnnxMultimodalModel: - raise NotImplementedError() - - def process(self, items: Iterable[tuple[int, Any]]) -> Iterable[tuple[int, Any]]: - for idx, batch in items: - onnx_output = self.model.onnx_embed_text(batch) - yield idx, onnx_output - - -class ImageEmbeddingWorker(EmbeddingWorker[T]): - def __init__( - self, - model_name: str, - cache_dir: str, - **kwargs: Any, - ): - self.model: OnnxMultimodalModel - super().__init__(model_name, cache_dir, **kwargs) - - def init_embedding( - self, - model_name: str, - cache_dir: str, - **kwargs: Any, - ) -> OnnxMultimodalModel: - raise NotImplementedError() - - def process(self, items: Iterable[tuple[int, Any]]) -> Iterable[tuple[int, Any]]: - for idx, batch in items: - embeddings = self.model.onnx_embed_image(batch) - yield idx, embeddings diff --git a/fastembed/postprocess/__init__.py b/fastembed/postprocess/__init__.py deleted file mode 100644 index 5d03c50e8..000000000 --- a/fastembed/postprocess/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from fastembed.postprocess.muvera import Muvera - -__all__ = ["Muvera"] diff --git a/fastembed/postprocess/muvera.py b/fastembed/postprocess/muvera.py deleted file mode 100644 index 3c6476795..000000000 --- a/fastembed/postprocess/muvera.py +++ /dev/null @@ -1,362 +0,0 @@ -import numpy as np - -from fastembed.common.types import NumpyArray -from fastembed.late_interaction.late_interaction_embedding_base import ( - LateInteractionTextEmbeddingBase, -) -from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import ( - LateInteractionMultimodalEmbeddingBase, -) - - -MultiVectorModel = LateInteractionTextEmbeddingBase | LateInteractionMultimodalEmbeddingBase -MAX_HAMMING_DISTANCE = 65 # 64 bits + 1 -POPCOUNT_LUT = np.array([bin(x).count("1") for x in range(256)], dtype=np.uint8) - - -def hamming_distance_matrix(ids: np.ndarray) -> np.ndarray: - """Compute full Hamming distance matrix - - Args: - ids: shape (n,) - array of ids, only size of the array matters - - Return: - np.ndarray (n, n) - hamming distance matrix - """ - n = len(ids) - xor_vals = np.bitwise_xor(ids[:, None], ids[None, :]) # (n, n) uint64 - bytes_view = xor_vals.view(np.uint8).reshape(n, n, 8) # (n, n, 8) - return POPCOUNT_LUT[bytes_view].sum(axis=2) - - -class SimHashProjection: - """ - SimHash projection component for MUVERA clustering. - - This class implements locality-sensitive hashing using random hyperplanes - to partition the vector space into 2^k_sim clusters. Each vector is assigned - to a cluster based on which side of k_sim random hyperplanes it falls on. - - Attributes: - k_sim (int): Number of SimHash functions (hyperplanes) - dim (int): Dimensionality of input vectors - simhash_vectors (np.ndarray): Random hyperplane normal vectors of shape (dim, k_sim) - """ - - def __init__(self, k_sim: int, dim: int, random_generator: np.random.Generator): - """ - Initialize SimHash projection with random hyperplanes. - - Args: - k_sim (int): Number of SimHash functions, determines 2^k_sim clusters - dim (int): Dimensionality of input vectors - random_generator (np.random.Generator): Random number generator for reproducibility - """ - self.k_sim = k_sim - self.dim = dim - # Generate k_sim random hyperplanes (normal vectors) from standard normal distribution - self.simhash_vectors = random_generator.normal(size=(dim, k_sim)) - - def get_cluster_ids(self, vectors: np.ndarray) -> np.ndarray: - """ - Compute the cluster IDs for a given vector using SimHash. - - The cluster ID is determined by computing the dot product of the vector - with each hyperplane normal vector, taking the sign, and interpreting - the resulting binary string as an integer. - - Args: - vectors (np.ndarray): Input vectors of shape (n, dim,) - - Returns: - np.ndarray: Cluster IDs in range [0, 2^k_sim - 1] - - Raises: - AssertionError: If a vector shape doesn't match expected dimensionality - """ - dot_product = ( - vectors @ self.simhash_vectors - ) # (token_num, dim) x (dim, k_sim) -> (token_num, k_sim) - cluster_ids = (dot_product > 0) @ (1 << np.arange(self.k_sim)) - return cluster_ids - - -class Muvera: - """ - MUVERA (Multi-Vector Retrieval Architecture) algorithm implementation. - - This class creates Fixed Dimensional Encodings (FDEs) from variable-length - sequences of vectors by using SimHash clustering and random projections. - The process involves: - 1. Clustering vectors using multiple SimHash projections - 2. Computing cluster centers (with different strategies for docs vs queries) - 3. Applying random projections for dimensionality reduction - 4. Concatenating results from all projections - - Attributes: - k_sim (int): Number of SimHash functions per projection - dim (int): Input vector dimensionality - dim_proj (int): Output dimensionality after random projection - r_reps (int): Number of random projection repetitions - random_seed (int): Random seed for consistent random matrix generation - simhash_projections (List[SimHashProjection]): SimHash instances for clustering - dim_reduction_projections (np.ndarray): Random projection matrices of shape (R_reps, d, d_proj) - """ - - def __init__( - self, - dim: int, - k_sim: int = 5, - dim_proj: int = 16, - r_reps: int = 20, - random_seed: int = 42, - ): - """ - Initialize MUVERA algorithm with specified parameters. - - Args: - dim (int): Dimensionality of individual input vectors - k_sim (int, optional): Number of SimHash functions (creates 2^k_sim clusters). - Defaults to 5. - dim_proj (int, optional): Dimensionality after random projection (must be <= dim). - Defaults to 16. - r_reps (int, optional): Number of random projection repetitions for robustness. - Defaults to 20. - random_seed (int, optional): Seed for random number generator to ensure - reproducible results. Defaults to 42. - - Raises: - ValueError: If dim_proj > dim (cannot project to higher dimensionality) - """ - if dim_proj > dim: - raise ValueError( - f"Cannot project to a higher dimensionality (dim_proj={dim_proj} > dim={dim})" - ) - - self.k_sim = k_sim - self.dim = dim - self.dim_proj = dim_proj - self.r_reps = r_reps - # Create r_reps independent SimHash projections for robustness - generator = np.random.default_rng(random_seed) - self.simhash_projections = [ - SimHashProjection(k_sim=self.k_sim, dim=self.dim, random_generator=generator) - for _ in range(r_reps) - ] - # Random projection matrices with entries from {-1, +1} for each repetition - self.dim_reduction_projections = generator.choice([-1, 1], size=(r_reps, dim, dim_proj)) - - @classmethod - def from_multivector_model( - cls, - model: MultiVectorModel, - k_sim: int = 5, - dim_proj: int = 16, - r_reps: int = 20, # noqa[naming] - random_seed: int = 42, - ) -> "Muvera": - """ - Create a Muvera instance from a multi-vector embedding model. - - This class method provides a convenient way to initialize a MUVERA - that is compatible with a given multi-vector model by automatically extracting - the embedding dimensionality from the model. - - Args: - model (MultiVectorModel): A late interaction text or multimodal embedding model - that provides multi-vector embeddings. Must have an - `embedding_size` attribute specifying the dimensionality - of individual vectors. - k_sim (int, optional): Number of SimHash functions (creates 2^k_sim clusters). - Defaults to 5. - dim_proj (int, optional): Dimensionality after random projection (must be <= model's - embedding_size). Defaults to 16. - r_reps (int, optional): Number of random projection repetitions for robustness. - Defaults to 20. - random_seed (int, optional): Seed for random number generator to ensure - reproducible results. Defaults to 42. - - Returns: - Muvera: A configured MUVERA instance ready to process embeddings from the given model. - - Raises: - ValueError: If dim_proj > model.embedding_size (cannot project to higher dimensionality) - - Example: - >>> from fastembed import LateInteractionTextEmbedding - >>> model = LateInteractionTextEmbedding(model_name="colbert-ir/colbertv2.0") - >>> muvera = Muvera.from_multivector_model( - ... model=model, - ... k_sim=6, - ... dim_proj=32 - ... ) - >>> # Now use postprocessor with embeddings from the model - >>> embeddings = np.array(list(model.embed(["sample text"]))) - >>> fde = muvera.process_document(embeddings[0]) - """ - return cls( - dim=model.embedding_size, - k_sim=k_sim, - dim_proj=dim_proj, - r_reps=r_reps, - random_seed=random_seed, - ) - - def _get_output_dimension(self) -> int: - """ - Get the output dimension of the MUVERA algorithm. - - Returns: - int: Output dimension (r_reps * num_partitions * dim_proj) where b = 2^k_sim - """ - num_partitions = 2**self.k_sim - return self.r_reps * num_partitions * self.dim_proj - - @property - def embedding_size(self) -> int: - return self._get_output_dimension() - - def process_document(self, vectors: NumpyArray) -> NumpyArray: - """ - Encode a document's vectors into a Fixed Dimensional Encoding (FDE). - - Uses document-specific settings: normalizes cluster centers by vector count - and fills empty clusters using Hamming distance-based selection. - - Args: - vectors (NumpyArray): Document vectors of shape (n_tokens, dim) - - Returns: - NumpyArray: Fixed dimensional encodings of shape (r_reps * b * dim_proj,) - """ - return self.process(vectors, fill_empty_clusters=True, normalize_by_count=True) - - def process_query(self, vectors: NumpyArray) -> NumpyArray: - """ - Encode a query's vectors into a Fixed Dimensional Encoding (FDE). - - Uses query-specific settings: no normalization by count and no empty - cluster filling to preserve query vector magnitudes. - - Args: - vectors (NumpyArray]): Query vectors of shape (n_tokens, dim) - - Returns: - NumpyArray: Fixed dimensional encoding of shape (r_reps * b * dim_proj,) - """ - return self.process(vectors, fill_empty_clusters=False, normalize_by_count=False) - - def process( - self, - vectors: NumpyArray, - fill_empty_clusters: bool = True, - normalize_by_count: bool = True, - ) -> NumpyArray: - """ - Core encoding method that transforms variable-length vector sequences into FDEs. - - The encoding process: - 1. For each of r_reps random projections: - a. Assign vectors to clusters using SimHash - b. Compute cluster centers (sum of vectors in each cluster) - c. Optionally normalize by cluster size - d. Fill empty clusters using Hamming distance if requested - e. Apply random projection for dimensionality reduction - f. Flatten cluster centers into a vector - 2. Concatenate all projection results - - Args: - vectors (np.ndarray): Input vectors of shape (n_vectors, dim) - fill_empty_clusters (bool): Whether to fill empty clusters using nearest - vectors based on Hamming distance of cluster IDs - normalize_by_count (bool): Whether to normalize cluster centers by the - number of vectors assigned to each cluster - - Returns: - np.ndarray: Fixed dimensional encoding of shape (r_reps * b * dim_proj) - where B = 2^k_sim is the number of clusters - - Raises: - AssertionError: If input vectors don't have expected dimensionality - """ - assert ( - vectors.shape[1] == self.dim - ), f"Expected vectors of shape (n, {self.dim}), got {vectors.shape}" - - # Store results from each random projection - output_vectors = [] - - # num of space partitions in SimHash - num_partitions = 2**self.k_sim - cluster_center_ids = np.arange(num_partitions) - precomputed_hamming_matrix = ( - hamming_distance_matrix(cluster_center_ids) if fill_empty_clusters else None - ) - - for projection_index, simhash in enumerate(self.simhash_projections): - # Initialize cluster centers and count vectors assigned to each cluster - cluster_centers = np.zeros((num_partitions, self.dim)) - cluster_center_id_to_vectors: dict[int, list[int]] = { - cluster_center_id: [] for cluster_center_id in cluster_center_ids - } - cluster_vector_counts = None - empty_mask = None - - # Assign each vector to its cluster and accumulate cluster centers - vector_cluster_ids = simhash.get_cluster_ids(vectors) - for cluster_id, (vec_idx, vec) in zip(vector_cluster_ids, enumerate(vectors)): - cluster_centers[cluster_id] += vec - cluster_center_id_to_vectors[cluster_id].append(vec_idx) - - if normalize_by_count or fill_empty_clusters: - cluster_vector_counts = np.bincount(vector_cluster_ids, minlength=num_partitions) - empty_mask = cluster_vector_counts == 0 - - if normalize_by_count: - assert empty_mask is not None - assert cluster_vector_counts is not None - non_empty_mask = ~empty_mask - cluster_centers[non_empty_mask] /= cluster_vector_counts[non_empty_mask][:, None] - - # Fill empty clusters using vectors with minimum Hamming distance - if fill_empty_clusters: - assert empty_mask is not None - assert precomputed_hamming_matrix is not None - masked_hamming = np.where( - empty_mask[None, :], MAX_HAMMING_DISTANCE, precomputed_hamming_matrix - ) - nearest_non_empty = np.argmin(masked_hamming, axis=1) - fill_vectors = np.array( - [ - vectors[cluster_center_id_to_vectors[cluster_id][0]] - for cluster_id in nearest_non_empty[empty_mask] - ] - ).reshape(-1, self.dim) - cluster_centers[empty_mask] = fill_vectors - - # Apply random projection for dimensionality reduction if needed - if self.dim_proj < self.dim: - dim_reduction_projection = self.dim_reduction_projections[ - projection_index - ] # Get projection matrix for this repetition - projected_centers = (1 / np.sqrt(self.dim_proj)) * ( - cluster_centers @ dim_reduction_projection - ) - - # Flatten cluster centers into a single vector and add to output - output_vectors.append(projected_centers.flatten()) - continue - - # If no projection needed (dim_proj == dim), use original cluster centers - output_vectors.append(cluster_centers.flatten()) - - # Concatenate results from all R_reps projections into final FDE - return np.concatenate(output_vectors) - - -if __name__ == "__main__": - v_arrs = np.random.randn(10, 100, 128) - muvera = Muvera(128, 4, 8, 20, 42) - - for v_arr in v_arrs: - muvera.process(v_arr) # type: ignore diff --git a/fastembed/rerank/cross_encoder/__init__.py b/fastembed/rerank/cross_encoder/__init__.py deleted file mode 100644 index 23c1e3591..000000000 --- a/fastembed/rerank/cross_encoder/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from fastembed.rerank.cross_encoder.text_cross_encoder import TextCrossEncoder - -__all__ = ["TextCrossEncoder"] diff --git a/fastembed/rerank/cross_encoder/custom_text_cross_encoder.py b/fastembed/rerank/cross_encoder/custom_text_cross_encoder.py deleted file mode 100644 index fc1f6e964..000000000 --- a/fastembed/rerank/cross_encoder/custom_text_cross_encoder.py +++ /dev/null @@ -1,47 +0,0 @@ -from typing import Sequence, Any - -from fastembed.common import OnnxProvider -from fastembed.common.model_description import BaseModelDescription -from fastembed.common.types import Device -from fastembed.rerank.cross_encoder.onnx_text_cross_encoder import OnnxTextCrossEncoder - - -class CustomTextCrossEncoder(OnnxTextCrossEncoder): - SUPPORTED_MODELS: list[BaseModelDescription] = [] - - def __init__( - self, - model_name: str, - cache_dir: str | None = None, - threads: int | None = None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_ids: list[int] | None = None, - lazy_load: bool = False, - device_id: int | None = None, - specific_model_path: str | None = None, - **kwargs: Any, - ): - super().__init__( - model_name=model_name, - cache_dir=cache_dir, - threads=threads, - providers=providers, - cuda=cuda, - device_ids=device_ids, - lazy_load=lazy_load, - device_id=device_id, - specific_model_path=specific_model_path, - **kwargs, - ) - - @classmethod - def _list_supported_models(cls) -> list[BaseModelDescription]: - return cls.SUPPORTED_MODELS - - @classmethod - def add_model( - cls, - model_description: BaseModelDescription, - ) -> None: - cls.SUPPORTED_MODELS.append(model_description) diff --git a/fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py b/fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py deleted file mode 100644 index 212315513..000000000 --- a/fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py +++ /dev/null @@ -1,239 +0,0 @@ -from typing import Any, Iterable, Sequence, Type - -from loguru import logger - -from fastembed.common import OnnxProvider -from fastembed.common.onnx_model import OnnxOutputContext -from fastembed.common.types import Device -from fastembed.common.utils import define_cache_dir -from fastembed.rerank.cross_encoder.onnx_text_model import ( - OnnxCrossEncoderModel, - TextRerankerWorker, -) -from fastembed.rerank.cross_encoder.text_cross_encoder_base import TextCrossEncoderBase -from fastembed.common.model_description import BaseModelDescription, ModelSource - -supported_onnx_models: list[BaseModelDescription] = [ - BaseModelDescription( - model="Xenova/ms-marco-MiniLM-L-6-v2", - description="MiniLM-L-6-v2 model optimized for re-ranking tasks.", - license="apache-2.0", - size_in_GB=0.08, - sources=ModelSource(hf="Xenova/ms-marco-MiniLM-L-6-v2"), - model_file="onnx/model.onnx", - ), - BaseModelDescription( - model="Xenova/ms-marco-MiniLM-L-12-v2", - description="MiniLM-L-12-v2 model optimized for re-ranking tasks.", - license="apache-2.0", - size_in_GB=0.12, - sources=ModelSource(hf="Xenova/ms-marco-MiniLM-L-12-v2"), - model_file="onnx/model.onnx", - ), - BaseModelDescription( - model="BAAI/bge-reranker-base", - description="BGE reranker base model for cross-encoder re-ranking.", - license="mit", - size_in_GB=1.04, - sources=ModelSource(hf="BAAI/bge-reranker-base"), - model_file="onnx/model.onnx", - ), - BaseModelDescription( - model="jinaai/jina-reranker-v1-tiny-en", - description="Designed for blazing-fast re-ranking with 8K context length and fewer parameters than jina-reranker-v1-turbo-en.", - license="apache-2.0", - size_in_GB=0.13, - sources=ModelSource(hf="jinaai/jina-reranker-v1-tiny-en"), - model_file="onnx/model.onnx", - ), - BaseModelDescription( - model="jinaai/jina-reranker-v1-turbo-en", - description="Designed for blazing-fast re-ranking with 8K context length.", - license="apache-2.0", - size_in_GB=0.15, - sources=ModelSource(hf="jinaai/jina-reranker-v1-turbo-en"), - model_file="onnx/model.onnx", - ), - BaseModelDescription( - model="jinaai/jina-reranker-v2-base-multilingual", - description="A multi-lingual reranker model for cross-encoder re-ranking with 1K context length and sliding window", - license="cc-by-nc-4.0", - size_in_GB=1.11, - sources=ModelSource(hf="jinaai/jina-reranker-v2-base-multilingual"), - model_file="onnx/model.onnx", - ), -] - - -class OnnxTextCrossEncoder(TextCrossEncoderBase, OnnxCrossEncoderModel): - @classmethod - def _list_supported_models(cls) -> list[BaseModelDescription]: - """Lists the supported models. - - Returns: - list[BaseModelDescription]: A list of BaseModelDescription objects containing the model information. - """ - return supported_onnx_models - - def __init__( - self, - model_name: str, - cache_dir: str | None = None, - threads: int | None = None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_ids: list[int] | None = None, - lazy_load: bool = False, - device_id: int | None = None, - specific_model_path: str | None = None, - **kwargs: Any, - ): - """ - Args: - model_name (str): The name of the model to use. - cache_dir (str, optional): The path to the cache directory. - Can be set using the `FASTEMBED_CACHE_PATH` env variable. - Defaults to `fastembed_cache` in the system's temp directory. - threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. - providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use. - Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None. - cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers` - Defaults to Device.AUTO. - device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in - workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive - with `providers`. Defaults to None. - lazy_load (bool, optional): Whether to load the model during class initialization or on demand. - Should be set to True when using multiple-gpu and parallel encoding. Defaults to False. - device_id (Optional[int], optional): The device id to use for loading the model in the worker process. - specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else - - Raises: - ValueError: If the model_name is not in the format / e.g. Xenova/ms-marco-MiniLM-L-6-v2. - """ - super().__init__(model_name, cache_dir, threads, **kwargs) - self.providers = providers - self.lazy_load = lazy_load - self._extra_session_options = self._select_exposed_session_options(kwargs) - - # List of device ids, that can be used for data parallel processing in workers - self.device_ids = device_ids - self.cuda = cuda - - if self.device_ids is not None and len(self.device_ids) > 1: - logger.warning( - "Parallel execution is currently not supported for cross encoders, " - f"only the first device will be used for inference: {self.device_ids[0]}." - ) - - # This device_id will be used if we need to load model in current process - self.device_id: int | None = None - if device_id is not None: - self.device_id = device_id - elif self.device_ids is not None: - self.device_id = self.device_ids[0] - - self.model_description = self._get_model_description(model_name) - self.cache_dir = str(define_cache_dir(cache_dir)) - self._specific_model_path = specific_model_path - self._model_dir = self.download_model( - self.model_description, - self.cache_dir, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - ) - - if not self.lazy_load: - self.load_onnx_model() - - def load_onnx_model(self) -> None: - self._load_onnx_model( - model_dir=self._model_dir, - model_file=self.model_description.model_file, - threads=self.threads, - providers=self.providers, - cuda=self.cuda, - device_id=self.device_id, - extra_session_options=self._extra_session_options, - ) - - def rerank( - self, - query: str, - documents: Iterable[str], - batch_size: int = 64, - **kwargs: Any, - ) -> Iterable[float]: - """Reranks documents based on their relevance to a given query. - - Args: - query (str): The query string to which document relevance is calculated. - documents (Iterable[str]): Iterable of documents to be reranked. - batch_size (int, optional): The number of documents processed in each batch. Higher batch sizes improve speed - but require more memory. Default is 64. - Returns: - Iterable[float]: An iterable of relevance scores for each document. - """ - - yield from self._rerank_documents( - query=query, documents=documents, batch_size=batch_size, **kwargs - ) - - def rerank_pairs( - self, - pairs: Iterable[tuple[str, str]], - batch_size: int = 64, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[float]: - yield from self._rerank_pairs( - model_name=self.model_name, - cache_dir=str(self.cache_dir), - pairs=pairs, - batch_size=batch_size, - parallel=parallel, - providers=self.providers, - cuda=self.cuda, - device_ids=self.device_ids, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - extra_session_options=self._extra_session_options, - **kwargs, - ) - - @classmethod - def _get_worker_class(cls) -> Type[TextRerankerWorker]: - return TextCrossEncoderWorker - - def _post_process_onnx_output( - self, output: OnnxOutputContext, **kwargs: Any - ) -> Iterable[float]: - return (float(elem) for elem in output.model_output) - - def token_count( - self, pairs: Iterable[tuple[str, str]], batch_size: int = 1024, **kwargs: Any - ) -> int: - """Returns the number of tokens in the pairs. - - Args: - pairs: Iterable of tuples, where each tuple contains a query and a document to be tokenized - batch_size: Batch size for tokenizing - - Returns: - token count: overall number of tokens in the pairs - """ - return self._token_count(pairs, batch_size=batch_size, **kwargs) - - -class TextCrossEncoderWorker(TextRerankerWorker): - def init_embedding( - self, - model_name: str, - cache_dir: str, - **kwargs: Any, - ) -> OnnxTextCrossEncoder: - return OnnxTextCrossEncoder( - model_name=model_name, - cache_dir=cache_dir, - threads=1, - **kwargs, - ) diff --git a/fastembed/rerank/cross_encoder/onnx_text_model.py b/fastembed/rerank/cross_encoder/onnx_text_model.py deleted file mode 100644 index 55f3ea85c..000000000 --- a/fastembed/rerank/cross_encoder/onnx_text_model.py +++ /dev/null @@ -1,204 +0,0 @@ -import os -from multiprocessing import get_all_start_methods -from pathlib import Path -from typing import Any, Iterable, Sequence, Type - -import numpy as np -from tokenizers import Encoding - -from fastembed.common.onnx_model import ( - EmbeddingWorker, - OnnxModel, - OnnxOutputContext, - OnnxProvider, -) -from fastembed.common.types import NumpyArray, Device -from fastembed.common.preprocessor_utils import load_tokenizer -from fastembed.common.utils import iter_batch -from fastembed.parallel_processor import ParallelWorkerPool - - -class OnnxCrossEncoderModel(OnnxModel[float]): - ONNX_OUTPUT_NAMES: list[str] | None = None - - @classmethod - def _get_worker_class(cls) -> Type["TextRerankerWorker"]: - raise NotImplementedError("Subclasses must implement this method") - - def _load_onnx_model( - self, - model_dir: Path, - model_file: str, - threads: int | None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_id: int | None = None, - extra_session_options: dict[str, Any] | None = None, - ) -> None: - super()._load_onnx_model( - model_dir=model_dir, - model_file=model_file, - threads=threads, - providers=providers, - cuda=cuda, - device_id=device_id, - extra_session_options=extra_session_options, - ) - self.tokenizer, _ = load_tokenizer(model_dir=model_dir) - assert self.tokenizer is not None - - def tokenize(self, pairs: list[tuple[str, str]], **_: Any) -> list[Encoding]: - return self.tokenizer.encode_batch(pairs) # type: ignore[union-attr] - - def _build_onnx_input(self, tokenized_input: list[Encoding]) -> dict[str, NumpyArray]: - input_names: set[str] = {node.name for node in self.model.get_inputs()} # type: ignore[union-attr] - inputs: dict[str, NumpyArray] = { - "input_ids": np.array([enc.ids for enc in tokenized_input], dtype=np.int64), - } - if "token_type_ids" in input_names: - inputs["token_type_ids"] = np.array( - [enc.type_ids for enc in tokenized_input], dtype=np.int64 - ) - if "attention_mask" in input_names: - inputs["attention_mask"] = np.array( - [enc.attention_mask for enc in tokenized_input], dtype=np.int64 - ) - return inputs - - def onnx_embed(self, query: str, documents: list[str], **kwargs: Any) -> OnnxOutputContext: - pairs = [(query, doc) for doc in documents] - return self.onnx_embed_pairs(pairs, **kwargs) - - def onnx_embed_pairs(self, pairs: list[tuple[str, str]], **kwargs: Any) -> OnnxOutputContext: - tokenized_input = self.tokenize(pairs, **kwargs) - inputs = self._build_onnx_input(tokenized_input) - onnx_input = self._preprocess_onnx_input(inputs, **kwargs) - outputs = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input) # type: ignore[union-attr] - relevant_output = outputs[0] - scores: NumpyArray = relevant_output[:, 0] - return OnnxOutputContext(model_output=scores) - - def _rerank_documents( - self, query: str, documents: Iterable[str], batch_size: int, **kwargs: Any - ) -> Iterable[float]: - if not hasattr(self, "model") or self.model is None: - self.load_onnx_model() - for batch in iter_batch(documents, batch_size): - yield from self._post_process_onnx_output(self.onnx_embed(query, batch, **kwargs)) - - def _rerank_pairs( - self, - model_name: str, - cache_dir: str, - pairs: Iterable[tuple[str, str]], - batch_size: int, - parallel: int | None = None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_ids: list[int] | None = None, - local_files_only: bool = False, - specific_model_path: str | None = None, - extra_session_options: dict[str, Any] | None = None, - **kwargs: Any, - ) -> Iterable[float]: - is_small = False - - if isinstance(pairs, tuple): - pairs = [pairs] - is_small = True - - if isinstance(pairs, list): - if len(pairs) < batch_size: - is_small = True - - if parallel is None or is_small: - if not hasattr(self, "model") or self.model is None: - self.load_onnx_model() - for batch in iter_batch(pairs, batch_size): - yield from self._post_process_onnx_output(self.onnx_embed_pairs(batch, **kwargs)) - else: - if parallel == 0: - parallel = os.cpu_count() - - start_method = "forkserver" if "forkserver" in get_all_start_methods() else "spawn" - params = { - "model_name": model_name, - "cache_dir": cache_dir, - "providers": providers, - "local_files_only": local_files_only, - "specific_model_path": specific_model_path, - **kwargs, - } - - if extra_session_options is not None: - params.update(extra_session_options) - - pool = ParallelWorkerPool( - num_workers=parallel or 1, - worker=self._get_worker_class(), - cuda=cuda, - device_ids=device_ids, - start_method=start_method, - ) - for batch in pool.ordered_map(iter_batch(pairs, batch_size), **params): - yield from self._post_process_onnx_output(batch) # type: ignore - - def _post_process_onnx_output( - self, output: OnnxOutputContext, **kwargs: Any - ) -> Iterable[float]: - """Post-process the ONNX model output to convert it into a usable format. - - Args: - output (OnnxOutputContext): The raw output from the ONNX model. - **kwargs: Additional keyword arguments that may be needed by specific implementations. - - Returns: - Iterable[float]: Post-processed output as an iterable of float values. - """ - raise NotImplementedError("Subclasses must implement this method") - - def _preprocess_onnx_input( - self, onnx_input: dict[str, NumpyArray], **kwargs: Any - ) -> dict[str, NumpyArray]: - """ - Preprocess the onnx input. - """ - return onnx_input - - def _token_count( - self, pairs: Iterable[tuple[str, str]], batch_size: int = 1024, **_: Any - ) -> int: - if not hasattr(self, "model") or self.model is None: - self.load_onnx_model() # loads the tokenizer as well - - token_num = 0 - assert self.tokenizer is not None - for batch in iter_batch(pairs, batch_size): - for tokens in self.tokenizer.encode_batch(batch): - token_num += sum(tokens.attention_mask) - - return token_num - - -class TextRerankerWorker(EmbeddingWorker[float]): - def __init__( - self, - model_name: str, - cache_dir: str, - **kwargs: Any, - ): - self.model: OnnxCrossEncoderModel - super().__init__(model_name, cache_dir, **kwargs) - - def init_embedding( - self, - model_name: str, - cache_dir: str, - **kwargs: Any, - ) -> OnnxCrossEncoderModel: - raise NotImplementedError() - - def process(self, items: Iterable[tuple[int, Any]]) -> Iterable[tuple[int, Any]]: - for idx, batch in items: - onnx_output = self.model.onnx_embed_pairs(batch) - yield idx, onnx_output diff --git a/fastembed/rerank/cross_encoder/text_cross_encoder.py b/fastembed/rerank/cross_encoder/text_cross_encoder.py deleted file mode 100644 index 6f98cb24a..000000000 --- a/fastembed/rerank/cross_encoder/text_cross_encoder.py +++ /dev/null @@ -1,178 +0,0 @@ -from typing import Any, Iterable, Sequence, Type -from dataclasses import asdict - -from fastembed.common import OnnxProvider -from fastembed.common.types import Device -from fastembed.rerank.cross_encoder.onnx_text_cross_encoder import OnnxTextCrossEncoder -from fastembed.rerank.cross_encoder.custom_text_cross_encoder import CustomTextCrossEncoder - -from fastembed.rerank.cross_encoder.text_cross_encoder_base import TextCrossEncoderBase -from fastembed.common.model_description import ( - ModelSource, - BaseModelDescription, -) - - -class TextCrossEncoder(TextCrossEncoderBase): - CROSS_ENCODER_REGISTRY: list[Type[TextCrossEncoderBase]] = [ - OnnxTextCrossEncoder, - CustomTextCrossEncoder, - ] - - @classmethod - def list_supported_models(cls) -> list[dict[str, Any]]: - """Lists the supported models. - - Returns: - list[BaseModelDescription]: A list of dictionaries containing the model information. - - Example: - ``` - [ - { - "model": "Xenova/ms-marco-MiniLM-L-6-v2", - "size_in_GB": 0.08, - "sources": { - "hf": "Xenova/ms-marco-MiniLM-L-6-v2", - }, - "model_file": "onnx/model.onnx", - "description": "MiniLM-L-6-v2 model optimized for re-ranking tasks.", - "license": "apache-2.0", - } - ] - ``` - """ - return [asdict(model) for model in cls._list_supported_models()] - - @classmethod - def _list_supported_models(cls) -> list[BaseModelDescription]: - result: list[BaseModelDescription] = [] - for encoder in cls.CROSS_ENCODER_REGISTRY: - result.extend(encoder._list_supported_models()) - return result - - def __init__( - self, - model_name: str, - cache_dir: str | None = None, - threads: int | None = None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_ids: list[int] | None = None, - lazy_load: bool = False, - **kwargs: Any, - ): - super().__init__(model_name, cache_dir, threads, **kwargs) - - for CROSS_ENCODER_TYPE in self.CROSS_ENCODER_REGISTRY: - supported_models = CROSS_ENCODER_TYPE._list_supported_models() - if any(model_name.lower() == model.model.lower() for model in supported_models): - self.model = CROSS_ENCODER_TYPE( - model_name=model_name, - cache_dir=cache_dir, - threads=threads, - providers=providers, - cuda=cuda, - device_ids=device_ids, - lazy_load=lazy_load, - **kwargs, - ) - return - - raise ValueError( - f"Model {model_name} is not supported in TextCrossEncoder." - "Please check the supported models using `TextCrossEncoder.list_supported_models()`" - ) - - def rerank( - self, query: str, documents: Iterable[str], batch_size: int = 64, **kwargs: Any - ) -> Iterable[float]: - """Rerank a list of documents based on a query. - - Args: - query: Query to rerank the documents against - documents: Iterator of documents to rerank - batch_size: Batch size for reranking - - Returns: - Iterable of scores for each document - """ - yield from self.model.rerank(query, documents, batch_size=batch_size, **kwargs) - - def rerank_pairs( - self, - pairs: Iterable[tuple[str, str]], - batch_size: int = 64, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[float]: - """ - Rerank a list of query-document pairs. - - Args: - pairs (Iterable[tuple[str, str]]): An iterable of tuples, where each tuple contains a query and a document - to be scored together. - batch_size (int, optional): The number of query-document pairs to process in a single batch. Defaults to 64. - parallel (Optional[int], optional): The number of parallel processes to use for reranking. - If None, parallelization is disabled. Defaults to None. - **kwargs (Any): Additional arguments to pass to the underlying reranking model. - - Returns: - Iterable[float]: An iterable of scores corresponding to each query-document pair in the input. - Higher scores indicate a stronger match between the query and the document. - - Example: - >>> encoder = TextCrossEncoder("Xenova/ms-marco-MiniLM-L-6-v2") - >>> pairs = [("What is AI?", "Artificial intelligence is ..."), ("What is ML?", "Machine learning is ...")] - >>> scores = list(encoder.rerank_pairs(pairs)) - >>> print(list(map(lambda x: round(x, 2), scores))) - [-1.24, -10.6] - """ - yield from self.model.rerank_pairs( - pairs, batch_size=batch_size, parallel=parallel, **kwargs - ) - - @classmethod - def add_custom_model( - cls, - model: str, - sources: ModelSource, - model_file: str = "onnx/model.onnx", - description: str = "", - license: str = "", - size_in_gb: float = 0.0, - additional_files: list[str] | None = None, - ) -> None: - registered_models = cls._list_supported_models() - for registered_model in registered_models: - if model == registered_model.model: - raise ValueError( - f"Model {model} is already registered in CrossEncoderModel, if you still want to add this model, " - f"please use another model name" - ) - - CustomTextCrossEncoder.add_model( - BaseModelDescription( - model=model, - sources=sources, - model_file=model_file, - description=description, - license=license, - size_in_GB=size_in_gb, - additional_files=additional_files or [], - ) - ) - - def token_count( - self, pairs: Iterable[tuple[str, str]], batch_size: int = 1024, **kwargs: Any - ) -> int: - """Returns the number of tokens in the pairs. - - Args: - pairs: Iterable of tuples, where each tuple contains a query and a document to be tokenized - batch_size: Batch size for tokenizing - - Returns: - token count: overall number of tokens in the pairs - """ - return self.model.token_count(pairs, batch_size=batch_size, **kwargs) diff --git a/fastembed/rerank/cross_encoder/text_cross_encoder_base.py b/fastembed/rerank/cross_encoder/text_cross_encoder_base.py deleted file mode 100644 index 6c2660b08..000000000 --- a/fastembed/rerank/cross_encoder/text_cross_encoder_base.py +++ /dev/null @@ -1,63 +0,0 @@ -from typing import Any, Iterable - -from fastembed.common.model_description import BaseModelDescription -from fastembed.common.model_management import ModelManagement - - -class TextCrossEncoderBase(ModelManagement[BaseModelDescription]): - def __init__( - self, - model_name: str, - cache_dir: str | None = None, - threads: int | None = None, - **kwargs: Any, - ): - self.model_name = model_name - self.cache_dir = cache_dir - self.threads = threads - self._local_files_only = kwargs.pop("local_files_only", False) - - def rerank( - self, - query: str, - documents: Iterable[str], - batch_size: int = 64, - **kwargs: Any, - ) -> Iterable[float]: - """Rerank a list of documents given a query. - - Args: - query (str): The query to rerank the documents. - documents (Iterable[str]): The list of texts to rerank. - batch_size (int): The batch size to use for reranking. - **kwargs: Additional keyword argument to pass to the rerank method. - - Yields: - Iterable[float]: The scores of the reranked the documents. - """ - raise NotImplementedError("This method should be overridden by subclasses") - - def rerank_pairs( - self, - pairs: Iterable[tuple[str, str]], - batch_size: int = 64, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[float]: - """Rerank query-document pairs. - Args: - pairs (Iterable[tuple[str, str]]): Query-document pairs to rerank - batch_size (int): The batch size to use for reranking. - parallel: parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - **kwargs: Additional keyword argument to pass to the rerank method. - Yields: - Iterable[float]: Scores for each individual pair - """ - raise NotImplementedError("This method should be overridden by subclasses") - - def token_count(self, pairs: Iterable[tuple[str, str]], **kwargs: Any) -> int: - """Returns the number of tokens in the pairs.""" - raise NotImplementedError("This method should be overridden by subclasses") diff --git a/fastembed/sparse/__init__.py b/fastembed/sparse/__init__.py deleted file mode 100644 index 93b65e217..000000000 --- a/fastembed/sparse/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from fastembed.sparse.sparse_embedding_base import SparseEmbedding -from fastembed.sparse.sparse_text_embedding import SparseTextEmbedding - -__all__ = ["SparseEmbedding", "SparseTextEmbedding"] diff --git a/fastembed/sparse/bm25.py b/fastembed/sparse/bm25.py deleted file mode 100644 index c31acd12c..000000000 --- a/fastembed/sparse/bm25.py +++ /dev/null @@ -1,359 +0,0 @@ -import os -from collections import defaultdict -from multiprocessing import get_all_start_methods -from pathlib import Path -from typing import Any, Iterable, Type - -import mmh3 -import numpy as np -from py_rust_stemmers import SnowballStemmer -from fastembed.common.utils import ( - define_cache_dir, - iter_batch, - get_all_punctuation, - remove_non_alphanumeric, -) -from fastembed.parallel_processor import ParallelWorkerPool, Worker -from fastembed.sparse.sparse_embedding_base import ( - SparseEmbedding, - SparseTextEmbeddingBase, -) -from fastembed.sparse.utils.tokenizer import SimpleTokenizer -from fastembed.common.model_description import SparseModelDescription, ModelSource - - -supported_languages = [ - "arabic", - "danish", - "dutch", - "english", - "finnish", - "french", - "german", - "greek", - "hungarian", - "italian", - "norwegian", - "portuguese", - "romanian", - "russian", - "spanish", - "swedish", - "tamil", - "turkish", -] - -supported_bm25_models: list[SparseModelDescription] = [ - SparseModelDescription( - model="Qdrant/bm25", - vocab_size=0, - description="BM25 as sparse embeddings meant to be used with Qdrant", - license="apache-2.0", - size_in_GB=0.01, - sources=ModelSource(hf="Qdrant/bm25"), - additional_files=[f"{lang}.txt" for lang in supported_languages], - requires_idf=True, - model_file="mock.file", - ), -] - - -class Bm25(SparseTextEmbeddingBase): - """Implements traditional BM25 in a form of sparse embeddings. - Uses a count of tokens in the document to evaluate the importance of the token. - - WARNING: This model is expected to be used with `modifier="idf"` in the sparse vector index of Qdrant. - - BM25 formula: - - score(q, d) = SUM[ IDF(q_i) * (f(q_i, d) * (k + 1)) / (f(q_i, d) + k * (1 - b + b * (|d| / avg_len))) ], - - where IDF is the inverse document frequency, computed on Qdrant's side - f(q_i, d) is the term frequency of the token q_i in the document d - k, b, avg_len are hyperparameters, described below. - - Args: - model_name (str): The name of the model to use. - cache_dir (str, optional): The path to the cache directory. - Can be set using the `FASTEMBED_CACHE_PATH` env variable. - Defaults to `fastembed_cache` in the system's temp directory. - k (float, optional): The k parameter in the BM25 formula. Defines the saturation of the term frequency. - I.e. defines how fast the moment when additional terms stop to increase the score. Defaults to 1.2. - b (float, optional): The b parameter in the BM25 formula. Defines the importance of the document length. - Defaults to 0.75. - avg_len (float, optional): The average length of the documents in the corpus. Defaults to 256.0. - language (str): Specifies the language for the stemmer. - disable_stemmer (bool): Disable the stemmer. - Raises: - ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. - """ - - def __init__( - self, - model_name: str, - cache_dir: str | None = None, - k: float = 1.2, - b: float = 0.75, - avg_len: float = 256.0, - language: str = "english", - token_max_length: int = 40, - disable_stemmer: bool = False, - specific_model_path: str | None = None, - **kwargs: Any, - ): - super().__init__(model_name, cache_dir, **kwargs) - - if language not in supported_languages: - raise ValueError(f"{language} language is not supported") - else: - self.language = language - - self.k = k - self.b = b - self.avg_len = avg_len - - model_description = self._get_model_description(model_name) - self.cache_dir = str(define_cache_dir(cache_dir)) - - self._specific_model_path = specific_model_path - self._model_dir = self.download_model( - model_description, - self.cache_dir, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - ) - - self.token_max_length = token_max_length - self.punctuation = set(get_all_punctuation()) - self.disable_stemmer = disable_stemmer - - if disable_stemmer: - self.stopwords: set[str] = set() - self.stemmer = None - else: - self.stopwords = set(self._load_stopwords(self._model_dir, self.language)) - self.stemmer = SnowballStemmer(language) - - self.tokenizer = SimpleTokenizer - - @classmethod - def _list_supported_models(cls) -> list[SparseModelDescription]: - """Lists the supported models. - - Returns: - list[SparseModelDescription]: A list of SparseModelDescription objects containing the model information. - """ - return supported_bm25_models - - @classmethod - def _load_stopwords(cls, model_dir: Path, language: str) -> list[str]: - stopwords_path = model_dir / f"{language}.txt" - if not stopwords_path.exists(): - return [] - - with open(stopwords_path, "r") as f: - return f.read().splitlines() - - def _embed_documents( - self, - model_name: str, - cache_dir: str, - documents: str | Iterable[str], - batch_size: int = 256, - parallel: int | None = None, - local_files_only: bool = False, - specific_model_path: str | None = None, - ) -> Iterable[SparseEmbedding]: - is_small = False - - if isinstance(documents, str): - documents = [documents] - is_small = True - - if isinstance(documents, list): - if len(documents) < batch_size: - is_small = True - - if parallel is None or is_small: - for batch in iter_batch(documents, batch_size): - yield from self.raw_embed(batch) - else: - if parallel == 0: - parallel = os.cpu_count() - - start_method = "forkserver" if "forkserver" in get_all_start_methods() else "spawn" - params = { - "model_name": model_name, - "cache_dir": cache_dir, - "k": self.k, - "b": self.b, - "avg_len": self.avg_len, - "language": self.language, - "token_max_length": self.token_max_length, - "disable_stemmer": self.disable_stemmer, - "local_files_only": local_files_only, - "specific_model_path": specific_model_path, - } - pool = ParallelWorkerPool( - num_workers=parallel or 1, - worker=self._get_worker_class(), - start_method=start_method, - ) - for batch in pool.ordered_map(iter_batch(documents, batch_size), **params): - for record in batch: - yield record # type: ignore - - def embed( - self, - documents: str | Iterable[str], - batch_size: int = 256, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[SparseEmbedding]: - """ - Encode a list of documents into list of embeddings. - We use mean pooling with attention so that the model can handle variable-length inputs. - - Args: - documents: Iterator of documents or single document to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - - Returns: - List of embeddings, one per document - """ - yield from self._embed_documents( - model_name=self.model_name, - cache_dir=str(self.cache_dir), - documents=documents, - batch_size=batch_size, - parallel=parallel, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - ) - - def _stem(self, tokens: list[str]) -> list[str]: - stemmed_tokens: list[str] = [] - for token in tokens: - lower_token = token.lower() - - if token in self.punctuation: - continue - - if lower_token in self.stopwords: - continue - - if len(token) > self.token_max_length: - continue - - stemmed_token = self.stemmer.stem_word(lower_token) if self.stemmer else lower_token - - if stemmed_token: - stemmed_tokens.append(stemmed_token) - return stemmed_tokens - - def raw_embed( - self, - documents: list[str], - ) -> list[SparseEmbedding]: - embeddings: list[SparseEmbedding] = [] - for document in documents: - document = remove_non_alphanumeric(document) - tokens = self.tokenizer.tokenize(document) - stemmed_tokens = self._stem(tokens) - token_id2value = self._term_frequency(stemmed_tokens) - embeddings.append(SparseEmbedding.from_dict(token_id2value)) - return embeddings - - def token_count(self, texts: str | Iterable[str], **kwargs: Any) -> int: - token_num = 0 - texts = [texts] if isinstance(texts, str) else texts - for text in texts: - document = remove_non_alphanumeric(text) - tokens = self.tokenizer.tokenize(document) - token_num += len(tokens) - return token_num - - def _term_frequency(self, tokens: list[str]) -> dict[int, float]: - """Calculate the term frequency part of the BM25 formula. - - ( - f(q_i, d) * (k + 1) - ) / ( - f(q_i, d) + k * (1 - b + b * (|d| / avg_len)) - ) - - Args: - tokens (list[str]): The list of tokens in the document. - - Returns: - dict[int, float]: The token_id to term frequency mapping. - """ - tf_map: dict[int, float] = {} - counter: defaultdict[str, int] = defaultdict(int) - for stemmed_token in tokens: - counter[stemmed_token] += 1 - - doc_len = len(tokens) - for stemmed_token in counter: - token_id = self.compute_token_id(stemmed_token) - num_occurrences = counter[stemmed_token] - tf_map[token_id] = num_occurrences * (self.k + 1) - tf_map[token_id] /= num_occurrences + self.k * ( - 1 - self.b + self.b * doc_len / self.avg_len - ) - return tf_map - - @classmethod - def compute_token_id(cls, token: str) -> int: - return abs(mmh3.hash(token)) - - def query_embed(self, query: str | Iterable[str], **kwargs: Any) -> Iterable[SparseEmbedding]: - """To emulate BM25 behaviour, we don't need to use weights in the query, and - it's enough to just hash the tokens and assign a weight of 1.0 to them. - """ - if isinstance(query, str): - query = [query] - - for text in query: - text = remove_non_alphanumeric(text) - tokens = self.tokenizer.tokenize(text) - stemmed_tokens = self._stem(tokens) - token_ids = np.array( - list(set(self.compute_token_id(token) for token in stemmed_tokens)), - dtype=np.int32, - ) - values = np.ones_like(token_ids) - yield SparseEmbedding(indices=token_ids, values=values) - - @classmethod - def _get_worker_class(cls) -> Type["Bm25Worker"]: - return Bm25Worker - - -class Bm25Worker(Worker): - def __init__( - self, - model_name: str, - cache_dir: str, - **kwargs: Any, - ): - self.model = self.init_embedding(model_name, cache_dir, **kwargs) - - @classmethod - def start(cls, model_name: str, cache_dir: str, **kwargs: Any) -> "Bm25Worker": - return cls(model_name=model_name, cache_dir=cache_dir, **kwargs) - - def process( - self, items: Iterable[tuple[int, Any]] - ) -> Iterable[tuple[int, list[SparseEmbedding]]]: - for idx, batch in items: - onnx_output = self.model.raw_embed(batch) - yield idx, onnx_output - - @staticmethod - def init_embedding(model_name: str, cache_dir: str, **kwargs: Any) -> Bm25: - return Bm25(model_name=model_name, cache_dir=cache_dir, **kwargs) diff --git a/fastembed/sparse/bm42.py b/fastembed/sparse/bm42.py deleted file mode 100644 index 2b090f749..000000000 --- a/fastembed/sparse/bm42.py +++ /dev/null @@ -1,369 +0,0 @@ -import math -import string -from pathlib import Path -from typing import Any, Iterable, Sequence, Type - -import mmh3 -import numpy as np -from py_rust_stemmers import SnowballStemmer - -from fastembed.common import OnnxProvider -from fastembed.common.onnx_model import OnnxOutputContext -from fastembed.common.types import Device -from fastembed.common.utils import define_cache_dir -from fastembed.sparse.sparse_embedding_base import ( - SparseEmbedding, - SparseTextEmbeddingBase, -) -from fastembed.text.onnx_text_model import OnnxTextModel, TextEmbeddingWorker -from fastembed.common.model_description import SparseModelDescription, ModelSource - -supported_bm42_models: list[SparseModelDescription] = [ - SparseModelDescription( - model="Qdrant/bm42-all-minilm-l6-v2-attentions", - vocab_size=30522, - description="Light sparse embedding model, which assigns an importance score to each token in the text", - license="apache-2.0", - size_in_GB=0.09, - sources=ModelSource(hf="Qdrant/all_miniLM_L6_v2_with_attentions"), - model_file="model.onnx", - additional_files=["stopwords.txt"], - requires_idf=True, - ), -] - - -_MODEL_TO_LANGUAGE = { - "Qdrant/bm42-all-minilm-l6-v2-attentions": "english", -} -MODEL_TO_LANGUAGE = { - model_name.lower(): language for model_name, language in _MODEL_TO_LANGUAGE.items() -} - - -def get_language_by_model_name(model_name: str) -> str: - return MODEL_TO_LANGUAGE[model_name.lower()] - - -class Bm42(SparseTextEmbeddingBase, OnnxTextModel[SparseEmbedding]): - """ - Bm42 is an extension of BM25, which tries to better evaluate importance of tokens in the documents, - by extracting attention weights from the transformer model. - - Traditional BM25 uses a count of tokens in the document to evaluate the importance of the token, - but this approach doesn't work well with short documents or chunks of text, as almost all tokens - there are unique. - - BM42 addresses this issue by replacing the token count with the attention weights from the transformer model. - This allows sparse embeddings to work well with short documents, handle rare tokens and leverage traditional NLP - techniques like stemming and stopwords. - - WARNING: This model is expected to be used with `modifier="idf"` in the sparse vector index of Qdrant. - """ - - ONNX_OUTPUT_NAMES = ["attention_6"] - - def __init__( - self, - model_name: str, - cache_dir: str | None = None, - threads: int | None = None, - providers: Sequence[OnnxProvider] | None = None, - alpha: float = 0.5, - cuda: bool | Device = Device.AUTO, - device_ids: list[int] | None = None, - lazy_load: bool = False, - device_id: int | None = None, - specific_model_path: str | None = None, - **kwargs: Any, - ): - """ - Args: - model_name (str): The name of the model to use. - cache_dir (str, optional): The path to the cache directory. - Can be set using the `FASTEMBED_CACHE_PATH` env variable. - Defaults to `fastembed_cache` in the system's temp directory. - threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. - providers (Optional[Sequence[OnnxProvider]], optional): The providers to use for onnxruntime. - alpha (float, optional): Parameter, that defines the importance of the token weight in the document - versus the importance of the token frequency in the corpus. Defaults to 0.5, based on empirical testing. - It is recommended to only change this parameter based on training data for a specific dataset. - cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers` - Defaults to Device.AUTO. - device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in - workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive - with `providers`. Defaults to None. - lazy_load (bool, optional): Whether to load the model during class initialization or on demand. - Should be set to True when using multiple-gpu and parallel encoding. Defaults to False. - device_id (Optional[int], optional): The device id to use for loading the model in the worker process. - specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else - - Raises: - ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. - """ - - super().__init__(model_name, cache_dir, threads, **kwargs) - self.providers = providers - self.lazy_load = lazy_load - self._extra_session_options = self._select_exposed_session_options(kwargs) - - # List of device ids, that can be used for data parallel processing in workers - self.device_ids = device_ids - self.cuda = cuda - - # This device_id will be used if we need to load model in current process - self.device_id: int | None = None - if device_id is not None: - self.device_id = device_id - elif self.device_ids is not None: - self.device_id = self.device_ids[0] - - self.model_description = self._get_model_description(model_name) - self.cache_dir = str(define_cache_dir(cache_dir)) - - self._specific_model_path = specific_model_path - self._model_dir = self.download_model( - self.model_description, - self.cache_dir, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - ) - - self.invert_vocab: dict[int, str] = {} - - self.special_tokens: set[str] = set() - self.special_tokens_ids: set[int] = set() - self.punctuation = set(string.punctuation) - self.stopwords = set(self._load_stopwords(self._model_dir)) - self.stemmer = SnowballStemmer(get_language_by_model_name(self.model_name)) - self.alpha = alpha - - if not self.lazy_load: - self.load_onnx_model() - - def load_onnx_model(self) -> None: - self._load_onnx_model( - model_dir=self._model_dir, - model_file=self.model_description.model_file, - threads=self.threads, - providers=self.providers, - cuda=self.cuda, - device_id=self.device_id, - extra_session_options=self._extra_session_options, - ) - - for token, idx in self.tokenizer.get_vocab().items(): # type: ignore[union-attr] - self.invert_vocab[idx] = token - self.special_tokens = set(self.special_token_to_id.keys()) - self.special_tokens_ids = set(self.special_token_to_id.values()) - self.stopwords = set(self._load_stopwords(self._model_dir)) - - def _filter_pair_tokens(self, tokens: list[tuple[str, Any]]) -> list[tuple[str, Any]]: - result: list[tuple[str, Any]] = [] - for token, value in tokens: - if token in self.stopwords or token in self.punctuation: - continue - result.append((token, value)) - return result - - def _stem_pair_tokens(self, tokens: list[tuple[str, Any]]) -> list[tuple[str, Any]]: - result: list[tuple[str, Any]] = [] - for token, value in tokens: - processed_token = self.stemmer.stem_word(token) - result.append((processed_token, value)) - return result - - @classmethod - def _aggregate_weights( - cls, tokens: list[tuple[str, list[int]]], weights: list[float] - ) -> list[tuple[str, float]]: - result: list[tuple[str, float]] = [] - for token, idxs in tokens: - sum_weight = sum(weights[idx] for idx in idxs) - result.append((token, sum_weight)) - return result - - def _reconstruct_bpe( - self, bpe_tokens: Iterable[tuple[int, str]] - ) -> list[tuple[str, list[int]]]: - result: list[tuple[str, list[int]]] = [] - acc: str = "" - acc_idx: list[int] = [] - - continuing_subword_prefix = self.tokenizer.model.continuing_subword_prefix # type: ignore[union-attr] - continuing_subword_prefix_len = len(continuing_subword_prefix) - - for idx, token in bpe_tokens: - if token in self.special_tokens: - continue - - if token.startswith(continuing_subword_prefix): - acc += token[continuing_subword_prefix_len:] - acc_idx.append(idx) - else: - if acc: - result.append((acc, acc_idx)) - acc_idx = [] - acc = token - acc_idx.append(idx) - - if acc: - result.append((acc, acc_idx)) - - return result - - def _rescore_vector(self, vector: dict[str, float]) -> dict[int, float]: - """ - Orders all tokens in the vector by their importance and generates a new score based on the importance order. - So that the scoring doesn't depend on absolute values assigned by the model, but on the relative importance. - """ - - new_vector: dict[int, float] = {} - - for token, value in vector.items(): - token_id = abs(mmh3.hash(token)) - # Examples: - # Num 0: Log(1/1 + 1) = 0.6931471805599453 - # Num 1: Log(1/2 + 1) = 0.4054651081081644 - # Num 2: Log(1/3 + 1) = 0.28768207245178085 - new_vector[token_id] = math.log(1.0 + value) ** self.alpha # value - - return new_vector - - def _post_process_onnx_output( - self, output: OnnxOutputContext, **kwargs: Any - ) -> Iterable[SparseEmbedding]: - if output.input_ids is None: - raise ValueError("input_ids must be provided for document post-processing") - - token_ids_batch = output.input_ids.astype(int) - - # attention_value shape: (batch_size, num_heads, num_tokens, num_tokens) - pooled_attention = np.mean(output.model_output[:, :, 0], axis=1) * output.attention_mask - - for document_token_ids, attention_value in zip(token_ids_batch, pooled_attention): - document_tokens_with_ids = ( - (idx, self.invert_vocab[token_id]) - for idx, token_id in enumerate(document_token_ids) - ) - - reconstructed = self._reconstruct_bpe(document_tokens_with_ids) - - filtered = self._filter_pair_tokens(reconstructed) - - stemmed = self._stem_pair_tokens(filtered) - - weighted = self._aggregate_weights(stemmed, attention_value) - - max_token_weight: dict[str, float] = {} - - for token, weight in weighted: - max_token_weight[token] = max(max_token_weight.get(token, 0), weight) - - rescored = self._rescore_vector(max_token_weight) - - yield SparseEmbedding.from_dict(rescored) - - @classmethod - def _list_supported_models(cls) -> list[SparseModelDescription]: - """Lists the supported models. - - Returns: - list[SparseModelDescription]: A list of SparseModelDescription objects containing the model information. - """ - return supported_bm42_models - - @classmethod - def _load_stopwords(cls, model_dir: Path) -> list[str]: - stopwords_path = model_dir / "stopwords.txt" - if not stopwords_path.exists(): - return [] - - with open(stopwords_path, "r") as f: - return f.read().splitlines() - - def embed( - self, - documents: str | Iterable[str], - batch_size: int = 256, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[SparseEmbedding]: - """ - Encode a list of documents into list of embeddings. - We use mean pooling with attention so that the model can handle variable-length inputs. - - Args: - documents: Iterator of documents or single document to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - - Returns: - List of embeddings, one per document - """ - yield from self._embed_documents( - model_name=self.model_name, - cache_dir=str(self.cache_dir), - documents=documents, - batch_size=batch_size, - parallel=parallel, - providers=self.providers, - cuda=self.cuda, - device_ids=self.device_ids, - alpha=self.alpha, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - extra_session_options=self._extra_session_options, - ) - - @classmethod - def _query_rehash(cls, tokens: Iterable[str]) -> dict[int, float]: - result: dict[int, float] = {} - for token in tokens: - token_id = abs(mmh3.hash(token)) - result[token_id] = 1.0 - return result - - def query_embed(self, query: str | Iterable[str], **kwargs: Any) -> Iterable[SparseEmbedding]: - """ - To emulate BM25 behaviour, we don't need to use smart weights in the query, and - it's enough to just hash the tokens and assign a weight of 1.0 to them. - It is also faster, as we don't need to run the model for the query. - """ - if isinstance(query, str): - query = [query] - - if not hasattr(self, "model") or self.model is None: - self.load_onnx_model() - - for text in query: - encoded = self.tokenizer.encode(text) # type: ignore[union-attr] - document_tokens_with_ids = enumerate(encoded.tokens) - reconstructed = self._reconstruct_bpe(document_tokens_with_ids) - filtered = self._filter_pair_tokens(reconstructed) - stemmed = self._stem_pair_tokens(filtered) - - yield SparseEmbedding.from_dict(self._query_rehash(token for token, _ in stemmed)) - - @classmethod - def _get_worker_class(cls) -> Type[TextEmbeddingWorker[SparseEmbedding]]: - return Bm42TextEmbeddingWorker - - def token_count( - self, texts: str | Iterable[str], batch_size: int = 1024, **kwargs: Any - ) -> int: - if not hasattr(self, "model") or self.model is None: - self.load_onnx_model() # loads the tokenizer as well - return self._token_count(texts, batch_size=batch_size, **kwargs) - - -class Bm42TextEmbeddingWorker(TextEmbeddingWorker[SparseEmbedding]): - def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> Bm42: - return Bm42( - model_name=model_name, - cache_dir=cache_dir, - **kwargs, - ) diff --git a/fastembed/sparse/minicoil.py b/fastembed/sparse/minicoil.py deleted file mode 100644 index 6f29c60fd..000000000 --- a/fastembed/sparse/minicoil.py +++ /dev/null @@ -1,372 +0,0 @@ -from pathlib import Path - -from typing import Any, Sequence, Iterable, Type - -import numpy as np -from numpy.typing import NDArray -from py_rust_stemmers import SnowballStemmer -from tokenizers import Tokenizer - -from fastembed.common.model_description import SparseModelDescription, ModelSource -from fastembed.common.onnx_model import OnnxOutputContext -from fastembed.common import OnnxProvider -from fastembed.common.types import Device -from fastembed.common.utils import define_cache_dir -from fastembed.sparse.sparse_embedding_base import ( - SparseEmbedding, - SparseTextEmbeddingBase, -) -from fastembed.sparse.utils.minicoil_encoder import Encoder -from fastembed.sparse.utils.sparse_vectors_converter import SparseVectorConverter, WordEmbedding -from fastembed.sparse.utils.vocab_resolver import VocabResolver, VocabTokenizer -from fastembed.text.onnx_text_model import OnnxTextModel, TextEmbeddingWorker - - -MINICOIL_MODEL_FILE = "minicoil.triplet.model.npy" -MINICOIL_VOCAB_FILE = "minicoil.triplet.model.vocab" -STOPWORDS_FILE = "stopwords.txt" - - -supported_minicoil_models: list[SparseModelDescription] = [ - SparseModelDescription( - model="Qdrant/minicoil-v1", - vocab_size=19125, - description="Sparse embedding model, that resolves semantic meaning of the words, " - "while keeping exact keyword match behavior. " - "Based on jinaai/jina-embeddings-v2-small-en-tokens", - license="apache-2.0", - size_in_GB=0.09, - sources=ModelSource(hf="Qdrant/minicoil-v1"), - model_file="onnx/model.onnx", - additional_files=[ - STOPWORDS_FILE, - MINICOIL_MODEL_FILE, - MINICOIL_VOCAB_FILE, - ], - requires_idf=True, - ), -] - -_MODEL_TO_LANGUAGE = { - "Qdrant/minicoil-v1": "english", -} -MODEL_TO_LANGUAGE = { - model_name.lower(): language for model_name, language in _MODEL_TO_LANGUAGE.items() -} - - -def get_language_by_model_name(model_name: str) -> str: - return MODEL_TO_LANGUAGE[model_name.lower()] - - -class MiniCOIL(SparseTextEmbeddingBase, OnnxTextModel[SparseEmbedding]): - """ - MiniCOIL is a sparse embedding model, that resolves semantic meaning of the words, - while keeping exact keyword match behavior. - - Each vocabulary token is converted into 4d component of a sparse vector, which is then weighted by the token frequency in the corpus. - If the token is not found in the corpus, it is treated exactly like in BM25. - ` - The model is based on `jinaai/jina-embeddings-v2-small-en-tokens` - """ - - def __init__( - self, - model_name: str, - cache_dir: str | None = None, - threads: int | None = None, - providers: Sequence[OnnxProvider] | None = None, - k: float = 1.2, - b: float = 0.75, - avg_len: float = 150.0, - cuda: bool | Device = Device.AUTO, - device_ids: list[int] | None = None, - lazy_load: bool = False, - device_id: int | None = None, - specific_model_path: str | None = None, - **kwargs: Any, - ): - """ - Args: - model_name (str): The name of the model to use. - cache_dir (str, optional): The path to the cache directory. - Can be set using the `FASTEMBED_CACHE_PATH` env variable. - Defaults to `fastembed_cache` in the system's temp directory. - threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. - providers (Optional[Sequence[OnnxProvider]], optional): The providers to use for onnxruntime. - k (float, optional): The k parameter in the BM25 formula. Defines the saturation of the term frequency. - I.e. defines how fast the moment when additional terms stop to increase the score. Defaults to 1.2. - b (float, optional): The b parameter in the BM25 formula. Defines the importance of the document length. - Defaults to 0.75. - avg_len (float, optional): The average length of the documents in the corpus. Defaults to 150.0. - cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers` - Defaults to Device.AUTO. - device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in - workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive - with `providers`. Defaults to None. - lazy_load (bool, optional): Whether to load the model during class initialization or on demand. - Should be set to True when using multiple-gpu and parallel encoding. Defaults to False. - device_id (Optional[int], optional): The device id to use for loading the model in the worker process. - specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else - - Raises: - ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. - """ - - super().__init__(model_name, cache_dir, threads, **kwargs) - self.providers = providers - self.lazy_load = lazy_load - self.device_ids = device_ids - self.cuda = cuda - self.device_id = device_id - self._extra_session_options = self._select_exposed_session_options(kwargs) - - self.k = k - self.b = b - self.avg_len = avg_len - - # Initialize class attributes - self.tokenizer: Tokenizer | None = None - self.invert_vocab: dict[int, str] = {} - self.special_tokens: set[str] = set() - self.special_tokens_ids: set[int] = set() - self.stopwords: set[str] = set() - self.vocab_resolver: VocabResolver | None = None - self.encoder: Encoder | None = None - self.output_dim: int | None = None - self.sparse_vector_converter: SparseVectorConverter | None = None - - self.model_description = self._get_model_description(model_name) - self.cache_dir = str(define_cache_dir(cache_dir)) - self._specific_model_path = specific_model_path - self._model_dir = self.download_model( - self.model_description, - self.cache_dir, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - ) - - if not self.lazy_load: - self.load_onnx_model() - - def load_onnx_model(self) -> None: - self._load_onnx_model( - model_dir=self._model_dir, - model_file=self.model_description.model_file, - threads=self.threads, - providers=self.providers, - cuda=self.cuda, - device_id=self.device_id, - extra_session_options=self._extra_session_options, - ) - - assert self.tokenizer is not None - - for token, idx in self.tokenizer.get_vocab().items(): # type: ignore[union-attr] - self.invert_vocab[idx] = token - self.special_tokens = set(self.special_token_to_id.keys()) - self.special_tokens_ids = set(self.special_token_to_id.values()) - self.stopwords = set(self._load_stopwords(self._model_dir)) - - stemmer = SnowballStemmer(get_language_by_model_name(self.model_name)) - - self.vocab_resolver = VocabResolver( - tokenizer=VocabTokenizer(self.tokenizer), - stopwords=self.stopwords, - stemmer=stemmer, - ) - self.vocab_resolver.load_json_vocab(str(self._model_dir / MINICOIL_VOCAB_FILE)) - - weights = np.load(str(self._model_dir / MINICOIL_MODEL_FILE), mmap_mode="r") - self.encoder = Encoder(weights) - self.output_dim = self.encoder.output_dim - - self.sparse_vector_converter = SparseVectorConverter( - stopwords=self.stopwords, - stemmer=stemmer, - k=self.k, - b=self.b, - avg_len=self.avg_len, - ) - - def token_count( - self, texts: str | Iterable[str], batch_size: int = 1024, **kwargs: Any - ) -> int: - return self._token_count(texts, batch_size=batch_size, **kwargs) - - def embed( - self, - documents: str | Iterable[str], - batch_size: int = 256, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[SparseEmbedding]: - """ - Encode a list of documents into list of embeddings. - We use mean pooling with attention so that the model can handle variable-length inputs. - - Args: - documents: Iterator of documents or single document to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - - Returns: - List of embeddings, one per document - """ - yield from self._embed_documents( - model_name=self.model_name, - cache_dir=str(self.cache_dir), - documents=documents, - batch_size=batch_size, - parallel=parallel, - providers=self.providers, - cuda=self.cuda, - device_ids=self.device_ids, - k=self.k, - b=self.b, - avg_len=self.avg_len, - is_query=False, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - extra_session_options=self._extra_session_options, - **kwargs, - ) - - def query_embed(self, query: str | Iterable[str], **kwargs: Any) -> Iterable[SparseEmbedding]: - """ - Encode a list of queries into list of embeddings. - """ - yield from self._embed_documents( - model_name=self.model_name, - cache_dir=str(self.cache_dir), - documents=query, - providers=self.providers, - cuda=self.cuda, - device_ids=self.device_ids, - k=self.k, - b=self.b, - avg_len=self.avg_len, - is_query=True, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - **kwargs, - ) - - @classmethod - def _load_stopwords(cls, model_dir: Path) -> list[str]: - stopwords_path = model_dir / STOPWORDS_FILE - if not stopwords_path.exists(): - return [] - - with open(stopwords_path, "r") as f: - return f.read().splitlines() - - @classmethod - def _list_supported_models(cls) -> list[SparseModelDescription]: - """Lists the supported models. - - Returns: - list[SparseModelDescription]: A list of SparseModelDescription objects containing the model information. - """ - return supported_minicoil_models - - def _post_process_onnx_output( - self, output: OnnxOutputContext, is_query: bool = False, **kwargs: Any - ) -> Iterable[SparseEmbedding]: - if output.input_ids is None: - raise ValueError("input_ids must be provided for document post-processing") - - assert self.vocab_resolver is not None - assert self.encoder is not None - assert self.sparse_vector_converter is not None - - # Size: (batch_size, sequence_length, hidden_size) - embeddings = output.model_output - # Size: (batch_size, sequence_length) - assert output.attention_mask is not None - masks = output.attention_mask - - vocab_size = self.vocab_resolver.vocab_size() - embedding_size = self.encoder.output_dim - - # For each document we only select those embeddings that are not masked out - - for i in range(embeddings.shape[0]): - # Size: (sequence_length, hidden_size) - token_embeddings = embeddings[i, masks[i] == 1] - - # Size: (sequence_length) - token_ids: NDArray[np.int64] = output.input_ids[i, masks[i] == 1] - - word_ids_array, counts, oov, forms = self.vocab_resolver.resolve_tokens(token_ids) - - # Size: (1, words) - word_ids_array_expanded: NDArray[np.int64] = np.expand_dims(word_ids_array, axis=0) - - # Size: (1, words, embedding_size) - token_embeddings_array: NDArray[np.float32] = np.expand_dims(token_embeddings, axis=0) - - assert word_ids_array_expanded.shape[1] == token_embeddings_array.shape[1] - - # Size of word_ids_mapping: (unique_words, 2) - [vocab_id, batch_id] - # Size of embeddings: (unique_words, embedding_size) - ids_mapping, minicoil_embeddings = self.encoder.forward( - word_ids_array_expanded, token_embeddings_array - ) - - # Size of counts: (unique_words) - words_ids: list[int] = ids_mapping[:, 0].tolist() # type: ignore[assignment] - - sentence_result: dict[str, WordEmbedding] = {} - - words = [self.vocab_resolver.lookup_word(word_id) for word_id in words_ids] - - for word, word_id, emb in zip(words, words_ids, minicoil_embeddings.tolist()): # type: ignore[arg-type] - if word_id == 0: - continue - - sentence_result[word] = WordEmbedding( - word=word, - forms=forms[word], - count=int(counts[word_id]), - word_id=int(word_id), - embedding=emb, # type: ignore[arg-type] - ) - - for oov_word, count in oov.items(): - # { - # "word": oov_word, - # "forms": [oov_word], - # "count": int(count), - # "word_id": -1, - # "embedding": [1] - # } - sentence_result[oov_word] = WordEmbedding( - word=oov_word, forms=[oov_word], count=int(count), word_id=-1, embedding=[1] - ) - - if not is_query: - yield self.sparse_vector_converter.embedding_to_vector( - sentence_result, vocab_size=vocab_size, embedding_size=embedding_size - ) - else: - yield self.sparse_vector_converter.embedding_to_vector_query( - sentence_result, vocab_size=vocab_size, embedding_size=embedding_size - ) - - @classmethod - def _get_worker_class(cls) -> Type["MiniCoilTextEmbeddingWorker"]: - return MiniCoilTextEmbeddingWorker - - -class MiniCoilTextEmbeddingWorker(TextEmbeddingWorker[SparseEmbedding]): - def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> MiniCOIL: - return MiniCOIL( - model_name=model_name, - cache_dir=cache_dir, - threads=1, - **kwargs, - ) diff --git a/fastembed/sparse/sparse_embedding_base.py b/fastembed/sparse/sparse_embedding_base.py deleted file mode 100644 index ea9a49075..000000000 --- a/fastembed/sparse/sparse_embedding_base.py +++ /dev/null @@ -1,90 +0,0 @@ -from dataclasses import dataclass -from typing import Iterable, Any - -import numpy as np -from numpy.typing import NDArray - -from fastembed.common.model_description import SparseModelDescription -from fastembed.common.types import NumpyArray -from fastembed.common.model_management import ModelManagement - - -@dataclass -class SparseEmbedding: - values: NumpyArray - indices: NDArray[np.int64] | NDArray[np.int32] - - def as_object(self) -> dict[str, NumpyArray]: - return { - "values": self.values, - "indices": self.indices, - } - - def as_dict(self) -> dict[int, float]: - return {int(i): float(v) for i, v in zip(self.indices, self.values)} # type: ignore - - @classmethod - def from_dict(cls, data: dict[int, float]) -> "SparseEmbedding": - if len(data) == 0: - return cls(values=np.array([]), indices=np.array([])) - indices, values = zip(*data.items()) - return cls(values=np.array(values), indices=np.array(indices)) - - -class SparseTextEmbeddingBase(ModelManagement[SparseModelDescription]): - def __init__( - self, - model_name: str, - cache_dir: str | None = None, - threads: int | None = None, - **kwargs: Any, - ): - self.model_name = model_name - self.cache_dir = cache_dir - self.threads = threads - self._local_files_only = kwargs.pop("local_files_only", False) - - def embed( - self, - documents: str | Iterable[str], - batch_size: int = 256, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[SparseEmbedding]: - raise NotImplementedError() - - def passage_embed(self, texts: Iterable[str], **kwargs: Any) -> Iterable[SparseEmbedding]: - """ - Embeds a list of text passages into a list of embeddings. - - Args: - texts (Iterable[str]): The list of texts to embed. - **kwargs: Additional keyword argument to pass to the embed method. - - Yields: - Iterable[SparseEmbedding]: The sparse embeddings. - """ - - # This is model-specific, so that different models can have specialized implementations - yield from self.embed(texts, **kwargs) - - def query_embed(self, query: str | Iterable[str], **kwargs: Any) -> Iterable[SparseEmbedding]: - """ - Embeds queries - - Args: - query (Union[str, Iterable[str]]): The query to embed, or an iterable e.g. list of queries. - - Returns: - Iterable[SparseEmbedding]: The sparse embeddings. - """ - - # This is model-specific, so that different models can have specialized implementations - if isinstance(query, str): - yield from self.embed([query], **kwargs) - else: - yield from self.embed(query, **kwargs) - - def token_count(self, texts: str | Iterable[str], **kwargs: Any) -> int: - """Returns the number of tokens in the texts.""" - raise NotImplementedError("Subclasses must implement this method") diff --git a/fastembed/sparse/sparse_text_embedding.py b/fastembed/sparse/sparse_text_embedding.py deleted file mode 100644 index 5b5c83085..000000000 --- a/fastembed/sparse/sparse_text_embedding.py +++ /dev/null @@ -1,143 +0,0 @@ -from typing import Any, Iterable, Sequence, Type -from dataclasses import asdict - -from fastembed.common import OnnxProvider -from fastembed.common.types import Device -from fastembed.sparse.bm25 import Bm25 -from fastembed.sparse.bm42 import Bm42 -from fastembed.sparse.minicoil import MiniCOIL -from fastembed.sparse.sparse_embedding_base import ( - SparseEmbedding, - SparseTextEmbeddingBase, -) -from fastembed.sparse.splade_pp import SpladePP -import warnings -from fastembed.common.model_description import SparseModelDescription - - -class SparseTextEmbedding(SparseTextEmbeddingBase): - EMBEDDINGS_REGISTRY: list[Type[SparseTextEmbeddingBase]] = [SpladePP, Bm42, Bm25, MiniCOIL] - - @classmethod - def list_supported_models(cls) -> list[dict[str, Any]]: - """ - Lists the supported models. - - Returns: - list[dict[str, Any]]: A list of dictionaries containing the model information. - - Example: - ``` - [ - { - "model": "prithvida/SPLADE_PP_en_v1", - "vocab_size": 30522, - "description": "Independent Implementation of SPLADE++ Model for English", - "license": "apache-2.0", - "size_in_GB": 0.532, - "sources": { - "hf": "qdrant/SPLADE_PP_en_v1", - }, - } - ] - ``` - """ - return [asdict(model) for model in cls._list_supported_models()] - - @classmethod - def _list_supported_models(cls) -> list[SparseModelDescription]: - result: list[SparseModelDescription] = [] - for embedding in cls.EMBEDDINGS_REGISTRY: - result.extend(embedding._list_supported_models()) - return result - - def __init__( - self, - model_name: str, - cache_dir: str | None = None, - threads: int | None = None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_ids: list[int] | None = None, - lazy_load: bool = False, - **kwargs: Any, - ): - super().__init__(model_name, cache_dir, threads, **kwargs) - if model_name.lower() == "prithvida/Splade_PP_en_v1".lower(): - warnings.warn( - "The right spelling is prithivida/Splade_PP_en_v1. " - "Support of this name will be removed soon, please fix the model_name", - DeprecationWarning, - stacklevel=2, - ) - model_name = "prithivida/Splade_PP_en_v1" - - for EMBEDDING_MODEL_TYPE in self.EMBEDDINGS_REGISTRY: - supported_models = EMBEDDING_MODEL_TYPE._list_supported_models() - if any(model_name.lower() == model.model.lower() for model in supported_models): - self.model = EMBEDDING_MODEL_TYPE( - model_name, - cache_dir, - threads=threads, - providers=providers, - cuda=cuda, - device_ids=device_ids, - lazy_load=lazy_load, - **kwargs, - ) - return - - raise ValueError( - f"Model {model_name} is not supported in SparseTextEmbedding." - "Please check the supported models using `SparseTextEmbedding.list_supported_models()`" - ) - - def embed( - self, - documents: str | Iterable[str], - batch_size: int = 256, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[SparseEmbedding]: - """ - Encode a list of documents into list of embeddings. - We use mean pooling with attention so that the model can handle variable-length inputs. - - Args: - documents: Iterator of documents or single document to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - - Returns: - List of embeddings, one per document - """ - yield from self.model.embed(documents, batch_size, parallel, **kwargs) - - def query_embed(self, query: str | Iterable[str], **kwargs: Any) -> Iterable[SparseEmbedding]: - """ - Embeds queries - - Args: - query (Union[str, Iterable[str]]): The query to embed, or an iterable e.g. list of queries. - - Returns: - Iterable[SparseEmbedding]: The sparse embeddings. - """ - yield from self.model.query_embed(query, **kwargs) - - def token_count( - self, texts: str | Iterable[str], batch_size: int = 1024, **kwargs: Any - ) -> int: - """Returns the number of tokens in the texts. - - Args: - texts (str | Iterable[str]): The list of texts to embed. - batch_size (int): Batch size for encoding - - Returns: - int: Sum of number of tokens in the texts. - """ - return self.model.token_count(texts, batch_size=batch_size, **kwargs) diff --git a/fastembed/sparse/splade_pp.py b/fastembed/sparse/splade_pp.py deleted file mode 100644 index 562ebcd4b..000000000 --- a/fastembed/sparse/splade_pp.py +++ /dev/null @@ -1,196 +0,0 @@ -from typing import Any, Iterable, Sequence, Type - -import numpy as np -from fastembed.common import OnnxProvider -from fastembed.common.onnx_model import OnnxOutputContext -from fastembed.common.types import Device -from fastembed.common.utils import define_cache_dir -from fastembed.sparse.sparse_embedding_base import ( - SparseEmbedding, - SparseTextEmbeddingBase, -) -from fastembed.text.onnx_text_model import OnnxTextModel, TextEmbeddingWorker -from fastembed.common.model_description import SparseModelDescription, ModelSource - -supported_splade_models: list[SparseModelDescription] = [ - SparseModelDescription( - model="prithivida/Splade_PP_en_v1", - vocab_size=30522, - description="Independent Implementation of SPLADE++ Model for English.", - license="apache-2.0", - size_in_GB=0.532, - sources=ModelSource(hf="Qdrant/Splade_PP_en_v1"), - model_file="model.onnx", - ), - SparseModelDescription( - model="prithvida/Splade_PP_en_v1", - vocab_size=30522, - description="Independent Implementation of SPLADE++ Model for English.", - license="apache-2.0", - size_in_GB=0.532, - sources=ModelSource(hf="Qdrant/Splade_PP_en_v1"), - model_file="model.onnx", - ), -] - - -class SpladePP(SparseTextEmbeddingBase, OnnxTextModel[SparseEmbedding]): - def _post_process_onnx_output( - self, output: OnnxOutputContext, **kwargs: Any - ) -> Iterable[SparseEmbedding]: - if output.attention_mask is None: - raise ValueError("attention_mask must be provided for document post-processing") - - relu_log = np.log(1 + np.maximum(output.model_output, 0)) - - weighted_log = relu_log * np.expand_dims(output.attention_mask, axis=-1) - - scores = np.max(weighted_log, axis=1) - - # Score matrix of shape (batch_size, vocab_size) - # Most of the values are 0, only a few are non-zero - for row_scores in scores: - indices = row_scores.nonzero()[0] - scores = row_scores[indices] - yield SparseEmbedding(values=scores, indices=indices) - - def token_count( - self, texts: str | Iterable[str], batch_size: int = 1024, **kwargs: Any - ) -> int: - return self._token_count(texts, batch_size=batch_size, **kwargs) - - @classmethod - def _list_supported_models(cls) -> list[SparseModelDescription]: - """Lists the supported models. - - Returns: - list[SparseModelDescription]: A list of SparseModelDescription objects containing the model information. - """ - return supported_splade_models - - def __init__( - self, - model_name: str, - cache_dir: str | None = None, - threads: int | None = None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_ids: list[int] | None = None, - lazy_load: bool = False, - device_id: int | None = None, - specific_model_path: str | None = None, - **kwargs: Any, - ): - """ - Args: - model_name (str): The name of the model to use. - cache_dir (str, optional): The path to the cache directory. - Can be set using the `FASTEMBED_CACHE_PATH` env variable. - Defaults to `fastembed_cache` in the system's temp directory. - threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. - providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use. - Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None. - cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers` - Defaults to Device. - device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in - workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive - with `providers`. Defaults to None. - lazy_load (bool, optional): Whether to load the model during class initialization or on demand. - Should be set to True when using multiple-gpu and parallel encoding. Defaults to False. - device_id (Optional[int], optional): The device id to use for loading the model in the worker process. - specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else - - Raises: - ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. - """ - super().__init__(model_name, cache_dir, threads, **kwargs) - self.providers = providers - self.lazy_load = lazy_load - self._extra_session_options = self._select_exposed_session_options(kwargs) - - # List of device ids, that can be used for data parallel processing in workers - self.device_ids = device_ids - self.cuda = cuda - - # This device_id will be used if we need to load model in current process - self.device_id: int | None = None - if device_id is not None: - self.device_id = device_id - elif self.device_ids is not None: - self.device_id = self.device_ids[0] - - self.model_description = self._get_model_description(model_name) - self.cache_dir = str(define_cache_dir(cache_dir)) - - self._specific_model_path = specific_model_path - self._model_dir = self.download_model( - self.model_description, - self.cache_dir, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - ) - - if not self.lazy_load: - self.load_onnx_model() - - def load_onnx_model(self) -> None: - self._load_onnx_model( - model_dir=self._model_dir, - model_file=self.model_description.model_file, - threads=self.threads, - providers=self.providers, - cuda=self.cuda, - device_id=self.device_id, - extra_session_options=self._extra_session_options, - ) - - def embed( - self, - documents: str | Iterable[str], - batch_size: int = 256, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[SparseEmbedding]: - """ - Encode a list of documents into list of embeddings. - We use mean pooling with attention so that the model can handle variable-length inputs. - - Args: - documents: Iterator of documents or single document to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - - Returns: - List of embeddings, one per document - """ - yield from self._embed_documents( - model_name=self.model_name, - cache_dir=str(self.cache_dir), - documents=documents, - batch_size=batch_size, - parallel=parallel, - providers=self.providers, - cuda=self.cuda, - device_ids=self.device_ids, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - extra_session_options=self._extra_session_options, - **kwargs, - ) - - @classmethod - def _get_worker_class(cls) -> Type[TextEmbeddingWorker[SparseEmbedding]]: - return SpladePPEmbeddingWorker - - -class SpladePPEmbeddingWorker(TextEmbeddingWorker[SparseEmbedding]): - def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> SpladePP: - return SpladePP( - model_name=model_name, - cache_dir=cache_dir, - threads=1, - **kwargs, - ) diff --git a/fastembed/sparse/utils/minicoil_encoder.py b/fastembed/sparse/utils/minicoil_encoder.py deleted file mode 100644 index fc70acd68..000000000 --- a/fastembed/sparse/utils/minicoil_encoder.py +++ /dev/null @@ -1,146 +0,0 @@ -""" -Pure numpy implementation of encoder model for a single word. - -This model is not trainable, and should only be used for inference. -""" - -import numpy as np -from fastembed.common.types import NumpyArray - - -class Encoder: - """ - Encoder(768, 4, 10000) - - Will look like this: - - - Per-word - Encoder Matrix - โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” - โ”‚ Token Embedding(768)โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ” (10k, 768, 4) - โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” - โ”‚ โ”‚ โ”‚ - โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ โ”Œโ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ - โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ - โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ โ”Œโ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” - โ””โ”€โ”€โ”€โ”€โ–บโ”‚ โ”‚ โ”‚ โ”œโ”€โ”€โ”€โ”€โ”€โ–บโ”‚Tanh โ”‚ - โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ โ”‚ โ”‚ โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ - โ”‚ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”˜ - โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ โ”œโ”€โ”˜ - โ”‚ โ”‚ - โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ - โ”‚ โ”‚ - โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ - - Final linear transformation is accompanied by a non-linear activation function: Tanh. - - Tanh is used to ensure that the output is in the range [-1, 1]. - It would be easier to visually interpret the output of the model, assuming that each dimension - would need to encode a type of semantic cluster. - """ - - def __init__( - self, - weights: NumpyArray, - ): - self.weights = weights - self.vocab_size, self.input_dim, self.output_dim = weights.shape - - self.encoder_weights: NumpyArray = weights - - # Activation function - self.activation = np.tanh - - @staticmethod - def convert_vocab_ids(vocab_ids: NumpyArray) -> NumpyArray: - """ - Convert vocab_ids of shape (batch_size, seq_len) into (batch_size, seq_len, 2) - by appending batch_id alongside each vocab_id. - """ - batch_size, seq_len = vocab_ids.shape - batch_ids = np.arange(batch_size, dtype=vocab_ids.dtype).reshape(batch_size, 1) - batch_ids = np.repeat(batch_ids, seq_len, axis=1) - # Stack vocab_ids and batch_ids along the last dimension - combined: NumpyArray = np.stack((vocab_ids, batch_ids), axis=2).astype(np.int32) - return combined - - @classmethod - def avg_by_vocab_ids( - cls, vocab_ids: NumpyArray, embeddings: NumpyArray - ) -> tuple[NumpyArray, NumpyArray]: - """ - Takes: - vocab_ids: (batch_size, seq_len) int array - embeddings: (batch_size, seq_len, input_dim) float array - - Returns: - unique_flattened_vocab_ids: (total_unique, 2) array of [vocab_id, batch_id] - unique_flattened_embeddings: (total_unique, input_dim) averaged embeddings - """ - input_dim = embeddings.shape[2] - - # Flatten vocab_ids and embeddings - # flattened_vocab_ids: (batch_size*seq_len, 2) - flattened_vocab_ids = cls.convert_vocab_ids(vocab_ids).reshape(-1, 2) - - # flattened_embeddings: (batch_size*seq_len, input_dim) - flattened_embeddings = embeddings.reshape(-1, input_dim) - - # Find unique (vocab_id, batch_id) pairs - unique_flattened_vocab_ids, inverse_indices = np.unique( - flattened_vocab_ids, axis=0, return_inverse=True - ) - - # Prepare arrays to accumulate sums - unique_count = unique_flattened_vocab_ids.shape[0] - unique_flattened_embeddings = np.zeros((unique_count, input_dim), dtype=np.float32) - unique_flattened_count = np.zeros(unique_count, dtype=np.int32) - - # Use np.add.at to accumulate sums based on inverse indices - np.add.at(unique_flattened_embeddings, inverse_indices, flattened_embeddings) - np.add.at(unique_flattened_count, inverse_indices, 1) - - # Compute averages - unique_flattened_embeddings /= unique_flattened_count[:, None] - - return unique_flattened_vocab_ids.astype(np.int32), unique_flattened_embeddings.astype( - np.float32 - ) - - def forward( - self, vocab_ids: NumpyArray, embeddings: NumpyArray - ) -> tuple[NumpyArray, NumpyArray]: - """ - Args: - vocab_ids: (batch_size, seq_len) int array - embeddings: (batch_size, seq_len, input_dim) float array - - Returns: - unique_flattened_vocab_ids_and_batch_ids: (total_unique, 2) - unique_flattened_encoded: (total_unique, output_dim) - """ - # Average embeddings for duplicate vocab_ids - unique_flattened_vocab_ids_and_batch_ids, unique_flattened_embeddings = ( - self.avg_by_vocab_ids(vocab_ids, embeddings) - ) - - # Select the encoder weights for each unique vocab_id - unique_flattened_vocab_ids = unique_flattened_vocab_ids_and_batch_ids[:, 0].astype( - np.int32 - ) - - # unique_encoder_weights: (total_unique, input_dim, output_dim) - unique_encoder_weights = self.encoder_weights[unique_flattened_vocab_ids] - - # Compute linear transform: (total_unique, output_dim) - # Using Einstein summation for matrix multiplication: - # 'bi,bio->bo' means: for each "b" (batch element), multiply embeddings (b,i) by weights (b,i,o) -> (b,o) - unique_flattened_encoded = np.einsum( - "bi,bio->bo", unique_flattened_embeddings, unique_encoder_weights - ) - - # Apply Tanh activation and ensure float32 type - unique_flattened_encoded = self.activation(unique_flattened_encoded).astype(np.float32) - - return unique_flattened_vocab_ids_and_batch_ids.astype(np.int32), unique_flattened_encoded diff --git a/fastembed/sparse/utils/sparse_vectors_converter.py b/fastembed/sparse/utils/sparse_vectors_converter.py deleted file mode 100644 index f856f0de4..000000000 --- a/fastembed/sparse/utils/sparse_vectors_converter.py +++ /dev/null @@ -1,244 +0,0 @@ -import copy -from dataclasses import dataclass - -import mmh3 -import numpy as np -from py_rust_stemmers import SnowballStemmer - -from fastembed.common.utils import get_all_punctuation, remove_non_alphanumeric -from fastembed.sparse.sparse_embedding_base import SparseEmbedding - -GAP = 32000 -INT32_MAX = 2**31 - 1 - - -@dataclass -class WordEmbedding: - word: str - forms: list[str] - count: int - word_id: int - embedding: list[float] - - -class SparseVectorConverter: - def __init__( - self, - stopwords: set[str], - stemmer: SnowballStemmer, - k: float = 1.2, - b: float = 0.75, - avg_len: float = 150.0, - ): - punctuation = set(get_all_punctuation()) - special_tokens = {"[CLS]", "[SEP]", "[PAD]", "[UNK]", "[MASK]"} - - self.stemmer = stemmer - self.unwanted_tokens = punctuation | special_tokens | stopwords - - self.k = k - self.b = b - self.avg_len = avg_len - - @classmethod - def unkn_word_token_id( - cls, word: str, shift: int - ) -> int: # 2-3 words can collide in 1 index with this mapping, not considering mm3 collisions - token_hash = abs(mmh3.hash(word)) - - range_size = INT32_MAX - shift - remapped_hash = shift + (token_hash % range_size) - - return remapped_hash - - def bm25_tf(self, num_occurrences: int, sentence_len: int) -> float: - res = num_occurrences * (self.k + 1) - res /= num_occurrences + self.k * (1 - self.b + self.b * sentence_len / self.avg_len) - return res - - @classmethod - def normalize_vector(cls, vector: list[float]) -> list[float]: - norm = sum([x**2 for x in vector]) ** 0.5 - if norm < 1e-8: - return vector - return [x / norm for x in vector] - - def clean_words( - self, sentence_embedding: dict[str, WordEmbedding], token_max_length: int = 40 - ) -> dict[str, WordEmbedding]: - """ - Clean miniCOIL-produced sentence_embedding, as unknown to the miniCOIL's stemmer tokens should fully resemble - our BM25 token representation. - - sentence_embedding = {"9ยฐ": {"word": "9ยฐ", "word_id": -1, "count": 2, "embedding": [1], "forms": ["9ยฐ"]}, - "9": {"word": "9", "word_id": -1, "count": 2, "embedding": [1], "forms": ["9"]}, - "bat": {"word": "bat", "word_id": 2, "count": 3, "embedding": [0.2, 0.1, -0.2, -0.2], "forms": ["bats", "bat"]}, - "9ยฐ9": {"word": "9ยฐ9", "word_id": -1, "count": 1, "embedding": [1], "forms": ["9ยฐ9"]}, - "screech": {"word": "screech", "word_id": -1, "count": 1, "embedding": [1], "forms": ["screech"]}, - "screeched": {"word": "screeched", "word_id": -1, "count": 1, "embedding": [1], "forms": ["screeched"]} - } - cleaned_embedding_ground_truth = { - "9": {"word": "9", "word_id": -1, "count": 6, "embedding": [1], "forms": ["9ยฐ", "9", "9ยฐ9", "9ยฐ9"]}, - "bat": {"word": "bat", "word_id": 2, "count": 3, "embedding": [0.2, 0.1, -0.2, -0.2], "forms": ["bats", "bat"]}, - "screech": {"word": "screech", "word_id": -1, "count": 2, "embedding": [1], "forms": ["screech", "screeched"]} - } - """ - - new_sentence_embedding: dict[str, WordEmbedding] = {} - - for word, embedding in sentence_embedding.items(): - # embedding = { - # "word": "vector", - # "forms": ["vector", "vectors"], - # "count": 2, - # "word_id": 1231, - # "embedding": [0.1, 0.2, 0.3, 0.4] - # } - if embedding.word_id > 0: - # Known word, no need to clean - new_sentence_embedding[word] = embedding - else: - # Unknown word - if word in self.unwanted_tokens: - continue - - # Example complex word split: - # word = `word^vec` - word_cleaned = remove_non_alphanumeric(word).strip() - # word_cleaned = `word vec` - - if len(word_cleaned) > 0: - # Subwords: ['word', 'vec'] - for subword in word_cleaned.split(): - stemmed_subword: str = self.stemmer.stem_word(subword) - if ( - len(stemmed_subword) <= token_max_length - and stemmed_subword not in self.unwanted_tokens - ): - if stemmed_subword not in new_sentence_embedding: - new_sentence_embedding[stemmed_subword] = copy.deepcopy(embedding) - new_sentence_embedding[stemmed_subword].word = stemmed_subword - else: - new_sentence_embedding[stemmed_subword].count += embedding.count - new_sentence_embedding[stemmed_subword].forms += embedding.forms - - return new_sentence_embedding - - def embedding_to_vector( - self, - sentence_embedding: dict[str, WordEmbedding], - embedding_size: int, - vocab_size: int, - ) -> SparseEmbedding: - """ - Convert miniCOIL sentence embedding to Qdrant sparse vector - - Example input: - - ``` - { - "vector": WordEmbedding({ // Vocabulary word, encoded with miniCOIL normally - "word": "vector", - "forms": ["vector", "vectors"], - "count": 2, - "word_id": 1231, - "embedding": [0.1, 0.2, 0.3, 0.4] - }), - "axiotic": WordEmbedding({ // Out-of-vocabulary word, fallback to BM25 - "word": "axiotic", - "forms": ["axiotics"], - "count": 1, - "word_id": -1, - }) - } - ``` - - """ - - indices: list[int] = [] - values: list[float] = [] - - # Example: - # vocab_size = 10000 - # embedding_size = 4 - # GAP = 32000 - # - # We want to start random words section from the bucket, that is guaranteed to not - # include any vocab words. - # We need (vocab_size * embedding_size) slots for vocab words. - # Therefore we need (vocab_size * embedding_size) // GAP + 1 buckets for vocab words. - # Therefore, we can start random words from bucket (vocab_size * embedding_size) // GAP + 1 + 1 - - # ID at which the scope of OOV words starts - unknown_words_shift = ((vocab_size * embedding_size) // GAP + 2) * GAP - sentence_embedding_cleaned = self.clean_words(sentence_embedding) - - # Calculate sentence length after cleaning - sentence_len = 0 - for embedding in sentence_embedding_cleaned.values(): - sentence_len += embedding.count - - for embedding in sentence_embedding_cleaned.values(): - word_id = embedding.word_id - num_occurrences = embedding.count - tf = self.bm25_tf(num_occurrences, sentence_len) - if ( - word_id > 0 - ): # miniCOIL starts with ID 1, we generally won't have word_id == 0 (UNK), as we don't add - # these words to sentence_embedding - embedding_values = embedding.embedding - normalized_embedding = self.normalize_vector(embedding_values) - - for val_id, value in enumerate(normalized_embedding): - indices.append( - word_id * embedding_size + val_id - ) # since miniCOIL IDs start with 1 - values.append(value * tf) - else: - indices.append(self.unkn_word_token_id(embedding.word, unknown_words_shift)) - values.append(tf) - - return SparseEmbedding( - indices=np.array(indices, dtype=np.int32), - values=np.array(values, dtype=np.float32), - ) - - def embedding_to_vector_query( - self, - sentence_embedding: dict[str, WordEmbedding], - embedding_size: int, - vocab_size: int, - ) -> SparseEmbedding: - """ - Same as `embedding_to_vector`, but no TF - """ - - indices: list[int] = [] - values: list[float] = [] - - # ID at which the scope of OOV words starts - unknown_words_shift = ((vocab_size * embedding_size) // GAP + 2) * GAP - - sentence_embedding_cleaned = self.clean_words(sentence_embedding) - - for embedding in sentence_embedding_cleaned.values(): - word_id = embedding.word_id - tf = 1.0 - - if word_id >= 0: # miniCOIL starts with ID 1 - embedding_values = embedding.embedding - normalized_embedding = self.normalize_vector(embedding_values) - - for val_id, value in enumerate(normalized_embedding): - indices.append( - word_id * embedding_size + val_id - ) # since miniCOIL IDs start with 1 - values.append(value * tf) - else: - indices.append(self.unkn_word_token_id(embedding.word, unknown_words_shift)) - values.append(tf) - - return SparseEmbedding( - indices=np.array(indices, dtype=np.int32), - values=np.array(values, dtype=np.float32), - ) diff --git a/fastembed/sparse/utils/tokenizer.py b/fastembed/sparse/utils/tokenizer.py deleted file mode 100644 index e21290832..000000000 --- a/fastembed/sparse/utils/tokenizer.py +++ /dev/null @@ -1,120 +0,0 @@ -# This code is a modified copy of the `NLTKWordTokenizer` class from `NLTK` library. - -import re - - -class SimpleTokenizer: - @staticmethod - def tokenize(text: str) -> list[str]: - text = re.sub(r"[^\w]", " ", text.lower()) - text = re.sub(r"\s+", " ", text) - - return text.strip().split() - - -class WordTokenizer: - """The tokenizer is "destructive" such that the regexes applied will munge the - input string to a state beyond re-construction. - """ - - # Starting quotes. - STARTING_QUOTES = [ - (re.compile("([ยซโ€œโ€˜โ€ž]|[`]+)", re.U), r" \1 "), - (re.compile(r"^\""), r"``"), - (re.compile(r"(``)"), r" \1 "), - (re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "), - (re.compile(r"(?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\b", re.U), r"\1 \2"), - ] - - # Ending quotes. - ENDING_QUOTES = [ - (re.compile("([ยปโ€โ€™])", re.U), r" \1 "), - (re.compile(r"''"), " '' "), - (re.compile(r'"'), " '' "), - (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "), - (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "), - ] - - # Punctuation. - PUNCTUATION = [ - (re.compile(r'([^\.])(\.)([\]\)}>"\'' "ยปโ€โ€™ " r"]*)\s*$", re.U), r"\1 \2 \3 "), - (re.compile(r"([:,])([^\d])"), r" \1 \2"), - (re.compile(r"([:,])$"), r" \1 "), - ( - re.compile(r"\.{2,}", re.U), - r" \g<0> ", - ), - (re.compile(r"[;@#$%&]"), r" \g<0> "), - ( - re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'), - r"\1 \2\3 ", - ), # Handles the final period. - (re.compile(r"[?!]"), r" \g<0> "), - (re.compile(r"([^'])' "), r"\1 ' "), - ( - re.compile(r"[*]", re.U), - r" \g<0> ", - ), - ] - - # Pads parentheses - PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ") - DOUBLE_DASHES = (re.compile(r"--"), r" -- ") - - # List of contractions adapted from Robert MacIntyre's tokenizer. - CONTRACTIONS2 = [ - re.compile(pattern) - for pattern in ( - r"(?i)\b(can)(?#X)(not)\b", - r"(?i)\b(d)(?#X)('ye)\b", - r"(?i)\b(gim)(?#X)(me)\b", - r"(?i)\b(gon)(?#X)(na)\b", - r"(?i)\b(got)(?#X)(ta)\b", - r"(?i)\b(lem)(?#X)(me)\b", - r"(?i)\b(more)(?#X)('n)\b", - r"(?i)\b(wan)(?#X)(na)(?=\s)", - ) - ] - CONTRACTIONS3 = [ - re.compile(pattern) for pattern in (r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b") - ] - - @classmethod - def tokenize(cls, text: str) -> list[str]: - """Return a tokenized copy of `text`. - - >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York.''' - >>> WordTokenizer().tokenize(s) - ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36', 'euros', ')', 'in', 'New', 'York', '.'] - - Args: - text: The text to be tokenized. - - Returns: - A list of tokens. - """ - for regexp, substitution in cls.STARTING_QUOTES: - text = regexp.sub(substitution, text) - - for regexp, substitution in cls.PUNCTUATION: - text = regexp.sub(substitution, text) - - # Handles parentheses. - regexp, substitution = cls.PARENS_BRACKETS - text = regexp.sub(substitution, text) - - # Handles double dash. - regexp, substitution = cls.DOUBLE_DASHES - text = regexp.sub(substitution, text) - - # add extra space to make things easier - text = " " + text + " " - - for regexp, substitution in cls.ENDING_QUOTES: - text = regexp.sub(substitution, text) - - for regexp in cls.CONTRACTIONS2: - text = regexp.sub(r" \1 \2 ", text) - for regexp in cls.CONTRACTIONS3: - text = regexp.sub(r" \1 \2 ", text) - return text.split() diff --git a/fastembed/sparse/utils/vocab_resolver.py b/fastembed/sparse/utils/vocab_resolver.py deleted file mode 100644 index 1a3c52acc..000000000 --- a/fastembed/sparse/utils/vocab_resolver.py +++ /dev/null @@ -1,202 +0,0 @@ -from collections import defaultdict -from typing import Iterable - -from py_rust_stemmers import SnowballStemmer -import numpy as np -from tokenizers import Tokenizer -from numpy.typing import NDArray - -from fastembed.common.types import NumpyArray - - -class VocabTokenizerBase: - def tokenize(self, sentence: str) -> NumpyArray: - raise NotImplementedError() - - def convert_ids_to_tokens(self, token_ids: NumpyArray) -> list[str]: - raise NotImplementedError() - - -class VocabTokenizer(VocabTokenizerBase): - def __init__(self, tokenizer: Tokenizer): - self.tokenizer = tokenizer - - def tokenize(self, sentence: str) -> NumpyArray: - return np.array(self.tokenizer.encode(sentence).ids) - - def convert_ids_to_tokens(self, token_ids: NumpyArray) -> list[str]: - return [self.tokenizer.id_to_token(token_id) for token_id in token_ids] - - -class VocabResolver: - def __init__(self, tokenizer: VocabTokenizerBase, stopwords: set[str], stemmer: SnowballStemmer): - # Word to id mapping - self.vocab: dict[str, int] = {} - # Id to word mapping - self.words: list[str] = [] - # Lemma to word mapping - self.stem_mapping: dict[str, str] = {} - self.tokenizer: VocabTokenizerBase = tokenizer - self.stemmer = stemmer - self.stopwords: set[str] = stopwords - - def tokenize(self, sentence: str) -> NumpyArray: - return self.tokenizer.tokenize(sentence) - - def lookup_word(self, word_id: int) -> str: - if word_id == 0: - return "UNK" - return self.words[word_id - 1] - - def convert_ids_to_tokens(self, token_ids: NumpyArray) -> list[str]: - return self.tokenizer.convert_ids_to_tokens(token_ids) - - def vocab_size(self) -> int: - # We need +1 for UNK token - return len(self.vocab) + 1 - - def save_vocab(self, path: str) -> None: - with open(path, "w") as f: - for word in self.words: - f.write(word + "\n") - - def save_json_vocab(self, path: str) -> None: - import json - - with open(path, "w") as f: - json.dump({"vocab": self.words, "stem_mapping": self.stem_mapping}, f, indent=2) - - def load_json_vocab(self, path: str) -> None: - import json - - with open(path, "r") as f: - data = json.load(f) - self.words = data["vocab"] - self.vocab = {word: idx + 1 for idx, word in enumerate(self.words)} - self.stem_mapping = data["stem_mapping"] - - def add_word(self, word: str) -> None: - if word not in self.vocab: - self.vocab[word] = len(self.vocab) + 1 - self.words.append(word) - stem = self.stemmer.stem_word(word) - if stem not in self.stem_mapping: - self.stem_mapping[stem] = word - else: - existing_word = self.stem_mapping[stem] - if len(existing_word) > len(word): - # Prefer shorter words for the same stem - # Example: "swim" is preferred over "swimming" - self.stem_mapping[stem] = word - - def load_vocab(self, path: str) -> None: - with open(path, "r") as f: - for line in f: - self.add_word(line.strip()) - - @classmethod - def _reconstruct_bpe( - cls, bpe_tokens: Iterable[tuple[int, str]] - ) -> list[tuple[str, list[int]]]: - result: list[tuple[str, list[int]]] = [] - acc: str = "" - acc_idx: list[int] = [] - - continuing_subword_prefix = "##" - continuing_subword_prefix_len = len(continuing_subword_prefix) - - for idx, token in bpe_tokens: - if token.startswith(continuing_subword_prefix): - acc += token[continuing_subword_prefix_len:] - acc_idx.append(idx) - else: - if acc: - result.append((acc, acc_idx)) - acc_idx = [] - acc = token - acc_idx.append(idx) - - if acc: - result.append((acc, acc_idx)) - return result - - def resolve_tokens( - self, token_ids: NDArray[np.int64] - ) -> tuple[NDArray[np.int64], dict[int, int], dict[str, int], dict[str, list[str]]]: - """ - Mark known tokens (including composed tokens) with vocab ids. - - Args: - token_ids: (seq_len) - list of ids of tokens - Example: - [ - 101, 3897, 19332, 12718, 23348, - 1010, 1996, 7151, 2296, 4845, - 2359, 2005, 4234, 1010, 4332, - 2871, 3191, 2062, 102 - ] - - returns: - - token_ids with vocab ids - [ - 0, 151, 151, 0, 0, - 912, 0, 0, 0, 332, - 332, 332, 0, 7121, 191, - 0, 0, 332, 0 - ] - - counts of each token - { - 151: 1, - 332: 3, - 7121: 1, - 191: 1, - 912: 1 - } - - oov counts of each token - { - "the": 1, - "a": 1, - "[CLS]": 1, - "[SEP]": 1, - ... - } - - forms of each token - { - "hello": ["hello"], - "world": ["worlds", "world", "worlding"], - } - - """ - tokens = self.convert_ids_to_tokens(token_ids) - tokens_mapping = self._reconstruct_bpe(enumerate(tokens)) - - counts: dict[int, int] = defaultdict(int) - oov_count: dict[str, int] = defaultdict(int) - - forms: dict[str, list[str]] = defaultdict(list) - - for token, mapped_token_ids in tokens_mapping: - vocab_id = 0 - if token in self.stopwords: - vocab_id = 0 - elif token in self.vocab: - vocab_id = self.vocab[token] - forms[token].append(token) - elif token in self.stem_mapping: - vocab_id = self.vocab[self.stem_mapping[token]] - forms[self.stem_mapping[token]].append(token) - else: - stem = self.stemmer.stem_word(token) - if stem in self.stem_mapping: - vocab_id = self.vocab[self.stem_mapping[stem]] - forms[self.stem_mapping[stem]].append(token) - - for token_id in mapped_token_ids: - token_ids[token_id] = vocab_id - - if vocab_id == 0: - oov_count[token] += 1 - else: - counts[vocab_id] += 1 - return token_ids, counts, oov_count, forms - diff --git a/fastembed/text/__init__.py b/fastembed/text/__init__.py deleted file mode 100644 index 493d0d438..000000000 --- a/fastembed/text/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from fastembed.text.text_embedding import TextEmbedding - -__all__ = ["TextEmbedding"] diff --git a/fastembed/text/clip_embedding.py b/fastembed/text/clip_embedding.py deleted file mode 100644 index b918e9f26..000000000 --- a/fastembed/text/clip_embedding.py +++ /dev/null @@ -1,56 +0,0 @@ -from typing import Any, Iterable, Type - -from fastembed.common.types import NumpyArray -from fastembed.common.onnx_model import OnnxOutputContext -from fastembed.text.onnx_embedding import OnnxTextEmbedding, OnnxTextEmbeddingWorker -from fastembed.common.model_description import DenseModelDescription, ModelSource - -supported_clip_models: list[DenseModelDescription] = [ - DenseModelDescription( - model="Qdrant/clip-ViT-B-32-text", - dim=512, - description=( - "Text embeddings, Multimodal (text&image), English, 77 input tokens truncation, " - "Prefixes for queries/documents: not necessary, 2021 year" - ), - license="mit", - size_in_GB=0.25, - sources=ModelSource(hf="Qdrant/clip-ViT-B-32-text"), - model_file="model.onnx", - ), -] - - -class CLIPOnnxEmbedding(OnnxTextEmbedding): - @classmethod - def _get_worker_class(cls) -> Type[OnnxTextEmbeddingWorker]: - return CLIPEmbeddingWorker - - @classmethod - def _list_supported_models(cls) -> list[DenseModelDescription]: - """Lists the supported models. - - Returns: - list[DenseModelDescription]: A list of DenseModelDescription objects containing the model information. - """ - return supported_clip_models - - def _post_process_onnx_output( - self, output: OnnxOutputContext, **kwargs: Any - ) -> Iterable[NumpyArray]: - return output.model_output - - -class CLIPEmbeddingWorker(OnnxTextEmbeddingWorker): - def init_embedding( - self, - model_name: str, - cache_dir: str, - **kwargs: Any, - ) -> OnnxTextEmbedding: - return CLIPOnnxEmbedding( - model_name=model_name, - cache_dir=cache_dir, - threads=1, - **kwargs, - ) diff --git a/fastembed/text/custom_text_embedding.py b/fastembed/text/custom_text_embedding.py deleted file mode 100644 index 465ffd251..000000000 --- a/fastembed/text/custom_text_embedding.py +++ /dev/null @@ -1,97 +0,0 @@ -from typing import Sequence, Any, Iterable -from dataclasses import dataclass - -import numpy as np -from numpy.typing import NDArray - -from fastembed.common import OnnxProvider -from fastembed.common.model_description import ( - PoolingType, - DenseModelDescription, -) -from fastembed.common.onnx_model import OnnxOutputContext -from fastembed.common.types import NumpyArray, Device -from fastembed.common.utils import normalize, mean_pooling -from fastembed.text.onnx_embedding import OnnxTextEmbedding - - -@dataclass(frozen=True) -class PostprocessingConfig: - pooling: PoolingType - normalization: bool - - -class CustomTextEmbedding(OnnxTextEmbedding): - SUPPORTED_MODELS: list[DenseModelDescription] = [] - POSTPROCESSING_MAPPING: dict[str, PostprocessingConfig] = {} - - def __init__( - self, - model_name: str, - cache_dir: str | None = None, - threads: int | None = None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_ids: list[int] | None = None, - lazy_load: bool = False, - device_id: int | None = None, - specific_model_path: str | None = None, - **kwargs: Any, - ): - super().__init__( - model_name=model_name, - cache_dir=cache_dir, - threads=threads, - providers=providers, - cuda=cuda, - device_ids=device_ids, - lazy_load=lazy_load, - device_id=device_id, - specific_model_path=specific_model_path, - **kwargs, - ) - self._pooling = self.POSTPROCESSING_MAPPING[model_name].pooling - self._normalization = self.POSTPROCESSING_MAPPING[model_name].normalization - - @classmethod - def _list_supported_models(cls) -> list[DenseModelDescription]: - return cls.SUPPORTED_MODELS - - def _post_process_onnx_output( - self, output: OnnxOutputContext, **kwargs: Any - ) -> Iterable[NumpyArray]: - return self._normalize(self._pool(output.model_output, output.attention_mask)) - - def _pool( - self, embeddings: NumpyArray, attention_mask: NDArray[np.int64] | None = None - ) -> NumpyArray: - if self._pooling == PoolingType.CLS: - return embeddings[:, 0] - - if self._pooling == PoolingType.MEAN: - if attention_mask is None: - raise ValueError("attention_mask must be provided for mean pooling") - return mean_pooling(embeddings, attention_mask) - - if self._pooling == PoolingType.DISABLED: - return embeddings - - raise ValueError( - f"Unsupported pooling type {self._pooling}. " - f"Supported types are: {PoolingType.CLS}, {PoolingType.MEAN}, {PoolingType.DISABLED}." - ) - - def _normalize(self, embeddings: NumpyArray) -> NumpyArray: - return normalize(embeddings) if self._normalization else embeddings - - @classmethod - def add_model( - cls, - model_description: DenseModelDescription, - pooling: PoolingType, - normalization: bool, - ) -> None: - cls.SUPPORTED_MODELS.append(model_description) - cls.POSTPROCESSING_MAPPING[model_description.model] = PostprocessingConfig( - pooling=pooling, normalization=normalization - ) diff --git a/fastembed/text/multitask_embedding.py b/fastembed/text/multitask_embedding.py deleted file mode 100644 index 73ce2b1bd..000000000 --- a/fastembed/text/multitask_embedding.py +++ /dev/null @@ -1,109 +0,0 @@ -from enum import Enum -from typing import Any, Type, Iterable - -import numpy as np - -from fastembed.common.onnx_model import OnnxOutputContext -from fastembed.common.types import NumpyArray -from fastembed.text.pooled_normalized_embedding import PooledNormalizedEmbedding -from fastembed.text.onnx_embedding import OnnxTextEmbeddingWorker -from fastembed.common.model_description import DenseModelDescription, ModelSource - -supported_multitask_models: list[DenseModelDescription] = [ - DenseModelDescription( - model="jinaai/jina-embeddings-v3", - dim=1024, - tasks={ - "retrieval.query": 0, - "retrieval.passage": 1, - "separation": 2, - "classification": 3, - "text-matching": 4, - }, - description=( - "Multi-task unimodal (text) embedding model, multi-lingual (~100), " - "1024 tokens truncation, and 8192 sequence length. Prefixes for queries/documents: not necessary, 2024 year." - ), - license="cc-by-nc-4.0", - size_in_GB=2.29, - sources=ModelSource(hf="jinaai/jina-embeddings-v3"), - model_file="onnx/model.onnx", - additional_files=["onnx/model.onnx_data"], - ), -] - - -class Task(int, Enum): - RETRIEVAL_QUERY = 0 - RETRIEVAL_PASSAGE = 1 - SEPARATION = 2 - CLASSIFICATION = 3 - TEXT_MATCHING = 4 - - -class JinaEmbeddingV3(PooledNormalizedEmbedding): - PASSAGE_TASK = Task.RETRIEVAL_PASSAGE - QUERY_TASK = Task.RETRIEVAL_QUERY - - def __init__(self, *args: Any, task_id: int | None = None, **kwargs: Any): - super().__init__(*args, **kwargs) - self.default_task_id: Task | int = task_id if task_id is not None else self.PASSAGE_TASK - - @classmethod - def _get_worker_class(cls) -> Type[OnnxTextEmbeddingWorker]: - return JinaEmbeddingV3Worker - - @classmethod - def _list_supported_models(cls) -> list[DenseModelDescription]: - return supported_multitask_models - - def _preprocess_onnx_input( - self, - onnx_input: dict[str, NumpyArray], - task_id: int | Task | None = None, - **kwargs: Any, - ) -> dict[str, NumpyArray]: - if task_id is None: - raise ValueError(f"task_id must be provided for JinaEmbeddingV3, got <{task_id}>") - onnx_input["task_id"] = np.array(task_id, dtype=np.int64) - return onnx_input - - def embed( - self, - documents: str | Iterable[str], - batch_size: int = 256, - parallel: int | None = None, - task_id: int | None = None, - **kwargs: Any, - ) -> Iterable[NumpyArray]: - task_id = ( - task_id if task_id is not None else self.default_task_id - ) # required for multiprocessing - yield from super().embed(documents, batch_size, parallel, task_id=task_id, **kwargs) - - def query_embed(self, query: str | Iterable[str], **kwargs: Any) -> Iterable[NumpyArray]: - yield from super().embed(query, task_id=self.QUERY_TASK, **kwargs) - - def passage_embed(self, texts: Iterable[str], **kwargs: Any) -> Iterable[NumpyArray]: - yield from super().embed(texts, task_id=self.PASSAGE_TASK, **kwargs) - - -class JinaEmbeddingV3Worker(OnnxTextEmbeddingWorker): - def init_embedding( - self, - model_name: str, - cache_dir: str, - **kwargs: Any, - ) -> JinaEmbeddingV3: - return JinaEmbeddingV3( - model_name=model_name, - cache_dir=cache_dir, - threads=1, - **kwargs, - ) - - def process(self, items: Iterable[tuple[int, Any]]) -> Iterable[tuple[int, OnnxOutputContext]]: - self.model: JinaEmbeddingV3 # mypy complaints `self.model` does not have `default_task_id` - for idx, batch in items: - onnx_output = self.model.onnx_embed(batch, task_id=self.model.default_task_id) - yield idx, onnx_output diff --git a/fastembed/text/onnx_embedding.py b/fastembed/text/onnx_embedding.py deleted file mode 100644 index 1e9978f75..000000000 --- a/fastembed/text/onnx_embedding.py +++ /dev/null @@ -1,353 +0,0 @@ -from typing import Any, Iterable, Sequence, Type - -from fastembed.common.types import NumpyArray, OnnxProvider, Device -from fastembed.common.onnx_model import OnnxOutputContext -from fastembed.common.utils import define_cache_dir, normalize -from fastembed.text.onnx_text_model import OnnxTextModel, TextEmbeddingWorker -from fastembed.text.text_embedding_base import TextEmbeddingBase -from fastembed.common.model_description import DenseModelDescription, ModelSource - -supported_onnx_models: list[DenseModelDescription] = [ - DenseModelDescription( - model="BAAI/bge-base-en", - dim=768, - description=( - "Text embeddings, Unimodal (text), English, 512 input tokens truncation, " - "Prefixes for queries/documents: necessary, 2023 year." - ), - license="mit", - size_in_GB=0.42, - sources=ModelSource( - hf="Qdrant/fast-bge-base-en", - url="https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en.tar.gz", - _deprecated_tar_struct=True, - ), - model_file="model_optimized.onnx", - ), - DenseModelDescription( - model="BAAI/bge-base-en-v1.5", - dim=768, - description=( - "Text embeddings, Unimodal (text), English, 512 input tokens truncation, " - "Prefixes for queries/documents: not so necessary, 2023 year." - ), - license="mit", - size_in_GB=0.21, - sources=ModelSource( - hf="qdrant/bge-base-en-v1.5-onnx-q", - url="https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en-v1.5.tar.gz", - _deprecated_tar_struct=True, - ), - model_file="model_optimized.onnx", - ), - DenseModelDescription( - model="BAAI/bge-large-en-v1.5", - dim=1024, - description=( - "Text embeddings, Unimodal (text), English, 512 input tokens truncation, " - "Prefixes for queries/documents: not so necessary, 2023 year." - ), - license="mit", - size_in_GB=1.20, - sources=ModelSource(hf="qdrant/bge-large-en-v1.5-onnx"), - model_file="model.onnx", - ), - DenseModelDescription( - model="BAAI/bge-small-en", - dim=384, - description=( - "Text embeddings, Unimodal (text), English, 512 input tokens truncation, " - "Prefixes for queries/documents: necessary, 2023 year." - ), - license="mit", - size_in_GB=0.13, - sources=ModelSource( - hf="Qdrant/bge-small-en", - url="https://storage.googleapis.com/qdrant-fastembed/BAAI-bge-small-en.tar.gz", - _deprecated_tar_struct=True, - ), - model_file="model_optimized.onnx", - ), - DenseModelDescription( - model="BAAI/bge-small-en-v1.5", - dim=384, - description=( - "Text embeddings, Unimodal (text), English, 512 input tokens truncation, " - "Prefixes for queries/documents: not so necessary, 2023 year." - ), - license="mit", - size_in_GB=0.067, - sources=ModelSource(hf="qdrant/bge-small-en-v1.5-onnx-q"), - model_file="model_optimized.onnx", - ), - DenseModelDescription( - model="BAAI/bge-small-zh-v1.5", - dim=512, - description=( - "Text embeddings, Unimodal (text), Chinese, 512 input tokens truncation, " - "Prefixes for queries/documents: not so necessary, 2023 year." - ), - license="mit", - size_in_GB=0.09, - sources=ModelSource( - hf="Qdrant/bge-small-zh-v1.5", - url="https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-zh-v1.5.tar.gz", - _deprecated_tar_struct=True, - ), - model_file="model_optimized.onnx", - ), - DenseModelDescription( - model="mixedbread-ai/mxbai-embed-large-v1", - dim=1024, - description=( - "Text embeddings, Unimodal (text), English, 512 input tokens truncation, " - "Prefixes for queries/documents: necessary, 2024 year." - ), - license="apache-2.0", - size_in_GB=0.64, - sources=ModelSource(hf="mixedbread-ai/mxbai-embed-large-v1"), - model_file="onnx/model.onnx", - ), - DenseModelDescription( - model="snowflake/snowflake-arctic-embed-xs", - dim=384, - description=( - "Text embeddings, Unimodal (text), English, 512 input tokens truncation, " - "Prefixes for queries/documents: necessary, 2024 year." - ), - license="apache-2.0", - size_in_GB=0.09, - sources=ModelSource(hf="snowflake/snowflake-arctic-embed-xs"), - model_file="onnx/model.onnx", - ), - DenseModelDescription( - model="snowflake/snowflake-arctic-embed-s", - dim=384, - description=( - "Text embeddings, Unimodal (text), English, 512 input tokens truncation, " - "Prefixes for queries/documents: necessary, 2024 year." - ), - license="apache-2.0", - size_in_GB=0.13, - sources=ModelSource(hf="snowflake/snowflake-arctic-embed-s"), - model_file="onnx/model.onnx", - ), - DenseModelDescription( - model="snowflake/snowflake-arctic-embed-m", - dim=768, - description=( - "Text embeddings, Unimodal (text), English, 512 input tokens truncation, " - "Prefixes for queries/documents: necessary, 2024 year." - ), - license="apache-2.0", - size_in_GB=0.43, - sources=ModelSource(hf="Snowflake/snowflake-arctic-embed-m"), - model_file="onnx/model.onnx", - ), - DenseModelDescription( - model="snowflake/snowflake-arctic-embed-m-long", - dim=768, - description=( - "Text embeddings, Unimodal (text), English, 2048 input tokens truncation, " - "Prefixes for queries/documents: necessary, 2024 year." - ), - license="apache-2.0", - size_in_GB=0.54, - sources=ModelSource(hf="snowflake/snowflake-arctic-embed-m-long"), - model_file="onnx/model.onnx", - ), - DenseModelDescription( - model="snowflake/snowflake-arctic-embed-l", - dim=1024, - description=( - "Text embeddings, Unimodal (text), English, 512 input tokens truncation, " - "Prefixes for queries/documents: necessary, 2024 year." - ), - license="apache-2.0", - size_in_GB=1.02, - sources=ModelSource(hf="snowflake/snowflake-arctic-embed-l"), - model_file="onnx/model.onnx", - ), - DenseModelDescription( - model="jinaai/jina-clip-v1", - dim=768, - description=( - "Text embeddings, Multimodal (text&image), English, Prefixes for queries/documents: " - "not necessary, 2024 year" - ), - license="apache-2.0", - size_in_GB=0.55, - sources=ModelSource(hf="jinaai/jina-clip-v1"), - model_file="onnx/text_model.onnx", - ), -] - - -class OnnxTextEmbedding(TextEmbeddingBase, OnnxTextModel[NumpyArray]): - """Implementation of the Flag Embedding model.""" - - @classmethod - def _list_supported_models(cls) -> list[DenseModelDescription]: - """ - Lists the supported models. - - Returns: - list[DenseModelDescription]: A list of DenseModelDescription objects containing the model information. - """ - return supported_onnx_models - - def __init__( - self, - model_name: str = "BAAI/bge-small-en-v1.5", - cache_dir: str | None = None, - threads: int | None = None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_ids: list[int] | None = None, - lazy_load: bool = False, - device_id: int | None = None, - specific_model_path: str | None = None, - **kwargs: Any, - ): - """ - Args: - model_name (str): The name of the model to use. - cache_dir (str, optional): The path to the cache directory. - Can be set using the `FASTEMBED_CACHE_PATH` env variable. - Defaults to `fastembed_cache` in the system's temp directory. - threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. - providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use. - Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None. - cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers` - Defaults to Device.AUTO. - device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in - workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive - with `providers`. Defaults to None. - lazy_load (bool, optional): Whether to load the model during class initialization or on demand. - Should be set to True when using multiple-gpu and parallel encoding. Defaults to False. - device_id (Optional[int], optional): The device id to use for loading the model in the worker process. - specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else - - Raises: - ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. - """ - super().__init__(model_name, cache_dir, threads, **kwargs) - self.providers = providers - self.lazy_load = lazy_load - self._extra_session_options = self._select_exposed_session_options(kwargs) - # List of device ids, that can be used for data parallel processing in workers - self.device_ids = device_ids - self.cuda = cuda - - # This device_id will be used if we need to load model in current process - self.device_id: int | None = None - if device_id is not None: - self.device_id = device_id - elif self.device_ids is not None: - self.device_id = self.device_ids[0] - - self.model_description = self._get_model_description(model_name) - self.cache_dir = str(define_cache_dir(cache_dir)) - self._specific_model_path = specific_model_path - self._model_dir = self.download_model( - self.model_description, - self.cache_dir, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - ) - - if not self.lazy_load: - self.load_onnx_model() - - def embed( - self, - documents: str | Iterable[str], - batch_size: int = 256, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[NumpyArray]: - """ - Encode a list of documents into list of embeddings. - We use mean pooling with attention so that the model can handle variable-length inputs. - - Args: - documents: Iterator of documents or single document to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - - Returns: - List of embeddings, one per document - """ - yield from self._embed_documents( - model_name=self.model_name, - cache_dir=str(self.cache_dir), - documents=documents, - batch_size=batch_size, - parallel=parallel, - providers=self.providers, - cuda=self.cuda, - device_ids=self.device_ids, - local_files_only=self._local_files_only, - specific_model_path=self._specific_model_path, - extra_session_options=self._extra_session_options, - **kwargs, - ) - - @classmethod - def _get_worker_class(cls) -> Type["TextEmbeddingWorker[NumpyArray]"]: - return OnnxTextEmbeddingWorker - - def _preprocess_onnx_input( - self, onnx_input: dict[str, NumpyArray], **kwargs: Any - ) -> dict[str, NumpyArray]: - """ - Preprocess the onnx input. - """ - return onnx_input - - def _post_process_onnx_output( - self, output: OnnxOutputContext, **kwargs: Any - ) -> Iterable[NumpyArray]: - embeddings = output.model_output - - if embeddings.ndim == 3: # (batch_size, seq_len, embedding_dim) - processed_embeddings = embeddings[:, 0] - elif embeddings.ndim == 2: # (batch_size, embedding_dim) - processed_embeddings = embeddings - else: - raise ValueError(f"Unsupported embedding shape: {embeddings.shape}") - return normalize(processed_embeddings) - - def load_onnx_model(self) -> None: - self._load_onnx_model( - model_dir=self._model_dir, - model_file=self.model_description.model_file, - threads=self.threads, - providers=self.providers, - cuda=self.cuda, - device_id=self.device_id, - extra_session_options=self._extra_session_options, - ) - - def token_count( - self, texts: str | Iterable[str], batch_size: int = 1024, **kwargs: Any - ) -> int: - return self._token_count(texts, batch_size=batch_size, **kwargs) - - -class OnnxTextEmbeddingWorker(TextEmbeddingWorker[NumpyArray]): - def init_embedding( - self, - model_name: str, - cache_dir: str, - **kwargs: Any, - ) -> OnnxTextEmbedding: - return OnnxTextEmbedding( - model_name=model_name, - cache_dir=cache_dir, - threads=1, - **kwargs, - ) diff --git a/fastembed/text/onnx_text_model.py b/fastembed/text/onnx_text_model.py deleted file mode 100644 index c8001a917..000000000 --- a/fastembed/text/onnx_text_model.py +++ /dev/null @@ -1,180 +0,0 @@ -import os -from multiprocessing import get_all_start_methods -from pathlib import Path -from typing import Any, Iterable, Sequence, Type - -import numpy as np -from numpy.typing import NDArray -from tokenizers import Encoding, Tokenizer - -from fastembed.common.types import NumpyArray, OnnxProvider, Device -from fastembed.common.onnx_model import EmbeddingWorker, OnnxModel, OnnxOutputContext, T -from fastembed.common.preprocessor_utils import load_tokenizer -from fastembed.common.utils import iter_batch -from fastembed.parallel_processor import ParallelWorkerPool - - -class OnnxTextModel(OnnxModel[T]): - ONNX_OUTPUT_NAMES: list[str] | None = None - - @classmethod - def _get_worker_class(cls) -> Type["TextEmbeddingWorker[T]"]: - raise NotImplementedError("Subclasses must implement this method") - - def _post_process_onnx_output(self, output: OnnxOutputContext, **kwargs: Any) -> Iterable[T]: - """Post-process the ONNX model output to convert it into a usable format. - - Args: - output (OnnxOutputContext): The raw output from the ONNX model. - **kwargs: Additional keyword arguments that may be needed by specific implementations. - - Returns: - Iterable[T]: Post-processed output as an iterable of type T. - """ - raise NotImplementedError("Subclasses must implement this method") - - def __init__(self) -> None: - super().__init__() - self.tokenizer: Tokenizer | None = None - self.special_token_to_id: dict[str, int] = {} - - def _preprocess_onnx_input( - self, onnx_input: dict[str, NumpyArray], **kwargs: Any - ) -> dict[str, NumpyArray | NDArray[np.int64]]: - """ - Preprocess the onnx input. - """ - return onnx_input - - def _load_onnx_model( - self, - model_dir: Path, - model_file: str, - threads: int | None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_id: int | None = None, - extra_session_options: dict[str, Any] | None = None, - ) -> None: - super()._load_onnx_model( - model_dir=model_dir, - model_file=model_file, - threads=threads, - providers=providers, - cuda=cuda, - device_id=device_id, - extra_session_options=extra_session_options, - ) - self.tokenizer, self.special_token_to_id = load_tokenizer(model_dir=model_dir) - - def load_onnx_model(self) -> None: - raise NotImplementedError("Subclasses must implement this method") - - def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]: - return self.tokenizer.encode_batch(documents) # type: ignore[union-attr] - - def onnx_embed( - self, - documents: list[str], - **kwargs: Any, - ) -> OnnxOutputContext: - encoded = self.tokenize(documents, **kwargs) - input_ids = np.array([e.ids for e in encoded]) - attention_mask = np.array([e.attention_mask for e in encoded]) - input_names = {node.name for node in self.model.get_inputs()} # type: ignore[union-attr] - onnx_input: dict[str, NumpyArray] = { - "input_ids": np.array(input_ids, dtype=np.int64), - } - if "attention_mask" in input_names: - onnx_input["attention_mask"] = np.array(attention_mask, dtype=np.int64) - if "token_type_ids" in input_names: - onnx_input["token_type_ids"] = np.array( - [np.zeros(len(e), dtype=np.int64) for e in input_ids], dtype=np.int64 - ) - onnx_input = self._preprocess_onnx_input(onnx_input, **kwargs) - - model_output = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input) # type: ignore[union-attr] - return OnnxOutputContext( - model_output=model_output[0], - attention_mask=onnx_input.get("attention_mask", attention_mask), - input_ids=onnx_input.get("input_ids", input_ids), - ) - - def _embed_documents( - self, - model_name: str, - cache_dir: str, - documents: str | Iterable[str], - batch_size: int = 256, - parallel: int | None = None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_ids: list[int] | None = None, - local_files_only: bool = False, - specific_model_path: str | None = None, - extra_session_options: dict[str, Any] | None = None, - **kwargs: Any, - ) -> Iterable[T]: - is_small = False - - if isinstance(documents, str): - documents = [documents] - is_small = True - - if isinstance(documents, list): - if len(documents) < batch_size: - is_small = True - - if parallel is None or is_small: - if not hasattr(self, "model") or self.model is None: - self.load_onnx_model() - for batch in iter_batch(documents, batch_size): - yield from self._post_process_onnx_output( - self.onnx_embed(batch, **kwargs), **kwargs - ) - else: - if parallel == 0: - parallel = os.cpu_count() - - start_method = "forkserver" if "forkserver" in get_all_start_methods() else "spawn" - params = { - "model_name": model_name, - "cache_dir": cache_dir, - "providers": providers, - "local_files_only": local_files_only, - "specific_model_path": specific_model_path, - **kwargs, - } - - if extra_session_options is not None: - params.update(extra_session_options) - - pool = ParallelWorkerPool( - num_workers=parallel or 1, - worker=self._get_worker_class(), - cuda=cuda, - device_ids=device_ids, - start_method=start_method, - ) - for batch in pool.ordered_map(iter_batch(documents, batch_size), **params): - yield from self._post_process_onnx_output(batch, **kwargs) # type: ignore - - def _token_count(self, texts: str | Iterable[str], batch_size: int = 1024, **_: Any) -> int: - if not hasattr(self, "model") or self.model is None: - self.load_onnx_model() # loads the tokenizer as well - - token_num = 0 - assert self.tokenizer is not None - texts = [texts] if isinstance(texts, str) else texts - for batch in iter_batch(texts, batch_size): - for tokens in self.tokenizer.encode_batch(batch): - token_num += sum(tokens.attention_mask) - - return token_num - - -class TextEmbeddingWorker(EmbeddingWorker[T]): - def process(self, items: Iterable[tuple[int, Any]]) -> Iterable[tuple[int, OnnxOutputContext]]: - for idx, batch in items: - onnx_output = self.model.onnx_embed(batch) - yield idx, onnx_output diff --git a/fastembed/text/pooled_embedding.py b/fastembed/text/pooled_embedding.py deleted file mode 100644 index f3fe607e9..000000000 --- a/fastembed/text/pooled_embedding.py +++ /dev/null @@ -1,136 +0,0 @@ -from typing import Any, Iterable, Type - -import numpy as np -from numpy.typing import NDArray - -from fastembed.common.types import NumpyArray -from fastembed.common.onnx_model import OnnxOutputContext -from fastembed.common.utils import mean_pooling -from fastembed.text.onnx_embedding import OnnxTextEmbedding, OnnxTextEmbeddingWorker -from fastembed.common.model_description import DenseModelDescription, ModelSource - -supported_pooled_models: list[DenseModelDescription] = [ - DenseModelDescription( - model="nomic-ai/nomic-embed-text-v1.5", - dim=768, - description=( - "Text embeddings, Multimodal (text, image), English, 8192 input tokens truncation, " - "Prefixes for queries/documents: necessary, 2024 year." - ), - license="apache-2.0", - size_in_GB=0.52, - sources=ModelSource(hf="nomic-ai/nomic-embed-text-v1.5"), - model_file="onnx/model.onnx", - ), - DenseModelDescription( - model="nomic-ai/nomic-embed-text-v1.5-Q", - dim=768, - description=( - "Text embeddings, Multimodal (text, image), English, 8192 input tokens truncation, " - "Prefixes for queries/documents: necessary, 2024 year." - ), - license="apache-2.0", - size_in_GB=0.13, - sources=ModelSource(hf="nomic-ai/nomic-embed-text-v1.5"), - model_file="onnx/model_quantized.onnx", - ), - DenseModelDescription( - model="nomic-ai/nomic-embed-text-v1", - dim=768, - description=( - "Text embeddings, Multimodal (text, image), English, 8192 input tokens truncation, " - "Prefixes for queries/documents: necessary, 2024 year." - ), - license="apache-2.0", - size_in_GB=0.52, - sources=ModelSource(hf="nomic-ai/nomic-embed-text-v1"), - model_file="onnx/model.onnx", - ), - DenseModelDescription( - model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", - dim=384, - description=( - "Text embeddings, Unimodal (text), Multilingual (~50 languages), 512 input tokens truncation, " - "Prefixes for queries/documents: not necessary, 2019 year." - ), - license="apache-2.0", - size_in_GB=0.22, - sources=ModelSource(hf="qdrant/paraphrase-multilingual-MiniLM-L12-v2-onnx-Q"), - model_file="model_optimized.onnx", - ), - DenseModelDescription( - model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", - dim=768, - description=( - "Text embeddings, Unimodal (text), Multilingual (~50 languages), 384 input tokens truncation, " - "Prefixes for queries/documents: not necessary, 2021 year." - ), - license="apache-2.0", - size_in_GB=1.00, - sources=ModelSource(hf="xenova/paraphrase-multilingual-mpnet-base-v2"), - model_file="onnx/model.onnx", - ), - DenseModelDescription( - model="intfloat/multilingual-e5-large", - dim=1024, - description=( - "Text embeddings, Unimodal (text), Multilingual (~100 languages), 512 input tokens truncation, " - "Prefixes for queries/documents: necessary, 2024 year." - ), - license="mit", - size_in_GB=2.24, - sources=ModelSource( - hf="qdrant/multilingual-e5-large-onnx", - url="https://storage.googleapis.com/qdrant-fastembed/fast-multilingual-e5-large.tar.gz", - _deprecated_tar_struct=True, - ), - model_file="model.onnx", - additional_files=["model.onnx_data"], - ), -] - - -class PooledEmbedding(OnnxTextEmbedding): - @classmethod - def _get_worker_class(cls) -> Type[OnnxTextEmbeddingWorker]: - return PooledEmbeddingWorker - - @classmethod - def mean_pooling( - cls, model_output: NumpyArray, attention_mask: NDArray[np.int64] - ) -> NumpyArray: - return mean_pooling(model_output, attention_mask) - - @classmethod - def _list_supported_models(cls) -> list[DenseModelDescription]: - """Lists the supported models. - - Returns: - list[DenseModelDescription]: A list of DenseModelDescription objects containing the model information. - """ - return supported_pooled_models - - def _post_process_onnx_output( - self, output: OnnxOutputContext, **kwargs: Any - ) -> Iterable[NumpyArray]: - if output.attention_mask is None: - raise ValueError("attention_mask must be provided for document post-processing") - - embeddings = output.model_output - attn_mask = output.attention_mask - return self.mean_pooling(embeddings, attn_mask) - - -class PooledEmbeddingWorker(OnnxTextEmbeddingWorker): - def init_embedding( - self, - model_name: str, - cache_dir: str, - **kwargs: Any, - ) -> OnnxTextEmbedding: - return PooledEmbedding( - model_name=model_name, - cache_dir=cache_dir, - threads=1, - **kwargs, - ) diff --git a/fastembed/text/pooled_normalized_embedding.py b/fastembed/text/pooled_normalized_embedding.py deleted file mode 100644 index 5711da19f..000000000 --- a/fastembed/text/pooled_normalized_embedding.py +++ /dev/null @@ -1,164 +0,0 @@ -from typing import Any, Iterable, Type - - -from fastembed.common.types import NumpyArray -from fastembed.common.onnx_model import OnnxOutputContext -from fastembed.common.utils import normalize -from fastembed.text.onnx_embedding import OnnxTextEmbedding, OnnxTextEmbeddingWorker -from fastembed.text.pooled_embedding import PooledEmbedding -from fastembed.common.model_description import DenseModelDescription, ModelSource - -supported_pooled_normalized_models: list[DenseModelDescription] = [ - DenseModelDescription( - model="sentence-transformers/all-MiniLM-L6-v2", - dim=384, - description=( - "Text embeddings, Unimodal (text), English, 256 input tokens truncation, " - "Prefixes for queries/documents: not necessary, 2021 year." - ), - license="apache-2.0", - size_in_GB=0.09, - sources=ModelSource( - url="https://storage.googleapis.com/qdrant-fastembed/sentence-transformers-all-MiniLM-L6-v2.tar.gz", - hf="qdrant/all-MiniLM-L6-v2-onnx", - _deprecated_tar_struct=True, - ), - model_file="model.onnx", - ), - DenseModelDescription( - model="jinaai/jina-embeddings-v2-base-en", - dim=768, - description=( - "Text embeddings, Unimodal (text), English, 8192 input tokens truncation, " - "Prefixes for queries/documents: not necessary, 2023 year." - ), - license="apache-2.0", - size_in_GB=0.52, - sources=ModelSource(hf="xenova/jina-embeddings-v2-base-en"), - model_file="onnx/model.onnx", - ), - DenseModelDescription( - model="jinaai/jina-embeddings-v2-small-en", - dim=512, - description=( - "Text embeddings, Unimodal (text), English, 8192 input tokens truncation, " - "Prefixes for queries/documents: not necessary, 2023 year." - ), - license="apache-2.0", - size_in_GB=0.12, - sources=ModelSource(hf="xenova/jina-embeddings-v2-small-en"), - model_file="onnx/model.onnx", - ), - DenseModelDescription( - model="jinaai/jina-embeddings-v2-base-de", - dim=768, - description=( - "Text embeddings, Unimodal (text), Multilingual (German, English), 8192 input tokens truncation, " - "Prefixes for queries/documents: not necessary, 2024 year." - ), - license="apache-2.0", - size_in_GB=0.32, - sources=ModelSource(hf="jinaai/jina-embeddings-v2-base-de"), - model_file="onnx/model_fp16.onnx", - ), - DenseModelDescription( - model="jinaai/jina-embeddings-v2-base-code", - dim=768, - description=( - "Text embeddings, Unimodal (text), Multilingual (English, 30 programming languages), " - "8192 input tokens truncation, Prefixes for queries/documents: not necessary, 2024 year." - ), - license="apache-2.0", - size_in_GB=0.64, - sources=ModelSource(hf="jinaai/jina-embeddings-v2-base-code"), - model_file="onnx/model.onnx", - ), - DenseModelDescription( - model="jinaai/jina-embeddings-v2-base-zh", - dim=768, - description=( - "Text embeddings, Unimodal (text), supports mixed Chinese-English input text, " - "8192 input tokens truncation, Prefixes for queries/documents: not necessary, 2024 year." - ), - license="apache-2.0", - size_in_GB=0.64, - sources=ModelSource(hf="jinaai/jina-embeddings-v2-base-zh"), - model_file="onnx/model.onnx", - ), - DenseModelDescription( - model="jinaai/jina-embeddings-v2-base-es", - dim=768, - description=( - "Text embeddings, Unimodal (text), supports mixed Spanish-English input text, " - "8192 input tokens truncation, Prefixes for queries/documents: not necessary, 2024 year." - ), - license="apache-2.0", - size_in_GB=0.64, - sources=ModelSource(hf="jinaai/jina-embeddings-v2-base-es"), - model_file="onnx/model.onnx", - ), - DenseModelDescription( - model="thenlper/gte-base", - dim=768, - description=( - "General text embeddings, Unimodal (text), supports English only input text, " - "512 input tokens truncation, Prefixes for queries/documents: not necessary, 2024 year." - ), - license="mit", - size_in_GB=0.44, - sources=ModelSource(hf="thenlper/gte-base"), - model_file="onnx/model.onnx", - ), - DenseModelDescription( - model="thenlper/gte-large", - dim=1024, - description=( - "Text embeddings, Unimodal (text), English, 512 input tokens truncation, " - "Prefixes for queries/documents: not necessary, 2023 year." - ), - license="mit", - size_in_GB=1.20, - sources=ModelSource(hf="qdrant/gte-large-onnx"), - model_file="model.onnx", - ), -] - - -class PooledNormalizedEmbedding(PooledEmbedding): - @classmethod - def _get_worker_class(cls) -> Type[OnnxTextEmbeddingWorker]: - return PooledNormalizedEmbeddingWorker - - @classmethod - def _list_supported_models(cls) -> list[DenseModelDescription]: - """Lists the supported models. - - Returns: - list[DenseModelDescription]: A list of DenseModelDescription objects containing the model information. - """ - return supported_pooled_normalized_models - - def _post_process_onnx_output( - self, output: OnnxOutputContext, **kwargs: Any - ) -> Iterable[NumpyArray]: - if output.attention_mask is None: - raise ValueError("attention_mask must be provided for document post-processing") - - embeddings = output.model_output - attn_mask = output.attention_mask - return normalize(self.mean_pooling(embeddings, attn_mask)) - - -class PooledNormalizedEmbeddingWorker(OnnxTextEmbeddingWorker): - def init_embedding( - self, - model_name: str, - cache_dir: str, - **kwargs: Any, - ) -> OnnxTextEmbedding: - return PooledNormalizedEmbedding( - model_name=model_name, - cache_dir=cache_dir, - threads=1, - **kwargs, - ) diff --git a/fastembed/text/text_embedding.py b/fastembed/text/text_embedding.py deleted file mode 100644 index a4ae48cc5..000000000 --- a/fastembed/text/text_embedding.py +++ /dev/null @@ -1,228 +0,0 @@ -import warnings -from typing import Any, Iterable, Sequence, Type -from dataclasses import asdict - -from fastembed.common.types import NumpyArray, OnnxProvider, Device -from fastembed.text.clip_embedding import CLIPOnnxEmbedding -from fastembed.text.custom_text_embedding import CustomTextEmbedding -from fastembed.text.pooled_normalized_embedding import PooledNormalizedEmbedding -from fastembed.text.pooled_embedding import PooledEmbedding -from fastembed.text.multitask_embedding import JinaEmbeddingV3 -from fastembed.text.onnx_embedding import OnnxTextEmbedding -from fastembed.text.text_embedding_base import TextEmbeddingBase -from fastembed.common.model_description import DenseModelDescription, ModelSource, PoolingType - - -class TextEmbedding(TextEmbeddingBase): - EMBEDDINGS_REGISTRY: list[Type[TextEmbeddingBase]] = [ - OnnxTextEmbedding, - CLIPOnnxEmbedding, - PooledNormalizedEmbedding, - PooledEmbedding, - JinaEmbeddingV3, - CustomTextEmbedding, - ] - - @classmethod - def list_supported_models(cls) -> list[dict[str, Any]]: - """Lists the supported models. - - Returns: - list[dict[str, Any]]: A list of dictionaries containing the model information. - """ - return [asdict(model) for model in cls._list_supported_models()] - - @classmethod - def _list_supported_models(cls) -> list[DenseModelDescription]: - result: list[DenseModelDescription] = [] - for embedding in cls.EMBEDDINGS_REGISTRY: - result.extend(embedding._list_supported_models()) - return result - - @classmethod - def add_custom_model( - cls, - model: str, - pooling: PoolingType, - normalization: bool, - sources: ModelSource, - dim: int, - model_file: str = "onnx/model.onnx", - description: str = "", - license: str = "", - size_in_gb: float = 0.0, - additional_files: list[str] | None = None, - ) -> None: - registered_models = cls._list_supported_models() - for registered_model in registered_models: - if model.lower() == registered_model.model.lower(): - raise ValueError( - f"Model {model} is already registered in TextEmbedding, if you still want to add this model, " - f"please use another model name" - ) - - CustomTextEmbedding.add_model( - DenseModelDescription( - model=model, - sources=sources, - dim=dim, - model_file=model_file, - description=description, - license=license, - size_in_GB=size_in_gb, - additional_files=additional_files or [], - ), - pooling=pooling, - normalization=normalization, - ) - - def __init__( - self, - model_name: str = "BAAI/bge-small-en-v1.5", - cache_dir: str | None = None, - threads: int | None = None, - providers: Sequence[OnnxProvider] | None = None, - cuda: bool | Device = Device.AUTO, - device_ids: list[int] | None = None, - lazy_load: bool = False, - **kwargs: Any, - ): - super().__init__(model_name, cache_dir, threads, **kwargs) - if model_name.lower() == "nomic-ai/nomic-embed-text-v1.5-Q".lower(): - warnings.warn( - "The model 'nomic-ai/nomic-embed-text-v1.5-Q' has been updated on HuggingFace. Please review " - "the latest documentation on HF and release notes to ensure compatibility with your workflow. ", - UserWarning, - stacklevel=2, - ) - if model_name.lower() in { - "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2".lower(), - "thenlper/gte-large".lower(), - "intfloat/multilingual-e5-large".lower(), - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2".lower(), - }: - warnings.warn( - f"The model {model_name} now uses mean pooling instead of CLS embedding. " - f"In order to preserve the previous behaviour, consider either pinning fastembed version to 0.5.1 or " - "using `add_custom_model` functionality.", - UserWarning, - stacklevel=2, - ) - for EMBEDDING_MODEL_TYPE in self.EMBEDDINGS_REGISTRY: - supported_models = EMBEDDING_MODEL_TYPE._list_supported_models() - if any(model_name.lower() == model.model.lower() for model in supported_models): - self.model = EMBEDDING_MODEL_TYPE( - model_name=model_name, - cache_dir=cache_dir, - threads=threads, - providers=providers, - cuda=cuda, - device_ids=device_ids, - lazy_load=lazy_load, - **kwargs, - ) - return - - raise ValueError( - f"Model {model_name} is not supported in TextEmbedding. " - "Please check the supported models using `TextEmbedding.list_supported_models()`" - ) - - @property - def embedding_size(self) -> int: - """Get the embedding size of the current model""" - if self._embedding_size is None: - self._embedding_size = self.get_embedding_size(self.model_name) - return self._embedding_size - - @classmethod - def get_embedding_size(cls, model_name: str) -> int: - """Get the embedding size of the passed model - - Args: - model_name (str): The name of the model to get embedding size for. - - Returns: - int: The size of the embedding. - - Raises: - ValueError: If the model name is not found in the supported models. - """ - descriptions = cls._list_supported_models() - embedding_size: int | None = None - for description in descriptions: - if description.model.lower() == model_name.lower(): - embedding_size = description.dim - break - if embedding_size is None: - model_names = [description.model for description in descriptions] - raise ValueError( - f"Embedding size for model {model_name} was None. " - f"Available model names: {model_names}" - ) - return embedding_size - - def embed( - self, - documents: str | Iterable[str], - batch_size: int = 256, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[NumpyArray]: - """ - Encode a list of documents into list of embeddings. - We use mean pooling with attention so that the model can handle variable-length inputs. - - Args: - documents: Iterator of documents or single document to embed - batch_size: Batch size for encoding -- higher values will use more memory, but be faster - parallel: - If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. - If 0, use all available cores. - If None, don't use data-parallel processing, use default onnxruntime threading instead. - - Returns: - List of embeddings, one per document - """ - yield from self.model.embed(documents, batch_size, parallel, **kwargs) - - def query_embed(self, query: str | Iterable[str], **kwargs: Any) -> Iterable[NumpyArray]: - """ - Embeds queries - - Args: - query (Union[str, Iterable[str]]): The query to embed, or an iterable e.g. list of queries. - - Returns: - Iterable[NumpyArray]: The embeddings. - """ - # This is model-specific, so that different models can have specialized implementations - yield from self.model.query_embed(query, **kwargs) - - def passage_embed(self, texts: Iterable[str], **kwargs: Any) -> Iterable[NumpyArray]: - """ - Embeds a list of text passages into a list of embeddings. - - Args: - texts (Iterable[str]): The list of texts to embed. - **kwargs: Additional keyword argument to pass to the embed method. - - Yields: - Iterable[SparseEmbedding]: The sparse embeddings. - """ - # This is model-specific, so that different models can have specialized implementations - yield from self.model.passage_embed(texts, **kwargs) - - def token_count( - self, texts: str | Iterable[str], batch_size: int = 1024, **kwargs: Any - ) -> int: - """Returns the number of tokens in the texts. - - Args: - texts (str | Iterable[str]): The list of texts to embed. - batch_size (int): Batch size for encoding - - Returns: - int: Sum of number of tokens in the texts. - """ - return self.model.token_count(texts, batch_size=batch_size, **kwargs) diff --git a/fastembed/text/text_embedding_base.py b/fastembed/text/text_embedding_base.py deleted file mode 100644 index 13bc4f732..000000000 --- a/fastembed/text/text_embedding_base.py +++ /dev/null @@ -1,75 +0,0 @@ -from typing import Iterable, Any - -from fastembed.common.model_description import DenseModelDescription -from fastembed.common.types import NumpyArray -from fastembed.common.model_management import ModelManagement - - -class TextEmbeddingBase(ModelManagement[DenseModelDescription]): - def __init__( - self, - model_name: str, - cache_dir: str | None = None, - threads: int | None = None, - **kwargs: Any, - ): - self.model_name = model_name - self.cache_dir = cache_dir - self.threads = threads - self._local_files_only = kwargs.pop("local_files_only", False) - self._embedding_size: int | None = None - - def embed( - self, - documents: str | Iterable[str], - batch_size: int = 256, - parallel: int | None = None, - **kwargs: Any, - ) -> Iterable[NumpyArray]: - raise NotImplementedError() - - def passage_embed(self, texts: Iterable[str], **kwargs: Any) -> Iterable[NumpyArray]: - """ - Embeds a list of text passages into a list of embeddings. - - Args: - texts (Iterable[str]): The list of texts to embed. - **kwargs: Additional keyword argument to pass to the embed method. - - Yields: - Iterable[NumpyArray]: The embeddings. - """ - - # This is model-specific, so that different models can have specialized implementations - yield from self.embed(texts, **kwargs) - - def query_embed(self, query: str | Iterable[str], **kwargs: Any) -> Iterable[NumpyArray]: - """ - Embeds queries - - Args: - query (Union[str, Iterable[str]]): The query to embed, or an iterable e.g. list of queries. - - Returns: - Iterable[NumpyArray]: The embeddings. - """ - - # This is model-specific, so that different models can have specialized implementations - if isinstance(query, str): - yield from self.embed([query], **kwargs) - else: - yield from self.embed(query, **kwargs) - - @classmethod - def get_embedding_size(cls, model_name: str) -> int: - """Returns embedding size of the passed model.""" - raise NotImplementedError("Subclasses must implement this method") - - @property - def embedding_size(self) -> int: - """Returns embedding size for the current model""" - raise NotImplementedError("Subclasses must implement this method") - - def token_count(self, texts: str | Iterable[str], **kwargs: Any) -> int: - """Returns the number of tokens in the texts.""" - raise NotImplementedError("Subclasses must implement this method") diff --git a/fastembed_bio/__init__.py b/fastembed_bio/__init__.py new file mode 100644 index 000000000..fa6de9cd7 --- /dev/null +++ b/fastembed_bio/__init__.py @@ -0,0 +1,18 @@ +import importlib.metadata + +from fastembed_bio.common import DNAInput, ProteinInput +from fastembed_bio.dna import DNAEmbedding +from fastembed_bio.protein import ProteinEmbedding + +try: + version = importlib.metadata.version("fastembed-bio") +except importlib.metadata.PackageNotFoundError: + version = "0.0.0" + +__version__ = version +__all__ = [ + "DNAEmbedding", + "DNAInput", + "ProteinEmbedding", + "ProteinInput", +] \ No newline at end of file diff --git a/fastembed_bio/common/__init__.py b/fastembed_bio/common/__init__.py new file mode 100644 index 000000000..5f7ee695d --- /dev/null +++ b/fastembed_bio/common/__init__.py @@ -0,0 +1,4 @@ +from fastembed_bio.common.inputs import DNAInput, ProteinInput +from fastembed_bio.common.types import OnnxProvider, PathInput + +__all__ = ["DNAInput", "OnnxProvider", "PathInput", "ProteinInput"] \ No newline at end of file diff --git a/fastembed_bio/common/inputs.py b/fastembed_bio/common/inputs.py new file mode 100644 index 000000000..703c9c94e --- /dev/null +++ b/fastembed_bio/common/inputs.py @@ -0,0 +1,65 @@ +from dataclasses import dataclass + + +@dataclass(frozen=True) +class DNAInput: + """ + Input for DNA embedding models that support additional metadata. + + Attributes: + sequence: DNA sequence string (A, T, C, G, N characters) + species: Species name for conditioning (e.g., "human", "mouse"). + Only used by models that support species conditioning. + + Example: + >>> inp = DNAInput("ATCGATCG", species="human") + >>> inp.sequence + 'ATCGATCG' + """ + + sequence: str + species: str = "human" + + def __post_init__(self) -> None: + if not self.sequence: + raise ValueError("sequence cannot be empty") + + @classmethod + def from_dict(cls, data: dict) -> "DNAInput": + """ + Create a DNAInput instance from a dictionary. + + Args: + data: Dictionary with keys 'sequence' and optional 'species'. + Returns: + DNAInput instance. + """ + return cls( + sequence=data["sequence"], + species=data.get("species", "human") + ) + + +@dataclass(frozen=True) +class ProteinInput: + """ + Input for protein embedding models. + + Currently protein models only need sequences, but this class + future-proofs the API for models that may need additional metadata + (e.g., organism, structure hints). + + Attributes: + sequence: Protein sequence string (amino acid characters) + + Example: + >>> inp = ProteinInput("MKTVRQERLKS") + >>> inp.sequence + 'MKTVRQERLKS' + """ + + sequence: str + + def __post_init__(self) -> None: + if not self.sequence: + raise ValueError("sequence cannot be empty") \ No newline at end of file diff --git a/fastembed/common/model_description.py b/fastembed_bio/common/model_description.py similarity index 100% rename from fastembed/common/model_description.py rename to fastembed_bio/common/model_description.py diff --git a/fastembed/common/model_management.py b/fastembed_bio/common/model_management.py similarity index 99% rename from fastembed/common/model_management.py rename to fastembed_bio/common/model_management.py index 5301def80..a77ac053e 100644 --- a/fastembed/common/model_management.py +++ b/fastembed_bio/common/model_management.py @@ -17,7 +17,7 @@ ) from loguru import logger from tqdm import tqdm -from fastembed.common.model_description import BaseModelDescription +from fastembed_bio.common.model_description import BaseModelDescription T = TypeVar("T", bound=BaseModelDescription) diff --git a/fastembed/common/onnx_model.py b/fastembed_bio/common/onnx_model.py similarity index 98% rename from fastembed/common/onnx_model.py rename to fastembed_bio/common/onnx_model.py index d357f2c1f..acc06660c 100644 --- a/fastembed/common/onnx_model.py +++ b/fastembed_bio/common/onnx_model.py @@ -9,8 +9,8 @@ from numpy.typing import NDArray from tokenizers import Tokenizer -from fastembed.common.types import OnnxProvider, NumpyArray, Device -from fastembed.parallel_processor import Worker +from fastembed_bio.common.types import OnnxProvider, NumpyArray, Device +from fastembed_bio.parallel_processor import Worker # Holds type of the embedding result T = TypeVar("T") diff --git a/fastembed/common/preprocessor_utils.py b/fastembed_bio/common/preprocessor_utils.py similarity index 83% rename from fastembed/common/preprocessor_utils.py rename to fastembed_bio/common/preprocessor_utils.py index 3b702f799..a9546f8e0 100644 --- a/fastembed/common/preprocessor_utils.py +++ b/fastembed_bio/common/preprocessor_utils.py @@ -4,8 +4,6 @@ from tokenizers import AddedToken, Tokenizer -from fastembed.image.transform.operators import Compose - def load_special_tokens(model_dir: Path) -> dict[str, Any]: tokens_map_path = model_dir / "special_tokens_map.json" @@ -72,13 +70,3 @@ def load_tokenizer(model_dir: Path) -> tuple[Tokenizer, dict[str, int]]: return tokenizer, special_token_to_id - -def load_preprocessor(model_dir: Path) -> Compose: - preprocessor_config_path = model_dir / "preprocessor_config.json" - if not preprocessor_config_path.exists(): - raise ValueError(f"Could not find preprocessor_config.json in {model_dir}") - - with open(str(preprocessor_config_path)) as preprocessor_config_file: - preprocessor_config = json.load(preprocessor_config_file) - transforms = Compose.from_config(preprocessor_config) - return transforms diff --git a/fastembed/common/types.py b/fastembed_bio/common/types.py similarity index 87% rename from fastembed/common/types.py rename to fastembed_bio/common/types.py index 80dee021c..e68d79042 100644 --- a/fastembed/common/types.py +++ b/fastembed_bio/common/types.py @@ -4,7 +4,7 @@ import numpy as np from numpy.typing import NDArray -from PIL import Image + class Device(str, Enum): @@ -14,7 +14,6 @@ class Device(str, Enum): PathInput: TypeAlias = str | Path -ImageInput: TypeAlias = PathInput | Image.Image OnnxProvider: TypeAlias = str | tuple[str, dict[Any, Any]] NumpyArray: TypeAlias = ( diff --git a/fastembed/common/utils.py b/fastembed_bio/common/utils.py similarity index 97% rename from fastembed/common/utils.py rename to fastembed_bio/common/utils.py index b61a8b9ce..a439b9225 100644 --- a/fastembed/common/utils.py +++ b/fastembed_bio/common/utils.py @@ -10,7 +10,7 @@ import numpy as np from numpy.typing import NDArray -from fastembed.common.types import NumpyArray +from fastembed_bio.common.types import NumpyArray T = TypeVar("T") diff --git a/fastembed_bio/dna/__init__.py b/fastembed_bio/dna/__init__.py new file mode 100644 index 000000000..1853c99dc --- /dev/null +++ b/fastembed_bio/dna/__init__.py @@ -0,0 +1,3 @@ +from fastembed_bio.dna.embedding import DNAEmbedding + +__all__ = ["DNAEmbedding"] \ No newline at end of file diff --git a/fastembed_bio/dna/embedding.py b/fastembed_bio/dna/embedding.py new file mode 100644 index 000000000..f223ab905 --- /dev/null +++ b/fastembed_bio/dna/embedding.py @@ -0,0 +1,712 @@ +import json +from dataclasses import asdict +from pathlib import Path +from typing import Any, Iterable, Sequence, Type + +import numpy as np +from tokenizers import Tokenizer + +from fastembed_bio.common.inputs import DNAInput +from fastembed_bio.common.model_description import DenseModelDescription, ModelSource +from fastembed_bio.common.model_management import ModelManagement +from fastembed_bio.common.onnx_model import EmbeddingWorker, OnnxModel, OnnxOutputContext +from fastembed_bio.common.types import Device, NumpyArray, OnnxProvider +from fastembed_bio.common.utils import define_cache_dir, iter_batch, normalize + +# type alias for inputs that can be either strings or DNAInput objects +DNAInputType = str | DNAInput + + +def _normalize_dna_inputs( + inputs: DNAInputType | Iterable[DNAInputType], + default_species: str = "human", +) -> tuple[list[str], list[str]]: + """ + Normalize DNA inputs to lists of sequences and species. + + Handles mixed inputs of strings and DNAInput objects. Strings use + the default_species, DNAInput objects use their own species. + + Args: + inputs: Single input or iterable of inputs (str or DNAInput) + default_species: Species to use for plain string inputs + + Returns: + Tuple of (sequences, species) lists, same length + """ + if isinstance(inputs, str): + return [inputs], [default_species] + if isinstance(inputs, DNAInput): + return [inputs.sequence], [inputs.species] + + sequences: list[str] = [] + species_list: list[str] = [] + + for inp in inputs: + if isinstance(inp, str): + sequences.append(inp) + species_list.append(default_species) + elif isinstance(inp, DNAInput): + sequences.append(inp.sequence) + species_list.append(inp.species) + else: + raise TypeError( + f"Expected str or DNAInput, got {type(inp).__name__}" + ) + + return sequences, species_list + + +supported_dna_models: list[DenseModelDescription] = [ + # DenseModelDescription( + # model="InstaDeepAI/NTv3_650M_post", + # dim=1536, + # description="Nucleotide Transformer v3, 650M parameters, 1536 dimensions, species-conditioned DNA embeddings", + # license="cc-by-nc-sa-4.0", + # size_in_GB=2.6, + # sources=ModelSource(hf="nleroy917/ntv3-650m-post-onnx"), + # model_file="model.onnx", + # additional_files=[ + # "vocab.json", + # "tokenizer_config.json", + # "special_tokens_map.json", + # "species_config.json", + # "model_config.json", + # "model.onnx.data", + # ], + # ), + DenseModelDescription( + model="PoetschLab/GROVER", + dim=768, + description="GROVER DNA foundation model, 768 dimensions, trained on human genome", + license="apache-2.0", + size_in_GB=0.33, + sources=ModelSource(hf="nleroy917/grover-onnx"), + model_file="model.onnx", + additional_files=[ + "tokenizer.json", + "tokenizer_config.json", + "special_tokens_map.json", + "model_config.json", + "model.onnx.data", + ], + ), +] + +# models that require species conditioning +SPECIES_CONDITIONED_MODELS = {"InstaDeepAI/NTv3_650M_post"} + + +def load_dna_tokenizer( + model_dir: Path, + max_length: int = 6144, + requires_species: bool = False, +) -> Tokenizer: + """ + Load a DNA tokenizer from model directory. + + Attempts to load in order: + 1. tokenizer.json (standard HuggingFace fast tokenizer format) + 2. Build from vocab.json (fallback for NTv3-style tokenizers) + + Args: + model_dir: Path to model directory containing tokenizer files + max_length: Maximum sequence length (default 6144) + requires_species: Whether the model requires species conditioning (affects padding) + + Returns: + Configured Tokenizer instance + """ + from tokenizers import pre_tokenizers + from tokenizers.models import WordLevel + + tokenizer_json_path = model_dir / "tokenizer.json" + tokenizer_config_path = model_dir / "tokenizer_config.json" + vocab_json_path = model_dir / "vocab.json" + + # read config for settings + pad_token = "[PAD]" + pad_token_id = 0 + if tokenizer_config_path.exists(): + with open(tokenizer_config_path) as f: + config = json.load(f) + config_max_length = config.get("model_max_length", max_length) + if config_max_length and config_max_length <= max_length: + max_length = config_max_length + pad_token = config.get("pad_token", pad_token) + + # try to load tokenizer.json directly (preferred for GROVER and others) + if tokenizer_json_path.exists(): + tokenizer = Tokenizer.from_file(str(tokenizer_json_path)) + tokenizer.enable_truncation(max_length=max_length) + + # NTv3-style models need padding to multiple of 128 + if requires_species: + tokenizer.enable_padding(pad_id=1, pad_token="", pad_to_multiple_of=128) + else: + # standard padding for models like GROVER + tokenizer.enable_padding(pad_id=pad_token_id, pad_token=pad_token) + return tokenizer + + # fall back to building from vocab.json (NTv3 style) + if not vocab_json_path.exists(): + raise ValueError( + f"Could not find tokenizer.json or vocab.json in {model_dir}" + ) + + with open(vocab_json_path) as f: + vocab: dict[str, int] = json.load(f) + + # build tokenizer from vocab + unk_token = "" + pad_token = "" + + tokenizer = Tokenizer(WordLevel(vocab=vocab, unk_token=unk_token)) + + # character-level pre-tokenizer (split each character) + tokenizer.pre_tokenizer = pre_tokenizers.Split( + pattern="", behavior="isolated", invert=False + ) + + # no special tokens added for NTv3 (add_special_tokens=False in original) + pad_token_id = vocab.get(pad_token, 1) + tokenizer.enable_padding( + pad_id=pad_token_id, pad_token=pad_token, pad_to_multiple_of=128 + ) + tokenizer.enable_truncation(max_length=max_length) + + return tokenizer + + +def load_species_config(model_dir: Path) -> dict[str, Any] | None: + """ + Load species configuration from model directory if it exists. + + Args: + model_dir: Path to model directory + + Returns: + Dictionary with species_to_id, id_to_species, and supported_species, + or None if the model doesn't use species conditioning. + """ + species_config_path = model_dir / "species_config.json" + if not species_config_path.exists(): + return None + + with open(species_config_path) as f: + return json.load(f) + + +class DNAEmbeddingBase(ModelManagement[DenseModelDescription]): + """ + Base class for DNA sequence embeddings. + """ + + def __init__( + self, + model_name: str, + cache_dir: str | None = None, + threads: int | None = None, + **kwargs: Any, + ): + self.model_name = model_name + self.cache_dir = cache_dir + self.threads = threads + self._local_files_only = kwargs.pop("local_files_only", False) + self._embedding_size: int | None = None + + def embed( + self, + inputs: DNAInputType | Iterable[DNAInputType], + batch_size: int = 32, + parallel: int | None = None, + species: str = "human", + **kwargs: Any, + ) -> Iterable[NumpyArray]: + """ + Embed DNA sequences. + + Args: + inputs: DNA input(s). Can be: + - A single sequence string + - A DNAInput object with sequence and species + - An iterable of strings and/or DNAInput objects + batch_size: Batch size for encoding + parallel: Number of parallel workers (None for single-threaded) + species: Default species for plain string inputs (default: "human"). + Ignored for DNAInput objects which specify their own species. + + Yields: + Embeddings as numpy arrays + + Example: + # Simple usage with strings (all use default species) + model.embed(["ATCG", "GCTA"]) + + # Per-sequence species with DNAInput + model.embed([ + DNAInput("ATCG", species="human"), + DNAInput("GCTA", species="mouse"), + ]) + + # Mixed inputs + model.embed(["ATCG", DNAInput("GCTA", species="mouse")]) + """ + raise NotImplementedError() + + @classmethod + def get_embedding_size(cls, model_name: str) -> int: + """ + Returns embedding size of the passed model. + + Args: + model_name: Name of the model + """ + descriptions = cls._list_supported_models() + for description in descriptions: + if description.model.lower() == model_name.lower(): + if description.dim is not None: + return description.dim + raise ValueError(f"Model {model_name} not found") + + @property + def embedding_size(self) -> int: + """ + Returns embedding size for the current model. + """ + if self._embedding_size is None: + self._embedding_size = self.get_embedding_size(self.model_name) + return self._embedding_size + + def list_supported_species(self) -> list[str]: + """ + Returns list of supported species for conditioning. + + Returns: + List of species names + """ + raise NotImplementedError() + + +class OnnxDNAModel(OnnxModel[NumpyArray]): + """ + ONNX model handler for DNA embeddings. + """ + + ONNX_OUTPUT_NAMES: list[str] | None = None + # NTv3 model uses special tokens offset for species IDs + # The actual species_id = base_index + num_species_special_tokens + NUM_SPECIES_SPECIAL_TOKENS: int = 13 + + def __init__(self) -> None: + super().__init__() + self.tokenizer: Tokenizer | None = None + self.species_config: dict[str, Any] | None = None + self._requires_species: bool = False + + def _load_onnx_model( + self, + model_dir: Path, + model_file: str, + threads: int | None, + model_name: str = "", + providers: Sequence[OnnxProvider] | None = None, + cuda: bool | Device = Device.AUTO, + device_id: int | None = None, + extra_session_options: dict[str, Any] | None = None, + ) -> None: + super()._load_onnx_model( + model_dir=model_dir, + model_file=model_file, + threads=threads, + providers=providers, + cuda=cuda, + device_id=device_id, + extra_session_options=extra_session_options, + ) + # check if this model requires species conditioning + self._requires_species = model_name in SPECIES_CONDITIONED_MODELS + self.tokenizer = load_dna_tokenizer(model_dir, requires_species=self._requires_species) + self.species_config = load_species_config(model_dir) + + def _get_species_id(self, species: str) -> int: + """Convert species name to model species ID. + + Args: + species: Species name (e.g., "human", "mouse") + + Returns: + Species ID for the model (includes special token offset) + """ + if self.species_config is None: + raise ValueError("Species config not loaded") + + species_to_id = self.species_config.get("species_to_id", {}) + if species not in species_to_id: + supported = list(species_to_id.keys()) + raise ValueError( + f"Species '{species}' not supported. Supported species: {supported}" + ) + + base_id = species_to_id[species] + # add special token offset to get actual model species ID + return base_id + self.NUM_SPECIES_SPECIAL_TOKENS + + def onnx_embed( + self, + sequences: list[str], + species_list: list[str] | None = None, + **kwargs: Any, + ) -> OnnxOutputContext: + """ + Run ONNX inference on DNA sequences. + + Args: + sequences: List of DNA sequences (A, T, C, G, N characters) + species_list: List of species names, one per sequence. If None, + defaults to "human" for all sequences. Ignored for + models that don't support species conditioning. + + Returns: + OnnxOutputContext containing model output and inputs + """ + assert self.tokenizer is not None + + # normalize sequences to uppercase + sequences = [seq.upper() for seq in sequences] + + # tokenize + encoded = self.tokenizer.encode_batch(sequences) + input_ids = np.array([e.ids for e in encoded], dtype=np.int64) + attention_mask = np.array([e.attention_mask for e in encoded], dtype=np.int64) + + # build onnx input based on model requirements + onnx_input: dict[str, NumpyArray] = { + "input_ids": input_ids, + "attention_mask": attention_mask, + } + + # add species IDs only for models that require it + if self._requires_species: + if species_list is None: + species_list = ["human"] * len(sequences) + + if len(species_list) != len(sequences): + raise ValueError( + f"species_list length ({len(species_list)}) must match " + f"sequences length ({len(sequences)})" + ) + + species_ids = np.array( + [self._get_species_id(sp) for sp in species_list], dtype=np.int64 + ) + onnx_input["species_ids"] = species_ids + + model_output = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input) # type: ignore[union-attr] + + return OnnxOutputContext( + model_output=model_output[0], + attention_mask=attention_mask, + input_ids=input_ids, + ) + + def _post_process_onnx_output( + self, output: OnnxOutputContext, **kwargs: Any + ) -> Iterable[NumpyArray]: + """Convert ONNX output to embeddings with mean pooling.""" + embeddings = output.model_output + attention_mask = output.attention_mask + + if attention_mask is None: + raise ValueError("attention_mask is required for mean pooling") + + # Mean pooling over sequence length + mask_expanded = np.expand_dims(attention_mask, axis=-1) + sum_embeddings = np.sum(embeddings * mask_expanded, axis=1) + sum_mask = np.sum(mask_expanded, axis=1) + sum_mask = np.clip(sum_mask, a_min=1e-9, a_max=None) + mean_embeddings = sum_embeddings / sum_mask + + return normalize(mean_embeddings) + + +class OnnxDNAEmbedding(DNAEmbeddingBase, OnnxDNAModel): + """ + ONNX-based DNA embedding implementation. + """ + + @classmethod + def _list_supported_models(cls) -> list[DenseModelDescription]: + return supported_dna_models + + def __init__( + self, + model_name: str = "InstaDeepAI/NTv3_650M_post", + cache_dir: str | None = None, + threads: int | None = None, + providers: Sequence[OnnxProvider] | None = None, + cuda: bool | Device = Device.AUTO, + device_ids: list[int] | None = None, + lazy_load: bool = False, + device_id: int | None = None, + specific_model_path: str | None = None, + **kwargs: Any, + ): + super().__init__(model_name, cache_dir, threads, **kwargs) + self.providers = providers + self.lazy_load = lazy_load + self._extra_session_options = self._select_exposed_session_options(kwargs) + self.device_ids = device_ids + self.cuda = cuda + + self.device_id: int | None = None + if device_id is not None: + self.device_id = device_id + elif self.device_ids is not None: + self.device_id = self.device_ids[0] + + self.model_description = self._get_model_description(model_name) + self.cache_dir = str(define_cache_dir(cache_dir)) + self._specific_model_path = specific_model_path + self._model_dir = self.download_model( + self.model_description, + self.cache_dir, + local_files_only=self._local_files_only, + specific_model_path=self._specific_model_path, + ) + + if not self.lazy_load: + self.load_onnx_model() + + def load_onnx_model(self) -> None: + self._load_onnx_model( + model_dir=self._model_dir, + model_file=self.model_description.model_file, + threads=self.threads, + model_name=self.model_name, + providers=self.providers, + cuda=self.cuda, + device_id=self.device_id, + extra_session_options=self._extra_session_options, + ) + + def embed( + self, + inputs: DNAInputType | Iterable[DNAInputType], + batch_size: int = 32, + parallel: int | None = None, + species: str = "human", + **kwargs: Any, + ) -> Iterable[NumpyArray]: + """ + Embed DNA sequences. + + Args: + inputs: DNA input(s). Can be: + - A single sequence string + - A DNAInput object with sequence and species + - An iterable of strings and/or DNAInput objects + batch_size: Batch size for encoding + parallel: Number of parallel workers (not yet supported) + species: Default species for plain string inputs (default: "human"). + Ignored for DNAInput objects which specify their own species. + + Yields: + Embeddings as numpy arrays, one per sequence + """ + # normalize inputs to sequences and species lists + sequences, species_list = _normalize_dna_inputs(inputs, default_species=species) + + if not hasattr(self, "model") or self.model is None: + self.load_onnx_model() + + # batch sequences and species together + seq_species_pairs = list(zip(sequences, species_list)) + for batch in iter_batch(seq_species_pairs, batch_size): + batch_seqs = [pair[0] for pair in batch] + batch_species = [pair[1] for pair in batch] + yield from self._post_process_onnx_output( + self.onnx_embed(batch_seqs, species_list=batch_species, **kwargs), + **kwargs, + ) + + def list_supported_species(self) -> list[str]: + """ + Returns list of supported species for conditioning. + + Returns an empty list for models that don't support species conditioning. + """ + if not hasattr(self, "_requires_species"): + # model not loaded yet + self.load_onnx_model() + + if not self._requires_species: + return [] + + if self.species_config is None: + return [] + + return self.species_config.get("supported_species", []) + + @classmethod + def _get_worker_class(cls) -> Type["DNAEmbeddingWorker"]: + return DNAEmbeddingWorker + + +class DNAEmbeddingWorker(EmbeddingWorker[NumpyArray]): + """Worker class for parallel DNA embedding processing.""" + + def init_embedding( + self, + model_name: str, + cache_dir: str, + **kwargs: Any, + ) -> OnnxDNAEmbedding: + return OnnxDNAEmbedding( + model_name=model_name, + cache_dir=cache_dir, + threads=1, + **kwargs, + ) + + def process( + self, items: Iterable[tuple[int, Any]] + ) -> Iterable[tuple[int, OnnxOutputContext]]: + for idx, batch in items: + onnx_output = self.model.onnx_embed(batch) + yield idx, onnx_output + + +class DNAEmbedding(DNAEmbeddingBase): + """ + DNA sequence embedding using Nucleotide Transformer v3 and similar models. + + Example: + >>> from fastembed_bio import DNAEmbedding, DNAInput + >>> model = DNAEmbedding("InstaDeepAI/NTv3_650M_post") + + # Simple usage - all sequences use default species ("human") + >>> embeddings = list(model.embed(["ATCGATCGATCG", "GCTAGCTAGCTA"])) + >>> print(embeddings[0].shape) + (1536,) + + # Per-sequence species with DNAInput + >>> embeddings = list(model.embed([ + ... DNAInput("ATCGATCGATCG", species="human"), + ... DNAInput("GCTAGCTAGCTA", species="mouse"), + ... ])) + + The model supports species-conditioned embeddings. Use `list_supported_species()` + to see available species options. + """ + + EMBEDDINGS_REGISTRY: list[Type[DNAEmbeddingBase]] = [OnnxDNAEmbedding] + + @classmethod + def list_supported_models(cls) -> list[dict[str, Any]]: + """Lists the supported models. + + Returns: + list[dict[str, Any]]: A list of dictionaries containing the model information. + """ + return [asdict(model) for model in cls._list_supported_models()] + + @classmethod + def _list_supported_models(cls) -> list[DenseModelDescription]: + result: list[DenseModelDescription] = [] + for embedding in cls.EMBEDDINGS_REGISTRY: + result.extend(embedding._list_supported_models()) + return result + + def __init__( + self, + model_name: str = "InstaDeepAI/NTv3_650M_post", + cache_dir: str | None = None, + threads: int | None = None, + providers: Sequence[OnnxProvider] | None = None, + cuda: bool | Device = Device.AUTO, + device_ids: list[int] | None = None, + lazy_load: bool = False, + **kwargs: Any, + ): + """ + Initialize DNAEmbedding. + + Args: + model_name: Name of the model to use + cache_dir: Path to cache directory + threads: Number of threads for ONNX runtime + providers: ONNX execution providers + cuda: Whether to use CUDA + device_ids: List of device IDs for multi-GPU + lazy_load: Whether to load model lazily + """ + super().__init__(model_name, cache_dir, threads, **kwargs) + + for EMBEDDING_MODEL_TYPE in self.EMBEDDINGS_REGISTRY: + supported_models = EMBEDDING_MODEL_TYPE._list_supported_models() + if any( + model_name.lower() == model.model.lower() for model in supported_models + ): + self.model = EMBEDDING_MODEL_TYPE( + model_name=model_name, + cache_dir=cache_dir, + threads=threads, + providers=providers, + cuda=cuda, + device_ids=device_ids, + lazy_load=lazy_load, + **kwargs, + ) + return + + raise ValueError( + f"Model {model_name} is not supported in DNAEmbedding. " + "Please check the supported models using `DNAEmbedding.list_supported_models()`" + ) + + def embed( + self, + inputs: DNAInputType | Iterable[DNAInputType], + batch_size: int = 32, + parallel: int | None = None, + species: str = "human", + **kwargs: Any, + ) -> Iterable[NumpyArray]: + """Embed DNA sequences. + + Args: + inputs: DNA input(s). Can be: + - A single sequence string + - A DNAInput object with sequence and species + - An iterable of strings and/or DNAInput objects + batch_size: Batch size for encoding + parallel: Number of parallel workers + species: Default species for plain string inputs (default: "human"). + Ignored for DNAInput objects which specify their own species. + + Yields: + Embeddings as numpy arrays, one per sequence + + Example: + # Simple usage with strings (all use default species) + model.embed(["ATCG", "GCTA"]) + + # Per-sequence species with DNAInput + model.embed([ + DNAInput("ATCG", species="human"), + DNAInput("GCTA", species="mouse"), + ]) + """ + yield from self.model.embed( + inputs, batch_size, parallel, species=species, **kwargs + ) + + def list_supported_species(self) -> list[str]: + """ + Returns list of supported species for conditioning. + + Returns: + List of species names (e.g., ["human", "mouse", ...]) + """ + return self.model.list_supported_species() \ No newline at end of file diff --git a/fastembed/parallel_processor.py b/fastembed_bio/parallel_processor.py similarity index 99% rename from fastembed/parallel_processor.py rename to fastembed_bio/parallel_processor.py index bfaaf0c54..5d84eb360 100644 --- a/fastembed/parallel_processor.py +++ b/fastembed_bio/parallel_processor.py @@ -10,7 +10,7 @@ from queue import Empty from typing import Any, Iterable, Type -from fastembed.common.types import Device +from fastembed_bio.common.types import Device # Single item should be processed in less than: processing_timeout = 10 * 60 # seconds diff --git a/fastembed_bio/protein/__init__.py b/fastembed_bio/protein/__init__.py new file mode 100644 index 000000000..6293a5bbe --- /dev/null +++ b/fastembed_bio/protein/__init__.py @@ -0,0 +1,3 @@ +from fastembed_bio.protein.embedding import ProteinEmbedding + +__all__ = ["ProteinEmbedding"] \ No newline at end of file diff --git a/fastembed_bio/protein/embedding.py b/fastembed_bio/protein/embedding.py new file mode 100644 index 000000000..9623b9f00 --- /dev/null +++ b/fastembed_bio/protein/embedding.py @@ -0,0 +1,459 @@ +import json + +from dataclasses import asdict +from pathlib import Path +from typing import Any, Iterable, Sequence, Type + +import numpy as np + +from tokenizers import Tokenizer, pre_tokenizers, processors +from tokenizers.models import WordLevel + +from fastembed_bio.common.model_description import DenseModelDescription, ModelSource +from fastembed_bio.common.model_management import ModelManagement +from fastembed_bio.common.onnx_model import OnnxModel, OnnxOutputContext, EmbeddingWorker +from fastembed_bio.common.types import NumpyArray, OnnxProvider, Device +from fastembed_bio.common.utils import define_cache_dir, iter_batch, normalize + + +supported_protein_models: list[DenseModelDescription] = [ + DenseModelDescription( + model="facebook/esm2_t12_35M_UR50D", + dim=480, + description="Protein embeddings, ESM-2 35M parameters, 480 dimensions, 1024 max sequence length", + license="mit", + size_in_GB=0.13, + sources=ModelSource(hf="nleroy917/esm2_t12_35M_UR50D-onnx"), + model_file="model.onnx", + additional_files=["tokenizer.json", "tokenizer_config.json", "special_tokens_map.json"], + ), +] + + +def load_protein_tokenizer(model_dir: Path, max_length: int = 1024) -> Tokenizer: + """ + Load a protein tokenizer from model directory using HuggingFace tokenizers. + + Attempts to load in order: + 1. tokenizer.json (standard HuggingFace fast tokenizer format) + 2. Build from vocab.txt (fallback for models without tokenizer.json) + + Args: + model_dir: Path to model directory containing tokenizer files + max_length: Maximum sequence length (default, can be overridden by config) + + Returns: + Configured Tokenizer instance + """ + tokenizer_json_path = model_dir / "tokenizer.json" + tokenizer_config_path = model_dir / "tokenizer_config.json" + vocab_path = model_dir / "vocab.txt" + + # Try to load tokenizer.json directly (preferred) + if tokenizer_json_path.exists(): + tokenizer = Tokenizer.from_file(str(tokenizer_json_path)) + # Read max_length from config if available + if tokenizer_config_path.exists(): + with open(tokenizer_config_path) as f: + config = json.load(f) + config_max_length = config.get("model_max_length", max_length) + # Cap at reasonable value (transformers defaults can be huge) + if config_max_length <= max_length: + max_length = config_max_length + tokenizer.enable_truncation(max_length=max_length) + return tokenizer + + # Fall back to building from vocab.txt + if not vocab_path.exists(): + raise ValueError( + f"Could not find tokenizer.json or vocab.txt in {model_dir}" + ) + + # Read max_length from config if available + if tokenizer_config_path.exists(): + with open(tokenizer_config_path) as f: + config = json.load(f) + max_length = config.get("model_max_length", max_length) + + vocab: dict[str, int] = {} + with open(vocab_path) as f: + for idx, line in enumerate(f): + token = line.strip() + vocab[token] = idx + + unk_token = "" + cls_token = "" + eos_token = "" + pad_token = "" + + tokenizer = Tokenizer(WordLevel(vocab=vocab, unk_token=unk_token)) + + tokenizer.pre_tokenizer = pre_tokenizers.Split( + pattern="", behavior="isolated", invert=False + ) + + cls_token_id = vocab.get(cls_token, 0) + eos_token_id = vocab.get(eos_token, 2) + + tokenizer.post_processor = processors.TemplateProcessing( + single=f"{cls_token}:0 $A:0 {eos_token}:0", + special_tokens=[ + (cls_token, cls_token_id), + (eos_token, eos_token_id), + ], + ) + + pad_token_id = vocab.get(pad_token, 1) + tokenizer.enable_padding(pad_id=pad_token_id, pad_token=pad_token) + tokenizer.enable_truncation(max_length=max_length) + + return tokenizer + + +class ProteinEmbeddingBase(ModelManagement[DenseModelDescription]): + def __init__( + self, + model_name: str, + cache_dir: str | None = None, + threads: int | None = None, + **kwargs: Any, + ): + self.model_name = model_name + self.cache_dir = cache_dir + self.threads = threads + self._local_files_only = kwargs.pop("local_files_only", False) + self._embedding_size: int | None = None + + def embed( + self, + sequences: str | Iterable[str], + batch_size: int = 32, + parallel: int | None = None, + **kwargs: Any, + ) -> Iterable[NumpyArray]: + """ + Embed protein sequences. + + Args: + sequences: Single protein sequence or iterable of sequences + batch_size: Batch size for encoding + parallel: Number of parallel workers (None for single-threaded) + + Yields: + Embeddings as numpy arrays + """ + raise NotImplementedError() + + @classmethod + def get_embedding_size(cls, model_name: str) -> int: + """ + Returns embedding size of the passed model. + + Args: + model_name: Name of the model + """ + descriptions = cls._list_supported_models() + for description in descriptions: + if description.model.lower() == model_name.lower(): + if description.dim is not None: + return description.dim + raise ValueError(f"Model {model_name} not found") + + @property + def embedding_size(self) -> int: + """ + Returns embedding size for the current model. + """ + if self._embedding_size is None: + self._embedding_size = self.get_embedding_size(self.model_name) + return self._embedding_size + + +class OnnxProteinModel(OnnxModel[NumpyArray]): + """ + ONNX model handler for protein embeddings. + """ + + ONNX_OUTPUT_NAMES: list[str] | None = None + + def __init__(self) -> None: + super().__init__() + self.tokenizer: Tokenizer | None = None + + def _load_onnx_model( + self, + model_dir: Path, + model_file: str, + threads: int | None, + providers: Sequence[OnnxProvider] | None = None, + cuda: bool | Device = Device.AUTO, + device_id: int | None = None, + extra_session_options: dict[str, Any] | None = None, + ) -> None: + super()._load_onnx_model( + model_dir=model_dir, + model_file=model_file, + threads=threads, + providers=providers, + cuda=cuda, + device_id=device_id, + extra_session_options=extra_session_options, + ) + self.tokenizer = load_protein_tokenizer(model_dir) + + def onnx_embed(self, sequences: list[str], **kwargs: Any) -> OnnxOutputContext: + """ + Run ONNX inference on protein sequences. + + Args: + sequences: List of protein sequences + Returns: + OnnxOutputContext containing model output and inputs + """ + assert self.tokenizer is not None + + sequences = [seq.upper() for seq in sequences] + encoded = self.tokenizer.encode_batch(sequences) + input_ids = np.array([e.ids for e in encoded], dtype=np.int64) + attention_mask = np.array([e.attention_mask for e in encoded], dtype=np.int64) + + input_names = {node.name for node in self.model.get_inputs()} # type: ignore[union-attr] + onnx_input: dict[str, NumpyArray] = { + "input_ids": input_ids, + } + if "attention_mask" in input_names: + onnx_input["attention_mask"] = attention_mask + + model_output = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input) # type: ignore[union-attr] + + return OnnxOutputContext( + model_output=model_output[0], + attention_mask=attention_mask, + input_ids=input_ids, + ) + + def _post_process_onnx_output( + self, output: OnnxOutputContext, **kwargs: Any + ) -> Iterable[NumpyArray]: + """Convert ONNX output to embeddings with mean pooling.""" + embeddings = output.model_output + attention_mask = output.attention_mask + + if attention_mask is None: + raise ValueError("attention_mask is required for mean pooling") + + mask_expanded = np.expand_dims(attention_mask, axis=-1) + sum_embeddings = np.sum(embeddings * mask_expanded, axis=1) + sum_mask = np.sum(mask_expanded, axis=1) + sum_mask = np.clip(sum_mask, a_min=1e-9, a_max=None) + mean_embeddings = sum_embeddings / sum_mask + + return normalize(mean_embeddings) + + +class OnnxProteinEmbedding(ProteinEmbeddingBase, OnnxProteinModel): + """ + ONNX-based protein embedding implementation. + """ + + @classmethod + def _list_supported_models(cls) -> list[DenseModelDescription]: + return supported_protein_models + + def __init__( + self, + model_name: str = "facebook/esm2_t12_35M_UR50D", + cache_dir: str | None = None, + threads: int | None = None, + providers: Sequence[OnnxProvider] | None = None, + cuda: bool | Device = Device.AUTO, + device_ids: list[int] | None = None, + lazy_load: bool = False, + device_id: int | None = None, + specific_model_path: str | None = None, + **kwargs: Any, + ): + super().__init__(model_name, cache_dir, threads, **kwargs) + self.providers = providers + self.lazy_load = lazy_load + self._extra_session_options = self._select_exposed_session_options(kwargs) + self.device_ids = device_ids + self.cuda = cuda + + self.device_id: int | None = None + if device_id is not None: + self.device_id = device_id + elif self.device_ids is not None: + self.device_id = self.device_ids[0] + + self.model_description = self._get_model_description(model_name) + self.cache_dir = str(define_cache_dir(cache_dir)) + self._specific_model_path = specific_model_path + self._model_dir = self.download_model( + self.model_description, + self.cache_dir, + local_files_only=self._local_files_only, + specific_model_path=self._specific_model_path, + ) + + if not self.lazy_load: + self.load_onnx_model() + + def load_onnx_model(self) -> None: + self._load_onnx_model( + model_dir=self._model_dir, + model_file=self.model_description.model_file, + threads=self.threads, + providers=self.providers, + cuda=self.cuda, + device_id=self.device_id, + extra_session_options=self._extra_session_options, + ) + + def embed( + self, + sequences: str | Iterable[str], + batch_size: int = 32, + parallel: int | None = None, + **kwargs: Any, + ) -> Iterable[NumpyArray]: + """ + Embed protein sequences. + + Args: + sequences: Single protein sequence or iterable of sequences (amino acid strings) + batch_size: Batch size for encoding + parallel: Number of parallel workers (not yet supported) + + Yields: + Embeddings as numpy arrays, one per sequence + """ + if isinstance(sequences, str): + sequences = [sequences] + + if not hasattr(self, "model") or self.model is None: + self.load_onnx_model() + + for batch in iter_batch(sequences, batch_size): + yield from self._post_process_onnx_output(self.onnx_embed(batch, **kwargs), **kwargs) + + @classmethod + def _get_worker_class(cls) -> Type["ProteinEmbeddingWorker"]: + return ProteinEmbeddingWorker + + +class ProteinEmbeddingWorker(EmbeddingWorker[NumpyArray]): + def init_embedding( + self, + model_name: str, + cache_dir: str, + **kwargs: Any, + ) -> OnnxProteinEmbedding: + return OnnxProteinEmbedding( + model_name=model_name, + cache_dir=cache_dir, + threads=1, + **kwargs, + ) + + def process( + self, items: Iterable[tuple[int, Any]] + ) -> Iterable[tuple[int, OnnxOutputContext]]: + for idx, batch in items: + onnx_output = self.model.onnx_embed(batch) + yield idx, onnx_output + + +class ProteinEmbedding(ProteinEmbeddingBase): + """ + Protein sequence embedding using ESM-2 and similar models. + + Example: + >>> from fastembed_bio import ProteinEmbedding + >>> model = ProteinEmbedding("facebook/esm2_t12_35M_UR50D") + >>> embeddings = list(model.embed(["MKTVRQERLKS", "GKGDPKKPRGKM"])) + >>> print(embeddings[0].shape) + (480,) + """ + + EMBEDDINGS_REGISTRY: list[Type[ProteinEmbeddingBase]] = [OnnxProteinEmbedding] + + @classmethod + def list_supported_models(cls) -> list[dict[str, Any]]: + """Lists the supported models. + + Returns: + list[dict[str, Any]]: A list of dictionaries containing the model information. + """ + return [asdict(model) for model in cls._list_supported_models()] + + @classmethod + def _list_supported_models(cls) -> list[DenseModelDescription]: + result: list[DenseModelDescription] = [] + for embedding in cls.EMBEDDINGS_REGISTRY: + result.extend(embedding._list_supported_models()) + return result + + def __init__( + self, + model_name: str = "facebook/esm2_t12_35M_UR50D", + cache_dir: str | None = None, + threads: int | None = None, + providers: Sequence[OnnxProvider] | None = None, + cuda: bool | Device = Device.AUTO, + device_ids: list[int] | None = None, + lazy_load: bool = False, + **kwargs: Any, + ): + """ + Initialize ProteinEmbedding. + + Args: + model_name: Name of the model to use + cache_dir: Path to cache directory + threads: Number of threads for ONNX runtime + providers: ONNX execution providers + cuda: Whether to use CUDA + device_ids: List of device IDs for multi-GPU + lazy_load: Whether to load model lazily + """ + super().__init__(model_name, cache_dir, threads, **kwargs) + + for EMBEDDING_MODEL_TYPE in self.EMBEDDINGS_REGISTRY: + supported_models = EMBEDDING_MODEL_TYPE._list_supported_models() + if any(model_name.lower() == model.model.lower() for model in supported_models): + self.model = EMBEDDING_MODEL_TYPE( + model_name=model_name, + cache_dir=cache_dir, + threads=threads, + providers=providers, + cuda=cuda, + device_ids=device_ids, + lazy_load=lazy_load, + **kwargs, + ) + return + + raise ValueError( + f"Model {model_name} is not supported in ProteinEmbedding. " + "Please check the supported models using `ProteinEmbedding.list_supported_models()`" + ) + + def embed( + self, + sequences: str | Iterable[str], + batch_size: int = 32, + parallel: int | None = None, + **kwargs: Any, + ) -> Iterable[NumpyArray]: + """Embed protein sequences. + + Args: + sequences: Single protein sequence or iterable of sequences (amino acid strings) + batch_size: Batch size for encoding + parallel: Number of parallel workers + + Yields: + Embeddings as numpy arrays, one per sequence + """ + yield from self.model.embed(sequences, batch_size, parallel, **kwargs) \ No newline at end of file diff --git a/fastembed/py.typed b/fastembed_bio/py.typed similarity index 100% rename from fastembed/py.typed rename to fastembed_bio/py.typed diff --git a/mkdocs.yml b/mkdocs.yml deleted file mode 100644 index be5b7a69c..000000000 --- a/mkdocs.yml +++ /dev/null @@ -1,94 +0,0 @@ -site_name: FastEmbed -site_url: https://qdrant.github.io/fastembed/ -site_author: Nirant Kasliwal -repo_url: https://github.com/qdrant/fastembed/ -repo_name: qdrant/fastembed - -remote_branch: gh-pages -remote_name: origin - -copyright: | - Maintained by Qdrant. Originally created by Nirant Kasliwal. - -theme: - name: material - logo: assets/favicon.png - favicon: assets/favicon.png - custom_dir: docs/overrides - icon: - repo: fontawesome/brands/github - features: - - search.suggest - - search.highlight - - navigation.instant - - navigation.tracking - - navigation.expand - - navigation.sections - - content.code.annotate - - toc.follow - - header.autohide - - announce.dismiss - accent: - # Primary color - color: "#3f51b5" - # Text color for primary color - text: "#ffffff" - - palette: - # Palette toggle for light mode - - scheme: default - toggle: - icon: material/brightness-7 - name: Switch to dark mode - - # Palette toggle for dark mode - - scheme: slate - toggle: - icon: material/brightness-4 - name: Switch to light mode - -markdown_extensions: - - abbr - - admonition - - attr_list - # - highlight - - def_list - - toc: - permalink: true - toc_depth: 3 - -plugins: - - search - - mkdocstrings: - default_handler: python - handlers: - python: - options: - show_source: false - show_bases: false - show_if_no_docstring: true - merge_init_into_class: true - show_root_toc_entry: false - show_inheritance: true - show_private: false - show_special_members: false - - mknotebooks: - execute: false - timeout: 100 - allow_errors: false - tag_remove_configs: - remove_cell_tags: - - Remove_cell - remove_all_outputs_tags: - - Remove_all_output - remove_single_output_tags: - - Remove_single_output - remove_input_tags: - - Remove_input - -markdown_extensions: - - pymdownx.superfences: - custom_fences: - - name: mermaid - class: mermaid - format: !!python/name:pymdownx.superfences.fence_code_format diff --git a/poetry.lock b/poetry.lock index d6f688a61..e2f0ce785 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.3.1 and should not be changed by hand. [[package]] name = "anyio" @@ -1275,7 +1275,7 @@ fqdn = {version = "*", optional = true, markers = "extra == \"format-nongpl\""} idna = {version = "*", optional = true, markers = "extra == \"format-nongpl\""} isoduration = {version = "*", optional = true, markers = "extra == \"format-nongpl\""} jsonpointer = {version = ">1.13", optional = true, markers = "extra == \"format-nongpl\""} -jsonschema-specifications = ">=2023.03.6" +jsonschema-specifications = ">=2023.3.6" referencing = ">=0.28.4" rfc3339-validator = {version = "*", optional = true, markers = "extra == \"format-nongpl\""} rfc3986-validator = {version = ">0.1.0", optional = true, markers = "extra == \"format-nongpl\""} @@ -1640,7 +1640,7 @@ colorama = {version = ">=0.3.4", markers = "sys_platform == \"win32\""} win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""} [package.extras] -dev = ["Sphinx (==7.2.5) ; python_version >= \"3.9\"", "colorama (==0.4.5) ; python_version < \"3.8\"", "colorama (==0.4.6) ; python_version >= \"3.8\"", "exceptiongroup (==1.1.3) ; python_version >= \"3.7\" and python_version < \"3.11\"", "freezegun (==1.1.0) ; python_version < \"3.8\"", "freezegun (==1.2.2) ; python_version >= \"3.8\"", "mypy (==v0.910) ; python_version < \"3.6\"", "mypy (==v0.971) ; python_version == \"3.6\"", "mypy (==v1.4.1) ; python_version == \"3.7\"", "mypy (==v1.5.1) ; python_version >= \"3.8\"", "pre-commit (==3.4.0) ; python_version >= \"3.8\"", "pytest (==6.1.2) ; python_version < \"3.8\"", "pytest (==7.4.0) ; python_version >= \"3.8\"", "pytest-cov (==2.12.1) ; python_version < \"3.8\"", "pytest-cov (==4.1.0) ; python_version >= \"3.8\"", "pytest-mypy-plugins (==1.9.3) ; python_version >= \"3.6\" and python_version < \"3.8\"", "pytest-mypy-plugins (==3.0.0) ; python_version >= \"3.8\"", "sphinx-autobuild (==2021.3.14) ; python_version >= \"3.9\"", "sphinx-rtd-theme (==1.3.0) ; python_version >= \"3.9\"", "tox (==3.27.1) ; python_version < \"3.8\"", "tox (==4.11.0) ; python_version >= \"3.8\""] +dev = ["Sphinx (==7.2.5) ; python_version >= \"3.9\"", "colorama (==0.4.5) ; python_version < \"3.8\"", "colorama (==0.4.6) ; python_version >= \"3.8\"", "exceptiongroup (==1.1.3) ; python_version >= \"3.7\" and python_version < \"3.11\"", "freezegun (==1.1.0) ; python_version < \"3.8\"", "freezegun (==1.2.2) ; python_version >= \"3.8\"", "mypy (==0.910) ; python_version < \"3.6\"", "mypy (==0.971) ; python_version == \"3.6\"", "mypy (==1.4.1) ; python_version == \"3.7\"", "mypy (==1.5.1) ; python_version >= \"3.8\"", "pre-commit (==3.4.0) ; python_version >= \"3.8\"", "pytest (==6.1.2) ; python_version < \"3.8\"", "pytest (==7.4.0) ; python_version >= \"3.8\"", "pytest-cov (==2.12.1) ; python_version < \"3.8\"", "pytest-cov (==4.1.0) ; python_version >= \"3.8\"", "pytest-mypy-plugins (==1.9.3) ; python_version >= \"3.6\" and python_version < \"3.8\"", "pytest-mypy-plugins (==3.0.0) ; python_version >= \"3.8\"", "sphinx-autobuild (==2021.3.14) ; python_version >= \"3.9\"", "sphinx-rtd-theme (==1.3.0) ; python_version >= \"3.9\"", "tox (==3.27.1) ; python_version < \"3.8\"", "tox (==4.11.0) ; python_version >= \"3.8\""] [[package]] name = "loguru" @@ -1660,7 +1660,7 @@ colorama = {version = ">=0.3.4", markers = "sys_platform == \"win32\""} win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""} [package.extras] -dev = ["Sphinx (==8.1.3) ; python_version >= \"3.11\"", "build (==1.2.2) ; python_version >= \"3.11\"", "colorama (==0.4.5) ; python_version < \"3.8\"", "colorama (==0.4.6) ; python_version >= \"3.8\"", "exceptiongroup (==1.1.3) ; python_version >= \"3.7\" and python_version < \"3.11\"", "freezegun (==1.1.0) ; python_version < \"3.8\"", "freezegun (==1.5.0) ; python_version >= \"3.8\"", "mypy (==v0.910) ; python_version < \"3.6\"", "mypy (==v0.971) ; python_version == \"3.6\"", "mypy (==v1.13.0) ; python_version >= \"3.8\"", "mypy (==v1.4.1) ; python_version == \"3.7\"", "myst-parser (==4.0.0) ; python_version >= \"3.11\"", "pre-commit (==4.0.1) ; python_version >= \"3.9\"", "pytest (==6.1.2) ; python_version < \"3.8\"", "pytest (==8.3.2) ; python_version >= \"3.8\"", "pytest-cov (==2.12.1) ; python_version < \"3.8\"", "pytest-cov (==5.0.0) ; python_version == \"3.8\"", "pytest-cov (==6.0.0) ; python_version >= \"3.9\"", "pytest-mypy-plugins (==1.9.3) ; python_version >= \"3.6\" and python_version < \"3.8\"", "pytest-mypy-plugins (==3.1.0) ; python_version >= \"3.8\"", "sphinx-rtd-theme (==3.0.2) ; python_version >= \"3.11\"", "tox (==3.27.1) ; python_version < \"3.8\"", "tox (==4.23.2) ; python_version >= \"3.8\"", "twine (==6.0.1) ; python_version >= \"3.11\""] +dev = ["Sphinx (==8.1.3) ; python_version >= \"3.11\"", "build (==1.2.2) ; python_version >= \"3.11\"", "colorama (==0.4.5) ; python_version < \"3.8\"", "colorama (==0.4.6) ; python_version >= \"3.8\"", "exceptiongroup (==1.1.3) ; python_version >= \"3.7\" and python_version < \"3.11\"", "freezegun (==1.1.0) ; python_version < \"3.8\"", "freezegun (==1.5.0) ; python_version >= \"3.8\"", "mypy (==0.910) ; python_version < \"3.6\"", "mypy (==0.971) ; python_version == \"3.6\"", "mypy (==1.13.0) ; python_version >= \"3.8\"", "mypy (==1.4.1) ; python_version == \"3.7\"", "myst-parser (==4.0.0) ; python_version >= \"3.11\"", "pre-commit (==4.0.1) ; python_version >= \"3.9\"", "pytest (==6.1.2) ; python_version < \"3.8\"", "pytest (==8.3.2) ; python_version >= \"3.8\"", "pytest-cov (==2.12.1) ; python_version < \"3.8\"", "pytest-cov (==5.0.0) ; python_version == \"3.8\"", "pytest-cov (==6.0.0) ; python_version >= \"3.9\"", "pytest-mypy-plugins (==1.9.3) ; python_version >= \"3.6\" and python_version < \"3.8\"", "pytest-mypy-plugins (==3.1.0) ; python_version >= \"3.8\"", "sphinx-rtd-theme (==3.0.2) ; python_version >= \"3.11\"", "tox (==3.27.1) ; python_version < \"3.8\"", "tox (==4.23.2) ; python_version >= \"3.8\"", "twine (==6.0.1) ; python_version >= \"3.11\""] [[package]] name = "markdown" @@ -2749,7 +2749,7 @@ version = "11.3.0" description = "Python Imaging Library (Fork)" optional = false python-versions = ">=3.9" -groups = ["main", "docs"] +groups = ["docs"] files = [ {file = "pillow-11.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:1b9c17fd4ace828b3003dfd1e30bff24863e0eb59b535e8f80194d9cc7ecf860"}, {file = "pillow-11.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:65dc69160114cdd0ca0f35cb434633c75e8e7fad4cf855177a05bf38678f73ad"}, @@ -4034,6 +4034,36 @@ typing-extensions = ">=3.7.4.3" [package.extras] standard = ["rich (>=10.11.0)", "shellingham (>=1.3.0)"] +[[package]] +name = "types-requests" +version = "2.32.4.20260107" +description = "Typing stubs for requests" +optional = false +python-versions = ">=3.9" +groups = ["types"] +files = [ + {file = "types_requests-2.32.4.20260107-py3-none-any.whl", hash = "sha256:b703fe72f8ce5b31ef031264fe9395cac8f46a04661a79f7ed31a80fb308730d"}, + {file = "types_requests-2.32.4.20260107.tar.gz", hash = "sha256:018a11ac158f801bfa84857ddec1650750e393df8a004a8a9ae2a9bec6fcb24f"}, +] + +[package.dependencies] +urllib3 = ">=2" + +[[package]] +name = "types-tqdm" +version = "4.67.0.20250809" +description = "Typing stubs for tqdm" +optional = false +python-versions = ">=3.9" +groups = ["types"] +files = [ + {file = "types_tqdm-4.67.0.20250809-py3-none-any.whl", hash = "sha256:1a73053b31fcabf3c1f3e2a9d5ecdba0f301bde47a418cd0e0bdf774827c5c57"}, + {file = "types_tqdm-4.67.0.20250809.tar.gz", hash = "sha256:02bf7ab91256080b9c4c63f9f11b519c27baaf52718e5fdab9e9606da168d500"}, +] + +[package.dependencies] +types-requests = "*" + [[package]] name = "typing-extensions" version = "4.15.0" @@ -4080,7 +4110,7 @@ version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" -groups = ["main", "dev", "docs"] +groups = ["main", "dev", "docs", "types"] files = [ {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, @@ -4229,4 +4259,4 @@ dev = ["black (>=19.3b0) ; python_version >= \"3.6\"", "pytest (>=4.6.2)"] [metadata] lock-version = "2.1" python-versions = ">=3.10.0" -content-hash = "cdceed9de4790023c96f06433d9e31c3b72c4562eed63c46f08e7388efb9b349" +content-hash = "6d5e0178e8f32fc7a73b901dda68f51e8bfbd27d0738b73e256a5229c5eb63d8" diff --git a/pyproject.toml b/pyproject.toml index 3b72a3053..826ccbf22 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,14 +1,14 @@ [tool.poetry] -name = "fastembed" -version = "0.7.4" -description = "Fast, light, accurate library built for retrieval embedding generation" -authors = ["Qdrant Team ", "NirantK "] +name = "fastembed-bio" +version = "0.1.2" +description = "Fast, light, accurate library for biological sequence embeddings (proteins, DNA, RNA)" +authors = ["Nathan LeRoy "] license = "Apache License" readme = "README.md" -packages = [{include = "fastembed"}] -homepage = "https://github.com/qdrant/fastembed" -repository = "https://github.com/qdrant/fastembed" -keywords = ["vector", "embedding", "neural", "search", "qdrant", "sentence-transformers"] +packages = [{include = "fastembed_bio"}] +homepage = "https://github.com/nleroy917/fastembed-bio" +repository = "https://github.com/nleroy917/fastembed-bio" +keywords = ["vector", "embedding", "protein", "esm2", "bioinformatics", "onnx", "biological-sequences"] [tool.poetry.dependencies] python = ">=3.10.0" @@ -28,11 +28,6 @@ requests = "^2.31" tokenizers = ">=0.15,<1.0" huggingface-hub = ">=0.20,<2.0" loguru = "^0.7.2" -pillow = [ - { version = ">=10.3.0,<11.0", python = "<3.10" }, - { version = ">=10.3.0,<12.0", python = ">=3.10,<3.13" }, - { version = ">=11.0.0,<12.0", python = ">=3.13" }, -] mmh3 = ">=4.1.0,<6.0.0" py-rust-stemmers = "^0.1.0" @@ -58,6 +53,8 @@ mknotebooks = "^0.8.0" [tool.poetry.group.types.dependencies] pyright = ">=1.1.293" mypy = "^1.0.0" +types-requests = "^2.31" +types-tqdm = "^4.66" [build-system] requires = ["poetry-core"] @@ -66,5 +63,8 @@ build-backend = "poetry.core.masonry.api" [tool.pyright] typeCheckingMode = "strict" +[tool.mypy] +ignore_missing_imports = true + [tool.ruff] line-length = 99 diff --git a/scripts/onnx_conversion/convert_grover_to_onnx.py b/scripts/onnx_conversion/convert_grover_to_onnx.py new file mode 100644 index 000000000..177ba17fe --- /dev/null +++ b/scripts/onnx_conversion/convert_grover_to_onnx.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python3 +""" +Convert GROVER (PoetschLab/GROVER) model to ONNX format for fastembed-bio. + +Usage: + python scripts/convert_grover_to_onnx.py \ + --model PoetschLab/GROVER \ + --output ./converted_models/grover-onnx + +Requirements: + pip install torch transformers onnx onnxruntime +""" +import argparse +import json + +from pathlib import Path + +import torch +import numpy as np + +from transformers import AutoTokenizer, AutoModel + + +def convert_to_onnx( + model_name: str, + output_dir: str, + opset_version: int = 18, + max_length: int = 512, +) -> Path: + """ + Convert GROVER model to ONNX format. + + Args: + model_name: HuggingFace model identifier (e.g., PoetschLab/GROVER) + output_dir: Directory to save ONNX model and tokenizer files + opset_version: ONNX opset version (default 14) + max_length: Maximum sequence length for dummy input + + Returns: + Path to output directory + """ + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + print(f"Loading model: {model_name}") + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + model = AutoModel.from_pretrained(model_name, trust_remote_code=True) + model.eval() + + # create dummy input + dummy_sequence = "ATCG" * (max_length // 4) + print(f"Creating dummy input with sequence length: {len(dummy_sequence)}") + + batch = tokenizer( + [dummy_sequence], + return_tensors="pt", + padding="max_length", + max_length=max_length, + truncation=True, + ) + + print(f"Input shapes - input_ids: {batch['input_ids'].shape}, attention_mask: {batch['attention_mask'].shape}") + + # export to ONNX + print("Exporting to ONNX...") + onnx_path = output_path / "model.onnx" + + model.eval() + + with torch.no_grad(): + torch.onnx.export( + model, + (batch["input_ids"], batch["attention_mask"]), + onnx_path, + input_names=["input_ids", "attention_mask"], + output_names=["embeddings"], + dynamic_axes={ + "input_ids": {0: "batch_size", 1: "sequence_length"}, + "attention_mask": {0: "batch_size", 1: "sequence_length"}, + "embeddings": {0: "batch_size", 1: "sequence_length"}, + }, + opset_version=opset_version, + do_constant_folding=True, + ) + + print(f"ONNX model saved to: {onnx_path}") + + # save tokenizer files + print("Saving tokenizer...") + tokenizer.save_pretrained(output_path) + + # save model config info + model_config = { + "hidden_size": model.config.hidden_size, + "model_type": "grover", + "source_model": model_name, + "output_type": "embeddings_only", + } + model_config_path = output_path / "model_config.json" + with open(model_config_path, "w") as f: + json.dump(model_config, f, indent=2) + print(f"Model config saved to: {model_config_path}") + + return output_path + + +def validate_onnx_model( + model_name: str, + onnx_path: str, +): + """ + Validate the ONNX model by checking its structure and running a test inference. + + Args: + model_name: HuggingFace model identifier + onnx_path: Path to the ONNX model file + """ + import onnxruntime as ort + + model = AutoModel.from_pretrained(model_name, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + model.eval() + + session = ort.InferenceSession(onnx_path, providers=['CPUExecutionProvider']) + test_seq = 'ATCGATCGATCGATCG' * 8 + batch = tokenizer([test_seq], return_tensors='pt', padding=True, truncation=True, max_length=512) + + with torch.no_grad(): + pt_out = model( + input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], + return_dict=True, + ) + + onnx_out = session.run(None, { + 'input_ids': batch['input_ids'].numpy().astype(np.int64), + 'attention_mask': batch['attention_mask'].numpy().astype(np.int64), + })[0] + + pt_emb = pt_out.last_hidden_state.numpy() + print(f'PyTorch: {pt_emb.shape}, ONNX: {onnx_out.shape}') + print(f'Max diff: {np.abs(pt_emb - onnx_out).max():.2e}') + print('โœ“ Match!' if np.allclose(pt_emb, onnx_out, atol=1e-3) else 'โœ— Mismatch') + + +def main(): + parser = argparse.ArgumentParser( + description="Convert GROVER model to ONNX format for fastembed-bio" + ) + parser.add_argument( + "--model", + type=str, + default="PoetschLab/GROVER", + help="HuggingFace model identifier", + ) + parser.add_argument( + "--output", + type=str, + default="./converted_models/grover-onnx", + help="Output directory for ONNX model", + ) + parser.add_argument( + "--opset", + type=int, + default=18, + help="ONNX opset version (default: 18)", + ) + parser.add_argument( + "--max-length", + type=int, + default=512, + help="Max sequence length for export", + ) + parser.add_argument( + "--validate", + action="store_true", + help="Validate the exported ONNX model", + ) + + args = parser.parse_args() + output_path = convert_to_onnx(args.model, args.output, args.opset, args.max_length) + + if args.validate: + validate_onnx_model(args.model, str(output_path / "model.onnx")) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tests/config.py b/tests/config.py deleted file mode 100644 index 3b172e8dd..000000000 --- a/tests/config.py +++ /dev/null @@ -1,4 +0,0 @@ -from pathlib import Path - -TEST_DIR = Path(__file__).parent -TEST_MISC_DIR = TEST_DIR / "misc" diff --git a/tests/misc/image.jpeg b/tests/misc/image.jpeg deleted file mode 100644 index e131e8ecd..000000000 Binary files a/tests/misc/image.jpeg and /dev/null differ diff --git a/tests/misc/small_image.jpeg b/tests/misc/small_image.jpeg deleted file mode 100644 index 2e0250aba..000000000 Binary files a/tests/misc/small_image.jpeg and /dev/null differ diff --git a/tests/profiling.py b/tests/profiling.py deleted file mode 100644 index 1edf3b047..000000000 --- a/tests/profiling.py +++ /dev/null @@ -1,148 +0,0 @@ -# %% [markdown] -# # ๐Ÿค— Huggingface vs โšก FastEmbed๏ธ -# -# Comparing the performance of Huggingface's ๐Ÿค— Transformers and โšก FastEmbed๏ธ on a simple task on the following machine: Apple M2 Max, 32 GB RAM -# -# ## ๐Ÿ“ฆ Imports -# -# Importing the necessary libraries for this comparison. - -# %% -import time -from typing import Callable - -import matplotlib.pyplot as plt -import torch.nn.functional as F -from transformers import AutoModel, AutoTokenizer - -from fastembed.embedding import DefaultEmbedding - -# %% [markdown] -# ## ๐Ÿ“– Data -# -# data is a list of strings, each string is a document. - -# %% -documents: list[str] = [ - "Chandrayaan-3 is India's third lunar mission", - "It aimed to land a rover on the Moon's surface - joining the US, China and Russia", - "The mission is a follow-up to Chandrayaan-2, which had partial success", - "Chandrayaan-3 will be launched by the Indian Space Research Organisation (ISRO)", - "The estimated cost of the mission is around $35 million", - "It will carry instruments to study the lunar surface and atmosphere", - "Chandrayaan-3 landed on the Moon's surface on 23rd August 2023", - "It consists of a lander named Vikram and a rover named Pragyan similar to Chandrayaan-2. Its propulsion module would act like an orbiter.", - "The propulsion module carries the lander and rover configuration until the spacecraft is in a 100-kilometre (62 mi) lunar orbit", - "The mission used GSLV Mk III rocket for its launch", - "Chandrayaan-3 was launched from the Satish Dhawan Space Centre in Sriharikota", - "Chandrayaan-3 was launched earlier in the year 2023", -] -len(documents) - -# %% [markdown] -# ## Setting up ๐Ÿค— Huggingface -# -# We'll be using the [Huggingface Transformers](https://huggingface.co/transformers/) with PyTorch library to generate embeddings. We'll be using the same model across both libraries for a fair(er?) comparison. - - -# %% -class HF: - """ - HuggingFace Transformer implementation of FlagEmbedding - Based on https://huggingface.co/BAAI/bge-base-en - """ - - def __init__(self, model_id: str): - self.model = AutoModel.from_pretrained(model_id) - self.tokenizer = AutoTokenizer.from_pretrained(model_id) - - def embed(self, texts: list[str]): - encoded_input = self.tokenizer( - texts, max_length=512, padding=True, truncation=True, return_tensors="pt" - ) - model_output = self.model(**encoded_input) - sentence_embeddings = model_output[0][:, 0] - sentence_embeddings = F.normalize(sentence_embeddings) - return sentence_embeddings - - -hf = HF(model_id="BAAI/bge-small-en") -hf.embed(documents).shape - -# %% [markdown] -# ## Setting up โšก๏ธFastEmbed -# -# Sorry, don't have a lot to set up here. We'll be using the default model, which is Flag Embedding, same as the Huggingface model. - -# %% -embedding_model = DefaultEmbedding() - -# %% [markdown] -# ## ๐Ÿ“Š Comparison -# -# We'll be comparing the following metrics: Minimum, Maximum, Mean, across k runs. Let's write a function to do that: -# -# ### ๐Ÿš€ Calculating Stats - - -# %% -def calculate_time_stats( - embed_func: Callable, documents: list, k: int -) -> tuple[float, float, float]: - times = [] - for _ in range(k): - # Timing the embed_func call - start_time = time.time() - embed_func(documents) - end_time = time.time() - - times.append(end_time - start_time) - - # Returning mean, max, and min time for the call - return (sum(times) / k, max(times), min(times)) - - -# %% -hf_stats = calculate_time_stats(hf.embed, documents, k=2) -print(f"Huggingface Transformers (Average, Max, Min): {hf_stats}") -fst_stats = calculate_time_stats(lambda x: list(embedding_model.embed(x)), documents, k=2) -print(f"FastEmbed (Average, Max, Min): {fst_stats}") - - -# %% -def plot_character_per_second_comparison( - hf_stats: tuple[float, float, float], - fst_stats: tuple[float, float, float], - documents: list, -): - # Calculating total characters in documents - total_characters = sum(len(doc) for doc in documents) - - # Calculating characters per second for each model - hf_chars_per_sec = total_characters / hf_stats[0] # Mean time is at index 0 - fst_chars_per_sec = total_characters / fst_stats[0] - - # Plotting the bar chart - models = ["HF Embed (Torch)", "FastEmbed"] - chars_per_sec = [hf_chars_per_sec, fst_chars_per_sec] - - bars = plt.bar(models, chars_per_sec, color=["#1f356c", "#dd1f4b"]) - plt.ylabel("Characters per Second") - plt.title("Characters Processed per Second Comparison") - - # Adding the number at the top of each bar - for bar, chars in zip(bars, chars_per_sec): - plt.text( - bar.get_x() + bar.get_width() / 2, - bar.get_height(), - f"{chars:.1f}", - ha="center", - va="bottom", - color="#1f356c", - fontsize=12, - ) - - plt.show() - - -plot_character_per_second_comparison(hf_stats, fst_stats, documents) diff --git a/tests/test_attention_embeddings.py b/tests/test_attention_embeddings.py deleted file mode 100644 index 8f62af435..000000000 --- a/tests/test_attention_embeddings.py +++ /dev/null @@ -1,172 +0,0 @@ -import os -from contextlib import contextmanager - -import numpy as np -import pytest - -from fastembed import SparseTextEmbedding -from tests.utils import delete_model_cache - - -_MODELS_TO_CACHE = ("Qdrant/bm42-all-minilm-l6-v2-attentions", "Qdrant/bm25") -MODELS_TO_CACHE = tuple([x.lower() for x in _MODELS_TO_CACHE]) - - -@pytest.fixture(scope="module") -def model_cache(): - is_ci = os.getenv("CI") - cache = {} - - @contextmanager - def get_model(model_name: str): - lowercase_model_name = model_name.lower() - if lowercase_model_name not in cache: - cache[lowercase_model_name] = SparseTextEmbedding(lowercase_model_name) - yield cache[lowercase_model_name] - if lowercase_model_name not in MODELS_TO_CACHE: - print("deleting model") - model_inst = cache.pop(lowercase_model_name) - if is_ci: - delete_model_cache(model_inst.model._model_dir) - del model_inst - - yield get_model - - if is_ci: - for name, model in cache.items(): - delete_model_cache(model.model._model_dir) - cache.clear() - - -@pytest.mark.parametrize("model_name", ["Qdrant/bm42-all-minilm-l6-v2-attentions", "Qdrant/bm25"]) -def test_attention_embeddings(model_cache, model_name: str) -> None: - with model_cache(model_name) as model: - output = list( - model.query_embed( - [ - "I must not fear. Fear is the mind-killer.", - ] - ) - ) - - assert len(output) == 1 - - for result in output: - assert len(result.indices) == len(result.values) - assert np.allclose(result.values, np.ones(len(result.values))) - - quotes = [ - "I must not fear. Fear is the mind-killer.", - "All animals are equal, but some animals are more equal than others.", - "It was a pleasure to burn.", - "The sky above the port was the color of television, tuned to a dead channel.", - "In the beginning, the universe was created." - " This has made a lot of people very angry and been widely regarded as a bad move.", - "It's a truth universally acknowledged that a zombie in possession of brains must be in want of more brains.", - "War is peace. Freedom is slavery. Ignorance is strength.", - "We're not in Infinity; we're in the suburbs.", - "I was a thousand times more evil than thou!", - "History is merely a list of surprises... It can only prepare us to be surprised yet again.", - ".", # Empty string - ] - - output = list(model.embed(quotes)) - - assert len(output) == len(quotes) - - for result in output[:-1]: - assert len(result.indices) == len(result.values) - assert len(result.indices) > 0 - - assert len(output[-1].indices) == 0 - - # Test support for unknown languages - output = list( - model.query_embed( - [ - "ะฟั€ะธะฒะตั‚ ะผะธั€!", - ] - ) - ) - - assert len(output) == 1 - - for result in output: - assert len(result.indices) == len(result.values) - assert len(result.indices) == 2 - - -@pytest.mark.parametrize("model_name", ["Qdrant/bm42-all-minilm-l6-v2-attentions", "Qdrant/bm25"]) -def test_parallel_processing(model_cache, model_name: str) -> None: - with model_cache(model_name) as model: - docs = [ - "hello world", - "attention embedding", - "Mangez-vous vraiment des grenouilles?", - ] * 100 - embeddings = list(model.embed(docs, batch_size=10, parallel=2)) - - embeddings_2 = list(model.embed(docs, batch_size=10, parallel=None)) - - embeddings_3 = list(model.embed(docs, batch_size=10, parallel=0)) - - assert len(embeddings) == len(docs) - - for emb_1, emb_2, emb_3 in zip(embeddings, embeddings_2, embeddings_3): - assert np.allclose(emb_1.indices, emb_2.indices) - assert np.allclose(emb_1.indices, emb_3.indices) - assert np.allclose(emb_1.values, emb_2.values) - assert np.allclose(emb_1.values, emb_3.values) - - -@pytest.mark.parametrize("model_name", ["Qdrant/bm25"]) -def test_multilanguage(model_cache, model_name: str) -> None: - docs = ["Mangez-vous vraiment des grenouilles?", "Je suis au lit"] - - model = SparseTextEmbedding(model_name=model_name, language="french") - embeddings = list(model.embed(docs))[:2] - assert embeddings[0].values.shape == (3,) - assert embeddings[0].indices.shape == (3,) - - assert embeddings[1].values.shape == (1,) - assert embeddings[1].indices.shape == (1,) - - with model_cache(model_name) as model: # language = "english" - embeddings = list(model.embed(docs))[:2] - assert embeddings[0].values.shape == (5,) - assert embeddings[0].indices.shape == (5,) - - assert embeddings[1].values.shape == (4,) - assert embeddings[1].indices.shape == (4,) - - -@pytest.mark.parametrize("model_name", ["Qdrant/bm25"]) -def test_special_characters(model_cache, model_name: str) -> None: - with model_cache(model_name) as model: - docs = [ - "รœber den grรถรŸten Flรผssen ร–sterreichs รคuรŸern sich Experten hรคufig: ร–ko-Systeme mรผssen geschรผtzt werden!", - "L'รฉlรจve franรงais s'รฉcrie : ยซ Oรน est mon crayon ? J'ai besoin de finir cet exercice avant la rรฉcrรฉation!", - "รŽntr-o zi รฎnsoritฤƒ, ศ˜tefan ศ™i Ioana au mรขncat mฤƒmฤƒligฤƒ cu brรขnzฤƒ ศ™i au bฤƒut ศ›uicฤƒ la cabanฤƒ.", - "รœzgรผn รถฤŸretmen รถฤŸrencilere seslendi: Lรผtfen gรผrรผltรผ yapmayฤฑn, sฤฑnavฤฑnฤฑzฤฑ bitirmeye รงalฤฑลŸฤฑyorum!", - "ฮŸ ฮžฮตฮฝฮฟฯ†ฯŽฮฝ ฮตฮฏฯ€ฮต: ยซฮจฮฌฯ‡ฮฝฯ‰ ฮณฮนฮฑ ฮญฮฝฮฑ ฯ‰ฯฮฑฮฏฮฟ ฮดฯŽฯฮฟ ฮณฮนฮฑ ฯ„ฮท ฮณฮนฮฑฮณฮนฮฌ ฮผฮฟฯ…. ฮŠฯƒฯ‰ฯ‚ ฮญฮฝฮฑ ฯ†ฯ…ฯ„ฯŒ ฮฎ ฮญฮฝฮฑ ฮฒฮนฮฒฮปฮฏฮฟ;ยป", - "Hola! ยฟCรณmo estรกs? Estoy muy emocionado por el cumpleaรฑos de mi hermano, ยกva a ser increรญble! Tambiรฉn quiero comprar un pastel de chocolate con fresas y un regalo especial: un libro titulado ยซCien aรฑos de soledad", - ] - embeddings = list(model.embed(docs)) - for idx, shape in enumerate([14, 18, 15, 10, 15]): - assert embeddings[idx].values.shape == (shape,) - assert embeddings[idx].indices.shape == (shape,) - - -@pytest.mark.parametrize("model_name", ["Qdrant/bm42-all-minilm-l6-v2-attentions"]) -def test_lazy_load(model_name: str) -> None: - model = SparseTextEmbedding(model_name=model_name, lazy_load=True) - assert not hasattr(model.model, "model") - docs = ["hello world", "flag embedding"] - list(model.embed(docs)) - assert hasattr(model.model, "model") - - model = SparseTextEmbedding(model_name=model_name, lazy_load=True) - list(model.query_embed(docs)) - - model = SparseTextEmbedding(model_name=model_name, lazy_load=True) - list(model.passage_embed(docs)) diff --git a/tests/test_common.py b/tests/test_common.py deleted file mode 100644 index 372c2e01f..000000000 --- a/tests/test_common.py +++ /dev/null @@ -1,30 +0,0 @@ -from fastembed import ( - TextEmbedding, - SparseTextEmbedding, - ImageEmbedding, - LateInteractionMultimodalEmbedding, - LateInteractionTextEmbedding, -) - - -def test_text_list_supported_models(): - for model_type in [ - TextEmbedding, - SparseTextEmbedding, - ImageEmbedding, - LateInteractionMultimodalEmbedding, - LateInteractionTextEmbedding, - ]: - supported_models = model_type.list_supported_models() - assert isinstance(supported_models, list) - description = supported_models[0] - assert isinstance(description, dict) - - assert "model" in description and description["model"] - if model_type != SparseTextEmbedding: - assert "dim" in description and description["dim"] - assert "license" in description and description["license"] - assert "size_in_GB" in description and description["size_in_GB"] - assert "model_file" in description and description["model_file"] - assert "sources" in description and description["sources"] - assert "hf" in description["sources"] or "url" in description["sources"] diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py deleted file mode 100644 index dcca7d89c..000000000 --- a/tests/test_custom_models.py +++ /dev/null @@ -1,243 +0,0 @@ -import itertools -import os -import numpy as np -import pytest - -from fastembed.common.model_description import ( - PoolingType, - ModelSource, - DenseModelDescription, - BaseModelDescription, -) -from fastembed.common.onnx_model import OnnxOutputContext -from fastembed.common.utils import normalize, mean_pooling -from fastembed.text.custom_text_embedding import CustomTextEmbedding, PostprocessingConfig -from fastembed.rerank.cross_encoder.custom_text_cross_encoder import CustomTextCrossEncoder -from fastembed.rerank.cross_encoder import TextCrossEncoder -from fastembed.text.text_embedding import TextEmbedding -from tests.utils import delete_model_cache - - -@pytest.fixture(autouse=True) -def restore_custom_models_fixture(): - CustomTextEmbedding.SUPPORTED_MODELS = [] - CustomTextCrossEncoder.SUPPORTED_MODELS = [] - yield - CustomTextEmbedding.SUPPORTED_MODELS = [] - CustomTextCrossEncoder.SUPPORTED_MODELS = [] - - -def test_text_custom_model(): - is_ci = os.getenv("CI") - custom_model_name = "intfloat/multilingual-e5-small" - canonical_vector = np.array( - [3.1317e-02, 3.0939e-02, -3.5117e-02, -6.7274e-02, 8.5084e-02], dtype=np.float32 - ) - pooling = PoolingType.MEAN - normalization = True - dim = 384 - size_in_gb = 0.47 - source = ModelSource(hf=custom_model_name) - - TextEmbedding.add_custom_model( - custom_model_name, - pooling=pooling, - normalization=normalization, - sources=source, - dim=dim, - size_in_gb=size_in_gb, - ) - - assert CustomTextEmbedding.SUPPORTED_MODELS[0] == DenseModelDescription( - model=custom_model_name, - sources=source, - model_file="onnx/model.onnx", - description="", - license="", - size_in_GB=size_in_gb, - additional_files=[], - dim=dim, - tasks={}, - ) - assert CustomTextEmbedding.POSTPROCESSING_MAPPING[custom_model_name] == PostprocessingConfig( - pooling=pooling, normalization=normalization - ) - - model = TextEmbedding(custom_model_name) - docs = ["hello world", "flag embedding"] - embeddings = list(model.embed(docs)) - embeddings = np.stack(embeddings, axis=0) - assert embeddings.shape == (2, dim) - - assert np.allclose(embeddings[0, : canonical_vector.shape[0]], canonical_vector, atol=1e-3) - - if is_ci: - delete_model_cache(model.model._model_dir) - - CustomTextEmbedding.SUPPORTED_MODELS.clear() - CustomTextEmbedding.POSTPROCESSING_MAPPING.clear() - - -def test_cross_encoder_custom_model(): - is_ci = os.getenv("CI") - custom_model_name = "Xenova/ms-marco-MiniLM-L-4-v2" - size_in_gb = 0.08 - source = ModelSource(hf=custom_model_name) - canonical_vector = np.array([-5.7170815, -11.112114], dtype=np.float32) - - TextCrossEncoder.add_custom_model( - custom_model_name, - model_file="onnx/model.onnx", - sources=source, - size_in_gb=size_in_gb, - ) - - assert CustomTextCrossEncoder.SUPPORTED_MODELS[0] == BaseModelDescription( - model=custom_model_name, - sources=source, - model_file="onnx/model.onnx", - description="", - license="", - size_in_GB=size_in_gb, - ) - - model = TextCrossEncoder(custom_model_name) - pairs = [ - ("What is AI?", "Artificial intelligence is ..."), - ("What is ML?", "Machine learning is ..."), - ] - scores = list(model.rerank_pairs(pairs)) - - embeddings = np.stack(scores, axis=0) - assert embeddings.shape == (2,) - assert np.allclose(embeddings, canonical_vector, atol=1e-3) - if is_ci: - delete_model_cache(model.model._model_dir) - - CustomTextCrossEncoder.SUPPORTED_MODELS.clear() - - -def test_mock_add_custom_models(): - dim = 5 - size_in_gb = 0.1 - source = ModelSource(hf="artificial") - - num_tokens = 10 - dummy_pooled_embedding = np.random.random((1, dim)).astype(np.float32) - dummy_token_embedding = np.random.random((1, num_tokens, dim)).astype(np.float32) - dummy_attention_mask = np.ones((1, num_tokens)).astype(np.int64) - - dummy_token_output = OnnxOutputContext( - model_output=dummy_token_embedding, attention_mask=dummy_attention_mask - ) - dummy_pooled_output = OnnxOutputContext(model_output=dummy_pooled_embedding) - input_data = { - f"{PoolingType.MEAN.lower()}-normalized": dummy_token_output, - f"{PoolingType.MEAN.lower()}": dummy_token_output, - f"{PoolingType.CLS.lower()}-normalized": dummy_token_output, - f"{PoolingType.CLS.lower()}": dummy_token_output, - f"{PoolingType.DISABLED.lower()}-normalized": dummy_pooled_output, - f"{PoolingType.DISABLED.lower()}": dummy_pooled_output, - } - - expected_output = { - f"{PoolingType.MEAN.lower()}-normalized": normalize( - mean_pooling(dummy_token_embedding, dummy_attention_mask) - ), - f"{PoolingType.MEAN.lower()}": mean_pooling(dummy_token_embedding, dummy_attention_mask), - f"{PoolingType.CLS.lower()}-normalized": normalize(dummy_token_embedding[:, 0]), - f"{PoolingType.CLS.lower()}": dummy_token_embedding[:, 0], - f"{PoolingType.DISABLED.lower()}-normalized": normalize(dummy_pooled_embedding), - f"{PoolingType.DISABLED.lower()}": dummy_pooled_embedding, - } - - for pooling, normalization in itertools.product( - (PoolingType.MEAN, PoolingType.CLS, PoolingType.DISABLED), (True, False) - ): - model_name = f"{pooling.name.lower()}{'-normalized' if normalization else ''}" - TextEmbedding.add_custom_model( - model_name, - pooling=pooling, - normalization=normalization, - sources=source, - dim=dim, - size_in_gb=size_in_gb, - ) - - custom_text_embedding = CustomTextEmbedding( - model_name, - lazy_load=True, - specific_model_path="./", # disable model downloading and loading - ) - - post_processed_output = next( - iter(custom_text_embedding._post_process_onnx_output(input_data[model_name])) - ) - assert np.allclose(post_processed_output, expected_output[model_name], atol=1e-3) - - CustomTextEmbedding.SUPPORTED_MODELS.clear() - CustomTextEmbedding.POSTPROCESSING_MAPPING.clear() - - -def test_do_not_add_existing_model(): - existing_base_model = "sentence-transformers/all-MiniLM-L6-v2" - custom_model_name = "intfloat/multilingual-e5-small" - - with pytest.raises(ValueError, match=f"Model {existing_base_model} is already registered"): - TextEmbedding.add_custom_model( - existing_base_model, - pooling=PoolingType.MEAN, - normalization=True, - sources=ModelSource(hf=existing_base_model), - dim=384, - size_in_gb=0.47, - ) - - TextEmbedding.add_custom_model( - custom_model_name, - pooling=PoolingType.MEAN, - normalization=False, - sources=ModelSource(hf=existing_base_model), - dim=384, - size_in_gb=0.47, - ) - - with pytest.raises(ValueError, match=f"Model {custom_model_name} is already registered"): - TextEmbedding.add_custom_model( - custom_model_name, - pooling=PoolingType.MEAN, - normalization=True, - sources=ModelSource(hf=custom_model_name), - dim=384, - size_in_gb=0.47, - ) - - CustomTextEmbedding.SUPPORTED_MODELS.clear() - CustomTextEmbedding.POSTPROCESSING_MAPPING.clear() - - -def test_do_not_add_existing_cross_encoder(): - existing_base_model = "Xenova/ms-marco-MiniLM-L-6-v2" - custom_model_name = "Xenova/ms-marco-MiniLM-L-4-v2" - - with pytest.raises(ValueError, match=f"Model {existing_base_model} is already registered"): - TextCrossEncoder.add_custom_model( - existing_base_model, - sources=ModelSource(hf=existing_base_model), - size_in_gb=0.08, - ) - - TextCrossEncoder.add_custom_model( - custom_model_name, - sources=ModelSource(hf=existing_base_model), - size_in_gb=0.08, - ) - - with pytest.raises(ValueError, match=f"Model {custom_model_name} is already registered"): - TextCrossEncoder.add_custom_model( - custom_model_name, - sources=ModelSource(hf=custom_model_name), - size_in_gb=0.08, - ) - - CustomTextCrossEncoder.SUPPORTED_MODELS.clear() diff --git a/tests/test_dna_embeddings.py b/tests/test_dna_embeddings.py new file mode 100644 index 000000000..fca02e28c --- /dev/null +++ b/tests/test_dna_embeddings.py @@ -0,0 +1,153 @@ +import os + +import numpy as np +import pytest + +from fastembed_bio import DNAEmbedding, DNAInput +from tests.utils import delete_model_cache + + +# sample DNA sequences for testing +SAMPLE_SEQUENCES = [ + "ATCGATCGATCGATCG" * 8, # 128 nucleotides + "GCTAGCTAGCTAGCTA" * 8, +] + + +@pytest.fixture(scope="module") +def model_fixture(): + """ + Fixture that provides the DNA embedding model and handles cleanup. + """ + is_ci = os.getenv("CI") + model = DNAEmbedding("PoetschLab/GROVER") + yield model + if is_ci: + delete_model_cache(model.model._model_dir) + + +def test_dna_embedding(model_fixture) -> None: + """ + Test basic DNA embedding functionality. + """ + model = model_fixture + dim = 768 # GROVER has 768 dimensions + + embeddings = list(model.embed(SAMPLE_SEQUENCES)) + embeddings_arr = np.stack(embeddings, axis=0) + + assert embeddings_arr.shape == ( + 2, + dim, + ), f"Expected shape (2, {dim}), got {embeddings_arr.shape}" + + # check that embeddings are normalized (L2 norm close to 1) + norms = np.linalg.norm(embeddings_arr, axis=1) + assert np.allclose( + norms, 1.0, atol=1e-5 + ), f"Embeddings should be normalized, got norms: {norms}" + + +def test_dna_embedding_single_sequence(model_fixture) -> None: + """ + Test embedding a single sequence passed as a string. + """ + model = model_fixture + dim = 768 + + # single sequence as string + embedding = list(model.embed("ATCGATCGATCGATCG" * 8)) + assert len(embedding) == 1 + assert embedding[0].shape == (dim,) + + +def test_dna_embedding_size() -> None: + """ + Test get_embedding_size class method. + """ + assert DNAEmbedding.get_embedding_size("PoetschLab/GROVER") == 768 + + +def test_dna_embedding_lazy_load() -> None: + """ + Test lazy loading functionality. + """ + is_ci = os.getenv("CI") + + model = DNAEmbedding("PoetschLab/GROVER", lazy_load=True) + # model should not be loaded yet + assert not hasattr(model.model, "model") or model.model.model is None + + # after embedding, model should be loaded + list(model.embed(SAMPLE_SEQUENCES[:1])) + assert model.model.model is not None + + if is_ci: + delete_model_cache(model.model._model_dir) + + +def test_list_supported_models() -> None: + """ + Test listing supported DNA models. + """ + models = DNAEmbedding.list_supported_models() + assert len(models) > 0 + assert any(m["model"] == "PoetschLab/GROVER" for m in models) + + # check required fields + for model_info in models: + assert "model" in model_info + assert "dim" in model_info + assert "description" in model_info + + +def test_unsupported_model() -> None: + """ + Test that unsupported model raises ValueError. + """ + with pytest.raises(ValueError, match="not supported"): + DNAEmbedding("nonexistent/model") + + +def test_dna_input_basic() -> None: + """ + Test DNAInput dataclass basic functionality. + """ + inp = DNAInput("ATCGATCG", species="human") + assert inp.sequence == "ATCGATCG" + assert inp.species == "human" + + # default species + inp_default = DNAInput("GCTAGCTA") + assert inp_default.species == "human" + + +def test_dna_input_empty_sequence() -> None: + """ + Test that DNAInput rejects empty sequences. + """ + with pytest.raises(ValueError, match="cannot be empty"): + DNAInput("") + + +def test_dna_embedding_batch(model_fixture) -> None: + """ + Test batch embedding with different batch sizes. + """ + model = model_fixture + dim = 768 + + sequences = SAMPLE_SEQUENCES * 5 # 10 sequences + + # test with small batch size + embeddings_small_batch = list(model.embed(sequences, batch_size=2)) + embeddings_small_batch = np.stack(embeddings_small_batch, axis=0) + assert embeddings_small_batch.shape == (len(sequences), dim) + + # test with larger batch size + embeddings_large_batch = list(model.embed(sequences, batch_size=8)) + embeddings_large_batch = np.stack(embeddings_large_batch, axis=0) + assert embeddings_large_batch.shape == (len(sequences), dim) + + # results should be the same regardless of batch size + assert np.allclose(embeddings_small_batch, embeddings_large_batch, atol=1e-5) \ No newline at end of file diff --git a/tests/test_image_onnx_embeddings.py b/tests/test_image_onnx_embeddings.py deleted file mode 100644 index 5ac5e44f1..000000000 --- a/tests/test_image_onnx_embeddings.py +++ /dev/null @@ -1,175 +0,0 @@ -import os -from contextlib import contextmanager -from io import BytesIO - -import numpy as np -import pytest -import requests -from PIL import Image - -from fastembed import ImageEmbedding -from tests.config import TEST_MISC_DIR -from tests.utils import delete_model_cache, should_test_model - -CANONICAL_VECTOR_VALUES = { - "Qdrant/clip-ViT-B-32-vision": np.array([-0.0098, 0.0128, -0.0274, 0.002, -0.0059]), - "Qdrant/resnet50-onnx": np.array( - [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01046245, 0.01171397, 0.00705971, 0.0] - ), - "Qdrant/Unicom-ViT-B-16": np.array( - [0.0170, -0.0361, 0.0125, -0.0428, -0.0232, 0.0232, -0.0602, -0.0333, 0.0155, 0.0497] - ), - "Qdrant/Unicom-ViT-B-32": np.array( - [0.0418, 0.0550, 0.0003, 0.0253, -0.0185, 0.0016, -0.0368, -0.0402, -0.0891, -0.0186] - ), - "jinaai/jina-clip-v1": np.array( - [-0.029, 0.0216, 0.0396, 0.0283, -0.0023, 0.0151, 0.011, -0.0235, 0.0251, -0.0343] - ), -} - -_MODELS_TO_CACHE = ("Qdrant/clip-ViT-B-32-vision",) -MODELS_TO_CACHE = tuple([x.lower() for x in _MODELS_TO_CACHE]) - - -@pytest.fixture(scope="module") -def model_cache(): - is_ci = os.getenv("CI") - cache = {} - - @contextmanager - def get_model(model_name: str): - lowercase_model_name = model_name.lower() - if lowercase_model_name not in cache: - cache[lowercase_model_name] = ImageEmbedding(lowercase_model_name) - yield cache[lowercase_model_name] - if lowercase_model_name not in MODELS_TO_CACHE: - model_inst = cache.pop(lowercase_model_name) - if is_ci: - delete_model_cache(model_inst.model._model_dir) - del model_inst - - yield get_model - - if is_ci: - for name, model in cache.items(): - delete_model_cache(model.model._model_dir) - cache.clear() - - -@pytest.mark.parametrize("model_name", ["Qdrant/clip-ViT-B-32-vision"]) -def test_embedding(model_cache, model_name: str) -> None: - is_ci = os.getenv("CI") - is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch" - - for model_desc in ImageEmbedding._list_supported_models(): - if not should_test_model(model_desc, model_name, is_ci, is_manual): - continue - - dim = model_desc.dim - - with model_cache(model_desc.model) as model: - images = [ - TEST_MISC_DIR / "image.jpeg", - str(TEST_MISC_DIR / "small_image.jpeg"), - Image.open((TEST_MISC_DIR / "small_image.jpeg")), - Image.open(BytesIO(requests.get("https://qdrant.tech/img/logo.png").content)), - ] - embeddings = list(model.embed(images)) - embeddings = np.stack(embeddings, axis=0) - assert embeddings.shape == (len(images), dim) - - canonical_vector = CANONICAL_VECTOR_VALUES[model_desc.model] - - assert np.allclose( - embeddings[0, : canonical_vector.shape[0]], canonical_vector, atol=1e-3 - ), model_desc.model - - assert np.allclose(embeddings[1], embeddings[2]), model_desc.model - - -@pytest.mark.parametrize("n_dims,model_name", [(512, "Qdrant/clip-ViT-B-32-vision")]) -def test_batch_embedding(model_cache, n_dims: int, model_name: str) -> None: - with model_cache(model_name) as model: - n_images = 32 - test_images = [ - TEST_MISC_DIR / "image.jpeg", - str(TEST_MISC_DIR / "small_image.jpeg"), - Image.open(TEST_MISC_DIR / "small_image.jpeg"), - ] - images = test_images * n_images - - embeddings = list(model.embed(images, batch_size=10)) - embeddings = np.stack(embeddings, axis=0) - assert np.allclose(embeddings[1], embeddings[2]) - - canonical_vector = CANONICAL_VECTOR_VALUES[model_name] - - assert embeddings.shape == (len(test_images) * n_images, n_dims) - assert np.allclose(embeddings[0, : canonical_vector.shape[0]], canonical_vector, atol=1e-3) - - -@pytest.mark.parametrize("n_dims,model_name", [(512, "Qdrant/clip-ViT-B-32-vision")]) -def test_parallel_processing(model_cache, n_dims: int, model_name: str) -> None: - with model_cache(model_name) as model: - n_images = 32 - test_images = [ - TEST_MISC_DIR / "image.jpeg", - str(TEST_MISC_DIR / "small_image.jpeg"), - Image.open(TEST_MISC_DIR / "small_image.jpeg"), - ] - images = test_images * n_images - embeddings = list(model.embed(images, batch_size=10, parallel=2)) - embeddings = np.stack(embeddings, axis=0) - - embeddings_2 = list(model.embed(images, batch_size=10, parallel=None)) - embeddings_2 = np.stack(embeddings_2, axis=0) - - embeddings_3 = list(model.embed(images, batch_size=10, parallel=0)) - embeddings_3 = np.stack(embeddings_3, axis=0) - - assert embeddings.shape == (n_images * len(test_images), n_dims) - assert np.allclose(embeddings, embeddings_2, atol=1e-3) - assert np.allclose(embeddings, embeddings_3, atol=1e-3) - - -@pytest.mark.parametrize("model_name", ["Qdrant/clip-ViT-B-32-vision"]) -def test_lazy_load(model_name: str) -> None: - is_ci = os.getenv("CI") - model = ImageEmbedding(model_name=model_name, lazy_load=True) - assert not hasattr(model.model, "model") - images = [ - TEST_MISC_DIR / "image.jpeg", - str(TEST_MISC_DIR / "small_image.jpeg"), - ] - list(model.embed(images)) - assert hasattr(model.model, "model") - if is_ci: - delete_model_cache(model.model._model_dir) - - -def test_get_embedding_size() -> None: - assert ImageEmbedding.get_embedding_size(model_name="Qdrant/clip-ViT-B-32-vision") == 512 - assert ImageEmbedding.get_embedding_size(model_name="Qdrant/clip-vit-b-32-vision") == 512 - - -def test_embedding_size() -> None: - is_ci = os.getenv("CI") - model_name = "Qdrant/clip-ViT-B-32-vision" - model = ImageEmbedding(model_name=model_name, lazy_load=True) - assert model.embedding_size == 512 - - model_name = "Qdrant/clip-vit-b-32-vision" - model = ImageEmbedding(model_name=model_name, lazy_load=True) - assert model.embedding_size == 512 - if is_ci: - delete_model_cache(model.model._model_dir) - - -@pytest.mark.parametrize("model_name", ["Qdrant/clip-ViT-B-32-vision"]) -def test_session_options(model_cache, model_name) -> None: - with model_cache(model_name) as default_model: - default_session_options = default_model.model.model.get_session_options() - assert default_session_options.enable_cpu_mem_arena is True - model = ImageEmbedding(model_name=model_name, enable_cpu_mem_arena=False) - session_options = model.model.model.get_session_options() - assert session_options.enable_cpu_mem_arena is False diff --git a/tests/test_late_interaction_embeddings.py b/tests/test_late_interaction_embeddings.py deleted file mode 100644 index ea83e76a3..000000000 --- a/tests/test_late_interaction_embeddings.py +++ /dev/null @@ -1,344 +0,0 @@ -import os -from contextlib import contextmanager - -import pytest -import numpy as np - -from fastembed.late_interaction.late_interaction_text_embedding import ( - LateInteractionTextEmbedding, -) -from tests.utils import delete_model_cache, should_test_model - -# vectors are abridged and rounded for brevity -CANONICAL_COLUMN_VALUES = { - "colbert-ir/colbertv2.0": np.array( - [ - [0.0759, 0.0841, -0.0299, 0.0374, 0.0254], - [0.0005, -0.0163, -0.0127, 0.2165, 0.1517], - [-0.0257, -0.0575, 0.0135, 0.2202, 0.1896], - [0.0846, 0.0122, 0.0032, -0.0109, -0.1041], - [0.0477, 0.1078, -0.0314, 0.016, 0.0156], - ] - ), - "answerdotai/answerai-colbert-small-v1": np.array( - [ - [-0.07281, 0.04632, -0.04711, 0.00762, -0.07374], - [-0.04464, 0.04426, -0.074, 0.01801, -0.05233], - [0.09936, -0.05123, -0.04925, -0.05276, -0.08944], - [0.01644, 0.0203, -0.03789, 0.03165, -0.06501], - [-0.07281, 0.04633, -0.04711, 0.00762, -0.07374], - ] - ), - "jinaai/jina-colbert-v2": np.array( - [ - [0.0742, 0.0591, -0.2403, -0.1774, 0.02], - [0.1318, 0.0882, -0.1138, -0.2066, 0.146], - [-0.0183, -0.1354, -0.0139, -0.1079, -0.051], - [0.0003, -0.1184, -0.07, -0.0479, -0.0649], - [0.0766, 0.0452, -0.2343, -0.183, 0.0058], - ] - ), -} - -CANONICAL_QUERY_VALUES = { - "colbert-ir/colbertv2.0": np.array( - [ - [0.0824, 0.0872, -0.0324, 0.0418, 0.024], - [-0.0007, -0.0154, -0.0113, 0.2277, 0.1528], - [-0.0251, -0.0565, 0.0136, 0.2236, 0.1838], - [0.0848, 0.0056, 0.0041, -0.0036, -0.1032], - [0.0574, 0.1072, -0.0332, 0.0233, 0.0209], - [0.1041, 0.0364, -0.0058, -0.027, -0.0704], - [0.106, 0.0371, -0.0055, -0.0339, -0.0719], - [0.1063, 0.0363, 0.0014, -0.0334, -0.0698], - [0.112, 0.036, 0.0026, -0.0355, -0.0675], - [0.1184, 0.0441, 0.0166, -0.0169, -0.0244], - [0.1033, 0.035, 0.0183, 0.0475, 0.0612], - [-0.0028, -0.014, -0.016, 0.2175, 0.1537], - [0.0547, 0.0219, -0.007, 0.1748, 0.1154], - [-0.001, -0.0184, -0.0112, 0.2197, 0.1523], - [-0.0012, -0.0149, -0.0119, 0.2147, 0.152], - [-0.0186, -0.0239, -0.014, 0.2196, 0.156], - [-0.017, -0.0232, -0.0108, 0.2212, 0.157], - [-0.0109, -0.0024, -0.003, 0.1972, 0.1391], - [0.0898, 0.0219, -0.0255, 0.0734, -0.0096], - [0.1143, 0.015, -0.022, 0.0417, -0.0421], - [0.1056, 0.0091, -0.0137, 0.0129, -0.0619], - [0.0234, 0.004, -0.0285, 0.1565, 0.0883], - [-0.0037, -0.0079, -0.0204, 0.1982, 0.1502], - [0.0988, 0.0377, 0.0226, 0.0309, 0.0508], - [-0.0103, -0.0128, -0.0035, 0.2114, 0.155], - [-0.0103, -0.0184, -0.011, 0.2252, 0.157], - [-0.0033, -0.0292, -0.0097, 0.2237, 0.1607], - [-0.0198, -0.0257, -0.0193, 0.2265, 0.165], - [-0.0227, -0.0028, -0.0084, 0.1995, 0.1306], - [0.0916, 0.0185, -0.0186, 0.0173, -0.0577], - [0.1022, 0.0228, -0.0174, -0.0102, -0.065], - [0.1043, 0.0231, -0.0144, -0.0246, -0.067], - ] - ), - "answerdotai/answerai-colbert-small-v1": np.array( - [ - [-0.07284, 0.04657, -0.04746, 0.00786, -0.07342], - [-0.0473, 0.04615, -0.07551, 0.01591, -0.0517], - [0.09658, -0.0506, -0.04593, -0.05225, -0.09086], - [0.01815, 0.0165, -0.03366, 0.03214, -0.07019], - [-0.07284, 0.04657, -0.04746, 0.00787, -0.07342], - [-0.07748, 0.04493, -0.055, 0.00481, -0.0486], - [-0.0803, 0.04229, -0.0589, 0.00379, -0.04506], - [-0.08477, 0.03724, -0.06162, 0.00578, -0.04554], - [-0.08392, 0.03805, -0.06202, 0.00899, -0.0409], - [-0.07945, 0.04163, -0.06151, 0.00569, -0.04432], - [-0.08469, 0.03985, -0.05765, 0.00485, -0.04485], - [-0.08306, 0.04111, -0.05774, 0.00583, -0.04325], - [-0.08244, 0.04597, -0.05842, 0.00433, -0.04025], - [-0.08385, 0.04745, -0.05845, 0.00469, -0.04002], - [-0.08402, 0.05014, -0.05941, 0.00692, -0.03452], - [-0.08303, 0.05693, -0.05701, 0.00504, -0.03565], - [-0.08216, 0.05516, -0.05687, 0.0057, -0.03748], - [-0.08051, 0.05751, -0.05647, 0.00283, -0.03645], - [-0.08172, 0.05608, -0.06064, 0.00252, -0.03533], - [-0.08073, 0.06144, -0.06373, 0.00935, -0.03154], - [-0.06651, 0.06697, -0.06769, 0.01717, -0.03369], - [-0.06526, 0.06931, -0.06935, 0.0139, -0.03702], - [-0.05435, 0.05829, -0.06593, 0.01708, -0.04559], - [-0.03648, 0.05234, -0.06759, 0.02057, -0.05053], - [-0.03461, 0.05032, -0.06747, 0.02216, -0.05209], - [-0.03444, 0.04835, -0.06812, 0.02296, -0.05276], - [-0.03292, 0.04853, -0.06811, 0.02348, -0.05303], - [-0.03349, 0.04783, -0.06846, 0.02393, -0.05334], - [-0.03485, 0.04677, -0.06826, 0.02362, -0.05326], - [-0.03408, 0.04744, -0.06931, 0.02302, -0.05288], - [-0.03444, 0.04838, -0.06945, 0.02133, -0.05277], - [-0.03473, 0.04792, -0.07033, 0.02196, -0.05314], - ] - ), - "jinaai/jina-colbert-v2": np.array( - [ - [0.0477, 0.0255, -0.2224, -0.1085, -0.03], - [0.0206, -0.0845, -0.0075, -0.1712, 0.0156], - [-0.0056, -0.0957, -0.0147, -0.1277, -0.0225], - [0.0486, -0.0499, -0.1609, 0.0194, 0.0274], - [0.0481, 0.0253, -0.2278, -0.1126, -0.0294], - [0.0599, -0.0678, -0.0956, -0.0757, 0.0236], - [0.0592, -0.0862, -0.0621, -0.1084, 0.0155], - [0.0874, -0.0714, -0.0772, -0.1414, 0.037], - [0.1009, -0.0552, -0.0669, -0.163, 0.0493], - [0.1135, -0.047, -0.0576, -0.1699, 0.0538], - [0.1228, -0.0428, -0.0507, -0.1725, 0.0562], - [0.1291, -0.0388, -0.042, -0.1753, 0.0569], - [0.1365, -0.0337, -0.0326, -0.1786, 0.0574], - [0.1439, -0.026, -0.024, -0.1831, 0.0574], - [0.1527, -0.0099, -0.0179, -0.1874, 0.057], - [0.1555, 0.0186, -0.023, -0.1801, 0.0539], - [0.1389, 0.054, -0.0345, -0.1636, 0.0429], - [0.1058, 0.0862, -0.0418, -0.1455, 0.0222], - [0.0713, 0.1061, -0.0438, -0.1288, 0.0002], - [0.0453, 0.1143, -0.0457, -0.1119, -0.019], - [0.0346, 0.1131, -0.0487, -0.0952, -0.0338], - [0.0355, 0.1073, -0.0493, -0.0823, -0.0438], - [0.0424, 0.1041, -0.0459, -0.0761, -0.048], - [0.048, 0.102, -0.0421, -0.0718, -0.0477], - [0.0474, 0.0989, -0.0413, -0.0654, -0.0431], - [0.0434, 0.095, -0.0415, -0.0589, -0.0345], - [0.0408, 0.0897, -0.0405, -0.0554, -0.0197], - [0.0433, 0.0811, -0.0407, -0.0545, 0.0055], - [0.0514, 0.0629, -0.0446, -0.0549, 0.0368], - [0.058, 0.048, -0.0527, -0.0607, 0.0568], - [0.0561, 0.0447, -0.0661, -0.0702, 0.0764], - [0.0204, -0.0856, -0.0386, -0.1232, -0.0332], - ] - ), -} - -_MODELS_TO_CACHE = ("answerdotai/answerai-colbert-small-v1",) -MODELS_TO_CACHE = tuple([x.lower() for x in _MODELS_TO_CACHE]) - - -@pytest.fixture(scope="module") -def model_cache(): - is_ci = os.getenv("CI") - cache = {} - - @contextmanager - def get_model(model_name: str): - lowercase_model_name = model_name.lower() - if lowercase_model_name not in cache: - cache[lowercase_model_name] = LateInteractionTextEmbedding(lowercase_model_name) - yield cache[lowercase_model_name] - if lowercase_model_name not in MODELS_TO_CACHE: - model_inst = cache.pop(lowercase_model_name) - if is_ci: - delete_model_cache(model_inst.model._model_dir) - del model_inst - - yield get_model - - if is_ci: - for name, model in cache.items(): - delete_model_cache(model.model._model_dir) - cache.clear() - - -docs = ["Hello World"] - - -@pytest.mark.parametrize("model_name", ["answerdotai/answerai-colbert-small-v1"]) -def test_batch_embedding(model_cache, model_name: str): - docs_to_embed = docs * 10 - - with model_cache(model_name) as model: - result = list(model.embed(docs_to_embed, batch_size=6)) - expected_result = CANONICAL_COLUMN_VALUES[model_name] - - for value in result: - token_num, abridged_dim = expected_result.shape - assert np.allclose(value[:, :abridged_dim], expected_result, atol=2e-3) - - -@pytest.mark.parametrize("model_name", ["answerdotai/answerai-colbert-small-v1"]) -def test_batch_inference_size_same_as_single_inference(model_cache, model_name: str): - with model_cache(model_name) as model: - docs_to_embed = [ - "short document", - "A bit longer document, which should not affect the size", - ] - result = list(model.embed(docs_to_embed, batch_size=1)) - result_2 = list(model.embed(docs_to_embed, batch_size=2)) - assert len(result[0]) == len(result_2[0]) - - -@pytest.mark.parametrize("model_name", ["answerdotai/answerai-colbert-small-v1"]) -def test_single_embedding(model_cache, model_name: str): - is_ci = os.getenv("CI") - is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch" - docs_to_embed = docs - - for model_desc in LateInteractionTextEmbedding._list_supported_models(): - if not should_test_model(model_desc, model_name, is_ci, is_manual): - continue - - print("evaluating", model_name) - with model_cache(model_desc.model) as model: - whole_result = list(model.embed(docs_to_embed, batch_size=6)) - assert len(whole_result) == 1 - result = whole_result[0] - expected_result = CANONICAL_COLUMN_VALUES[model_desc.model] - token_num, abridged_dim = expected_result.shape - assert np.allclose(result[:, :abridged_dim], expected_result, atol=2e-3) - - -@pytest.mark.parametrize("model_name", ["answerdotai/answerai-colbert-small-v1"]) -def test_single_embedding_query(model_cache, model_name: str): - is_ci = os.getenv("CI") - is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch" - queries_to_embed = docs - - for model_desc in LateInteractionTextEmbedding._list_supported_models(): - if not should_test_model(model_desc, model_name, is_ci, is_manual): - continue - - print("evaluating", model_desc.model) - with model_cache(model_desc.model) as model: - whole_result = list(model.query_embed(queries_to_embed)) - assert len(whole_result) == 1 - result = whole_result[0] - expected_result = CANONICAL_QUERY_VALUES[model_desc.model] - token_num, abridged_dim = expected_result.shape - assert np.allclose(result[:, :abridged_dim], expected_result, atol=2e-3) - - -@pytest.mark.parametrize("token_dim,model_name", [(96, "answerdotai/answerai-colbert-small-v1")]) -def test_parallel_processing(model_cache, token_dim: int, model_name: str): - with model_cache(model_name) as model: - docs = ["hello world", "flag embedding"] * 100 - embeddings = list(model.embed(docs, batch_size=10, parallel=2)) - - embeddings_2 = list(model.embed(docs, batch_size=10, parallel=None)) - - # embeddings_3 = list(model.embed(docs, batch_size=10, parallel=0)) # inherits OnnxTextModel which - # # is tested in TextEmbedding, disabling it here to reduce number of requests to hf - # # multiprocessing is enough to test with `parallel=2`, and `parallel=None` is okay to tests since it reuses - # # model from cache - - assert len(embeddings) == len(docs) and embeddings[0].shape[-1] == token_dim - - for i in range(len(embeddings)): - assert np.allclose(embeddings[i], embeddings_2[i], atol=1e-3) - # assert np.allclose(embeddings[i], embeddings_3[i], atol=1e-3) - - -@pytest.mark.parametrize("model_name", ["answerdotai/answerai-colbert-small-v1"]) -def test_lazy_load(model_name: str): - is_ci = os.getenv("CI") - - model = LateInteractionTextEmbedding(model_name=model_name, lazy_load=True) - assert not hasattr(model.model, "model") - - docs = ["hello world", "flag embedding"] - list(model.embed(docs)) - assert hasattr(model.model, "model") - - model = LateInteractionTextEmbedding(model_name=model_name, lazy_load=True) - list(model.query_embed(docs)) - - model = LateInteractionTextEmbedding(model_name=model_name, lazy_load=True) - list(model.passage_embed(docs)) - - if is_ci: - delete_model_cache(model.model._model_dir) - - -def test_get_embedding_size(): - model_name = "answerdotai/answerai-colbert-small-v1" - assert LateInteractionTextEmbedding.get_embedding_size(model_name) == 96 - - model_name = "answerdotai/answerai-ColBERT-small-v1" - assert LateInteractionTextEmbedding.get_embedding_size(model_name) == 96 - - -def test_embedding_size(): - is_ci = os.getenv("CI") - model_name = "answerdotai/answerai-colbert-small-v1" - model = LateInteractionTextEmbedding(model_name=model_name, lazy_load=True) - assert model.embedding_size == 96 - - model_name = "answerdotai/answerai-ColBERT-small-v1" - model = LateInteractionTextEmbedding(model_name=model_name, lazy_load=True) - assert model.embedding_size == 96 - if is_ci: - delete_model_cache(model.model._model_dir) - - -@pytest.mark.parametrize("model_name", ["answerdotai/answerai-ColBERT-small-v1"]) -def test_session_options(model_cache, model_name) -> None: - with model_cache(model_name) as default_model: - default_session_options = default_model.model.model.get_session_options() - assert default_session_options.enable_cpu_mem_arena is True - model = LateInteractionTextEmbedding(model_name=model_name, enable_cpu_mem_arena=False) - session_options = model.model.model.get_session_options() - assert session_options.enable_cpu_mem_arena is False - - -@pytest.mark.parametrize("model_name", ["answerdotai/answerai-colbert-small-v1"]) -def test_token_count(model_cache, model_name) -> None: - with model_cache(model_name) as model: - documents = ["short doc", "it is a long document to check attention mask for paddings"] - short_doc_token_count = model.token_count(documents[0]) - long_doc_token_count = model.token_count(documents[1]) - documents_token_count = model.token_count(documents) - assert short_doc_token_count + long_doc_token_count == documents_token_count - # 2 is 2*DOC_MARKER_TOKEN_ID for each document - assert short_doc_token_count + long_doc_token_count + 2 == model.token_count( - documents, include_extension=True - ) - assert short_doc_token_count + long_doc_token_count == model.token_count( - documents, batch_size=1 - ) - assert short_doc_token_count + long_doc_token_count == model.token_count( - documents, is_doc=False - ) - # query min length is 32 - assert model.token_count(documents, is_doc=False, include_extension=True) == 64 - very_long_query = "It's a very long query which definitely contains more than 32 tokens and we're using it to check whether the method can handle large query properly without cutting it to 32 tokens" - assert model.token_count(very_long_query, is_doc=False, include_extension=True) > 32 diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py deleted file mode 100644 index 94ae47e7a..000000000 --- a/tests/test_late_interaction_multimodal.py +++ /dev/null @@ -1,165 +0,0 @@ -import os -from contextlib import contextmanager - -import pytest -from PIL import Image -import numpy as np - -from fastembed import LateInteractionMultimodalEmbedding -from tests.config import TEST_MISC_DIR -from tests.utils import delete_model_cache - -# vectors are abridged and rounded for brevity -CANONICAL_IMAGE_VALUES = { - "Qdrant/colpali-v1.3-fp16": np.array( - [ - [-0.0345, -0.022, 0.0567, -0.0518, -0.0782, 0.1714, -0.1738], - [-0.1181, -0.099, 0.0268, 0.0774, 0.0228, 0.0563, -0.1021], - [-0.117, -0.0683, 0.0371, 0.0921, 0.0107, 0.0659, -0.0666], - [-0.1393, -0.0948, 0.037, 0.0951, -0.0126, 0.0678, -0.087], - [-0.0957, -0.081, 0.0404, 0.052, 0.0409, 0.0335, -0.064], - [-0.0626, -0.0445, 0.056, 0.0592, -0.0229, 0.0409, -0.0301], - [-0.1299, -0.0691, 0.1097, 0.0728, 0.0123, 0.0519, 0.0122], - ] - ), - "Qdrant/colmodernvbert": np.array( - [ - [0.11614, -0.15793, -0.11194, 0.0688, 0.08001, 0.10575, -0.07871], - [0.10094, -0.13301, -0.12069, 0.10932, 0.04645, 0.09884, 0.04048], - [0.13106, -0.18613, -0.13469, 0.10566, 0.03659, 0.07712, -0.03916], - [0.09754, -0.09596, -0.04839, 0.14991, 0.05692, 0.10569, -0.08349], - [0.02576, -0.15651, -0.09977, 0.09707, 0.13412, 0.09994, -0.09931], - [-0.06741, -0.1787, -0.19677, -0.07618, 0.13102, -0.02131, -0.02437], - [-0.02776, -0.10187, -0.13793, 0.03835, 0.04766, 0.04701, -0.15635], - ] - ), -} - -CANONICAL_QUERY_VALUES = { - "Qdrant/colpali-v1.3-fp16": np.array( - [ - [-0.0023, 0.1477, 0.1594, 0.046, -0.0196, 0.0554, 0.1567], - [-0.0139, -0.0057, 0.0932, 0.0052, -0.0678, 0.0131, 0.0537], - [0.0054, 0.0364, 0.2078, -0.074, 0.0355, 0.061, 0.1593], - [-0.0076, -0.0154, 0.2266, 0.0103, 0.0089, -0.024, 0.098], - [-0.0274, 0.0098, 0.2106, -0.0634, 0.0616, -0.0021, 0.0708], - [0.0074, 0.0025, 0.1631, -0.0802, 0.0418, -0.0219, 0.1022], - [-0.0165, -0.0106, 0.1672, -0.0768, 0.0389, -0.0038, 0.1137], - ] - ), - "Qdrant/colmodernvbert": np.array( - [ - [0.05, 0.06557, 0.04026, 0.14981, 0.1842, 0.0263, -0.18706], - [-0.05664, -0.14028, 0.00649, -0.02849, 0.09034, -0.01494, 0.10693], - [-0.10147, -0.00716, 0.09084, -0.08236, -0.01849, -0.00972, -0.00461], - [-0.1233, -0.10814, -0.02337, -0.00329, 0.05984, 0.09934, 0.09846], - [-0.07053, -0.13119, -0.06487, 0.01508, 0.07459, 0.07655, 0.14821], - [0.00526, -0.13842, -0.05837, -0.02721, 0.13009, 0.05076, 0.17962], - [0.00924, -0.14383, -0.03057, -0.03691, 0.11718, 0.037, 0.13344], - ] - ), -} - -queries = ["hello world", "flag embedding"] -images = [ - TEST_MISC_DIR / "image.jpeg", - str(TEST_MISC_DIR / "image.jpeg"), - Image.open((TEST_MISC_DIR / "image.jpeg")), -] - -_MODELS_TO_CACHE = ("Qdrant/colmodernvbert",) -MODELS_TO_CACHE = tuple(model_name.lower() for model_name in _MODELS_TO_CACHE) - - -@pytest.fixture(scope="module") -def model_cache(): - is_ci = os.getenv("CI") - cache = {} - - @contextmanager - def get_model(model_name: str): - lowercase_model_name = model_name.lower() - if lowercase_model_name not in cache: - cache[lowercase_model_name] = LateInteractionMultimodalEmbedding(lowercase_model_name) - yield cache[lowercase_model_name] - if lowercase_model_name not in MODELS_TO_CACHE: - model_inst = cache.pop(lowercase_model_name) - if is_ci: - delete_model_cache(model_inst.model._model_dir) - del model_inst - - yield get_model - - if is_ci: - for _, model in cache.items(): - delete_model_cache(model.model._model_dir) - cache.clear() - - -def test_batch_embedding(model_cache): - for model_name, expected_result in CANONICAL_IMAGE_VALUES.items(): - if model_name.lower() == "Qdrant/colpali-v1.3-fp16".lower() and os.getenv("CI"): - continue # colpali is too large for ci - - print("evaluating", model_name) - with model_cache(model_name) as model: - result = list(model.embed_image(images, batch_size=2)) - - for value in result: - token_num, abridged_dim = expected_result.shape - assert np.allclose(value[:token_num, :abridged_dim], expected_result, atol=2e-3) - - -def test_single_embedding(model_cache): - for model_name, expected_result in CANONICAL_IMAGE_VALUES.items(): - if model_name.lower() == "Qdrant/colpali-v1.3-fp16".lower() and os.getenv("CI"): - continue # colpali is too large for ci - print("evaluating", model_name) - with model_cache(model_name) as model: - result = next(iter(model.embed_image(images, batch_size=6))) - token_num, abridged_dim = expected_result.shape - assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3) - - -def test_single_embedding_query(model_cache): - for model_name, expected_result in CANONICAL_QUERY_VALUES.items(): - if model_name.lower() == "Qdrant/colpali-v1.3-fp16".lower() and os.getenv("CI"): - continue # colpali is too large for ci - print("evaluating", model_name) - with model_cache(model_name) as model: - result = next(iter(model.embed_text(queries))) - token_num, abridged_dim = expected_result.shape - assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3) - - -def test_get_embedding_size(): - model_name = "Qdrant/colpali-v1.3-fp16" - assert LateInteractionMultimodalEmbedding.get_embedding_size(model_name) == 128 - - model_name = "Qdrant/ColPali-v1.3-fp16" - assert LateInteractionMultimodalEmbedding.get_embedding_size(model_name) == 128 - - model_name = "Qdrant/colmodernvbert" - assert LateInteractionMultimodalEmbedding.get_embedding_size(model_name) == 128 - - -def test_embedding_size(): - model_name = "Qdrant/colmodernvbert" - model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True) - assert model.embedding_size == 128 - - -def test_token_count(model_cache) -> None: - model_name = "Qdrant/colmodernvbert" - with model_cache(model_name) as model: - documents = ["short doc", "it is a long document to check attention mask for paddings"] - short_doc_token_count = model.token_count(documents[0]) - long_doc_token_count = model.token_count(documents[1]) - documents_token_count = model.token_count(documents) - assert short_doc_token_count + long_doc_token_count == documents_token_count - assert short_doc_token_count + long_doc_token_count == model.token_count( - documents, batch_size=1 - ) - assert short_doc_token_count + long_doc_token_count < model.token_count( - documents, include_extension=True - ) diff --git a/tests/test_multi_gpu.py b/tests/test_multi_gpu.py deleted file mode 100644 index 235fb0eae..000000000 --- a/tests/test_multi_gpu.py +++ /dev/null @@ -1,221 +0,0 @@ -import pytest - -from fastembed import ( - TextEmbedding, - SparseTextEmbedding, - LateInteractionTextEmbedding, - ImageEmbedding, -) -from fastembed.rerank.cross_encoder import TextCrossEncoder -from tests.config import TEST_MISC_DIR - -CACHE_DIR = "../model_cache" - - -@pytest.mark.skip(reason="Requires a multi-gpu server") -@pytest.mark.parametrize("device_id", [None, 0, 1]) -def test_gpu_via_providers(device_id: int | None) -> None: - docs = ["hello world", "flag embedding"] - - device_id = device_id if device_id is not None else 0 - providers = ( - ["CUDAExecutionProvider"] - if device_id is None - else [("CUDAExecutionProvider", {"device_id": device_id})] - ) - embedding_model = TextEmbedding( - "sentence-transformers/all-MiniLM-L6-v2", - providers=providers, - cache_dir=CACHE_DIR, - ) - list(embedding_model.embed(docs)) - options = embedding_model.model.model.get_provider_options() - assert options["CUDAExecutionProvider"]["device_id"] == str(device_id) - - embedding_model = SparseTextEmbedding( - "prithvida/Splade_PP_en_v1", - providers=providers, - cache_dir=CACHE_DIR, - ) - list(embedding_model.embed(docs)) - options = embedding_model.model.model.get_provider_options() - assert options["CUDAExecutionProvider"]["device_id"] == str(device_id) - - embedding_model = SparseTextEmbedding( - "Qdrant/bm42-all-minilm-l6-v2-attentions", - providers=providers, - cache_dir=CACHE_DIR, - ) - list(embedding_model.embed(docs)) - options = embedding_model.model.model.get_provider_options() - assert options["CUDAExecutionProvider"]["device_id"] == str(device_id) - - embedding_model = LateInteractionTextEmbedding( - "colbert-ir/colbertv2.0", - providers=providers, - cache_dir=CACHE_DIR, - ) - list(embedding_model.embed(docs)) - options = embedding_model.model.model.get_provider_options() - assert options["CUDAExecutionProvider"]["device_id"] == str(device_id) - - embedding_model = ImageEmbedding( - model_name="Qdrant/clip-ViT-B-32-vision", - providers=providers, - cache_dir=CACHE_DIR, - ) - images = [ - TEST_MISC_DIR / "image.jpeg", - str(TEST_MISC_DIR / "small_image.jpeg"), - ] - list(embedding_model.embed(images)) - options = embedding_model.model.model.get_provider_options() - assert options["CUDAExecutionProvider"]["device_id"] == str(device_id) - - model = TextCrossEncoder( - model_name="Xenova/ms-marco-MiniLM-L-6-v2", - providers=providers, - cache_dir=CACHE_DIR, - ) - query = "What is the capital of France?" - documents = ["Paris is the capital of France.", "Berlin is the capital of Germany."] - list(model.rerank(query, documents)) - options = embedding_model.model.model.get_provider_options() - assert options["CUDAExecutionProvider"]["device_id"] == str(device_id) - - -@pytest.mark.skip(reason="Requires a multi-gpu server") -@pytest.mark.parametrize("device_ids", [None, [0], [1], [0, 1]]) -def test_gpu_cuda_device_ids(device_ids: list[int] | None) -> None: - docs = ["hello world", "flag embedding"] - device_id = device_ids[0] if device_ids else 0 - embedding_model = TextEmbedding( - "sentence-transformers/all-MiniLM-L6-v2", - cuda=True, - device_ids=device_ids, - cache_dir=CACHE_DIR, - ) - list(embedding_model.embed(docs)) - options = embedding_model.model.model.get_provider_options() - assert options["CUDAExecutionProvider"]["device_id"] == str( - device_id - ), f"Text embedding: {options}" - - embedding_model = SparseTextEmbedding( - "prithvida/Splade_PP_en_v1", - cuda=True, - device_ids=device_ids, - cache_dir=CACHE_DIR, - ) - list(embedding_model.embed(docs)) - options = embedding_model.model.model.get_provider_options() - assert options["CUDAExecutionProvider"]["device_id"] == str( - device_id - ), f"Sparse text embedding: {options}" - - embedding_model = SparseTextEmbedding( - "Qdrant/bm42-all-minilm-l6-v2-attentions", - cuda=True, - device_ids=device_ids, - cache_dir=CACHE_DIR, - ) - list(embedding_model.embed(docs)) - options = embedding_model.model.model.get_provider_options() - assert options["CUDAExecutionProvider"]["device_id"] == str(device_id), f"Bm42: {options}" - - embedding_model = LateInteractionTextEmbedding( - "colbert-ir/colbertv2.0", - cuda=True, - device_ids=device_ids, - cache_dir=CACHE_DIR, - ) - list(embedding_model.embed(docs)) - options = embedding_model.model.model.get_provider_options() - assert options["CUDAExecutionProvider"]["device_id"] == str( - device_id - ), f"Late interaction text embedding: {options}" - - embedding_model = ImageEmbedding( - model_name="Qdrant/clip-ViT-B-32-vision", - cuda=True, - device_ids=device_ids, - cache_dir=CACHE_DIR, - ) - images = [ - TEST_MISC_DIR / "image.jpeg", - str(TEST_MISC_DIR / "small_image.jpeg"), - ] - list(embedding_model.embed(images)) - options = embedding_model.model.model.get_provider_options() - assert options["CUDAExecutionProvider"]["device_id"] == str( - device_id - ), f"Image embedding: {options}" - - if device_ids is None or len(device_ids) == 1: - model = TextCrossEncoder( - model_name="Xenova/ms-marco-MiniLM-L-6-v2", - cuda=True, - device_ids=device_ids, - cache_dir=CACHE_DIR, - ) - query = "What is the capital of France?" - documents = ["Paris is the capital of France.", "Berlin is the capital of Germany."] - list(model.rerank(query, documents)) - options = embedding_model.model.model.get_provider_options() - assert options["CUDAExecutionProvider"]["device_id"] == str( - device_id - ), f"Text cross encoder: {options}" - - -@pytest.mark.skip(reason="Requires a multi-gpu server") -@pytest.mark.parametrize( - "device_ids,parallel", [(None, None), (None, 2), ([1], None), ([1], 1), ([1], 2), ([0, 1], 2)] -) -def test_multi_gpu_parallel_inference(device_ids: list[int] | None, parallel: int) -> None: - docs = ["hello world", "flag embedding"] * 100 - batch_size = 5 - - embedding_model = TextEmbedding( - "sentence-transformers/all-MiniLM-L6-v2", - cuda=True, - device_ids=device_ids, - cache_dir=CACHE_DIR, - lazy_load=True, - ) - list(embedding_model.embed(docs, batch_size=batch_size, parallel=parallel)) - - embedding_model = SparseTextEmbedding( - "prithvida/Splade_PP_en_v1", - cuda=True, - device_ids=device_ids, - cache_dir=CACHE_DIR, - ) - list(embedding_model.embed(docs, batch_size=batch_size, parallel=parallel)) - - embedding_model = SparseTextEmbedding( - "Qdrant/bm42-all-minilm-l6-v2-attentions", - cuda=True, - device_ids=device_ids, - cache_dir=CACHE_DIR, - ) - list(embedding_model.embed(docs, batch_size=batch_size, parallel=parallel)) - - embedding_model = LateInteractionTextEmbedding( - "colbert-ir/colbertv2.0", - cuda=True, - device_ids=device_ids, - cache_dir=CACHE_DIR, - ) - list(embedding_model.embed(docs, batch_size=batch_size, parallel=parallel)) - - embedding_model = ImageEmbedding( - model_name="Qdrant/clip-ViT-B-32-vision", - cuda=True, - device_ids=device_ids, - cache_dir=CACHE_DIR, - ) - images = [ - TEST_MISC_DIR / "image.jpeg", - str(TEST_MISC_DIR / "small_image.jpeg"), - ] * 100 - list(embedding_model.embed(images, batch_size=batch_size, parallel=parallel)) diff --git a/tests/test_postprocess.py b/tests/test_postprocess.py deleted file mode 100644 index 82576f115..000000000 --- a/tests/test_postprocess.py +++ /dev/null @@ -1,38 +0,0 @@ -import numpy as np - -from fastembed import LateInteractionTextEmbedding -from fastembed.postprocess import Muvera - -CANONICAL_VALUES = [-2.61810007e-04, 1.89005750e00, -2.32070747e00] -CANONICAL_QUERY_VALUES = [ - -0.85783903, - 1.1077204, - -0.09522747, -] # part of the values are zeros, should be compared with the result of nonzero mask - -DIM = 128 -K_SIM = 5 -DIM_PROJ = 16 -R_REPS = 20 - - -def test_single_input(): - model = LateInteractionTextEmbedding("colbert-ir/colbertv2.0", lazy_load=True) - random_generator = np.random.default_rng(42) - multivector = random_generator.random((10, 128)) - - for muvera in ( - Muvera(dim=DIM, k_sim=K_SIM, dim_proj=DIM_PROJ, r_reps=R_REPS, random_seed=42), - Muvera.from_multivector_model(model, k_sim=K_SIM, dim_proj=DIM_PROJ, r_reps=R_REPS), - ): - fde = muvera.process(multivector) - assert fde.shape[0] == muvera.embedding_size - assert np.allclose(fde[:3], CANONICAL_VALUES) - - fde_doc = muvera.process_document(multivector) - assert fde_doc.shape[0] == muvera.embedding_size - assert np.allclose(fde, fde_doc) - - fde_query = muvera.process_query(multivector) - assert fde_query.shape[0] == muvera.embedding_size - assert np.allclose(fde_query[np.nonzero(fde_query)][:3], CANONICAL_QUERY_VALUES) diff --git a/tests/test_protein_embeddings.py b/tests/test_protein_embeddings.py new file mode 100644 index 000000000..0b924d8bb --- /dev/null +++ b/tests/test_protein_embeddings.py @@ -0,0 +1,147 @@ +import os + +import numpy as np +import pytest + +from fastembed_bio import ProteinEmbedding +from tests.utils import delete_model_cache + + +# Sample protein sequences for testing +SAMPLE_SEQUENCES = [ + "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG", + "GKGDPKKPRGKMSSYAFFVQTSREEHKKKHPDASVNFSEFSKKCSERWKTMSAKEKGKFEDMAKADKARYEREMKTY", +] + + +CANONICAL_VECTOR_VALUES = { + "facebook/esm2_t12_35M_UR50D": np.array( + [-0.0055, -0.0144, 0.0355, -0.0049, 0.0071] + ), +} + + +@pytest.fixture(scope="module") +def model_fixture(): + """ + Fixture that provides the protein embedding model and handles cleanup. + """ + is_ci = os.getenv("CI") + model = ProteinEmbedding("facebook/esm2_t12_35M_UR50D") + yield model + if is_ci: + delete_model_cache(model.model._model_dir) + + +def test_protein_embedding(model_fixture) -> None: + """Test basic protein embedding functionality.""" + model = model_fixture + dim = 480 # ESM2 t12 35M has 480 dimensions + + embeddings = list(model.embed(SAMPLE_SEQUENCES)) + embeddings = np.stack(embeddings, axis=0) + + assert embeddings.shape == (2, dim), f"Expected shape (2, {dim}), got {embeddings.shape}" + + # Check that embeddings are normalized (L2 norm close to 1) + norms = np.linalg.norm(embeddings, axis=1) + assert np.allclose(norms, 1.0, atol=1e-5), f"Embeddings should be normalized, got norms: {norms}" + + +def test_protein_embedding_canonical_values(model_fixture) -> None: + """ + Test that embeddings match expected canonical values. + """ + model = model_fixture + canonical_vector = CANONICAL_VECTOR_VALUES["facebook/esm2_t12_35M_UR50D"] + + embeddings = list(model.embed(SAMPLE_SEQUENCES[:1])) + embedding = embeddings[0] + + assert np.allclose( + embedding[: canonical_vector.shape[0]], canonical_vector, atol=1e-3 + ), f"First 5 values {embedding[:5]} don't match canonical {canonical_vector}" + + +def test_protein_embedding_single_sequence(model_fixture) -> None: + """ + Test embedding a single sequence passed as a string. + """ + model = model_fixture + dim = 480 + + # Single sequence as string + embedding = list(model.embed("MKTVRQERLKS")) + assert len(embedding) == 1 + assert embedding[0].shape == (dim,) + + +def test_protein_embedding_batch(model_fixture) -> None: + """ + Test batch embedding with different batch sizes. + """ + model = model_fixture + dim = 480 + + sequences = SAMPLE_SEQUENCES * 10 # 20 sequences + + # test with small batch size + embeddings_small_batch = list(model.embed(sequences, batch_size=4)) + embeddings_small_batch = np.stack(embeddings_small_batch, axis=0) + assert embeddings_small_batch.shape == (len(sequences), dim) + + # test with larger batch size + embeddings_large_batch = list(model.embed(sequences, batch_size=16)) + embeddings_large_batch = np.stack(embeddings_large_batch, axis=0) + assert embeddings_large_batch.shape == (len(sequences), dim) + + # results should be the same regardless of batch size + assert np.allclose(embeddings_small_batch, embeddings_large_batch, atol=1e-5) + + +def test_protein_embedding_size() -> None: + """ + Test get_embedding_size class method. + """ + assert ProteinEmbedding.get_embedding_size("facebook/esm2_t12_35M_UR50D") == 480 + + +def test_protein_embedding_lazy_load() -> None: + """ + Test lazy loading functionality. + """ + is_ci = os.getenv("CI") + + model = ProteinEmbedding("facebook/esm2_t12_35M_UR50D", lazy_load=True) + # model should not be loaded yet + assert not hasattr(model.model, "model") or model.model.model is None + + # after embedding, model should be loaded + list(model.embed(SAMPLE_SEQUENCES[:1])) + assert model.model.model is not None + + if is_ci: + delete_model_cache(model.model._model_dir) + + +def test_list_supported_models() -> None: + """ + Test listing supported protein models. + """ + models = ProteinEmbedding.list_supported_models() + assert len(models) > 0 + assert any(m["model"] == "facebook/esm2_t12_35M_UR50D" for m in models) + + # check required fields + for model_info in models: + assert "model" in model_info + assert "dim" in model_info + assert "description" in model_info + + +def test_unsupported_model() -> None: + """ + Test that unsupported model raises ValueError. + """ + with pytest.raises(ValueError, match="not supported"): + ProteinEmbedding("nonexistent/model") \ No newline at end of file diff --git a/tests/test_sparse_embeddings.py b/tests/test_sparse_embeddings.py deleted file mode 100644 index c2a7e2ffa..000000000 --- a/tests/test_sparse_embeddings.py +++ /dev/null @@ -1,322 +0,0 @@ -import os -from contextlib import contextmanager - -import pytest -import numpy as np - -from fastembed.sparse.bm25 import Bm25 -from fastembed.sparse.sparse_text_embedding import SparseTextEmbedding -from tests.utils import delete_model_cache, should_test_model - -CANONICAL_COLUMN_VALUES = { - "prithivida/Splade_PP_en_v1": { - "indices": [ - 2040, - 2047, - 2088, - 2299, - 2748, - 3011, - 3376, - 3795, - 4774, - 5304, - 5798, - 6160, - 7592, - 7632, - 8484, - ], - "values": [ - 0.4219532012939453, - 0.4320072531700134, - 2.766580104827881, - 0.3314574658870697, - 1.395172119140625, - 0.021595917642116547, - 0.43770670890808105, - 0.0008370947907678783, - 0.5187209844589233, - 0.17124654352664948, - 0.14742016792297363, - 0.8142819404602051, - 2.803262710571289, - 2.1904349327087402, - 1.0531445741653442, - ], - }, - "Qdrant/minicoil-v1": { - "indices": [80, 81, 82, 83, 6664, 6665, 6666, 6667], - "values": [ - 0.52634597, - 0.8711344, - 1.2264385, - 0.52123857, - 0.974713, - -0.97803956, - -0.94312465, - -0.12508166, - ], - }, -} - -CANONICAL_QUERY_VALUES = { - "Qdrant/minicoil-v1": { - "indices": [80, 81, 82, 83, 6664, 6665, 6666, 6667], - "values": [ - 0.31389374, - 0.5195128, - 0.7314033, - 0.3108479, - 0.5812834, - -0.5832673, - -0.5624452, - -0.0745942, - ], - }, -} - - -_MODELS_TO_CACHE = ( - "prithivida/Splade_PP_en_v1", - "Qdrant/minicoil-v1", - "Qdrant/bm25", - "Qdrant/bm42-all-minilm-l6-v2-attentions", -) -MODELS_TO_CACHE = tuple([x.lower() for x in _MODELS_TO_CACHE]) - - -@pytest.fixture(scope="module") -def model_cache(): - is_ci = os.getenv("CI") - cache = {} - - @contextmanager - def get_model(model_name: str): - lowercase_model_name = model_name.lower() - if lowercase_model_name not in cache: - cache[lowercase_model_name] = SparseTextEmbedding(lowercase_model_name) - yield cache[lowercase_model_name] - if lowercase_model_name not in MODELS_TO_CACHE: - model_inst = cache.pop(lowercase_model_name) - if is_ci: - delete_model_cache(model_inst.model._model_dir) - del model_inst - - yield get_model - - if is_ci: - for name, model in cache.items(): - delete_model_cache(model.model._model_dir) - cache.clear() - - -docs = ["Hello World"] - - -@pytest.mark.parametrize( - "model_name", - ["prithivida/Splade_PP_en_v1", "Qdrant/minicoil-v1"], -) -def test_batch_embedding(model_cache, model_name: str) -> None: - docs_to_embed = docs * 10 - - with model_cache(model_name) as model: - result = next(iter(model.embed(docs_to_embed, batch_size=6))) - expected_result = CANONICAL_COLUMN_VALUES[model_name] - assert result.indices.tolist() == expected_result["indices"] - - for i, value in enumerate(result.values): - assert pytest.approx(value, abs=0.001) == expected_result["values"][i] - - -def test_single_embedding(model_cache) -> None: - is_ci = os.getenv("CI") - is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch" - - for model_desc in SparseTextEmbedding._list_supported_models(): - if ( - model_desc.model not in CANONICAL_COLUMN_VALUES - ): # attention models and bm25 are also parts of - # SparseTextEmbedding, however, they have their own tests - continue - if not should_test_model(model_desc, model_desc.model, is_ci, is_manual): - continue - - with model_cache(model_desc.model) as model: - passage_result = next(iter(model.embed(docs, batch_size=6))) - query_result = next(iter(model.query_embed(docs))) - expected_result = CANONICAL_COLUMN_VALUES[model_desc.model] - expected_query_result = CANONICAL_QUERY_VALUES.get(model_desc.model, expected_result) - assert passage_result.indices.tolist() == expected_result["indices"] - for i, value in enumerate(passage_result.values): - assert pytest.approx(value, abs=0.001) == expected_result["values"][i] - - assert query_result.indices.tolist() == expected_query_result["indices"] - for i, value in enumerate(query_result.values): - assert pytest.approx(value, abs=0.001) == expected_query_result["values"][i] - - -@pytest.mark.parametrize( - "model_name", - ["prithivida/Splade_PP_en_v1", "Qdrant/minicoil-v1"], -) -def test_parallel_processing(model_cache, model_name: str) -> None: - with model_cache(model_name) as model: - docs = ["hello world", "flag embedding"] * 30 - sparse_embeddings_duo = list(model.embed(docs, batch_size=10, parallel=2)) - # sparse_embeddings_all = list(model.embed(docs, batch_size=10, parallel=0)) # inherits OnnxTextModel which - # is tested in TextEmbedding, disabling it here to reduce number of requests to hf - # multiprocessing is enough to test with `parallel=2`, and `parallel=None` is okay to tests since it reuses - # model from cache - sparse_embeddings = list(model.embed(docs, batch_size=10, parallel=None)) - - assert ( - len(sparse_embeddings) - == len(sparse_embeddings_duo) - # == len(sparse_embeddings_all) - == len(docs) - ) - - for ( - sparse_embedding, - sparse_embedding_duo, - # sparse_embedding_all - ) in zip( - sparse_embeddings, - sparse_embeddings_duo, - # sparse_embeddings_all - ): - assert ( - sparse_embedding.indices.tolist() == sparse_embedding_duo.indices.tolist() - # == sparse_embedding_all.indices.tolist() - ) - assert np.allclose(sparse_embedding.values, sparse_embedding_duo.values, atol=1e-3) - # assert np.allclose(sparse_embedding.values, sparse_embedding_all.values, atol=1e-3) - - -def test_stem_with_stopwords_and_punctuation(model_cache) -> None: - with model_cache("Qdrant/bm25") as model: - bm25_instance = model.model - # Setup - original_stopwords = bm25_instance.stopwords.copy() - original_punctuation = bm25_instance.punctuation.copy() - - bm25_instance.stopwords = {"the", "is", "a"} - bm25_instance.punctuation = {".", ",", "!"} - - # Test data - tokens = ["The", "quick", "brown", "fox", "is", "a", "test", "sentence", ".", "!"] - - # Execute - result = bm25_instance._stem(tokens) - - # Assert - expected = ["quick", "brown", "fox", "test", "sentenc"] - assert result == expected, f"Expected {expected}, but got {result}" - - bm25_instance.stopwords = original_stopwords - bm25_instance.punctuation = original_punctuation - - -def test_stem_case_insensitive_stopwords(model_cache) -> None: - with model_cache("Qdrant/bm25") as model: - bm25_instance = model.model - original_stopwords = bm25_instance.stopwords.copy() - original_punctuation = bm25_instance.punctuation.copy() - - # Setup - bm25_instance.stopwords = {"the", "is", "a"} - bm25_instance.punctuation = {".", ",", "!"} - - # Test data - tokens = ["THE", "Quick", "Brown", "Fox", "IS", "A", "Test", "Sentence", ".", "!"] - - # Execute - result = bm25_instance._stem(tokens) - - # Assert - expected = ["quick", "brown", "fox", "test", "sentenc"] - assert result == expected, f"Expected {expected}, but got {result}" - bm25_instance.stopwords = original_stopwords - bm25_instance.punctuation = original_punctuation - - -@pytest.mark.parametrize("disable_stemmer", [True, False]) -def test_disable_stemmer_behavior(disable_stemmer: bool) -> None: - # Setup - model = Bm25("Qdrant/bm25", language="english", disable_stemmer=disable_stemmer) - model.stopwords = {"the", "is", "a"} - model.punctuation = {".", ",", "!"} - - # Test data - tokens = ["The", "quick", "brown", "fox", "is", "a", "test", "sentence", ".", "!"] - - # Execute - result = model._stem(tokens) - - # Assert - if disable_stemmer: - expected = ["quick", "brown", "fox", "test", "sentence"] # no stemming, lower case only - else: - expected = ["quick", "brown", "fox", "test", "sentenc"] - assert result == expected, f"Expected {expected}, but got {result}" - - -@pytest.mark.parametrize("model_name", ["prithivida/Splade_PP_en_v1"]) -def test_lazy_load(model_name: str) -> None: - is_ci = os.getenv("CI") - model = SparseTextEmbedding(model_name=model_name, lazy_load=True) - assert not hasattr(model.model, "model") - - docs = ["hello world", "flag embedding"] - list(model.embed(docs)) - assert hasattr(model.model, "model") - - model = SparseTextEmbedding(model_name=model_name, lazy_load=True) - list(model.query_embed(docs)) - - model = SparseTextEmbedding(model_name=model_name, lazy_load=True) - list(model.passage_embed(docs)) - - if is_ci: - delete_model_cache(model.model._model_dir) - - -@pytest.mark.parametrize( - "model_name", - [ - "prithivida/Splade_PP_en_v1", - "Qdrant/minicoil-v1", - "Qdrant/bm42-all-minilm-l6-v2-attentions", - ], -) -def test_session_options(model_cache, model_name) -> None: - with model_cache(model_name) as default_model: - default_session_options = default_model.model.model.get_session_options() - assert default_session_options.enable_cpu_mem_arena is True - model = SparseTextEmbedding(model_name=model_name, enable_cpu_mem_arena=False) - session_options = model.model.model.get_session_options() - assert session_options.enable_cpu_mem_arena is False - - -@pytest.mark.parametrize( - "model_name", - [ - "prithivida/Splade_PP_en_v1", - "Qdrant/minicoil-v1", - "Qdrant/bm42-all-minilm-l6-v2-attentions", - "Qdrant/bm25", - ], -) -def test_token_count(model_cache, model_name) -> None: - with model_cache(model_name) as model: - documents = [ - "Name me a couple of cities were the capitals of Germany?", - "Berlin is the current capital of Germany, Bonn is a former capital of Germany.", - ] - first_doc_token_count = model.token_count(documents[0]) - second_doc_token_count = model.token_count(documents[1]) - doc_token_count = model.token_count(documents) - assert first_doc_token_count + second_doc_token_count == doc_token_count - assert doc_token_count == model.token_count(documents, batch_size=1) diff --git a/tests/test_text_cross_encoder.py b/tests/test_text_cross_encoder.py deleted file mode 100644 index 4d0d5b7d6..000000000 --- a/tests/test_text_cross_encoder.py +++ /dev/null @@ -1,151 +0,0 @@ -import os -from contextlib import contextmanager - -import numpy as np -import pytest - -from fastembed.rerank.cross_encoder import TextCrossEncoder -from tests.utils import delete_model_cache, should_test_model - -CANONICAL_SCORE_VALUES = { - "Xenova/ms-marco-MiniLM-L-6-v2": np.array([8.500708, -2.541011]), - "Xenova/ms-marco-MiniLM-L-12-v2": np.array([9.330912, -2.0380247]), - "BAAI/bge-reranker-base": np.array([6.15733337, -3.65939403]), - "jinaai/jina-reranker-v1-tiny-en": np.array([2.5911, 0.1122]), - "jinaai/jina-reranker-v1-turbo-en": np.array([1.8295, -2.8908]), - "jinaai/jina-reranker-v2-base-multilingual": np.array([1.6533, -1.6455]), -} - - -_MODELS_TO_CACHE = ("Xenova/ms-marco-MiniLM-L-6-v2",) -MODELS_TO_CACHE = tuple([x.lower() for x in _MODELS_TO_CACHE]) - - -@pytest.fixture(scope="module") -def model_cache(): - is_ci = os.getenv("CI") - cache = {} - - @contextmanager - def get_model(model_name: str): - lowercase_model_name = model_name.lower() - if lowercase_model_name not in cache: - cache[lowercase_model_name] = TextCrossEncoder(lowercase_model_name) - yield cache[lowercase_model_name] - if lowercase_model_name not in MODELS_TO_CACHE: - model_inst = cache.pop(lowercase_model_name) - if is_ci: - delete_model_cache(model_inst.model._model_dir) - del model_inst - - yield get_model - - if is_ci: - for name, model in cache.items(): - delete_model_cache(model.model._model_dir) - cache.clear() - - -@pytest.mark.parametrize("model_name", ["Xenova/ms-marco-MiniLM-L-6-v2"]) -def test_rerank(model_cache, model_name: str) -> None: - is_ci = os.getenv("CI") - is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch" - - for model_desc in TextCrossEncoder._list_supported_models(): - if not should_test_model(model_desc, model_name, is_ci, is_manual): - continue - - with model_cache(model_desc.model) as model: - query = "What is the capital of France?" - documents = ["Paris is the capital of France.", "Berlin is the capital of Germany."] - scores = np.array(list(model.rerank(query, documents))) - - pairs = [(query, doc) for doc in documents] - scores2 = np.array(list(model.rerank_pairs(pairs))) - assert np.allclose( - scores, scores2, atol=1e-5 - ), f"Model: {model_desc.model}, Scores: {scores}, Scores2: {scores2}" - - canonical_scores = CANONICAL_SCORE_VALUES[model_desc.model] - assert np.allclose( - scores, canonical_scores, atol=1e-3 - ), f"Model: {model_desc.model}, Scores: {scores}, Expected: {canonical_scores}" - - -@pytest.mark.parametrize("model_name", ["Xenova/ms-marco-MiniLM-L-6-v2"]) -def test_batch_rerank(model_cache, model_name: str) -> None: - with model_cache(model_name) as model: - query = "What is the capital of France?" - documents = ["Paris is the capital of France.", "Berlin is the capital of Germany."] * 50 - scores = np.array(list(model.rerank(query, documents, batch_size=10))) - - pairs = [(query, doc) for doc in documents] - scores2 = np.array(list(model.rerank_pairs(pairs))) - assert np.allclose( - scores, scores2, atol=1e-5 - ), f"Model: {model_name}, Scores: {scores}, Scores2: {scores2}" - - canonical_scores = np.tile(CANONICAL_SCORE_VALUES[model_name], 50) - - assert scores.shape == canonical_scores.shape, f"Unexpected shape for model {model_name}" - assert np.allclose( - scores, canonical_scores, atol=1e-3 - ), f"Model: {model_name}, Scores: {scores}, Expected: {canonical_scores}" - - -@pytest.mark.parametrize("model_name", ["Xenova/ms-marco-MiniLM-L-6-v2"]) -def test_lazy_load(model_name: str) -> None: - is_ci = os.getenv("CI") - model = TextCrossEncoder(model_name=model_name, lazy_load=True) - assert not hasattr(model.model, "model") - query = "What is the capital of France?" - documents = ["Paris is the capital of France.", "Berlin is the capital of Germany."] - list(model.rerank(query, documents)) - assert hasattr(model.model, "model") - - if is_ci: - delete_model_cache(model.model._model_dir) - - -@pytest.mark.parametrize("model_name", ["Xenova/ms-marco-MiniLM-L-6-v2"]) -def test_rerank_pairs_parallel(model_cache, model_name: str) -> None: - with model_cache(model_name) as model: - query = "What is the capital of France?" - documents = ["Paris is the capital of France.", "Berlin is the capital of Germany."] * 10 - pairs = [(query, doc) for doc in documents] - scores_parallel = np.array(list(model.rerank_pairs(pairs, parallel=2, batch_size=10))) - scores_sequential = np.array(list(model.rerank_pairs(pairs, batch_size=10))) - assert np.allclose( - scores_parallel, scores_sequential, atol=1e-5 - ), f"Model: {model_name}, Scores (Parallel): {scores_parallel}, Scores (Sequential): {scores_sequential}" - canonical_scores = CANONICAL_SCORE_VALUES[model_name] - assert np.allclose( - scores_parallel[: len(canonical_scores)], canonical_scores, atol=1e-3 - ), f"Model: {model_name}, Scores (Parallel): {scores_parallel}, Expected: {canonical_scores}" - - -@pytest.mark.parametrize("model_name", ["Xenova/ms-marco-MiniLM-L-6-v2"]) -def test_token_count(model_cache, model_name: str) -> None: - with model_cache(model_name) as model: - pairs = [ - ("What is the capital of France?", "Paris is the capital of France."), - ( - "Name me a couple of cities were the capitals of Germany?", - "Berlin is the current capital of Germany, Bonn is a former capital of Germany.", - ), - ] - first_pair_token_count = model.token_count([pairs[0]]) - second_pair_token_count = model.token_count([pairs[1]]) - pairs_token_count = model.token_count(pairs) - assert first_pair_token_count + second_pair_token_count == pairs_token_count - assert pairs_token_count == model.token_count(pairs, batch_size=1) - - -@pytest.mark.parametrize("model_name", ["Xenova/ms-marco-MiniLM-L-6-v2"]) -def test_session_options(model_cache, model_name) -> None: - with model_cache(model_name) as default_model: - default_session_options = default_model.model.model.get_session_options() - assert default_session_options.enable_cpu_mem_arena is True - model = TextCrossEncoder(model_name=model_name, enable_cpu_mem_arena=False) - session_options = model.model.model.get_session_options() - assert session_options.enable_cpu_mem_arena is False diff --git a/tests/test_text_multitask_embeddings.py b/tests/test_text_multitask_embeddings.py deleted file mode 100644 index f246b489f..000000000 --- a/tests/test_text_multitask_embeddings.py +++ /dev/null @@ -1,241 +0,0 @@ -import os - -import numpy as np -import pytest - -from fastembed import TextEmbedding -from fastembed.text.multitask_embedding import JinaEmbeddingV3, Task -from tests.utils import delete_model_cache - - -CANONICAL_VECTOR_VALUES = { - "jinaai/jina-embeddings-v3": [ - { - "task_id": Task.RETRIEVAL_QUERY, - "vectors": np.array( - [ - [0.0623, -0.0402, 0.1706, -0.0143, 0.0617], - [-0.1064, -0.0733, 0.0353, 0.0096, 0.0667], - ] - ), - }, - { - "task_id": Task.RETRIEVAL_PASSAGE, - "vectors": np.array( - [ - [0.0513, -0.0247, 0.1751, -0.0075, 0.0679], - [-0.0987, -0.0786, 0.09, 0.0087, 0.0577], - ] - ), - }, - { - "task_id": Task.SEPARATION, - "vectors": np.array( - [ - [0.094, -0.1065, 0.1305, 0.0547, 0.0556], - [0.0315, -0.1468, 0.065, 0.0568, 0.0546], - ] - ), - }, - { - "task_id": Task.CLASSIFICATION, - "vectors": np.array( - [ - [0.0606, -0.0877, 0.1384, 0.0065, 0.0722], - [-0.0502, -0.119, 0.032, 0.0514, 0.0689], - ] - ), - }, - { - "task_id": Task.TEXT_MATCHING, - "vectors": np.array( - [ - [0.0911, -0.0341, 0.1305, -0.026, 0.0576], - [-0.1432, -0.05, 0.0133, 0.0464, 0.0789], - ] - ), - }, - ] -} -docs = ["Hello World", "Follow the white rabbit."] - - -@pytest.mark.parametrize("dim,model_name", [(1024, "jinaai/jina-embeddings-v3")]) -def test_batch_embedding(dim: int, model_name: str): - is_ci = os.getenv("CI") - is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch" - if is_ci and not is_manual: - pytest.skip("Skipping multitask models in CI non-manual mode") - - docs_to_embed = docs * 10 - default_task = Task.RETRIEVAL_PASSAGE - - model = TextEmbedding(model_name=model_name) - - embeddings = list(model.embed(documents=docs_to_embed, batch_size=6)) - embeddings = np.stack(embeddings, axis=0) - - assert embeddings.shape == (len(docs_to_embed), dim) - - canonical_vector = CANONICAL_VECTOR_VALUES[model_name][default_task]["vectors"] - assert np.allclose( - embeddings[: len(docs), : canonical_vector.shape[1]], canonical_vector, atol=1e-4 - ), model_name - - if is_ci: - delete_model_cache(model.model._model_dir) - - -def test_single_embedding(): - is_ci = os.getenv("CI") - is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch" - if is_ci and not is_manual: - pytest.skip("Skipping multitask models in CI non-manual mode") - - for model_desc in JinaEmbeddingV3._list_supported_models(): - # todo: once we add more models, we should not test models >1GB size locally - model_name = model_desc.model - dim = model_desc.dim - - model = TextEmbedding(model_name=model_name) - - for task in CANONICAL_VECTOR_VALUES[model_name]: - print(f"evaluating {model_name} task_id: {task['task_id']}") - - embeddings = list(model.embed(documents=docs, task_id=task["task_id"])) - embeddings = np.stack(embeddings, axis=0) - - assert embeddings.shape == (len(docs), dim) - - canonical_vector = task["vectors"] - assert np.allclose( - embeddings[:, : canonical_vector.shape[1]], canonical_vector, atol=1e-4 - ), model_desc.model - - classification_embeddings = list(model.embed(documents=docs, task_id=Task.CLASSIFICATION)) - classification_embeddings = np.stack(classification_embeddings, axis=0) - - assert classification_embeddings.shape == (len(docs), dim) - - model = TextEmbedding(model_name=model_name, task_id=Task.CLASSIFICATION) - default_embeddings = list(model.embed(documents=docs)) - default_embeddings = np.stack(default_embeddings, axis=0) - - assert default_embeddings.shape == (len(docs), dim) - - assert np.allclose( - classification_embeddings, - default_embeddings, - atol=1e-4, - ), model_desc.model - if is_ci: - delete_model_cache(model.model._model_dir) - - -def test_single_embedding_query(): - is_ci = os.getenv("CI") - is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch" - if is_ci and not is_manual: - pytest.skip("Skipping multitask models in CI non-manual mode") - - task_id = Task.RETRIEVAL_QUERY - - for model_desc in JinaEmbeddingV3._list_supported_models(): - # todo: once we add more models, we should not test models >1GB size locally - model_name = model_desc.model - dim = model_desc.dim - - model = TextEmbedding(model_name=model_name) - - print(f"evaluating {model_name} query_embed task_id: {task_id}") - - embeddings = list(model.query_embed(query=docs)) - embeddings = np.stack(embeddings, axis=0) - - assert embeddings.shape == (len(docs), dim) - - canonical_vector = CANONICAL_VECTOR_VALUES[model_name][task_id]["vectors"] - assert np.allclose( - embeddings[:, : canonical_vector.shape[1]], canonical_vector, atol=1e-4 - ), model_desc.model - - if is_ci: - delete_model_cache(model.model._model_dir) - - -def test_single_embedding_passage(): - is_ci = os.getenv("CI") - is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch" - if is_ci and not is_manual: - pytest.skip("Skipping multitask models in CI non-manual mode") - - task_id = Task.RETRIEVAL_PASSAGE - - for model_desc in JinaEmbeddingV3._list_supported_models(): - # todo: once we add more models, we should not test models >1GB size locally - - model_name = model_desc.model - dim = model_desc.dim - - model = TextEmbedding(model_name=model_name) - - print(f"evaluating {model_name} passage_embed task_id: {task_id}") - - embeddings = list(model.passage_embed(texts=docs)) - embeddings = np.stack(embeddings, axis=0) - - assert embeddings.shape == (len(docs), dim) - - canonical_vector = CANONICAL_VECTOR_VALUES[model_name][task_id]["vectors"] - assert np.allclose( - embeddings[:, : canonical_vector.shape[1]], canonical_vector, atol=1e-4 - ), model_desc.model - - if is_ci: - delete_model_cache(model.model._model_dir) - - -@pytest.mark.parametrize("dim,model_name", [(1024, "jinaai/jina-embeddings-v3")]) -def test_parallel_processing(dim: int, model_name: str): - is_ci = os.getenv("CI") - is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch" - if is_ci and not is_manual: - pytest.skip("Skipping in CI non-manual mode") - - docs = ["Hello World", "Follow the white rabbit."] * 10 - - model = TextEmbedding(model_name=model_name) - - task_id = Task.SEPARATION - embeddings_1 = list(model.embed(docs, batch_size=10, parallel=None, task_id=task_id)) - embeddings_1 = np.stack(embeddings_1, axis=0) - - embeddings_2 = list(model.embed(docs, batch_size=10, parallel=1, task_id=task_id)) - embeddings_2 = np.stack(embeddings_2, axis=0) - - assert embeddings_1.shape[0] == len(docs) and embeddings_1.shape[-1] == dim - assert np.allclose(embeddings_1, embeddings_2, atol=1e-4) - - canonical_vector = CANONICAL_VECTOR_VALUES[model_name][task_id]["vectors"] - assert np.allclose(embeddings_2[:2, : canonical_vector.shape[1]], canonical_vector, atol=1e-4) - - if is_ci: - delete_model_cache(model.model._model_dir) - - -@pytest.mark.parametrize("model_name", ["jinaai/jina-embeddings-v3"]) -def test_lazy_load(model_name: str): - is_ci = os.getenv("CI") - is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch" - - if is_ci and not is_manual: - pytest.skip("Skipping in CI non-manual mode") - - model = TextEmbedding(model_name=model_name, lazy_load=True) - assert not hasattr(model.model, "model") - - list(model.embed(docs)) - assert hasattr(model.model, "model") - - if is_ci: - delete_model_cache(model.model._model_dir) diff --git a/tests/test_text_onnx_embeddings.py b/tests/test_text_onnx_embeddings.py deleted file mode 100644 index e919faf9d..000000000 --- a/tests/test_text_onnx_embeddings.py +++ /dev/null @@ -1,219 +0,0 @@ -import os -import platform -from contextlib import contextmanager - -import numpy as np -import pytest - -from fastembed.text.text_embedding import TextEmbedding -from tests.utils import delete_model_cache, should_test_model - -CANONICAL_VECTOR_VALUES = { - "BAAI/bge-small-en": np.array([-0.0232, -0.0255, 0.0174, -0.0639, -0.0006]), - "BAAI/bge-small-en-v1.5": np.array( - [0.01522374, -0.02271799, 0.00860278, -0.07424029, 0.00386434] - ), - "BAAI/bge-small-en-v1.5-quantized": np.array( - [0.01522374, -0.02271799, 0.00860278, -0.07424029, 0.00386434] - ), - "BAAI/bge-small-zh-v1.5": np.array( - [-0.01023294, 0.07634465, 0.0691722, -0.04458365, -0.03160762] - ), - "BAAI/bge-base-en": np.array([0.0115, 0.0372, 0.0295, 0.0121, 0.0346]), - "BAAI/bge-base-en-v1.5": np.array( - [0.01129394, 0.05493144, 0.02615099, 0.00328772, 0.02996045] - ), - "BAAI/bge-large-en-v1.5": np.array( - [0.03434538, 0.03316108, 0.02191251, -0.03713358, -0.01577825] - ), - "BAAI/bge-large-en-v1.5-quantized": np.array( - [0.03434538, 0.03316108, 0.02191251, -0.03713358, -0.01577825] - ), - "sentence-transformers/all-MiniLM-L6-v2": np.array( - [-0.034478, 0.03102, 0.00673, 0.02611, -0.039362] - ), - "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2": np.array( - [0.0361, 0.1862, 0.2776, 0.2461, -0.1904] - ), - "intfloat/multilingual-e5-large": np.array([0.4544, -0.0968, 0.1054, -1.3753, 0.1500]), - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2": np.array( - [0.0047, 0.1334, -0.0102, 0.0714, 0.1930] - ), - "jinaai/jina-embeddings-v2-small-en": np.array([-0.0455, -0.0428, -0.0122, 0.0613, 0.0015]), - "jinaai/jina-embeddings-v2-base-en": np.array([-0.0332, -0.0509, 0.0287, -0.0043, -0.0077]), - "jinaai/jina-embeddings-v2-base-de": np.array([-0.0085, 0.0417, 0.0342, 0.0309, -0.0149]), - "jinaai/jina-embeddings-v2-base-code": np.array([0.0145, -0.0164, 0.0136, -0.0170, 0.0734]), - "jinaai/jina-embeddings-v2-base-zh": np.array([0.0381, 0.0286, -0.0231, 0.0052, -0.0151]), - "jinaai/jina-embeddings-v2-base-es": np.array([-0.0108, -0.0092, -0.0373, 0.0171, -0.0301]), - "nomic-ai/nomic-embed-text-v1": np.array([0.3708, 0.2031, -0.3406, -0.2114, -0.3230]), - "nomic-ai/nomic-embed-text-v1.5": np.array( - [-0.15407836, -0.03053198, -3.9138033, 0.1910364, 0.13224715] - ), - "nomic-ai/nomic-embed-text-v1.5-Q": np.array( - [0.0802303, 0.3700881, -4.3053818, 0.4431803, -0.271572] - ), - "thenlper/gte-large": np.array( - [-0.00986551, -0.00018734, 0.00605892, -0.03289612, -0.0387564], - ), - "mixedbread-ai/mxbai-embed-large-v1": np.array( - [0.02295546, 0.03196154, 0.016512, -0.04031524, -0.0219634] - ), - "snowflake/snowflake-arctic-embed-xs": np.array([0.0092, 0.0619, 0.0196, 0.009, -0.0114]), - "snowflake/snowflake-arctic-embed-s": np.array([-0.0416, -0.0867, 0.0209, 0.0554, -0.0272]), - "snowflake/snowflake-arctic-embed-m": np.array([-0.0329, 0.0364, 0.0481, 0.0016, 0.0328]), - "snowflake/snowflake-arctic-embed-m-long": np.array( - [0.0080, -0.0266, -0.0335, 0.0282, 0.0143] - ), - "snowflake/snowflake-arctic-embed-l": np.array([0.0189, -0.0673, 0.0183, 0.0124, 0.0146]), - "Qdrant/clip-ViT-B-32-text": np.array([0.0083, 0.0103, -0.0138, 0.0199, -0.0069]), - "thenlper/gte-base": np.array([0.0038, 0.0355, 0.0181, 0.0092, 0.0654]), - "jinaai/jina-clip-v1": np.array([-0.0862, -0.0101, -0.0056, 0.0375, -0.0472]), -} - -MULTI_TASK_MODELS = ["jinaai/jina-embeddings-v3"] - -_MODELS_TO_CACHE = ("BAAI/bge-small-en-v1.5",) -MODELS_TO_CACHE = tuple([x.lower() for x in _MODELS_TO_CACHE]) - - -@pytest.fixture(scope="module") -def model_cache(): - is_ci = os.getenv("CI") - cache = {} - - @contextmanager - def get_model(model_name: str): - lowercase_model_name = model_name.lower() - if lowercase_model_name not in cache: - cache[lowercase_model_name] = TextEmbedding(lowercase_model_name) - yield cache[lowercase_model_name] - if lowercase_model_name not in MODELS_TO_CACHE: - model_inst = cache.pop(lowercase_model_name) - if is_ci: - delete_model_cache(model_inst.model._model_dir) - del model_inst - - yield get_model - - if is_ci: - for name, model in cache.items(): - delete_model_cache(model.model._model_dir) - cache.clear() - - -@pytest.mark.parametrize("model_name", ["BAAI/bge-small-en-v1.5"]) -def test_embedding(model_cache, model_name: str) -> None: - is_ci = os.getenv("CI") - is_mac = platform.system() == "Darwin" - is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch" - - for model_desc in TextEmbedding._list_supported_models(): - if model_desc.model in MULTI_TASK_MODELS or ( - is_mac and model_desc.model == "nomic-ai/nomic-embed-text-v1.5-Q" - ): - continue - if not should_test_model(model_desc, model_name, is_ci, is_manual): - continue - - dim = model_desc.dim - - with model_cache(model_desc.model) as model: - docs = ["hello world", "flag embedding"] - embeddings = list(model.embed(docs)) - embeddings = np.stack(embeddings, axis=0) - assert embeddings.shape == (2, dim) - - canonical_vector = CANONICAL_VECTOR_VALUES[model_desc.model] - assert np.allclose( - embeddings[0, : canonical_vector.shape[0]], canonical_vector, atol=1e-3 - ), model_desc.model - - -@pytest.mark.parametrize("n_dims,model_name", [(384, "BAAI/bge-small-en-v1.5")]) -def test_batch_embedding(model_cache, n_dims: int, model_name: str) -> None: - with model_cache(model_name) as model: - docs = ["hello world", "flag embedding"] * 100 - embeddings = list(model.embed(docs, batch_size=10)) - embeddings = np.stack(embeddings, axis=0) - - assert embeddings.shape == (len(docs), n_dims) - - -@pytest.mark.parametrize("n_dims,model_name", [(384, "BAAI/bge-small-en-v1.5")]) -def test_parallel_processing(model_cache, n_dims: int, model_name: str) -> None: - with model_cache(model_name) as model: - docs = ["hello world", "flag embedding"] * 100 - embeddings = list(model.embed(docs, batch_size=10, parallel=2)) - embeddings = np.stack(embeddings, axis=0) - - embeddings_2 = list(model.embed(docs, batch_size=10, parallel=None)) - embeddings_2 = np.stack(embeddings_2, axis=0) - - embeddings_3 = list(model.embed(docs, batch_size=10, parallel=0)) - embeddings_3 = np.stack(embeddings_3, axis=0) - - assert embeddings.shape == (len(docs), n_dims) - assert np.allclose(embeddings, embeddings_2, atol=1e-3) - assert np.allclose(embeddings, embeddings_3, atol=1e-3) - - -@pytest.mark.parametrize("model_name", ["BAAI/bge-small-en-v1.5"]) -def test_lazy_load(model_name: str) -> None: - is_ci = os.getenv("CI") - model = TextEmbedding(model_name=model_name, lazy_load=True) - assert not hasattr(model.model, "model") - docs = ["hello world", "flag embedding"] - list(model.embed(docs)) - assert hasattr(model.model, "model") - - model = TextEmbedding(model_name=model_name, lazy_load=True) - list(model.query_embed(docs)) - - model = TextEmbedding(model_name=model_name, lazy_load=True) - list(model.passage_embed(docs)) - - if is_ci: - delete_model_cache(model.model._model_dir) - - -def test_get_embedding_size() -> None: - assert TextEmbedding.get_embedding_size("sentence-transformers/all-MiniLM-L6-v2") == 384 - assert TextEmbedding.get_embedding_size("sentence-transformers/all-minilm-l6-v2") == 384 - - -def test_embedding_size() -> None: - is_ci = os.getenv("CI") - model_name = "sentence-transformers/all-MiniLM-L6-v2" - model = TextEmbedding(model_name=model_name, lazy_load=True) - assert model.embedding_size == 384 - - model_name = "sentence-transformers/all-minilm-l6-v2" - model = TextEmbedding(model_name=model_name, lazy_load=True) - assert model.embedding_size == 384 - - if is_ci: - delete_model_cache(model.model._model_dir) - - -@pytest.mark.parametrize("model_name", ["sentence-transformers/all-MiniLM-L6-v2"]) -def test_session_options(model_cache, model_name) -> None: - with model_cache(model_name) as default_model: - default_session_options = default_model.model.model.get_session_options() - assert default_session_options.enable_cpu_mem_arena is True - model = TextEmbedding(model_name=model_name, enable_cpu_mem_arena=False) - session_options = model.model.model.get_session_options() - assert session_options.enable_cpu_mem_arena is False - - -@pytest.mark.parametrize("model_name", ["sentence-transformers/all-MiniLM-L6-v2"]) -def test_token_count(model_cache, model_name) -> None: - with model_cache(model_name) as model: - documents = [ - "Name me a couple of cities were the capitals of Germany?", - "Berlin is the current capital of Germany, Bonn is a former capital of Germany.", - ] - first_doc_token_count = model.token_count(documents[0]) - second_doc_token_count = model.token_count(documents[1]) - doc_token_count = model.token_count(documents) - assert first_doc_token_count + second_doc_token_count == doc_token_count - assert doc_token_count == model.token_count(documents, batch_size=1) diff --git a/tests/type_stub.py b/tests/type_stub.py deleted file mode 100644 index b22f80d90..000000000 --- a/tests/type_stub.py +++ /dev/null @@ -1,56 +0,0 @@ -from fastembed import TextEmbedding, LateInteractionTextEmbedding, SparseTextEmbedding -from fastembed.sparse.bm25 import Bm25 -from fastembed.rerank.cross_encoder import TextCrossEncoder - - -text_embedder = TextEmbedding(cache_dir="models") -late_interaction_embedder = LateInteractionTextEmbedding(model_name="", cache_dir="models") -reranker = TextCrossEncoder(model_name="", cache_dir="models") -sparse_embedder = SparseTextEmbedding(model_name="", cache_dir="models") -bm25_embedder = Bm25( - model_name="", - k=1.0, - b=1.0, - avg_len=1.0, - language="", - token_max_length=1, - disable_stemmer=False, - specific_model_path="models", -) - -text_embedder.list_supported_models() -text_embedder.embed(documents=[""], batch_size=1, parallel=1) -text_embedder.embed(documents="", parallel=None, task_id=1) -text_embedder.query_embed(query=[""], batch_size=1, parallel=1) -text_embedder.query_embed(query="", parallel=None) -text_embedder.passage_embed(texts=[""], batch_size=1, parallel=1) -text_embedder.passage_embed(texts=[""], parallel=None) - -late_interaction_embedder.list_supported_models() -late_interaction_embedder.embed(documents=[""], batch_size=1, parallel=1) -late_interaction_embedder.embed(documents="", parallel=None) -late_interaction_embedder.query_embed(query=[""], batch_size=1, parallel=1) -late_interaction_embedder.query_embed(query="", parallel=None) -late_interaction_embedder.passage_embed(texts=[""], batch_size=1, parallel=1) -late_interaction_embedder.passage_embed(texts=[""], parallel=None) - -reranker.list_supported_models() -reranker.rerank(query="", documents=[""], batch_size=1, parallel=1) -reranker.rerank(query="", documents=[""], parallel=None) -reranker.rerank_pairs(pairs=[("", "")], batch_size=1, parallel=1) -reranker.rerank_pairs(pairs=[("", "")], parallel=None) - -sparse_embedder.list_supported_models() -sparse_embedder.embed(documents=[""], batch_size=1, parallel=1) -sparse_embedder.embed(documents="", batch_size=1, parallel=None) -sparse_embedder.query_embed(query=[""], batch_size=1, parallel=1) -sparse_embedder.query_embed(query="", batch_size=1, parallel=None) -sparse_embedder.passage_embed(texts=[""], batch_size=1, parallel=1) -sparse_embedder.passage_embed(texts=[""], batch_size=1, parallel=None) - -bm25_embedder.list_supported_models() -bm25_embedder.embed(documents=[""], batch_size=1, parallel=1) -bm25_embedder.embed(documents="", batch_size=1, parallel=None) -bm25_embedder.query_embed(query=[""], batch_size=1, parallel=1) -bm25_embedder.query_embed(query="", batch_size=1, parallel=None) -bm25_embedder.raw_embed(documents=[""]) diff --git a/tests/utils.py b/tests/utils.py index 481c05045..193ae36d0 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -5,7 +5,7 @@ from types import TracebackType from typing import Callable, Any, Type -from fastembed.common.model_description import BaseModelDescription +from fastembed_bio.common.model_description import BaseModelDescription def delete_model_cache(model_dir: str | Path) -> None: diff --git a/uv.lock b/uv.lock new file mode 100644 index 000000000..bda020730 --- /dev/null +++ b/uv.lock @@ -0,0 +1,3 @@ +version = 1 +revision = 3 +requires-python = ">=3.13" diff --git a/zensical.toml b/zensical.toml new file mode 100644 index 000000000..fab918af9 --- /dev/null +++ b/zensical.toml @@ -0,0 +1,310 @@ +# ============================================================================ +# +# The configuration produced by default is meant to highlight the features +# that Zensical provides and to serve as a starting point for your own +# projects. +# +# ============================================================================ + +[project] + +# The site_name is shown in the page header and the browser window title +# +# Read more: https://zensical.org/docs/setup/basics/#site_name +site_name = "Documentation for fastembed-bio" + +# The site_description is included in the HTML head and should contain a +# meaningful description of the site content for use by search engines. +# +# Read more: https://zensical.org/docs/setup/basics/#site_description +site_description = "Fast, lightweight biological sequence embeddings using ONNX" + +# The site_author attribute. This is used in the HTML head element. +# +# Read more: https://zensical.org/docs/setup/basics/#site_author +site_author = "Nathan LeRoy" + +# The site_url is the canonical URL for your site. When building online +# documentation you should set this. +# Read more: https://zensical.org/docs/setup/basics/#site_url +#site_url = "https://www.example.com/" + +# The copyright notice appears in the page footer and can contain an HTML +# fragment. +# +# Read more: https://zensical.org/docs/setup/basics/#copyright +copyright = """ +Copyright © 2026 The authors +""" + +# Zensical supports both implicit navigation and explicitly defined navigation. +# If you decide not to define a navigation here then Zensical will simply +# derive the navigation structure from the directory structure of your +# "docs_dir". The definition below demonstrates how a navigation structure +# can be defined using TOML syntax. +# +# Read more: https://zensical.org/docs/setup/navigation/ +nav = [ + { "Home" = "index.md" }, + { "Quickstart" = [ + { "Overview" = "quickstart/index.md" }, + { "DNA Embeddings" = "quickstart/dna.md" }, + { "Protein Embeddings" = "quickstart/protein.md" }, + ]}, +] + +# With the "extra_css" option you can add your own CSS styling to customize +# your Zensical project according to your needs. You can add any number of +# CSS files. +# +# The path provided should be relative to the "docs_dir". +# +# Read more: https://zensical.org/docs/customization/#additional-css +# +#extra_css = ["stylesheets/extra.css"] + +# With the `extra_javascript` option you can add your own JavaScript to your +# project to customize the behavior according to your needs. +# +# The path provided should be relative to the "docs_dir". +# +# Read more: https://zensical.org/docs/customization/#additional-javascript +#extra_javascript = ["javascripts/extra.js"] + +# ---------------------------------------------------------------------------- +# Section for configuring theme options +# ---------------------------------------------------------------------------- +[project.theme] + +# change this to "classic" to use the traditional Material for MkDocs look. +#variant = "classic" + +# Zensical allows you to override specific blocks, partials, or whole +# templates as well as to define your own templates. To do this, uncomment +# the custom_dir setting below and set it to a directory in which you +# keep your template overrides. +# +# Read more: +# - https://zensical.org/docs/customization/#extending-the-theme +# +#custom_dir = "overrides" + +# With the "favicon" option you can set your own image to use as the icon +# browsers will use in the browser title bar or tab bar. The path provided +# must be relative to the "docs_dir". +# +# Read more: +# - https://zensical.org/docs/setup/logo-and-icons/#favicon +# - https://developer.mozilla.org/en-US/docs/Glossary/Favicon +# +#favicon = "images/favicon.png" + +# Zensical supports more than 60 different languages. This means that the +# labels and tooltips that Zensical's templates produce are translated. +# The "language" option allows you to set the language used. This language +# is also indicated in the HTML head element to help with accessibility +# and guide search engines and translation tools. +# +# The default language is "en" (English). It is possible to create +# sites with multiple languages and configure a language selector. See +# the documentation for details. +# +# Read more: +# - https://zensical.org/docs/setup/language/ +# +language = "en" + +# Zensical provides a number of feature toggles that change the behavior +# of the documentation site. +features = [ + # Zensical includes an announcement bar. This feature allows users to + # dismiss it then they have read the announcement. + # https://zensical.org/docs/setup/header/#announcement-bar + "announce.dismiss", + + # If you have a repository configured and turn on this feature, Zensical + # will generate an edit button for the page. This works for common + # repository hosting services. + # https://zensical.org/docs/setup/repository/#code-actions + #"content.action.edit", + + # If you have a repository configured and turn on this feature, Zensical + # will generate a button that allows the user to view the Markdown + # code for the current page. + # https://zensical.org/docs/setup/repository/#code-actions + #"content.action.view", + + # Code annotations allow you to add an icon with a tooltip to your + # code blocks to provide explanations at crucial points. + # https://zensical.org/docs/authoring/code-blocks/#code-annotations + "content.code.annotate", + + # This feature turns on a button in code blocks that allow users to + # copy the content to their clipboard without first selecting it. + # https://zensical.org/docs/authoring/code-blocks/#code-copy-button + "content.code.copy", + + # Code blocks can include a button to allow for the selection of line + # ranges by the user. + # https://zensical.org/docs/authoring/code-blocks/#code-selection-button + "content.code.select", + + # Zensical can render footnotes as inline tooltips, so the user can read + # the footnote without leaving the context of the document. + # https://zensical.org/docs/authoring/footnotes/#footnote-tooltips + "content.footnote.tooltips", + + # If you have many content tabs that have the same titles (e.g., "Python", + # "JavaScript", "Cobol"), this feature causes all of them to switch to + # at the same time when the user chooses their language in one. + # https://zensical.org/docs/authoring/content-tabs/#linked-content-tabs + "content.tabs.link", + + # TODO: not sure I understand this one? Is there a demo of this in the docs? + # https://zensical.org/docs/authoring/tooltips/#improved-tooltips + "content.tooltips", + + # With this feature enabled, Zensical will automatically hide parts + # of the header when the user scrolls past a certain point. + # https://zensical.org/docs/setup/header/#automatic-hiding + # "header.autohide", + + # Turn on this feature to expand all collapsible sections in the + # navigation sidebar by default. + # https://zensical.org/docs/setup/navigation/#navigation-expansion + # "navigation.expand", + + # This feature turns on navigation elements in the footer that allow the + # user to navigate to a next or previous page. + # https://zensical.org/docs/setup/footer/#navigation + "navigation.footer", + + # When section index pages are enabled, documents can be directly attached + # to sections, which is particularly useful for providing overview pages. + # https://zensical.org/docs/setup/navigation/#section-index-pages + "navigation.indexes", + + # When instant navigation is enabled, clicks on all internal links will be + # intercepted and dispatched via XHR without fully reloading the page. + # https://zensical.org/docs/setup/navigation/#instant-navigation + "navigation.instant", + + # With instant prefetching, your site will start to fetch a page once the + # user hovers over a link. This will reduce the perceived loading time + # for the user. + # https://zensical.org/docs/setup/navigation/#instant-prefetching + "navigation.instant.prefetch", + + # In order to provide a better user experience on slow connections when + # using instant navigation, a progress indicator can be enabled. + # https://zensical.org/docs/setup/navigation/#progress-indicator + #"navigation.instant.progress", + + # When navigation paths are activated, a breadcrumb navigation is rendered + # above the title of each page + # https://zensical.org/docs/setup/navigation/#navigation-path + "navigation.path", + + # When pruning is enabled, only the visible navigation items are included + # in the rendered HTML, reducing the size of the built site by 33% or more. + # https://zensical.org/docs/setup/navigation/#navigation-pruning + #"navigation.prune", + + # When sections are enabled, top-level sections are rendered as groups in + # the sidebar for viewports above 1220px, but remain as-is on mobile. + # https://zensical.org/docs/setup/navigation/#navigation-sections + "navigation.sections", + + # When tabs are enabled, top-level sections are rendered in a menu layer + # below the header for viewports above 1220px, but remain as-is on mobile. + # https://zensical.org/docs/setup/navigation/#navigation-tabs + #"navigation.tabs", + + # When sticky tabs are enabled, navigation tabs will lock below the header + # and always remain visible when scrolling down. + # https://zensical.org/docs/setup/navigation/#sticky-navigation-tabs + #"navigation.tabs.sticky", + + # A back-to-top button can be shown when the user, after scrolling down, + # starts to scroll up again. + # https://zensical.org/docs/setup/navigation/#back-to-top-button + "navigation.top", + + # When anchor tracking is enabled, the URL in the address bar is + # automatically updated with the active anchor as highlighted in the table + # of contents. + # https://zensical.org/docs/setup/navigation/#anchor-tracking + "navigation.tracking", + + # When search highlighting is enabled and a user clicks on a search result, + # Zensical will highlight all occurrences after following the link. + # https://zensical.org/docs/setup/search/#search-highlighting + "search.highlight", + + # When anchor following for the table of contents is enabled, the sidebar + # is automatically scrolled so that the active anchor is always visible. + # https://zensical.org/docs/setup/navigation/#anchor-following + # "toc.follow", + + # When navigation integration for the table of contents is enabled, it is + # always rendered as part of the navigation sidebar on the left. + # https://zensical.org/docs/setup/navigation/#navigation-integration + #"toc.integrate", +] + +# ---------------------------------------------------------------------------- +# In the "palette" subsection you can configure options for the color scheme. +# You can configure different color # schemes, e.g., to turn on dark mode, +# that the user can switch between. Each color scheme can be further +# customized. +# +# Read more: +# - https://zensical.org/docs/setup/colors/ +# ---------------------------------------------------------------------------- +[[project.theme.palette]] +scheme = "default" +toggle.icon = "lucide/sun" +toggle.name = "Switch to dark mode" + +[[project.theme.palette]] +scheme = "slate" +toggle.icon = "lucide/moon" +toggle.name = "Switch to light mode" + +# ---------------------------------------------------------------------------- +# In the "font" subsection you can configure the fonts used. By default, fonts +# are loaded from Google Fonts, giving you a wide range of choices from a set +# of suitably licensed fonts. There are options for a normal text font and for +# a monospaced font used in code blocks. +# ---------------------------------------------------------------------------- +#[project.theme.font] +#text = "Inter" +#code = "Jetbrains Mono" + +# ---------------------------------------------------------------------------- +# You can configure your own logo to be shown in the header using the "logo" +# option in the "icons" subsection. The logo can be a path to a file in your +# "docs_dir" or it can be a path to an icon. +# +# Likewise, you can customize the logo used for the repository section of the +# header. Zensical derives the default logo for this from the repository URL. +# See below... +# +# There are other icons you can customize. See the documentation for details. +# +# Read more: +# - https://zensical.org/docs/setup/logo-and-icons +# - https://zensical.org/docs/authoring/icons-emojis/#search +# ---------------------------------------------------------------------------- +#[project.theme.icon] +#logo = "lucide/smile" +#repo = "lucide/smile" + +# ---------------------------------------------------------------------------- +# The "extra" section contains miscellaneous settings. +# ---------------------------------------------------------------------------- +#[[project.extra.social]] +#icon = "fontawesome/brands/github" +#link = "https://github.com/user/repo" + +