Skip to content

Commit

Permalink
Add similarity_search_with_normalized_similarities (langchain-ai#2916)
Browse files Browse the repository at this point in the history
Add a method that exposes a similarity search with corresponding
normalized similarity scores. Implement only for FAISS now.

### Motivation:

Some memory definitions combine `relevance` with other scores, like
recency , importance, etc.

While many (but not all) of the `VectorStore`'s expose a
`similarity_search_with_score` method, they don't all interpret the
units of that score (depends on the distance metric and whether or not
the the embeddings are normalized).

This PR proposes a `similarity_search_with_normalized_similarities`
method that lets consumers of the vector store not have to worry about
the metric and embedding scale.

*Most providers default to euclidean distance, with Pinecone being one
exception (defaults to cosine _similarity_).*

---------

Co-authored-by: Harrison Chase <[email protected]>
  • Loading branch information
vowelparrot and hwchase17 authored Apr 16, 2023
1 parent b9db204 commit 4ffc58e
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 4 deletions.
37 changes: 36 additions & 1 deletion langchain/vectorstores/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import asyncio
from abc import ABC, abstractmethod
from functools import partial
from typing import Any, Dict, Iterable, List, Optional, Type, TypeVar
from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, TypeVar

from pydantic import BaseModel, Field, root_validator

Expand Down Expand Up @@ -81,6 +81,41 @@ def similarity_search(
) -> List[Document]:
"""Return docs most similar to query."""

def similarity_search_with_relevance_scores(
self,
query: str,
k: int = 4,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs and relevance scores in the range [0, 1].
0 is dissimilar, 1 is most similar.
"""
docs_and_similarities = self._similarity_search_with_relevance_scores(
query, k=k, **kwargs
)
if any(
similarity < 0.0 or similarity > 1.0
for _, similarity in docs_and_similarities
):
raise ValueError(
"Relevance scores must be between"
f" 0 and 1, got {docs_and_similarities}"
)
return docs_and_similarities

def _similarity_search_with_relevance_scores(
self,
query: str,
k: int = 4,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs and relevance scores, normalized on a scale from 0 to 1.
0 is dissimilar, 1 is most similar.
"""
raise NotImplementedError

async def asimilarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
Expand Down
52 changes: 49 additions & 3 deletions langchain/vectorstores/faiss.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Wrapper around FAISS vector database."""
from __future__ import annotations

import math
import pickle
import uuid
from pathlib import Path
Expand Down Expand Up @@ -29,6 +30,20 @@ def dependable_faiss_import() -> Any:
return faiss


def _default_relevance_score_fn(score: float) -> float:
"""Return a similarity score on a scale [0, 1]."""
# The 'correct' relevance function
# may differ depending on a few things, including:
# - the distance / similarity metric used by the VectorStore
# - the scale of your embeddings (OpenAI's are unit normed. Many others are not!)
# - embedding dimensionality
# - etc.
# This function converts the euclidean norm of normalized embeddings
# (0 is most similar, sqrt(2) most dissimilar)
# to a similarity function (0 to 1)
return 1.0 - score / math.sqrt(2)


class FAISS(VectorStore):
"""Wrapper around FAISS vector database.
Expand All @@ -48,12 +63,16 @@ def __init__(
index: Any,
docstore: Docstore,
index_to_docstore_id: Dict[int, str],
relevance_score_fn: Optional[
Callable[[float], float]
] = _default_relevance_score_fn,
):
"""Initialize with necessary components."""
self.embedding_function = embedding_function
self.index = index
self.docstore = docstore
self.index_to_docstore_id = index_to_docstore_id
self.relevance_score_fn = relevance_score_fn

def __add(
self,
Expand Down Expand Up @@ -318,7 +337,7 @@ def __from(
docstore = InMemoryDocstore(
{index_to_id[i]: doc for i, doc in enumerate(documents)}
)
return cls(embedding.embed_query, index, docstore, index_to_id)
return cls(embedding.embed_query, index, docstore, index_to_id, **kwargs)

@classmethod
def from_texts(
Expand Down Expand Up @@ -346,7 +365,13 @@ def from_texts(
faiss = FAISS.from_texts(texts, embeddings)
"""
embeddings = embedding.embed_documents(texts)
return cls.__from(texts, embeddings, embedding, metadatas, **kwargs)
return cls.__from(
texts,
embeddings,
embedding,
metadatas,
**kwargs,
)

@classmethod
def from_embeddings(
Expand Down Expand Up @@ -375,7 +400,13 @@ def from_embeddings(
"""
texts = [t[0] for t in text_embeddings]
embeddings = [t[1] for t in text_embeddings]
return cls.__from(texts, embeddings, embedding, metadatas, **kwargs)
return cls.__from(
texts,
embeddings,
embedding,
metadatas,
**kwargs,
)

def save_local(self, folder_path: str, index_name: str = "index") -> None:
"""Save FAISS index, docstore, and index_to_docstore_id to disk.
Expand Down Expand Up @@ -421,3 +452,18 @@ def load_local(
with open(path / "{index_name}.pkl".format(index_name=index_name), "rb") as f:
docstore, index_to_docstore_id = pickle.load(f)
return cls(embeddings.embed_query, index, docstore, index_to_docstore_id)

def _similarity_search_with_relevance_scores(
self,
query: str,
k: int = 4,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs and their similarity scores on a scale from 0 to 1."""
if self.relevance_score_fn is None:
raise ValueError(
"normalize_score_fn must be provided to"
" FAISS constructor to normalize scores"
)
docs_and_scores = self.similarity_search_with_score(query, k=k)
return [(doc, self.relevance_score_fn(score)) for doc, score in docs_and_scores]
35 changes: 35 additions & 0 deletions tests/integration_tests/vectorstores/test_faiss.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Test FAISS functionality."""
import math
import tempfile

import pytest
Expand Down Expand Up @@ -109,3 +110,37 @@ def test_faiss_local_save_load() -> None:
docsearch.save_local(temp_file.name)
new_docsearch = FAISS.load_local(temp_file.name, FakeEmbeddings())
assert new_docsearch.index is not None


def test_faiss_similarity_search_with_relevance_scores() -> None:
"""Test the similarity search with normalized similarities."""
texts = ["foo", "bar", "baz"]
docsearch = FAISS.from_texts(
texts,
FakeEmbeddings(),
normalize_score_fn=lambda score: 1.0 - score / math.sqrt(2),
)
outputs = docsearch.similarity_search_with_relevance_scores("foo", k=1)
output, score = outputs[0]
assert output == Document(page_content="foo")
assert score == 1.0


def test_faiss_invalid_normalize_fn() -> None:
"""Test the similarity search with normalized similarities."""
texts = ["foo", "bar", "baz"]
docsearch = FAISS.from_texts(
texts, FakeEmbeddings(), normalize_score_fn=lambda _: 2.0
)
with pytest.raises(
ValueError, match="Normalized similarity scores must be between 0 and 1"
):
docsearch.similarity_search_with_relevance_scores("foo", k=1)


def test_missing_normalize_score_fn() -> None:
"""Test doesn't perform similarity search without a normalize score function."""
with pytest.raises(ValueError):
texts = ["foo", "bar", "baz"]
faiss_instance = FAISS.from_texts(texts, FakeEmbeddings())
faiss_instance.similarity_search_with_relevance_scores("foo", k=2)

0 comments on commit 4ffc58e

Please sign in to comment.