Update SinglStoreDB vectorstore (langchain-ai#6423)

1. Introduced new distance strategies support: **DOT_PRODUCT** and **EUCLIDEAN_DISTANCE** for enhanced flexibility. 2. Implemented a feature to filter results based on metadata fields. 3. Incorporated connection attributes specifying "langchain python sdk" usage for enhanced traceability and debugging. 4. Expanded the suite of integration tests for improved code reliability. 5. Updated the existing notebook with the usage example @dev2049 --------- Co-authored-by: Volodymyr Tkachuk <[email protected]> Co-authored-by: Harrison Chase <[email protected]>
richstep · Jun 20, 2023 · d2e9b62 · d2e9b62
1 parent 6efd5fa
commit d2e9b62
Show file tree

Hide file tree

Showing 5 changed files with 812 additions and 46 deletions.
diff --git a/docs/extras/modules/data_connection/vectorstores/integrations/singlestoredb.ipynb b/docs/extras/modules/data_connection/vectorstores/integrations/singlestoredb.ipynb
@@ -5,9 +5,8 @@
    "id": "2b9582dc",
    "metadata": {},
    "source": [
-    "# SingleStoreDB vector search\n",
-    "[SingleStore DB](https://singlestore.com) is a high-performance distributed database that supports deployment both in the [cloud](https://www.singlestore.com/cloud/) and on-premises. For a significant duration, it has provided support for vector functions such as [dot_product](https://docs.singlestore.com/managed-service/en/reference/sql-reference/vector-functions/dot_product.html), thereby positioning itself as an ideal solution for AI applications that require text similarity matching. \n",
-    "This tutorial illustrates how to utilize the features of the SingleStore DB Vector Store."
+    "# SingleStoreDB\n",
+    "[SingleStoreDB](https://singlestore.com/) is a high-performance distributed SQL database that supports deployment both in the [cloud](https://www.singlestore.com/cloud/) and on-premises. It provides vector storage, and vector functions including [dot_product](https://docs.singlestore.com/managed-service/en/reference/sql-reference/vector-functions/dot_product.html) and [euclidean_distance](https://docs.singlestore.com/managed-service/en/reference/sql-reference/vector-functions/euclidean_distance.html), thereby supporting AI applications that require text similarity matching. This tutorial illustrates how to [work with vector data in SingleStoreDB](https://docs.singlestore.com/managed-service/en/developer-resources/functional-extensions/working-with-vector-data.html)."
    ]
   },
   {
@@ -58,10 +57,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Load text samples\n",
-    "from langchain.document_loaders import TextLoader\n",
-    "\n",
-    "loader = TextLoader(\"../../../state_of_the_union.txt\")\n",
+    "# Load text samples \n",
+    "loader = TextLoader('../../../state_of_the_union.txt')\n",
     "documents = loader.load()\n",
     "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
     "docs = text_splitter.split_documents(documents)\n",
@@ -91,7 +88,7 @@
     "docsearch = SingleStoreDB.from_documents(\n",
     "    docs,\n",
     "    embeddings,\n",
-    "    table_name=\"noteook\",  # use table with a custom name\n",
+    "    table_name = \"notebook\", # use table with a custom name \n",
     ")"
    ]
   },

diff --git a/langchain/vectorstores/singlestoredb.py b/langchain/vectorstores/singlestoredb.py
@@ -1,6 +1,7 @@
 """Wrapper around SingleStore DB."""
 from __future__ import annotations
 
+import enum
 import json
 from typing import (
     Any,
@@ -20,6 +21,19 @@
 from langchain.vectorstores.base import VectorStore, VectorStoreRetriever
 
 
+class DistanceStrategy(str, enum.Enum):
+    EUCLIDEAN_DISTANCE = "EUCLIDEAN_DISTANCE"
+    DOT_PRODUCT = "DOT_PRODUCT"
+
+
+DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.DOT_PRODUCT
+
+ORDERING_DIRECTIVE: dict = {
+    DistanceStrategy.EUCLIDEAN_DISTANCE: "",
+    DistanceStrategy.DOT_PRODUCT: "DESC",
+}
+
+
 class SingleStoreDB(VectorStore):
     """
     This class serves as a Pythonic interface to the SingleStore DB database.
@@ -45,6 +59,7 @@ def __init__(
         self,
         embedding: Embeddings,
         *,
+        distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
         table_name: str = "embeddings",
         content_field: str = "content",
         metadata_field: str = "metadata",
@@ -59,6 +74,18 @@ def __init__(
         Args:
             embedding (Embeddings): A text embedding model.
 
+            distance_strategy (DistanceStrategy, optional):
+                Determines the strategy employed for calculating
+                the distance between vectors in the embedding space.
+                Defaults to DOT_PRODUCT.
+                Available options are:
+                - DOT_PRODUCT: Computes the scalar product of two vectors.
+                    This is the default behavior
+                - EUCLIDEAN_DISTANCE: Computes the Euclidean distance between
+                    two vectors. This metric considers the geometric distance in
+                    the vector space, and might be more suitable for embeddings
+                    that rely on spatial relationships.
+
             table_name (str, optional): Specifies the name of the table in use.
                 Defaults to "embeddings".
             content_field (str, optional): Specifies the field to store the content.
@@ -137,6 +164,7 @@ def __init__(
 
                 vectorstore = SingleStoreDB(
                     OpenAIEmbeddings(),
+                    distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
                     host="127.0.0.1",
                     port=3306,
                     user="user",
@@ -159,6 +187,7 @@ def __init__(
         """
 
         self.embedding = embedding
+        self.distance_strategy = distance_strategy
         self.table_name = table_name
         self.content_field = content_field
         self.metadata_field = metadata_field
@@ -167,6 +196,17 @@ def __init__(
         """Pass the rest of the kwargs to the connection."""
         self.connection_kwargs = kwargs
 
+        """Add program name and version to connection attributes."""
+        if "conn_attrs" not in self.connection_kwargs:
+            self.connection_kwargs["conn_attrs"] = dict()
+        if "program_name" not in self.connection_kwargs["conn_attrs"]:
+            self.connection_kwargs["conn_attrs"][
+                "program_name"
+            ] = "langchain python sdk"
+            self.connection_kwargs["conn_attrs"][
+                "program_version"
+            ] = "0.0.205"  # the version of SingleStoreDB VectorStore implementation
+
         """Create connection pool."""
         self.connection_pool = QueuePool(
             self._get_connection,
@@ -246,7 +286,7 @@ def add_texts(
         return []
 
     def similarity_search(
-        self, query: str, k: int = 4, **kwargs: Any
+        self, query: str, k: int = 4, filter: Optional[dict] = None, **kwargs: Any
     ) -> List[Document]:
         """Returns the most similar indexed documents to the query text.
 
@@ -255,21 +295,38 @@ def similarity_search(
         Args:
             query (str): The query text for which to find similar documents.
             k (int): The number of documents to return. Default is 4.
+            filter (dict): A dictionary of metadata fields and values to filter by.
 
         Returns:
             List[Document]: A list of documents that are most similar to the query text.
+
+        Examples:
+            .. code-block:: python
+                from langchain.vectorstores import SingleStoreDB
+                from langchain.embeddings import OpenAIEmbeddings
+                s2 = SingleStoreDB.from_documents(
+                    docs,
+                    OpenAIEmbeddings(),
+                    host="username:password@localhost:3306/database"
+                )
+                s2.similarity_search("query text", 1,
+                    {"metadata_field": "metadata_value"})
         """
-        docs_and_scores = self.similarity_search_with_score(query, k=k)
+        docs_and_scores = self.similarity_search_with_score(
+            query=query, k=k, filter=filter
+        )
         return [doc for doc, _ in docs_and_scores]
 
     def similarity_search_with_score(
-        self, query: str, k: int = 4
+        self, query: str, k: int = 4, filter: Optional[dict] = None
     ) -> List[Tuple[Document, float]]:
         """Return docs most similar to query. Uses cosine similarity.
 
         Args:
             query: Text to look up documents similar to.
             k: Number of Documents to return. Defaults to 4.
+            filter: A dictionary of metadata fields and values to filter by.
+                    Defaults to None.
 
         Returns:
             List of Documents most similar to the query and score for each
@@ -278,21 +335,52 @@ def similarity_search_with_score(
         embedding = self.embedding.embed_query(query)
         conn = self.connection_pool.connect()
         result = []
+        where_clause: str = ""
+        where_clause_values: List[Any] = []
+        if filter:
+            where_clause = "WHERE "
+            arguments = []
+
+            def build_where_clause(
+                where_clause_values: List[Any],
+                sub_filter: dict,
+                prefix_args: List[str] = [],
+            ) -> None:
+                for key in sub_filter.keys():
+                    if isinstance(sub_filter[key], dict):
+                        build_where_clause(
+                            where_clause_values, sub_filter[key], prefix_args + [key]
+                        )
+                    else:
+                        arguments.append(
+                            "JSON_EXTRACT_JSON({}, {}) = %s".format(
+                                self.metadata_field,
+                                ", ".join(["%s"] * (len(prefix_args) + 1)),
+                            )
+                        )
+                        where_clause_values += prefix_args + [key]
+                        where_clause_values.append(json.dumps(sub_filter[key]))
+
+            build_where_clause(where_clause_values, filter)
+            where_clause += " AND ".join(arguments)
+
         try:
             cur = conn.cursor()
             try:
                 cur.execute(
-                    """SELECT {}, {}, DOT_PRODUCT({}, JSON_ARRAY_PACK(%s)) as __score 
-                    FROM {} ORDER BY __score DESC LIMIT %s""".format(
+                    """SELECT {}, {}, {}({}, JSON_ARRAY_PACK(%s)) as __score
+                    FROM {} {} ORDER BY __score {} LIMIT %s""".format(
                         self.content_field,
                         self.metadata_field,
+                        self.distance_strategy,
                         self.vector_field,
                         self.table_name,
+                        where_clause,
+                        ORDERING_DIRECTIVE[self.distance_strategy],
                     ),
-                    (
-                        "[{}]".format(",".join(map(str, embedding))),
-                        k,
-                    ),
+                    ("[{}]".format(",".join(map(str, embedding))),)
+                    + tuple(where_clause_values)
+                    + (k,),
                 )
 
                 for row in cur.fetchall():
@@ -310,6 +398,7 @@ def from_texts(
         texts: List[str],
         embedding: Embeddings,
         metadatas: Optional[List[dict]] = None,
+        distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
         table_name: str = "embeddings",
         content_field: str = "content",
         metadata_field: str = "metadata",
@@ -338,6 +427,7 @@ def from_texts(
 
         instance = cls(
             embedding,
+            distance_strategy=distance_strategy,
             table_name=table_name,
             content_field=content_field,
             metadata_field=metadata_field,