Remove Langchain dependency (#122)

* remove langchain dependency * convert embeddings to list * remove instruction templates * adapt readme and remove langchain from requirements * rename and move stuff * move some more stuff, fix imports * add __init__ to BM25Retriever * create FaissRetriever class and add BM25Retriever import * remove invoke method from MyQdrantSparseVectorRetriever * fix metadata type annotation * add TextSplitter base class * revert accidental edit in request header * fix details wrt. qdrant-client import * load embedding model in float32 if device is CPU
mamei16 · Oct 30, 2024 · d4d0f57 · d4d0f57
1 parent 51d5657
commit d4d0f57
Show file tree

Hide file tree

Showing 17 changed files with 672 additions and 267 deletions.
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@ This project gives local LLMs the ability to search the web by outputting a spec
 command. Once the command has been found in the model output using a regular expression,
 [duckduckgo-search](https://pypi.org/project/duckduckgo-search/)
 is used to search the web and return a number of result pages. Finally, an
-ensemble of LangChain's [Contextual compression](https://python.langchain.com/docs/modules/data_connection/retrievers/contextual_compression/) and 
+ensemble of a dense embedding model and 
 [Okapi BM25](https://en.wikipedia.org/wiki/Okapi_BM25) (Or alternatively, [SPLADE](https://github.com/naver/splade))
 is used to extract the relevant parts (if any) of each web page in the search results
 and the results are appended to the model's output.

diff --git a/chunkers/base_chunker.py b/chunkers/base_chunker.py
@@ -0,0 +1,137 @@
+import warnings
+import copy
+from typing import Any, List, Literal, Optional, Union, Callable, Iterable, Sequence
+from abc import abstractmethod
+
+try:
+    from ..utils import Document
+except:
+    from utils import Document
+
+
+class TextSplitter:
+    """Interface for splitting text into chunks.
+    Source: https://github.com/langchain-ai/langchain/blob/master/libs/text-splitters/langchain_text_splitters/base.py#L30
+    """
+
+    def __init__(
+        self,
+        chunk_size: int = 4000,
+        chunk_overlap: int = 200,
+        length_function: Callable[[str], int] = len,
+        keep_separator: Union[bool, Literal["start", "end"]] = False,
+        add_start_index: bool = False,
+        strip_whitespace: bool = True,
+    ) -> None:
+        """Create a new TextSplitter.
+
+        Args:
+            chunk_size: Maximum size of chunks to return
+            chunk_overlap: Overlap in characters between chunks
+            length_function: Function that measures the length of given chunks
+            keep_separator: Whether to keep the separator and where to place it
+                            in each corresponding chunk (True='start')
+            add_start_index: If `True`, includes chunk's start index in metadata
+            strip_whitespace: If `True`, strips whitespace from the start and end of
+                              every document
+        """
+        if chunk_overlap > chunk_size:
+            raise ValueError(
+                f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
+                f"({chunk_size}), should be smaller."
+            )
+        self._chunk_size = chunk_size
+        self._chunk_overlap = chunk_overlap
+        self._length_function = length_function
+        self._keep_separator = keep_separator
+        self._add_start_index = add_start_index
+        self._strip_whitespace = strip_whitespace
+
+    @abstractmethod
+    def split_text(self, text: str) -> List[str]:
+        """Split text into multiple components."""
+
+    def create_documents(
+        self, texts: List[str], metadatas: Optional[List[dict]] = None
+    ) -> List[Document]:
+        """Create documents from a list of texts."""
+        _metadatas = metadatas or [{}] * len(texts)
+        documents = []
+        for i, text in enumerate(texts):
+            index = 0
+            previous_chunk_len = 0
+            for chunk in self.split_text(text):
+                metadata = copy.deepcopy(_metadatas[i])
+                if self._add_start_index:
+                    offset = index + previous_chunk_len - self._chunk_overlap
+                    index = text.find(chunk, max(0, offset))
+                    metadata["start_index"] = index
+                    previous_chunk_len = len(chunk)
+                new_doc = Document(page_content=chunk, metadata=metadata)
+                documents.append(new_doc)
+        return documents
+
+    def split_documents(self, documents: Iterable[Document]) -> List[Document]:
+        """Split documents."""
+        texts, metadatas = [], []
+        for doc in documents:
+            texts.append(doc.page_content)
+            metadatas.append(doc.metadata)
+        return self.create_documents(texts, metadatas=metadatas)
+
+    def _join_docs(self, docs: List[str], separator: str) -> Optional[str]:
+        text = separator.join(docs)
+        if self._strip_whitespace:
+            text = text.strip()
+        if text == "":
+            return None
+        else:
+            return text
+
+    def _merge_splits(self, splits: Iterable[str], separator: str) -> List[str]:
+        # We now want to combine these smaller pieces into medium size
+        # chunks to send to the LLM.
+        separator_len = self._length_function(separator)
+
+        docs = []
+        current_doc: List[str] = []
+        total = 0
+        for d in splits:
+            _len = self._length_function(d)
+            if (
+                total + _len + (separator_len if len(current_doc) > 0 else 0)
+                > self._chunk_size
+            ):
+                if total > self._chunk_size:
+                    warnings.warn(
+                        f"Created a chunk of size {total}, "
+                        f"which is longer than the specified {self._chunk_size}"
+                    )
+                if len(current_doc) > 0:
+                    doc = self._join_docs(current_doc, separator)
+                    if doc is not None:
+                        docs.append(doc)
+                    # Keep on popping if:
+                    # - we have a larger chunk than in the chunk overlap
+                    # - or if we still have any chunks and the length is long
+                    while total > self._chunk_overlap or (
+                        total + _len + (separator_len if len(current_doc) > 0 else 0)
+                        > self._chunk_size
+                        and total > 0
+                    ):
+                        total -= self._length_function(current_doc[0]) + (
+                            separator_len if len(current_doc) > 1 else 0
+                        )
+                        current_doc = current_doc[1:]
+            current_doc.append(d)
+            total += _len + (separator_len if len(current_doc) > 1 else 0)
+        doc = self._join_docs(current_doc, separator)
+        if doc is not None:
+            docs.append(doc)
+        return docs
+
+    def transform_documents(
+            self, documents: Sequence[Document], **kwargs: Any
+    ) -> Sequence[Document]:
+        """Transform sequence of documents by splitting them."""
+        return self.split_documents(list(documents))
diff --git a/chunkers/character_chunker.py b/chunkers/character_chunker.py
@@ -0,0 +1,102 @@
+from typing import Any, List, Literal, Optional, Union, Callable
+import re
+
+try:
+    from ..chunkers.base_chunker import TextSplitter
+except:
+    from chunkers.base_chunker import TextSplitter
+
+
+class RecursiveCharacterTextSplitter(TextSplitter):
+    """Splitting text by recursively look at characters.
+
+    Recursively tries to split by different characters to find one
+    that works.
+
+    Adapted from Langchain:
+    https://github.com/langchain-ai/langchain/blob/0606aabfa39acb2ec575ea8bbfa4c8e662a6134f/libs/text-splitters/langchain_text_splitters/character.py#L58
+    """
+    def __init__(self, chunk_size: int = 4000, chunk_overlap: int = 200, length_function: Callable[[str], int] = len,
+                 add_start_index: bool = False, strip_whitespace: bool = True, separators: Optional[List[str]] = None,
+                 keep_separator: Union[bool, Literal["start", "end"]] = True, is_separator_regex: bool = False,
+                 **kwargs: Any) -> None:
+        """Create a new TextSplitter."""
+        super().__init__(chunk_size, chunk_overlap, length_function, keep_separator, add_start_index, strip_whitespace)
+        if chunk_overlap > chunk_size:
+            raise ValueError(
+                f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
+                f"({chunk_size}), should be smaller."
+            )
+
+        self._separators = separators or ["\n\n", "\n", " ", ""]
+        self._is_separator_regex = is_separator_regex
+
+    def _split_text(self, text: str, separators: List[str]) -> List[str]:
+        """Split incoming text and return chunks."""
+        final_chunks = []
+        # Get appropriate separator to use
+        separator = separators[-1]
+        new_separators = []
+        for i, _s in enumerate(separators):
+            _separator = _s if self._is_separator_regex else re.escape(_s)
+            if _s == "":
+                separator = _s
+                break
+            if re.search(_separator, text):
+                separator = _s
+                new_separators = separators[i + 1 :]
+                break
+
+        _separator = separator if self._is_separator_regex else re.escape(separator)
+        splits = _split_text_with_regex(text, _separator, self._keep_separator)
+
+        # Now go merging things, recursively splitting longer texts.
+        _good_splits = []
+        _separator = "" if self._keep_separator else separator
+        for s in splits:
+            if self._length_function(s) < self._chunk_size:
+                _good_splits.append(s)
+            else:
+                if _good_splits:
+                    merged_text = self._merge_splits(_good_splits, _separator)
+                    final_chunks.extend(merged_text)
+                    _good_splits = []
+                if not new_separators:
+                    final_chunks.append(s)
+                else:
+                    other_info = self._split_text(s, new_separators)
+                    final_chunks.extend(other_info)
+        if _good_splits:
+            merged_text = self._merge_splits(_good_splits, _separator)
+            final_chunks.extend(merged_text)
+        return final_chunks
+
+    def split_text(self, text: str) -> List[str]:
+        return self._split_text(text, self._separators)
+
+
+def _split_text_with_regex(
+    text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]]
+) -> List[str]:
+    # Now that we have the separator, split the text
+    if separator:
+        if keep_separator:
+            # The parentheses in the pattern keep the delimiters in the result.
+            _splits = re.split(f"({separator})", text)
+            splits = (
+                ([_splits[i] + _splits[i + 1] for i in range(0, len(_splits) - 1, 2)])
+                if keep_separator == "end"
+                else ([_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)])
+            )
+            if len(_splits) % 2 == 0:
+                splits += _splits[-1:]
+            splits = (
+                (splits + [_splits[-1]])
+                if keep_separator == "end"
+                else ([_splits[0]] + splits)
+            )
+        else:
+            splits = re.split(separator, text)
+    else:
+        splits = list(text)
+    return [s for s in splits if s != ""]
diff --git a/semantic_chunker.py → chunkers/semantic_chunker.py b/semantic_chunker.py → chunkers/semantic_chunker.py
@@ -1,14 +1,17 @@
-import copy
 import re
-from typing import Any, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, cast
+from typing import Dict, List, Literal, Optional, Tuple, cast
 
 import numpy as np
-from langchain_community.utils.math import (
-    cosine_similarity,
-)
-from langchain_core.documents import BaseDocumentTransformer, Document
-from langchain_core.embeddings import Embeddings
-from langchain.text_splitter import RecursiveCharacterTextSplitter
+from sentence_transformers import SentenceTransformer
+
+try:
+    from ..chunkers.character_chunker import RecursiveCharacterTextSplitter
+    from ..chunkers.base_chunker import TextSplitter
+    from ..utils import Document, cosine_similarity
+except:
+    from chunkers.character_chunker import RecursiveCharacterTextSplitter
+    from chunkers.base_chunker import TextSplitter
+    from utils import Document, cosine_similarity
 
 
 def calculate_cosine_distances(sentence_embeddings) -> np.array:
@@ -31,26 +34,20 @@ def calculate_cosine_distances(sentence_embeddings) -> np.array:
 }
 
 
-class BoundedSemanticChunker(BaseDocumentTransformer):
+class BoundedSemanticChunker(TextSplitter):
     """First splits the text using semantic chunking according to the specified
     'breakpoint_threshold_amount', but then uses a RecursiveCharacterTextSplitter
     to split all chunks that are larger than 'max_chunk_size'.
 
     Adapted from langchain_experimental.text_splitter.SemanticChunker"""
 
-    def __init__(
-            self,
-            embeddings: Embeddings,
-            buffer_size: int = 1,
-            add_start_index: bool = False,
-            breakpoint_threshold_type: BreakpointThresholdType = "percentile",
-            breakpoint_threshold_amount: Optional[float] = None,
-            number_of_chunks: Optional[int] = None,
-            max_chunk_size: int = 500,
-            min_chunk_size: int = 4
-    ):
+    def __init__(self, embedding_model: SentenceTransformer, buffer_size: int = 1, add_start_index: bool = False,
+                 breakpoint_threshold_type: BreakpointThresholdType = "percentile",
+                 breakpoint_threshold_amount: Optional[float] = None, number_of_chunks: Optional[int] = None,
+                 max_chunk_size: int = 500, min_chunk_size: int = 4):
+        super().__init__(add_start_index=add_start_index)
         self._add_start_index = add_start_index
-        self.embeddings = embeddings
+        self.embedding_model = embedding_model
         self.buffer_size = buffer_size
         self.breakpoint_threshold_type = breakpoint_threshold_type
         self.number_of_chunks = number_of_chunks
@@ -72,8 +69,9 @@ def _calculate_sentence_distances(
         self, sentences: List[dict]
     ) -> Tuple[List[float], List[dict]]:
         """Split text into multiple components."""
-        embeddings = self.embeddings.embed_documents(sentences)
-        return calculate_cosine_distances(embeddings)
+        sentences = list(map(lambda x: x.replace("\n", " "), sentences))
+        embeddings = self.embedding_model.encode(sentences)
+        return calculate_cosine_distances(embeddings.tolist())
 
     def _calculate_breakpoint_threshold(self, distances: np.array, alt_breakpoint_threshold_amount=None) -> float:
         if alt_breakpoint_threshold_amount is None:
@@ -202,36 +200,3 @@ def split_text(
                     chunks.extend(recursive_splitter.split_text(bad_sentence))
         return chunks
 
-    def create_documents(
-                self, texts: List[str], metadatas: Optional[List[dict]] = None
-        ) -> List[Document]:
-            """Create documents from a list of texts."""
-            _metadatas = metadatas or [{}] * len(texts)
-            documents = []
-            for i, text in enumerate(texts):
-                index = -1
-                for chunk in self.split_text(text):
-                    metadata = copy.deepcopy(_metadatas[i])
-                    if self._add_start_index:
-                        index = text.find(chunk, index + 1)
-                        metadata["start_index"] = index
-                    new_doc = Document(page_content=chunk, metadata=metadata)
-                    documents.append(new_doc)
-            return documents
-
-    def split_documents(self, documents: Iterable[Document]) -> List[Document]:
-        """Split documents."""
-        texts, metadatas = [], []
-        for doc in documents:
-            texts.append(doc.page_content)
-            metadatas.append(doc.metadata)
-        return self.create_documents(texts, metadatas=metadatas)
-
-    def transform_documents(
-            self, documents: Sequence[Document], **kwargs: Any
-    ) -> Sequence[Document]:
-        """Transform sequence of documents by splitting them."""
-        return self.split_documents(list(documents))
-
-
-
diff --git a/environment.yml b/environment.yml
@@ -8,8 +8,6 @@ dependencies:
 - pip:
   - duckduckgo_search==6.3.1
   - beautifulsoup4==4.12.3
-  - langchain==0.2.1
-  - langchain-community==0.2.1
   - unstructured==0.15.13
   - rank_bm25==0.2.2
   - sentence-transformers==3.0.1

diff --git a/instruction_templates/Llama-3.yaml b/instruction_templates/Llama-3.yaml