Skip to content

Commit

Permalink
Remove Langchain dependency (#122)
Browse files Browse the repository at this point in the history
* remove langchain dependency

* convert embeddings to list

* remove instruction templates

* adapt readme and remove langchain from requirements

* rename and move stuff

* move some more stuff, fix imports

* add __init__ to BM25Retriever

* create FaissRetriever class and add BM25Retriever import

* remove invoke method from MyQdrantSparseVectorRetriever

* fix metadata type annotation

* add TextSplitter base class

* revert accidental edit in request header

* fix details wrt. qdrant-client import

* load embedding model in float32 if device is CPU
  • Loading branch information
mamei16 authored Oct 30, 2024
1 parent 51d5657 commit d4d0f57
Show file tree
Hide file tree
Showing 17 changed files with 672 additions and 267 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ This project gives local LLMs the ability to search the web by outputting a spec
command. Once the command has been found in the model output using a regular expression,
[duckduckgo-search](https://pypi.org/project/duckduckgo-search/)
is used to search the web and return a number of result pages. Finally, an
ensemble of LangChain's [Contextual compression](https://python.langchain.com/docs/modules/data_connection/retrievers/contextual_compression/) and
ensemble of a dense embedding model and
[Okapi BM25](https://en.wikipedia.org/wiki/Okapi_BM25) (Or alternatively, [SPLADE](https://github.com/naver/splade))
is used to extract the relevant parts (if any) of each web page in the search results
and the results are appended to the model's output.
Expand Down
137 changes: 137 additions & 0 deletions chunkers/base_chunker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import warnings
import copy
from typing import Any, List, Literal, Optional, Union, Callable, Iterable, Sequence
from abc import abstractmethod

try:
from ..utils import Document
except:
from utils import Document


class TextSplitter:
"""Interface for splitting text into chunks.
Source: https://github.com/langchain-ai/langchain/blob/master/libs/text-splitters/langchain_text_splitters/base.py#L30
"""

def __init__(
self,
chunk_size: int = 4000,
chunk_overlap: int = 200,
length_function: Callable[[str], int] = len,
keep_separator: Union[bool, Literal["start", "end"]] = False,
add_start_index: bool = False,
strip_whitespace: bool = True,
) -> None:
"""Create a new TextSplitter.
Args:
chunk_size: Maximum size of chunks to return
chunk_overlap: Overlap in characters between chunks
length_function: Function that measures the length of given chunks
keep_separator: Whether to keep the separator and where to place it
in each corresponding chunk (True='start')
add_start_index: If `True`, includes chunk's start index in metadata
strip_whitespace: If `True`, strips whitespace from the start and end of
every document
"""
if chunk_overlap > chunk_size:
raise ValueError(
f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
f"({chunk_size}), should be smaller."
)
self._chunk_size = chunk_size
self._chunk_overlap = chunk_overlap
self._length_function = length_function
self._keep_separator = keep_separator
self._add_start_index = add_start_index
self._strip_whitespace = strip_whitespace

@abstractmethod
def split_text(self, text: str) -> List[str]:
"""Split text into multiple components."""

def create_documents(
self, texts: List[str], metadatas: Optional[List[dict]] = None
) -> List[Document]:
"""Create documents from a list of texts."""
_metadatas = metadatas or [{}] * len(texts)
documents = []
for i, text in enumerate(texts):
index = 0
previous_chunk_len = 0
for chunk in self.split_text(text):
metadata = copy.deepcopy(_metadatas[i])
if self._add_start_index:
offset = index + previous_chunk_len - self._chunk_overlap
index = text.find(chunk, max(0, offset))
metadata["start_index"] = index
previous_chunk_len = len(chunk)
new_doc = Document(page_content=chunk, metadata=metadata)
documents.append(new_doc)
return documents

def split_documents(self, documents: Iterable[Document]) -> List[Document]:
"""Split documents."""
texts, metadatas = [], []
for doc in documents:
texts.append(doc.page_content)
metadatas.append(doc.metadata)
return self.create_documents(texts, metadatas=metadatas)

def _join_docs(self, docs: List[str], separator: str) -> Optional[str]:
text = separator.join(docs)
if self._strip_whitespace:
text = text.strip()
if text == "":
return None
else:
return text

def _merge_splits(self, splits: Iterable[str], separator: str) -> List[str]:
# We now want to combine these smaller pieces into medium size
# chunks to send to the LLM.
separator_len = self._length_function(separator)

docs = []
current_doc: List[str] = []
total = 0
for d in splits:
_len = self._length_function(d)
if (
total + _len + (separator_len if len(current_doc) > 0 else 0)
> self._chunk_size
):
if total > self._chunk_size:
warnings.warn(
f"Created a chunk of size {total}, "
f"which is longer than the specified {self._chunk_size}"
)
if len(current_doc) > 0:
doc = self._join_docs(current_doc, separator)
if doc is not None:
docs.append(doc)
# Keep on popping if:
# - we have a larger chunk than in the chunk overlap
# - or if we still have any chunks and the length is long
while total > self._chunk_overlap or (
total + _len + (separator_len if len(current_doc) > 0 else 0)
> self._chunk_size
and total > 0
):
total -= self._length_function(current_doc[0]) + (
separator_len if len(current_doc) > 1 else 0
)
current_doc = current_doc[1:]
current_doc.append(d)
total += _len + (separator_len if len(current_doc) > 1 else 0)
doc = self._join_docs(current_doc, separator)
if doc is not None:
docs.append(doc)
return docs

def transform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
"""Transform sequence of documents by splitting them."""
return self.split_documents(list(documents))
102 changes: 102 additions & 0 deletions chunkers/character_chunker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from typing import Any, List, Literal, Optional, Union, Callable
import re

try:
from ..chunkers.base_chunker import TextSplitter
except:
from chunkers.base_chunker import TextSplitter


class RecursiveCharacterTextSplitter(TextSplitter):
"""Splitting text by recursively look at characters.
Recursively tries to split by different characters to find one
that works.
Adapted from Langchain:
https://github.com/langchain-ai/langchain/blob/0606aabfa39acb2ec575ea8bbfa4c8e662a6134f/libs/text-splitters/langchain_text_splitters/character.py#L58
"""
def __init__(self, chunk_size: int = 4000, chunk_overlap: int = 200, length_function: Callable[[str], int] = len,
add_start_index: bool = False, strip_whitespace: bool = True, separators: Optional[List[str]] = None,
keep_separator: Union[bool, Literal["start", "end"]] = True, is_separator_regex: bool = False,
**kwargs: Any) -> None:
"""Create a new TextSplitter."""
super().__init__(chunk_size, chunk_overlap, length_function, keep_separator, add_start_index, strip_whitespace)
if chunk_overlap > chunk_size:
raise ValueError(
f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
f"({chunk_size}), should be smaller."
)

self._separators = separators or ["\n\n", "\n", " ", ""]
self._is_separator_regex = is_separator_regex

def _split_text(self, text: str, separators: List[str]) -> List[str]:
"""Split incoming text and return chunks."""
final_chunks = []
# Get appropriate separator to use
separator = separators[-1]
new_separators = []
for i, _s in enumerate(separators):
_separator = _s if self._is_separator_regex else re.escape(_s)
if _s == "":
separator = _s
break
if re.search(_separator, text):
separator = _s
new_separators = separators[i + 1 :]
break

_separator = separator if self._is_separator_regex else re.escape(separator)
splits = _split_text_with_regex(text, _separator, self._keep_separator)

# Now go merging things, recursively splitting longer texts.
_good_splits = []
_separator = "" if self._keep_separator else separator
for s in splits:
if self._length_function(s) < self._chunk_size:
_good_splits.append(s)
else:
if _good_splits:
merged_text = self._merge_splits(_good_splits, _separator)
final_chunks.extend(merged_text)
_good_splits = []
if not new_separators:
final_chunks.append(s)
else:
other_info = self._split_text(s, new_separators)
final_chunks.extend(other_info)
if _good_splits:
merged_text = self._merge_splits(_good_splits, _separator)
final_chunks.extend(merged_text)
return final_chunks

def split_text(self, text: str) -> List[str]:
return self._split_text(text, self._separators)


def _split_text_with_regex(
text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]]
) -> List[str]:
# Now that we have the separator, split the text
if separator:
if keep_separator:
# The parentheses in the pattern keep the delimiters in the result.
_splits = re.split(f"({separator})", text)
splits = (
([_splits[i] + _splits[i + 1] for i in range(0, len(_splits) - 1, 2)])
if keep_separator == "end"
else ([_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)])
)
if len(_splits) % 2 == 0:
splits += _splits[-1:]
splits = (
(splits + [_splits[-1]])
if keep_separator == "end"
else ([_splits[0]] + splits)
)
else:
splits = re.split(separator, text)
else:
splits = list(text)
return [s for s in splits if s != ""]
77 changes: 21 additions & 56 deletions semantic_chunker.py → chunkers/semantic_chunker.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import copy
import re
from typing import Any, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, cast
from typing import Dict, List, Literal, Optional, Tuple, cast

import numpy as np
from langchain_community.utils.math import (
cosine_similarity,
)
from langchain_core.documents import BaseDocumentTransformer, Document
from langchain_core.embeddings import Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

try:
from ..chunkers.character_chunker import RecursiveCharacterTextSplitter
from ..chunkers.base_chunker import TextSplitter
from ..utils import Document, cosine_similarity
except:
from chunkers.character_chunker import RecursiveCharacterTextSplitter
from chunkers.base_chunker import TextSplitter
from utils import Document, cosine_similarity


def calculate_cosine_distances(sentence_embeddings) -> np.array:
Expand All @@ -31,26 +34,20 @@ def calculate_cosine_distances(sentence_embeddings) -> np.array:
}


class BoundedSemanticChunker(BaseDocumentTransformer):
class BoundedSemanticChunker(TextSplitter):
"""First splits the text using semantic chunking according to the specified
'breakpoint_threshold_amount', but then uses a RecursiveCharacterTextSplitter
to split all chunks that are larger than 'max_chunk_size'.
Adapted from langchain_experimental.text_splitter.SemanticChunker"""

def __init__(
self,
embeddings: Embeddings,
buffer_size: int = 1,
add_start_index: bool = False,
breakpoint_threshold_type: BreakpointThresholdType = "percentile",
breakpoint_threshold_amount: Optional[float] = None,
number_of_chunks: Optional[int] = None,
max_chunk_size: int = 500,
min_chunk_size: int = 4
):
def __init__(self, embedding_model: SentenceTransformer, buffer_size: int = 1, add_start_index: bool = False,
breakpoint_threshold_type: BreakpointThresholdType = "percentile",
breakpoint_threshold_amount: Optional[float] = None, number_of_chunks: Optional[int] = None,
max_chunk_size: int = 500, min_chunk_size: int = 4):
super().__init__(add_start_index=add_start_index)
self._add_start_index = add_start_index
self.embeddings = embeddings
self.embedding_model = embedding_model
self.buffer_size = buffer_size
self.breakpoint_threshold_type = breakpoint_threshold_type
self.number_of_chunks = number_of_chunks
Expand All @@ -72,8 +69,9 @@ def _calculate_sentence_distances(
self, sentences: List[dict]
) -> Tuple[List[float], List[dict]]:
"""Split text into multiple components."""
embeddings = self.embeddings.embed_documents(sentences)
return calculate_cosine_distances(embeddings)
sentences = list(map(lambda x: x.replace("\n", " "), sentences))
embeddings = self.embedding_model.encode(sentences)
return calculate_cosine_distances(embeddings.tolist())

def _calculate_breakpoint_threshold(self, distances: np.array, alt_breakpoint_threshold_amount=None) -> float:
if alt_breakpoint_threshold_amount is None:
Expand Down Expand Up @@ -202,36 +200,3 @@ def split_text(
chunks.extend(recursive_splitter.split_text(bad_sentence))
return chunks

def create_documents(
self, texts: List[str], metadatas: Optional[List[dict]] = None
) -> List[Document]:
"""Create documents from a list of texts."""
_metadatas = metadatas or [{}] * len(texts)
documents = []
for i, text in enumerate(texts):
index = -1
for chunk in self.split_text(text):
metadata = copy.deepcopy(_metadatas[i])
if self._add_start_index:
index = text.find(chunk, index + 1)
metadata["start_index"] = index
new_doc = Document(page_content=chunk, metadata=metadata)
documents.append(new_doc)
return documents

def split_documents(self, documents: Iterable[Document]) -> List[Document]:
"""Split documents."""
texts, metadatas = [], []
for doc in documents:
texts.append(doc.page_content)
metadatas.append(doc.metadata)
return self.create_documents(texts, metadatas=metadatas)

def transform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
"""Transform sequence of documents by splitting them."""
return self.split_documents(list(documents))



2 changes: 0 additions & 2 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@ dependencies:
- pip:
- duckduckgo_search==6.3.1
- beautifulsoup4==4.12.3
- langchain==0.2.1
- langchain-community==0.2.1
- unstructured==0.15.13
- rank_bm25==0.2.2
- sentence-transformers==3.0.1
Expand Down
17 changes: 0 additions & 17 deletions instruction_templates/Llama-3.yaml

This file was deleted.

Loading

0 comments on commit d4d0f57

Please sign in to comment.