Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/add text splitter #256

Open
wants to merge 40 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
2b07ad0
wip on text splitter
JoaquinIglesiasTurina Feb 9, 2025
8ebc780
first text passes
JoaquinIglesiasTurina Feb 9, 2025
cccc70f
add new test and fix assert order
JoaquinIglesiasTurina Feb 9, 2025
1e3c4ce
add module record
JoaquinIglesiasTurina Feb 9, 2025
7e16283
functions use module record and are parametrizable
JoaquinIglesiasTurina Feb 9, 2025
038ffe4
test cleanup
JoaquinIglesiasTurina Feb 9, 2025
3e6b681
added more tests and ran formatter
JoaquinIglesiasTurina Feb 9, 2025
3f50b4a
more tests
JoaquinIglesiasTurina Feb 9, 2025
f29694c
working in failing test
JoaquinIglesiasTurina Feb 9, 2025
1cc252e
all tests working
JoaquinIglesiasTurina Feb 9, 2025
aeac96b
added next test
JoaquinIglesiasTurina Feb 9, 2025
088cc97
add keep_separator param and test
JoaquinIglesiasTurina Feb 9, 2025
74bd284
all character splitter options done
JoaquinIglesiasTurina Feb 10, 2025
cd6a167
separate base from character text splitter
JoaquinIglesiasTurina Feb 12, 2025
2cb8223
Merge pull request #1 from JoaquinIglesiasTurina/feat/refactor-for-in…
JoaquinIglesiasTurina Feb 13, 2025
6fd121e
text splitter
JoaquinIglesiasTurina Feb 13, 2025
2b443eb
Update text_splitter.ex
brainlid Feb 14, 2025
e604b8f
Merge branch 'feat/refactor-for-inheritance' into feat/add-text-splitter
JoaquinIglesiasTurina Feb 14, 2025
0ab9e80
move docs to correct file
JoaquinIglesiasTurina Feb 14, 2025
0981a42
add documentation, examples and doctests
JoaquinIglesiasTurina Feb 16, 2025
8372636
better wording and generate new doc section for text splitter
JoaquinIglesiasTurina Feb 16, 2025
8197fdd
fomatted code
JoaquinIglesiasTurina Feb 16, 2025
6abb643
first test is ready
JoaquinIglesiasTurina Feb 20, 2025
034eba4
recursive_character_text_splitter module ready
JoaquinIglesiasTurina Feb 20, 2025
f3c49be
first green test for recursive test splitter
JoaquinIglesiasTurina Feb 22, 2025
00e82d6
add iterative text splitter test
JoaquinIglesiasTurina Feb 22, 2025
1aed409
refactor tests
JoaquinIglesiasTurina Feb 22, 2025
690e6b0
add new test and fix assert order
JoaquinIglesiasTurina Feb 23, 2025
c9a70f9
join docs with unescaped separator
JoaquinIglesiasTurina Feb 23, 2025
9d4beeb
fix cond to if and empty values
JoaquinIglesiasTurina Feb 23, 2025
ea84383
trim join docs. add separator parameter to merge
JoaquinIglesiasTurina Feb 23, 2025
5b533b2
join_docs returns nil on empty string
JoaquinIglesiasTurina Feb 23, 2025
335b83b
pass separator as parameter
JoaquinIglesiasTurina Feb 23, 2025
a8ff215
clean up and filter nil output
JoaquinIglesiasTurina Feb 23, 2025
6fc3d50
add python tests
JoaquinIglesiasTurina Feb 23, 2025
96512f7
add language separators
JoaquinIglesiasTurina Feb 23, 2025
766bae6
fix rst separators
JoaquinIglesiasTurina Feb 23, 2025
2bb5cbe
all recursive text splitter tests ported
JoaquinIglesiasTurina Feb 23, 2025
8e75eac
format
JoaquinIglesiasTurina Feb 23, 2025
d740284
fix typo
JoaquinIglesiasTurina Feb 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix rst separators
  • Loading branch information
JoaquinIglesiasTurina committed Feb 23, 2025
commit 766bae67d76ef8f36b43760e7f7f666db10104a7
2 changes: 1 addition & 1 deletion lib/text_splitter/language_separators.ex
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ defmodule LangChain.TextSplitter.LanguageSeparators do
"\n-+\n",
"\n\\*+\n",
# Split along directive markers
"\n\n.. *\n\n",
"\n\n.. \*\n\n",
# Split by the normal type of lines
"\n\n",
"\n",
Expand Down
284 changes: 282 additions & 2 deletions test/text_splitter_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ defmodule TextSplitterTest do
alias LangChain.TextSplitter.LanguageSeparators
doctest CharacterTextSplitter

@chunk_size 16

describe "CharacterTextSplitter" do
test "New TextSplitter" do
expected_splitter = %CharacterTextSplitter{
Expand Down Expand Up @@ -300,7 +302,7 @@ Bye!\n\n-I."
end
end

describe "Language splitters" do
describe "Programming languages splitters" do
test "Python test splitter" do
fake_python_text = """
class Foo:
Expand Down Expand Up @@ -359,7 +361,285 @@ hello_world()
RecursiveCharacterTextSplitter.new!(%{
separators: LanguageSeparators.python(),
keep_separator: :start,
chunk_size: 16,
chunk_size: @chunk_size,
chunk_overlap: 0
})

splits =
splitter
|> RecursiveCharacterTextSplitter.split_text(code)

assert splits == expected_splits
end

test "Golang splitting" do
code = "
package main

import \"fmt\"

func helloWorld() {
fmt.Println(\"Hello, World!\")
}

func main() {
helloWorld()
}
"

expected_splits = [
"package main",
"import \"fmt\"",
"func",
"helloWorld() {",
"fmt.Println(\"He",
"llo,",
"World!\")",
"}",
"func main() {",
"helloWorld()",
"}"
]

splitter =
RecursiveCharacterTextSplitter.new!(%{
separators: LanguageSeparators.go(),
keep_separator: :start,
chunk_size: @chunk_size,
chunk_overlap: 0
})

splits =
splitter
|> RecursiveCharacterTextSplitter.split_text(code)

assert splits == expected_splits
end

@tag :wip
test "Rst splitting" do
code = "
Sample Document
===============

Section
-------

This is the content of the section.

Lists
-------

- Item 1
- Item 2
- Item 3

Comment
*******
Not a comment

.. This is a comment
"

expected_splits = [
"Sample Document",
"===============",
"Section",
"-------",
"This is the",
"content of the",
"section.",
"Lists",
"-------",
"- Item 1",
"- Item 2",
"- Item 3",
"Comment",
"*******",
"Not a comment",
".. This is a",
"comment"
]

splitter =
RecursiveCharacterTextSplitter.new!(%{
separators: LanguageSeparators.rst(),
keep_separator: :start,
chunk_size: @chunk_size,
chunk_overlap: 0,
is_separator_regex: true
})

splits =
splitter
|> RecursiveCharacterTextSplitter.split_text(code)

assert splits == expected_splits
code = "harry\n***\nbabylon is"

chunks =
splitter
|> RecursiveCharacterTextSplitter.split_text(code)

assert chunks == ["harry", "***\nbabylon is"]
end

test "Proto splitting" do
code = "
syntax = \"proto3\";

package example;

message Person {
string name = 1;
int32 age = 2;
repeated string hobbies = 3;
}
"

expected_splits = [
"syntax =",
"\"proto3\";",
"package",
"example;",
"message Person",
"{",
"string name",
"= 1;",
"int32 age =",
"2;",
"repeated",
"string hobbies",
"= 3;",
"}"
]

splitter =
RecursiveCharacterTextSplitter.new!(%{
separators: LanguageSeparators.proto(),
keep_separator: :start,
chunk_size: @chunk_size,
chunk_overlap: 0
})

splits =
splitter
|> RecursiveCharacterTextSplitter.split_text(code)

assert splits == expected_splits
end

test "Javscript splitting" do
code = "
function helloWorld() {
console.log(\"Hello, World!\");
}

// Call the function
helloWorld();
"

expected_splits = [
"function",
"helloWorld() {",
"console.log(\"He",
"llo,",
"World!\");",
"}",
"// Call the",
"function",
"helloWorld();"
]

splitter =
RecursiveCharacterTextSplitter.new!(%{
separators: LanguageSeparators.js(),
keep_separator: :start,
chunk_size: @chunk_size,
chunk_overlap: 0
})

splits =
splitter
|> RecursiveCharacterTextSplitter.split_text(code)

assert splits == expected_splits
end

test "Cobol splitting" do
code = "
IDENTIFICATION DIVISION.
PROGRAM-ID. HelloWorld.
DATA DIVISION.
WORKING-STORAGE SECTION.
01 GREETING PIC X(12) VALUE 'Hello, World!'.
PROCEDURE DIVISION.
DISPLAY GREETING.
STOP RUN.
"

expected_splits = [
"IDENTIFICATION",
"DIVISION.",
"PROGRAM-ID.",
"HelloWorld.",
"DATA DIVISION.",
"WORKING-STORAGE",
"SECTION.",
"01 GREETING",
"PIC X(12)",
"VALUE 'Hello,",
"World!'.",
"PROCEDURE",
"DIVISION.",
"DISPLAY",
"GREETING.",
"STOP RUN."
]

splitter =
RecursiveCharacterTextSplitter.new!(%{
separators: LanguageSeparators.cobol(),
keep_separator: :start,
chunk_size: @chunk_size,
chunk_overlap: 0
})

splits =
splitter
|> RecursiveCharacterTextSplitter.split_text(code)

assert splits == expected_splits
end

test "Typescript splitting" do
code = "
function helloWorld(): void {
console.log(\"Hello, World!\");
}

// Call the function
helloWorld();
"

expected_splits = [
"function",
"helloWorld():",
"void {",
"console.log(\"He",
"llo,",
"World!\");",
"}",
"// Call the",
"function",
"helloWorld();"
]

splitter =
RecursiveCharacterTextSplitter.new!(%{
separators: LanguageSeparators.ts(),
keep_separator: :start,
chunk_size: @chunk_size,
chunk_overlap: 0
})

Expand Down