Skip to content

Commit

Permalink
chunking, seq len
Browse files Browse the repository at this point in the history
  • Loading branch information
VomV committed Jul 30, 2024
1 parent b970be8 commit 1f53c84
Showing 1 changed file with 178 additions and 5 deletions.
183 changes: 178 additions & 5 deletions rag_basic.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -25,6 +25,8 @@
"from langchain.docstore.document import Document as LangchainDocument\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"\n",
"from sentence_transformers import SentenceTransformer\n",
"\n",
"pd.set_option('display.max_colwidth', None)"
]
},
Expand Down Expand Up @@ -73,11 +75,11 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"MARDOWN_SEP = [\n",
"MARKDOWN_SEP = [\n",
" \"\\n#{1,6} \",\n",
" \"```\\n\",\n",
" \"\\n\\\\*\\\\*\\\\*+\\n\",\n",
Expand All @@ -87,14 +89,185 @@
" \"\\n\",\n",
" \" \",\n",
" \"\"\n",
" ]"
" ]\n",
"\n",
"text_splitter = RecursiveCharacterTextSplitter(\n",
" chunk_size=1000,\n",
" chunk_overlap=100,\n",
" add_start_index=True,\n",
" strip_whitespace=True,\n",
" separators=MARKDOWN_SEP\n",
"\n",
")\n",
"\n",
"docs_processed=[]\n",
"\n",
"for doc in RAW_KB:\n",
" docs_processed += text_splitter.split_documents([doc])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "85816b3c9d0040d78b991263e64d84d8",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"modules.json: 0%| | 0.00/385 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "36edd1aaa4e84415ae247f46dced58e6",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"README.md: 0%| | 0.00/68.1k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "13c19a3b738e4cca8bdbeb29ceb5d186",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"sentence_bert_config.json: 0%| | 0.00/57.0 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b75089dc762f442fa849ba87b952a942",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"config.json: 0%| | 0.00/583 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a58e8c57927a443c87ba687a9e54f519",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"model.safetensors: 0%| | 0.00/66.7M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "89e6ea6e67cb4062aaafbacd7f2eec9c",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"tokenizer_config.json: 0%| | 0.00/394 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f3fb8f737b924f7a8c62d3f76b051f6c",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5ca87feaf0aa4ed1ab8ffbe5f0a48719",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"tokenizer.json: 0%| | 0.00/712k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "039d38b9fd7c4469a697ae590b0da7ce",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"special_tokens_map.json: 0%| | 0.00/125 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "1fc2aa1dc77e4eecb716bfb58c0f84f9",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"1_Pooling/config.json: 0%| | 0.00/190 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model's max seq len: 512\n"
]
}
],
"source": [
"print(f\"Model's max seq len: {SentenceTransformer(\"thenlper/gte-small\").max_seq_length}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Chunking"
"Chunk size should be smaller than max seq len of the model otherwise chunk embeddings will lose relevance"
]
},
{
Expand Down

0 comments on commit 1f53c84

Please sign in to comment.