forked from szczyglis-dev/py-gpt
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample_data_loader.py
84 lines (69 loc) · 2.83 KB
/
example_data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# ================================================== #
# This file is a part of PYGPT package #
# Website: https://pygpt.net #
# GitHub: https://github.com/szczyglis-dev/py-gpt #
# MIT License #
# Created By : Marcin Szczygliński #
# Updated Date: 2024.02.28 05:00:00 #
# ================================================== #
from pathlib import Path
from typing import Any, Dict, List, Optional
from llama_index.core import Document
from llama_index.core.readers.base import BaseReader
from pygpt_net.provider.loaders.base import BaseLoader # <--- data loader must inherit from BaseLoader
class ExampleDataLoader(BaseLoader):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.id = "example_csv" # identifier for the loader, must be unique
self.name = "CSV files" # name of the loader
self.extensions = ["csv"] # file extensions that the data loader can handle
self.type = ["file"] # allowed types: file, web
def get(self) -> BaseReader:
"""
Get data reader instance
This is the only one required method to implement.
It must return a BaseReader instance.
Below is an example of how to return a reader instance for CSV files.
SimpleCSVReader is a data reader downloaded from the Llama Hub.
:return: Data reader instance
"""
print("Using example CSV data loader...")
return SimpleCSVReader()
class SimpleCSVReader(BaseReader):
"""CSV parser. (downloaded from Llama Hub)
Args:
encoding (str): Encoding used to open the file.
utf-8 by default.
concat_rows (bool): whether to concatenate all rows into one document.
If set to False, a Document will be created for each row.
True by default.
"""
def __init__(
self,
*args: Any,
concat_rows: bool = True,
encoding: str = "utf-8",
**kwargs: Any
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self._concat_rows = concat_rows
self._encoding = encoding
def load_data(
self, file: Path, extra_info: Optional[Dict] = None
) -> List[Document]:
"""Parse file."""
import csv
text_list = []
with open(file, "r", encoding=self._encoding) as fp:
csv_reader = csv.reader(fp)
for row in csv_reader:
text_list.append(", ".join(row))
if self._concat_rows:
return [Document(text="\n".join(text_list), extra_info=extra_info or {})]
else:
return [
Document(text=text, extra_info=extra_info or {}) for text in text_list
]