-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
119 lines (96 loc) · 4.17 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Dict
import os
import pandas as pd
import pdfplumber
import fitz # PyMuPDF
import json
import pytesseract
from PIL import Image
from sentence_transformers import SentenceTransformer
import numpy as np
app = FastAPI()
# Global storage for data and embeddings
data_store = []
embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # Small, efficient model for embeddings
# ----- Data Ingestion Functions -----
def process_pdf(file_path: str) -> Dict:
"""Extract text from a PDF file."""
with pdfplumber.open(file_path) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text() + "\n"
return {"type": "pdf", "content": text, "file_path": file_path}
def process_excel(file_path: str) -> Dict:
"""Read Excel data into a dictionary format."""
df = pd.read_excel(file_path)
return {"type": "excel", "content": df.to_dict(), "file_path": file_path}
def process_text(file_path: str) -> Dict:
"""Read and return text file content."""
with open(file_path, 'r') as f:
text = f.read()
return {"type": "text", "content": text, "file_path": file_path}
def process_json(file_path: str) -> Dict:
"""Read and return JSON file content."""
with open(file_path, 'r') as f:
data = json.load(f)
return {"type": "json", "content": data, "file_path": file_path}
def process_image(file_path: str) -> Dict:
"""Use OCR to extract text from an image."""
text = pytesseract.image_to_string(Image.open(file_path))
return {"type": "image", "content": text, "file_path": file_path}
# Load all data on startup
@app.on_event("startup")
def load_data():
directory = "path_to_data" # Specify your data directory
for filename in os.listdir(directory):
file_path = os.path.join(directory, filename)
if filename.endswith(".pdf"):
data_store.append(process_pdf(file_path))
elif filename.endswith(".xlsx") or filename.endswith(".xls"):
data_store.append(process_excel(file_path))
elif filename.endswith(".txt"):
data_store.append(process_text(file_path))
elif filename.endswith(".json"):
data_store.append(process_json(file_path))
elif filename.endswith((".png", ".jpg", ".jpeg")):
data_store.append(process_image(file_path))
# ----- Embedding and Search Functions -----
def generate_embedding(text: str) -> np.ndarray:
"""Generate embeddings for the text using a sentence transformer."""
return embedding_model.encode(text)
def search_documents(query: str, top_n: int = 5) -> List[Dict]:
"""Search for documents most similar to the query."""
query_embedding = generate_embedding(query)
results = []
for document in data_store:
content_embedding = generate_embedding(document["content"]) if document["type"] != "excel" else None
similarity_score = np.dot(query_embedding, content_embedding) if content_embedding is not None else 0
results.append({"document": document, "score": similarity_score})
# Sort results by similarity score in descending order and return top N
sorted_results = sorted(results, key=lambda x: x["score"], reverse=True)[:top_n]
return sorted_results
# ----- API Models -----
class QueryRequest(BaseModel):
query: str
class DocumentResponse(BaseModel):
document_type: str
content: str
file_path: str
score: float
# ----- API Endpoints -----
@app.post("/search", response_model=List[DocumentResponse])
def search_data(query_request: QueryRequest):
"""Endpoint to search the databank for relevant documents."""
query = query_request.query
results = search_documents(query)
if not results:
raise HTTPException(status_code=404, detail="No relevant data found")
return [{"document_type": res["document"]["type"],
"content": res["document"]["content"],
"file_path": res["document"]["file_path"],
"score": res["score"]} for res in results]
# ----- Run the API -----
# You can run this FastAPI app using Uvicorn from the command line as follows:
# uvicorn main:app --reload