Skip to content

Commit

Permalink
Add telemetry (dgarnitz#91)
Browse files Browse the repository at this point in the history
* implemented posthog telemetry in the api

* pointed towards new endpoint, removed filename, added opt out

* fixed tests

* fixed tests pt II

---------

Co-authored-by: David Garnitz <[email protected]>
  • Loading branch information
dgarnitz and David Garnitz authored Nov 7, 2023
1 parent 4231477 commit 7baa00f
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 3 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ deploy/
dist/
build/
vectorflow_ai.egg-info/
*.egg-info/
*.egg-info/
src/api/config.json
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,12 @@ If you wish to validate which chunks you wish to embed, pass a `ChunkValidationU
### S3 Endpoint
VectorFlow is integrated with AWS s3. You can pass a pre-signed s3 URL in the body of the HTTP instead of a file. Use the form field `PreSignedURL` and hit the endpoint `/s3`. This endpoint has the same configuration and restrictions as the `/embed` endpoint.

### Telemetry
VectorFlow uses PostHog to anonymously collect data about usage. This does not collect any personally identifiable information. If you want to disable it though, add the following enviroment variable to your `env_vars.env`:
```
TELEMETRY_DISABLED=True
```

# Contributing

We love feedback from the community. If you have an idea of how to make this project better, we encourage you to open an issue or join our Discord. Please tag `dgarnitz` and `danmeier2`.
Expand Down
9 changes: 8 additions & 1 deletion src/api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,15 @@
from pathlib import Path
from llama_index import download_loader
from services.minio.minio_service import create_minio_client
from api.posthog import send_telemetry

auth = Auth()
pipeline = Pipeline()
app = Flask(__name__)
CORS(app)
CORS(app)

logging.basicConfig(filename='./api-log.txt', level=logging.INFO)
logging.basicConfig(filename='./api-errors.txt', level=logging.ERROR)

@app.route("/embed", methods=['POST'])
def embed():
Expand Down Expand Up @@ -70,6 +74,8 @@ def embed():
if file and is_valid_file_type(file):
job = safe_db_operation(job_service.create_job, vectorflow_request, file.filename)
batch_count = process_file(file, vectorflow_request, job.id)
send_telemetry("SINGLE_FILE_UPLOAD_SUCCESS", vectorflow_request)

return jsonify({'message': f"Successfully added {batch_count} batches to the queue", 'JobID': job.id}), 200
else:
return jsonify({'error': 'Uploaded file is not a TXT, PDF, Markdown or DOCX file'}), 400
Expand Down Expand Up @@ -143,6 +149,7 @@ def create_jobs():
pipeline.disconnect()

successfully_uploaded_files[file.filename] = job.id
send_telemetry("MULTI_FILE_UPLOAD_SUCCESS", vectorflow_request)
except Exception as e:
print(f"Error uploading file {file.filename} to min.io, creating job or passing vectorflow request to message broker. \nError: {e}\n\n")
failed_uploads.append(file.filename)
Expand Down
50 changes: 50 additions & 0 deletions src/api/posthog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import os
import sys

# this is needed to import classes from other modules
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../')))

import json
import uuid
import logging
import datetime
from posthog import Posthog

logging.basicConfig(filename='./api-log.txt', level=logging.INFO)
logging.basicConfig(filename='./api-errors.txt', level=logging.ERROR)

posthog = Posthog(project_api_key='phc_E2V9rY1esOWV6el6WjfGSiwhTj49YFXe8vHv1rcgx9E', host='https://eu.posthog.com')

def get_user_id():
config_file = os.path.join(os.getenv("API_STORAGE_DIRECTORY"), "config.json")

if os.path.exists(config_file):
with open(config_file, "r") as f:
data = json.load(f)
if "user_id" in data:
return data["user_id"]

user_id = str(uuid.uuid4())
with open(config_file, "w") as f:
json.dump({"user_id": user_id}, f)
return user_id

def send_telemetry(event_name, vectorflow_request):
if os.getenv("TELEMETRY_DISABLED"):
return

user_id = get_user_id()
current_time = datetime.datetime.now()
properties = {
"vector_db_type": vectorflow_request.vector_db_metadata.vector_db_type.value,
"embeddings_type": vectorflow_request.embeddings_metadata.embeddings_type.value,
"chunk_size": vectorflow_request.embeddings_metadata.chunk_size,
"chunk_overlap": vectorflow_request.embeddings_metadata.chunk_overlap,
"chunk_strategy": vectorflow_request.embeddings_metadata.chunk_strategy.value,
"time": current_time.strftime('%m/%d/%Y')
}

try:
posthog.capture(user_id, event_name, properties)
except Exception as e:
logging.error('ERROR sending telemetric data to Posthog. See exception: %s', e)
3 changes: 3 additions & 0 deletions src/api/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ anyio==3.7.1
argcomplete==3.1.1
async-timeout==4.0.3
attrs==23.1.0
backoff==2.2.1
beautifulsoup4==4.12.2
blinker==1.6.2
boto3==1.28.27
Expand Down Expand Up @@ -64,6 +65,7 @@ MarkupSafe==2.1.3
marshmallow==3.20.1
matplotlib==3.8.0
minio==7.1.17
monotonic==1.6
mpmath==1.3.0
msg-parser==1.2.0
multidict==6.0.4
Expand All @@ -88,6 +90,7 @@ pika==1.3.2
Pillow==10.0.1
placebo==0.9.0
portalocker==2.8.2
posthog==3.0.2
protobuf==4.24.3
psycopg2-binary==2.9.6
pycocotools==2.0.7
Expand Down
3 changes: 2 additions & 1 deletion src/api/tests/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,11 @@ def setUp(self):
"X-EmbeddingAPI-Key": "test__embed_key"
}

@patch('api.app.send_telemetry')
@patch('services.database.database.safe_db_operation')
@patch('services.database.job_service.create_job')
@patch('api.app.process_file')
def test_embed_endpoint(self, mock_process_file, mock_create_job, mock_safe_db_operation):
def test_embed_endpoint(self, mock_process_file, mock_create_job, mock_safe_db_operation, mock_send_telemetry):
mock_process_file.return_value = 2
test_embeddings_metadata = EmbeddingsMetadata(embeddings_type=EmbeddingsType.OPEN_AI)
test_vector_db_metadata = VectorDBMetadata(vector_db_type=VectorDBType.PINECONE,
Expand Down

0 comments on commit 7baa00f

Please sign in to comment.