Skip to content

Commit

Permalink
Adds enclave monitoring service (#10)
Browse files Browse the repository at this point in the history
* Adds `ps` action
* Adds client command
* Adds metrics service
* Small updates + readme
  • Loading branch information
kgrofelnik authored Apr 15, 2024
1 parent 9872188 commit 7b5144e
Show file tree
Hide file tree
Showing 8 changed files with 194 additions and 1 deletion.
15 changes: 14 additions & 1 deletion admin/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
ACTION_GET_ATTESTATION = "get_attestation_doc"
ACTION_SIGN_MESSAGE = "sign_message"
ACTION_SEND_SECRETS = "send_secrets"
ACTION_PS = "ps"


def save_attestation_b64(attestation_b64):
Expand Down Expand Up @@ -70,6 +71,15 @@ def _action_send_secrets(s):
print("Send secrets response:", response.decode())


def _action_ps(s):
s.send(str.encode(json.dumps({
"action": ACTION_PS,
})))
response = s.recv(65536)
metrics = response.decode()
print("Metrics:", metrics)


def _get_cid():
"""
Determine CID of Current Enclave
Expand Down Expand Up @@ -102,6 +112,8 @@ def main(cid: str, action: str, message: str = None, until_success: bool = False
_action_sign_message(s, message)
elif action == ACTION_SEND_SECRETS:
_action_send_secrets(s)
elif action == ACTION_PS:
_action_ps(s)

# close the connection
s.close()
Expand Down Expand Up @@ -129,7 +141,8 @@ def main(cid: str, action: str, message: str = None, until_success: bool = False
ACTION_PING,
ACTION_GET_ATTESTATION,
ACTION_SIGN_MESSAGE,
ACTION_SEND_SECRETS
ACTION_SEND_SECRETS,
ACTION_PS
],
help="action to run"
)
Expand Down
1 change: 1 addition & 0 deletions enclave/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ openai==1.11.1
python-dotenv==1.0.1
web3==6.15.1
google-cloud-storage==2.14.0
psutil==5.9.8
pytest==8.1.1
pytest-cov==4.1.0
groq==0.4.2
17 changes: 17 additions & 0 deletions enclave/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import base64

import psutil
from NsmUtil import NSMUtil
import key_manager

Expand Down Expand Up @@ -71,6 +72,22 @@ def main():
client_connection.send(str.encode(json.dumps({
"exception": str(exc)
})))
elif request["action"] == "ps":
cpu_usage = psutil.cpu_percent(percpu=True)
memory_usage = psutil.virtual_memory()
disk_usage = psutil.disk_usage('/')
response = {}
response["cpu_count"] = len(cpu_usage)
response["cpu_usage"] = {}
for i, usage in enumerate(cpu_usage):
response[f"cpu_usage"][i] = usage
response["ram_total"] = memory_usage.total
response["ram_available"] = memory_usage.available
response["ram_used"] = memory_usage.used
response["disk_total"] = disk_usage.total
response["disk_free"] = disk_usage.free
response["disk_used"] = disk_usage.used
client_connection.send(str.encode(json.dumps(response)))
else:
client_connection.send(str.encode(json.dumps({
"error": "unknown action: " + request["action"]
Expand Down
33 changes: 33 additions & 0 deletions monitoring/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Enclave Monitoring Service

This service is a FastAPI application that provides system metrics from an enclave environment, exposed via a REST API. The service can be managed using two simple shell scripts to start and stop the server.

## Installation

```bash
cd monitoring
pip3 install -r requirements.txt
```

## Running the service

```bash
./run_monitoring_service.sh
```

## Stopping the service

```bash
./stop_monitoring_service.sh
```


## Usage

Once the service is running, you can access the metrics at:

```bash
http://localhost:9101/metrics
```

This endpoint will provide system metrics in Prometheus format, which can be used for monitoring and alerting purposes.
110 changes: 110 additions & 0 deletions monitoring/monitoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import socket
import json
import subprocess
import time
from typing import Dict
from fastapi import FastAPI
from fastapi.responses import PlainTextResponse


app = FastAPI()

fetch_metrics_failures_count = 0


def _action_ps(s):
s.send(
str.encode(
json.dumps(
{
"action": "ps",
}
)
)
)
response = s.recv(65536)
return json.loads(response.decode())


def _get_enclave_metrics() -> Dict:
try:
cid = _get_cid()
if not cid:
return None
# Create a vsock socket object
s = socket.socket(socket.AF_VSOCK, socket.SOCK_STREAM)
s.settimeout(100.0)
# The port should match the server running in enclave
port = 5000
# Connect to the server
s.connect((cid, port))
metrics = _action_ps(s)
s.close()
return metrics
except Exception as exc:
print("Failed to connect, exc:", exc, flush=True)
return None


def _format_metrics(data: dict):
global fetch_metrics_failures_count
metrics = ""
enclave_running = 0
# CPU metrics
if data:
enclave_running = 1
for i in range(data["cpu_count"]):
metrics += f"# HELP cpu_usage_core{i} CPU usage percentage for core {i}\n"
metrics += f"# TYPE cpu_usage_core{i} gauge\n"
metrics += f"cpu_usage_core{i} {data['cpu_usage'][str(i)]}\n"
# Memory metrics
metrics += f"# HELP memory_used Memory used in bytes\n"
metrics += f"# TYPE memory_used gauge\n"
metrics += f"memory_used {data['ram_used']}\n"
metrics += f"# HELP memory_total Total memory in bytes\n"
metrics += f"# TYPE memory_total gauge\n"
metrics += f"memory_total {data['ram_total']}\n"

# Disk metrics
metrics += f"# HELP disk_used Disk used in bytes\n"
metrics += f"# TYPE disk_used gauge\n"
metrics += f"disk_used {data['disk_used']}\n"
metrics += f"# HELP disk_total Disk total in bytes\n"
metrics += f"# TYPE disk_total gauge\n"
metrics += f"disk_used {data['disk_total']}\n"
else:
fetch_metrics_failures_count += 1
metrics += "# HELP enclave_running Whether the Enclave is up and running\n"
metrics += "# TYPE enclave_running gauge\n"
metrics += f"enclave_running {enclave_running}\n"
metrics += "# HELP fetch_metrics_failures_total Total number of times the metrics fetch has failed\n"
metrics += "# TYPE fetch_metrics_failures_total counter\n"
metrics += f"fetch_metrics_failures_total {fetch_metrics_failures_count}\n"
return metrics


def _get_cid():
"""
Determine CID of Current Enclave
"""
try:
proc = subprocess.Popen(
["/bin/nitro-cli", "describe-enclaves"], stdout=subprocess.PIPE
)
output = json.loads(proc.communicate()[0].decode())
enclave_cid = output[0]["EnclaveCID"]
return enclave_cid
except:
return None


@app.get("/metrics", response_class=PlainTextResponse)
def get_metrics():
data = _get_enclave_metrics()
return _format_metrics(data)


if __name__ == "__main__":
import uvicorn

uvicorn.run(app, host="0.0.0.0", port=9101)
2 changes: 2 additions & 0 deletions monitoring/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
fastapi==0.110.1
uvicorn==0.29.0
8 changes: 8 additions & 0 deletions monitoring/run_monitoring_service.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash
if [ -f ".pid" ]; then
echo "Enclave monitoring service already running"
exit 1
fi
nohup uvicorn monitoring:app --host 0.0.0.0 --port 9101 &
echo $! > .pid
echo "Enclave monitoring service started"
9 changes: 9 additions & 0 deletions monitoring/stop_monitoring_service.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash
if [ ! -f ".pid" ]; then
echo "Enclave monitoring service not running"
exit 1
fi
PID=$(cat .pid)
kill $PID
rm .pid
echo "Enclave monitoring service stopped"

0 comments on commit 7b5144e

Please sign in to comment.