feat: recording and sending images to psi

DANCEcollaborative · Oct 1, 2023 · a57180a · a57180a
1 parent 550e901
commit a57180a
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 21 deletions.
diff --git a/config.py b/config.py
@@ -13,6 +13,7 @@
 audio_port = 60001
 doa_port = 60002
 vad_port = 60003
+images_port = 60004
 
 confusion_classifier_res_port = 61001
 

diff --git a/confusion_model/inference.py b/confusion_model/inference.py
@@ -31,7 +31,7 @@ def __init__(
             data_type: str = "window",
             label_dict: dict = EMOTION_NO,
             device: str = "cpu",
-            verbose: bool = "False"
+            verbose: bool = False
     ):
         """
         Initialize trained model for inference
@@ -74,6 +74,7 @@ def __init__(
             data_type: str = "window",
             label_dict: dict = EMOTION_NO,
             device: str = "cpu",
+            cv2_device: str = "cpu",
             multiclass: bool = False,
             haar_path: str = None,
     ):
@@ -82,14 +83,15 @@ def __init__(
         If needed load CNN featurizer models for embedding
         """
         self.feat_type = load_model_path.split("/")[-1].split(".")[0].split("_")[-3]
+        self.cv2_device = cv2_device
         if self.feat_type == "CNN":
             # Default extraction, only works on newer cv2 releases
             if haar_path is None:
                 haar_path = cv2.data.haarcascades + "haarcascade_frontalface_alt.xml"
 
             # If running Haar Cascades on Cuda, will need to use cuda optimized classifier
             # Currently hard-coding Haar cascade Hyperparams
-            if self.device == "cuda":
+            if self.cv2_device == "cuda":
                 self.face_extractor = cv2.cuda_CascadeClassifier.create(haar_path)
                 self.face_extractor.setMinNeighbors(5)
                 self.face_extractor.setMinObjectSize((10, 10))
@@ -127,7 +129,7 @@ def _face_extraction_harr(self, image: Image):
         # Take PIL image and turn it into CV2 image
         col_img, gray_img = convert_from_image_to_cv2(image, new_area=None)
         # If GPU, need to turn from numpy array to GPU Matrix and back
-        if self.device == "cuda":
+        if self.cv2_device == "cuda":
             cuFrame = cv2.cuda_GpuMat(gray_img)
             boxes = self.face_extractor.detectMultiScale(cuFrame).download()
             # Given we return anything, then unpack the value
@@ -250,7 +252,7 @@ def run_inference(
         multiclass=False,
         label_dict=EMOTION_NO,
         device="cuda",
-        haar_path="/home/teledia/Desktop/nvaikunt/ConfusionDataset/data/haarcascade_frontalface_alt_cuda.xml",
+        haar_path="/home/teledia/Desktop/nvaikunt/ConfusionDataset/data/haarcascade_frontalface_alt.xml",
         # device="cpu",
         # haar_path=None
     )

diff --git a/send_nano_ip_to_psi.py b/send_nano_ip_to_psi.py
@@ -17,7 +17,7 @@ def main():
             "doa": f"tcp://{jetson_ip}:{doa_port}",
             "vad": f"tcp://{jetson_ip}:{vad_port}",
             "cvPreds": f"tcp://{jetson_ip}:{confusion_classifier_res_port}",
-
+            "images": f"tcp://{jetson_ip}:{images_port}",
         }
     )  # erebor"
     # request = json.dumps({"sensorVideoText":"tcp://128.2.212.138:40000", "sensorAudio": "tcp://128.2.212.138:40001", "sensorDOA": "tcp://128.2.212.138:40002", "sensorVAD": "tcp://128.2.212.138:40003"})   # erebor"

diff --git a/video_scripts/send_video_dict_with_embed.py b/video_scripts/send_video_dict_with_embed.py
@@ -7,13 +7,15 @@
 from zmq_utils import *
 from confusion_model.inference import ConfusionInference
 from video_scripts.camera import RealSenseCamera
-
+from confusion_model.constants import *
+from PIL import Image
+import base64
 
 camera = RealSenseCamera(res=(640, 480))
 
 context = zmq.Context()
 socket = context.socket(zmq.PUB)
-socket.bind(f"tcp://*:{confusion_classifier_res_port}")
+socket.bind(f"tcp://*:{images_port}")
 
 # Initialize global buffer
 BUF_MAX_LEN = 6
@@ -27,11 +29,11 @@ def confusion_cnn_embed():
         multiclass=False,
         label_dict=EMOTION_NO,
         device="cuda",
-        haar_path="/home/recrafting5/Desktop/DANCEcollaborative/nvaikunt/ConfusionDataset/data/haarcascade_frontalface_alt_cuda.xml"
+        haar_path="/home/recrafting5/Desktop/DANCEcollaborative/nvaikunt/ConfusionDataset/data/haarcascade_frontalface_alt.xml"
     )
     window_len = inference_model.window_len
     num_preds = 0
-    start = time()
+    start = time.time()
 
     while buffer:
         if len(buffer) > window_len:
@@ -40,13 +42,14 @@ def confusion_cnn_embed():
 
             current_images = []
             for bo in buffer_outputs:
-                h, w = bo[0].size
+                img = Image.fromarray(bo[0])
+                h, w = img.size
                 print("hello", h, w)
-                curr_image = bo[0].resize((3 * h // 4,  3 * w // 4))
+                curr_image = img.resize((3 * h // 4,  3 * w // 4))
                 print(curr_image.size)
-                curr_images.append(curr_image)
+                current_images.append(curr_image)
 
-            preds = inference_model.run_inference()
+            preds = inference_model.run_inference(current_images)
             payload = preds #ToDo: Convert "preds" type to something that send_payload expects
             print(preds)
             num_preds += 1
@@ -59,10 +62,10 @@ def confusion_cnn_embed():
             #     send_payload(socket, "Remote_PSI_Text", payload)
             #     print(preds)
             #     inference_model.feats.pop(0)
-            send_payload(socket, "cvpreds", payload, originatingTime=buffer_outputs[0][1]) # sending the time when the first image of input window was captured as the originatingTime
+            # send_payload(socket, "cvpreds", payload, originatingTime=buffer_outputs[0][1]) # sending the time when the first image of input window was captured as the originatingTime
         time.sleep(0.01)
     print(f"Total number of predictions {num_preds}")
-    print(f"Total inference time: {time() - start}")
+    print(f"Total inference time: {time.time() - start}")
 
 def capture_frames():
     try:
@@ -71,19 +74,20 @@ def capture_frames():
             depth, img = camera.get_frame_stream()
             height = img.shape[0]
             width = img.shape[1]
-            print(height, width)
 
             # if frame_count % 10 == 0:  # Add every 10th frame to buffer
             with buffer_lock:
                 buffer.append((img, generate_current_dotnet_datetime_ticks()))  # Appending image and current time as tuple to buffer
 
             time.sleep(0.01)
 
-            cv.imshow("demo", img)
+            # cv.imshow("demo", img)
 
             # print('msg:', msg)
             # print('msg length', len(msg))
-
+            _, img_buffer = cv.imencode('.jpg', img)
+            payload = base64.b64encode(img_buffer)
+            send_payload(socket, "images", payload)
             key = cv.waitKey(1)
             if key == 27:
                 break
@@ -97,13 +101,10 @@ def capture_frames():
 
 def main():
     capture_thread = threading.Thread(target=capture_frames, daemon=True)
-    # inference_thread = threading.Thread(target=perform_inference, daemon=True)
 
     capture_thread.start()
-    # inference_thread.start()
 
     capture_thread.join()
-    # inference_thread.join()
 
 
 if __name__ == "__main__":