test_video.py

import torch
import cv2
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import random
from builders.model_builder import build_model


model_name = "ENet"
num_classes_ = 11
# height, width = 360, 480
height, width = 480, 640


weights_path = 'checkpoint/camvid/ENetbs8gpu1_trainval/model_1000.pth'
model = build_model(model_name, num_classes=num_classes_)
checkpoint = torch.load(weights_path)
model.load_state_dict(checkpoint['model'])
model.eval()
model.to('cuda')

camvid_palette = [random.randint(0, 255) for _ in range(33)]

video_path = 'test_video.mp4'

cap = cv2.VideoCapture(video_path)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out_path = model_name+'_output_video.mp4'
fps = int(cap.get(cv2.CAP_PROP_FPS))
out_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
out_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out_video_writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))


def camvid_colorize_mask(mask, palette):
    colored_mask = np.zeros((mask.shape[0], mask.shape[1], 3), dtype=np.uint8)
    for i in range(len(palette) // 3):
        colored_mask[mask == i] = palette[i * 3:i * 3 + 3]
    return colored_mask


while True:
    ret, frame = cap.read()
    frame = cv2.resize(frame, (width, height))
    if not ret:
        break

    image = frame.copy()

    # Preprocess the image
    f_scale=1
    mean = [105.65775,  103.329834,  99.625404] #camvid
    image = cv2.resize(image, None, fx=f_scale, fy=f_scale, interpolation=cv2.INTER_LINEAR)
    image = np.asarray(image, np.float32)
    image -= mean
    image = image[:, :, ::-1]  # revert to RGB
    image_copy = image.copy()

    # Convert to tensor 
    input_tensor = torch.from_numpy(image_copy).permute(2, 0, 1).float()
    input_batch = input_tensor.unsqueeze(0)

    # Make prediction
    with torch.no_grad():
        output = model(input_batch.cuda())  # Move to GPU if available

    # postProcess the prediction
    torch.cuda.synchronize()
    output = output.cpu().data[0].numpy()
    output = output.transpose(1, 2, 0)
    predictions = np.argmax(output, axis=2)

    # Combine original and segmented image
    colored_semantic_map = camvid_colorize_mask(predictions, camvid_palette)
    final_img = cv2.addWeighted(frame, 0.7, colored_semantic_map, 0.3, 0)


    # Display the original frame and prediction
    cv2.imshow('Original Frame', frame)
    cv2.imshow('Prediction',final_img)  # Convert to NumPy array for display

    # Write colored predictions to video
    out_video_writer.write(final_img)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
out_video_writer.release()
cv2.destroyAllWindows()