forked from fudan-generative-vision/hallo
-
Notifications
You must be signed in to change notification settings - Fork 1
/
combine_videos.py
140 lines (124 loc) · 4.89 KB
/
combine_videos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import os
from moviepy.editor import (
VideoFileClip,
ImageClip,
concatenate_videoclips,
CompositeVideoClip,
clips_array,
vfx,
AudioFileClip,
afx
)
# Paths
diarization_file = 'diaraization/diaraization_index.rttm'
video_chunks_path = 'output_videos'
static_images_path = 'source_images'
audio_file = "audio/input_audio.wav"
# Constants
NUM_POSES = 4 # Number of poses per speaker
OUTPUT_WIDTH = 1024
OUTPUT_HEIGHT = 512
CLIP_WIDTH = OUTPUT_WIDTH // 2 # Each clip is half the output width
ADJUST_AUDIO = False
# Read the diarization file and extract events
events = []
with open(diarization_file, 'r') as f:
for line in f:
tokens = line.strip().split()
index = int(tokens[0])
start_time = float(tokens[4])
duration = float(tokens[5])
end_time = start_time + duration
speaker = tokens[8]
events.append({
'index': index,
'start_time': start_time,
'duration': duration,
'end_time': end_time,
'speaker': speaker
})
# Determine total duration
total_duration = max(event['end_time'] for event in events)
# Build speaker intervals
speakers = set(event['speaker'] for event in events)
speaker_intervals = {speaker: [] for speaker in speakers}
for event in events:
speaker_intervals[event['speaker']].append(event)
# Process each speaker
speaker_final_clips = {}
for speaker in speakers:
# Sort intervals
intervals = sorted(speaker_intervals[speaker], key=lambda x: x['start_time'])
clips = []
last_end = 0.0
pose_counter = 0
for event in intervals:
start = event['start_time']
end = event['end_time']
duration = event['duration']
index = event['index']
speaker_lower = speaker.lower()
# Handle non-speaking intervals
if last_end < start:
silence_duration = start - last_end
pose_index = pose_counter % NUM_POSES
pose_counter += 1
image_path = os.path.join(static_images_path, f'{speaker}_pose_{pose_index}.png')
image_clip = ImageClip(image_path, duration=silence_duration)
fade_duration = min(0.5, silence_duration)
image_clip = image_clip.fx(vfx.fadein, fade_duration)
# Ensure the static image has silent audio
image_clip = image_clip.set_audio(None)
clips.append(image_clip)
# Handle speaking intervals (video chunks)
video_filename = f'chunk_{index}_{speaker_lower}.mp4'
video_path = os.path.join(video_chunks_path, video_filename)
if os.path.exists(video_path):
video_clip = VideoFileClip(video_path).subclip(0, duration)
if not ADJUST_AUDIO:
video_clip = video_clip.without_audio()
clips.append(video_clip)
else:
print(f'Warning: Video file {video_path} not found.')
last_end = end
# Handle any remaining time after the last interval
if last_end < total_duration:
silence_duration = total_duration - last_end
pose_index = pose_counter % NUM_POSES
pose_counter += 1
image_path = os.path.join(static_images_path, f'{speaker}_pose_{pose_index}.png')
image_clip = ImageClip(image_path, duration=silence_duration)
fade_duration = min(0.5, silence_duration)
image_clip = image_clip.fx(vfx.fadein, fade_duration)
image_clip = image_clip.set_audio(None)
clips.append(image_clip)
# Concatenate all clips for the speaker
final_clip = concatenate_videoclips(clips, method='compose')
final_clip = final_clip.resize((CLIP_WIDTH, OUTPUT_HEIGHT))
speaker_final_clips[speaker] = final_clip
# Ensure both speaker clips have the same duration
durations = [clip.duration for clip in speaker_final_clips.values()]
max_duration = max(durations)
for speaker, clip in speaker_final_clips.items():
if clip.duration < max_duration:
# Extend the clip by freezing the last frame
freeze_frame = clip.to_ImageClip(clip.duration - 1/clip.fps)
freeze_clip = freeze_frame.set_duration(max_duration - clip.duration)
freeze_clip = freeze_clip.set_audio(None)
speaker_final_clips[speaker] = concatenate_videoclips([clip, freeze_clip])
# Arrange speakers side by side
speakers_sorted = sorted(speaker_final_clips.keys())
left_clip = speaker_final_clips[speakers_sorted[0]]
right_clip = speaker_final_clips[speakers_sorted[1]]
final_video = clips_array([[left_clip, right_clip]])
# Set the final video size
final_video = final_video.resize((OUTPUT_WIDTH, OUTPUT_HEIGHT))
# Load the main audio
try:
main_audio = AudioFileClip(audio_file)
except Exception as e:
print(f"Error loading main audio file: {e}")
# Set the audio to the combined video
final_video = final_video.set_audio(main_audio)
# Write the output video file
final_video.write_videofile('final_video.mp4', codec='libx264', audio_codec='aac')