-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.html
1233 lines (1021 loc) · 70.5 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>RIFLEx: A Free Lunch for Length Extrapolation in Video Diffusion Transformers</title>
<link rel="icon" type="image/png" href="images/logo_title.png" sizes="32x38">
<script>
window.dataLayer = window.dataLayer || [];
function gtag() {
dataLayer.push(arguments);
}
gtag('js', new Date());
gtag('config', 'G-PYVRSFMDRL');
document.addEventListener('DOMContentLoaded', function () {
const toggleButton = document.getElementById('toggleButton');
const tocContent = document.getElementById('tocContent');
const tocHeader = document.querySelector('.toc-header');
function toggleMenu() {
tocContent.classList.toggle('collapsed');
toggleButton.textContent = tocContent.classList.contains('collapsed') ? '▶' : '▼';
}
tocHeader.addEventListener('click', toggleMenu);
});
</script>
<script>
MathJax = {
tex: {
inlineMath: [['$', '$'], ['//(', '//)']] // 定义行内公式的定界符(例如 $...$ 或 /(.../))
}
};
</script>
<script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Google+Sans&family=Noto+Sans&display=swap" rel="stylesheet">
<link rel="stylesheet" href="./static/css/bulma.min.css">
<link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
<link rel="stylesheet" href="./static/css/bulma-slider.min.css">
<link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
<link rel="preload" href="./static/css/index.css" as="style">
<link rel="stylesheet" href="./static/css/index.css">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
<script defer src="./static/js/fontawesome.all.min.js"></script>
<script defer src="./static/js/bulma-carousel.min.js"></script>
<script defer src="./static/js/bulma-slider.min.js"></script>
<script defer src="./static/js/index.js"></script>
</head>
<body>
<section class="hero">
<div class="hero-body">
<div class="container is-max-desktop">
<div class="columns is-centered">
<div class="column has-text-centered">
<div class="has-text-centered">
<img src="images/logo_cropped.png" alt="Logo" style="max-height: 50px;">
</div>
<h4 class="title is-3 publication-title">RIFLEx: A Free Lunch for Length Extrapolation in<br> Video Diffusion Transformers</h4>
<div class="is-size-6 publication-authors">
<span class="author-block">
<a href="https://gracezhao1997.github.io/">Min Zhao</a><sup>1,2</sup>,
</span>
<span class="author-block">
<a href="https://guandehe.github.io/">Guande He</a><sup>3</sup>,
</span>
<span class="author-block">
<a href="https://github.com/Chyxx">Yixiao Chen</a><sup>1,2</sup>,
</span>
<span class="author-block">
<a href="https://zhuhz22.github.io/">Hongzhou Zhu</a><sup>1,2</sup>,
</span>
<span class="author-block">
<a href="https://zhenxuan00.github.io/">Chongxuan Li</a><sup>4</sup>,
</span>
<span class="author-block">
<a href="https://ml.cs.tsinghua.edu.cn/~jun/index.shtml">Jun Zhu</a><sup>1,2,5</sup>
</span>
</div>
<br>
<div class="is-size-6 publication-authors">
<span class="author-block"><sup>1</sup>THU,</span>
<span class="author-block"><sup>2</sup>ShengShu,</span>
<span class="author-block"><sup>3</sup>UT-Austin,</span>
<span class="author-block"><sup>4</sup>RUC,</span>
<span class="author-block"><sup>5</sup>Pazhou Lab</span>
</div>
<br>
<span class="link-block">
<a href="" target="_blank" class="external-link ">
<span class="icon">
<i class="ai ai-arxiv"></i>
</span>
<span>arXiv</span>
</a>
</span>
<span class="link-block">
<a href="" target="_blank" class="external-link ">
<span class="icon">
<svg class="svg-inline--fa fa-github fa-w-16" aria-hidden="true" focusable="false" data-prefix="fab" data-icon="github" role="img" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 496 512" data-fa-i2svg=""><path fill="currentColor" d="M165.9 397.4c0 2-2.3 3.6-5.2 3.6-3.3.3-5.6-1.3-5.6-3.6 0-2 2.3-3.6 5.2-3.6 3-.3 5.6 1.3 5.6 3.6zm-31.1-4.5c-.7 2 1.3 4.3 4.3 4.9 2.6 1 5.6 0 6.2-2s-1.3-4.3-4.3-5.2c-2.6-.7-5.5.3-6.2 2.3zm44.2-1.7c-2.9.7-4.9 2.6-4.6 4.9.3 2 2.9 3.3 5.9 2.6 2.9-.7 4.9-2.6 4.6-4.6-.3-1.9-3-3.2-5.9-2.9zM244.8 8C106.1 8 0 113.3 0 252c0 110.9 69.8 205.8 169.5 239.2 12.8 2.3 17.3-5.6 17.3-12.1 0-6.2-.3-40.4-.3-61.4 0 0-70 15-84.7-29.8 0 0-11.4-29.1-27.8-36.6 0 0-22.9-15.7 1.6-15.4 0 0 24.9 2 38.6 25.8 21.9 38.6 58.6 27.5 72.9 20.9 2.3-16 8.8-27.1 16-33.7-55.9-6.2-112.3-14.3-112.3-110.5 0-27.5 7.6-41.3 23.6-58.9-2.6-6.5-11.1-33.3 2.6-67.9 20.9-6.5 69 27 69 27 20-5.6 41.5-8.5 62.8-8.5s42.8 2.9 62.8 8.5c0 0 48.1-33.6 69-27 13.7 34.7 5.2 61.4 2.6 67.9 16 17.7 25.8 31.5 25.8 58.9 0 96.5-58.9 104.2-114.8 110.5 9.2 7.9 17 22.9 17 46.4 0 33.7-.3 75.4-.3 83.6 0 6.5 4.6 14.4 17.3 12.1C428.2 457.8 496 362.9 496 252 496 113.3 383.5 8 244.8 8zM97.2 352.9c-1.3 1-1 3.3.7 5.2 1.6 1.6 3.9 2.3 5.2 1 1.3-1 1-3.3-.7-5.2-1.6-1.6-3.9-2.3-5.2-1zm-10.8-8.1c-.7 1.3.3 2.9 2.3 3.9 1.6 1 3.6.7 4.3-.7.7-1.3-.3-2.9-2.3-3.9-2-.6-3.6-.3-4.3.7zm32.4 35.6c-1.6 1.3-1 4.3 1.3 6.2 2.3 2.3 5.2 2.6 6.5 1 1.3-1.3.7-4.3-1.3-6.2-2.2-2.3-5.2-2.6-6.5-1zm-11.4-14.7c-1.6 1-1.6 3.6 0 5.9 1.6 2.3 4.3 3.3 5.6 2.3 1.6-1.3 1.6-3.9 0-6.2-1.4-2.3-4-3.3-5.6-2z"></path></svg><!-- <i class="fab fa-github"></i> Font Awesome fontawesome.com -->
</span>
<span>Code</span>
</a>
</span>
<br>
<br>
<div>
<p style="color:black;font-size: 20px; font-weight: bold; color: rgb(25, 100, 201);">
<span style="color: black;">TL;DR:</span> Effortlessly extend your video with just one line of code:
<span style="font-family: 'Courier New', Courier, monospace;">freq[k]=(2*np.pi)/(L*s).</span>
</p>
</div>
</div>
</div>
</div>
</div>
</section>
<!-- ********************************************************************主视频****************************************************************************** -->
<section class="hero teaser">
<div class="container is-max-desktop">
<div class="hero-body" id="demo">
<!-- <div style="position: relative; padding-bottom: 56.25%; height: 0; overflow: hidden; margin-top: 20px;"> -->
<iframe width="700" height="400" src="https://www.youtube.com/embed/taofoXDsKGk?si=bgYO83QyiBDGbxsC" title="YouTube video player"
frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
referrerpolicy="strict-origin-when-cross-origin" allowfullscreen>
</iframe>
<!-- </div> -->
<p style="color:black;font-size: 1em; margin-top: 10px; text-align: center;">
(This webpage contains a lot of videos. We suggest using Chrome for the best experience.)
</p>
</div>
</div>
</section>
<!-- ********************************************************************菜单****************************************************************************** -->
<!-- <section>
<nav id="toc">
<div class="toc-header">
<span>Webpage Menu</span>
<button id="toggleButton">▶</button>
</div>
<ul id="tocContent" class="collapsed">
<li><a href="#demo">Top: Demo video</a></li>
<li><a href="#abstract">1. Abstract</a></li>
<li><a href="#overview">2. CausVid Method Overview</a></li>
<li><a href="#result">3. Quantitative Result</a></li>
<li><a href="#ui">4. Interactive UI</a></li>
<li><a href="#text2short">5. Text to 5-second Short Video Generation</a></li>
<li><a href="#text2long">6. Text to 30-second Long Video Generation</a></li>
<li><a href="#image2video">7. Zero-shot Image to Video Generation</a></li>
<li><a href="#text2shortcomparison">8. Text to Short Video Qualitative Comparison</a></li>
<li><a href="#text2longcomparison">9. Text to Long Video Qualitative Comparison</a></li>
<li><a href="#ablation">10. Ablation Studies</a></li>
<li><a href="#bidir">11. Comparison with Bidirectional Teacher</a></li>
</ul>
</nav>
</section> -->
<section class="hero is-small">
<div class="hero-body ">
<hr>
<div class="container " id="abstract">
<h2 id="obj-comparison" class="title is-4 has-text-centered">Overview</h2>
<figure class="image is-centered" style="display: flex; justify-content: center; align-items: center;">
<img src="images/overview.png" style="width: 70%; height: auto;">
</figure>
<div class="content has-text-justified" style="font-size: 1.5rem; line-height: 1.8;">
<p style="color:black;">
<br>
<span style="color: rgb(25, 100, 201);font-weight: bold">Motivation:</span>
Recent advancements in video generation allow models to create high-quality videos,
but fixed sequence lengths limit their ability to extend content.
In this paper, we explore length extrapolation techniques that <span style="font-weight: bold">generate new and temporally coherent content
without longer training videos</span>. Current extrapolation strategies lead to <span style="font-weight: bold">temporal repetition</span> or <span style="font-weight: bold">slow motion</span>,
indicating a gap in understanding how positional encodings affect video extrapolation.
<br>
<br>
<span style="color: rgb(25, 100, 201);font-weight: bold">Analysis:</span>
We systematically analyze the role of individual frequency components in positional,
discovering that high frequencies capture rapid movements and short-term dependencies,
inducing temporal repetition, while low frequencies encode long-term dependencies with slow motion.
Furthermore, we surprisingly identify an <span style="font-weight: bold">intrinsic frequency</span> that primarily dictates repetition
patterns among all components during extrapolation.
<br>
<br>
<span style="color: rgb(25, 100, 201);font-weight: bold">Method:</span>
Building on these insights, we propose Reducing Intrinsic Frequency for Length Extrapolation (RIFLEx),
<span style="font-weight: bold">a minimal yet effective</span> solution that lowers the intrinsic frequency to ensure it remains within a single
cycle after extrapolation, without requiring additional modifications.
</p>
</div>
</div>
<br>
<br>
<hr>
<div class="container " id="text2short">
<h2 id="obj-comparison" class="title is-4 has-text-centered">Training-free 2× Temporal Extrapolation
</h2>
<!--****************************************************介绍*******************************************************-->
<div class="content has-text-justified" style="font-size: 1.5rem; line-height: 1.8; ">
<p style="color:black;">
RIFLEx offers a true free lunch—achieving high-quality 2× extrapolation on SOTA video diffusion transformers in a completely
<span style="font-weight: bold">training-free</span> manner. In the following, we present videos extended from 129 to 261 frames at 24 fps.
</p>
</div>
</div>
<br>
<div id="results-carousel" class="carousel results-carousel">
<div class="item">
<video id="cereal" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/free2/seed42_A person in a red tracksuit pours something out of a cup..mp4"
type="video/mp4">
</video>
<p class="prompt-text">A person in a red tracksuit pours something out of a cup.
</p>
</div>
<div class="item">
<video id="cereal" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/free2/r049/seed42_A petri dish with a bamboo forest growing within it that has tiny red pandas running around..mp4"
type="video/mp4">
</video>
<p class="prompt-text">A petri dish with a bamboo forest growing within it that has tiny red pandas running around.</p>
</div>
<div class="item">
<video id="cereal" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/free2/r049/seed42_An animated porcupine with a mix of brown and white fur and prominent quills is seen in a cozy, warm.mp4"
type="video/mp4">
</video>
<p class="prompt-text">An animated porcupine with a mix of brown and white fur and prominent quills is seen in a cozy, warmly lit interior setting, interacting with a green gift box with a yellow ribbon. The room is filled with wooden furniture and colorful wall decorations, suggesting a cheerful and domestic atmosphere. The porcupine's large eyes and expressive face convey a sense of lightheartedness and curiosity. The camera maintains a low angle, close to the ground, providing an intimate view of the character's actions without any movement, focusing on the playful and curious mood of the scene. The visual style is characteristic of contemporary 3D animation, with vibrant colors and smooth textures that create a polished and engaging look. The scene transitions to an outdoor environment, showcasing a sunny, verdant landscape with rocks, trees, and grass, indicating a natural, possibly forest-like setting. The presence of a small character in the final frame suggests the continuation of a narrative or the introduction of new characters.</p>
</div>
<div class="item">
<video id="cereal" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/free2/r049/seed42_In a lush meadow, a teenage girl practices yoga on a blanket laid out among wildflowers. She begins .mp4"
type="video/mp4">
</video>
<p class="prompt-text">In a lush meadow, a teenage girl practices yoga on a blanket laid out among wildflowers. She begins with simple poses, stretching her arms upwards and bending backwards. Gradually, her routine intensifies; she transitions into more challenging positions, balancing on one leg and twisting her torso. Her breathing remains steady throughout, eyes closed, fully immersed in the moment.</p>
</div>
<div class="item">
<video id="cereal" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/free2/r049/seed42_Two men and a woman engage in a conversation within a traditional Korean indoor setting, characteriz.mp4"
type="video/mp4">
</video>
<p class="prompt-text">Two men and a woman engage in a conversation within a traditional Korean indoor setting, characterized by wooden architecture and natural lighting. The men exhibit a variety of emotions, from shock to amusement, while the woman appears distressed. The camera captures their expressions in medium close-up shots, with a focus on their faces against a softly blurred background, creating an intimate and intense atmosphere. The realistic and cinematic visual style enhances the emotional gravity of the scene. </p>
</div>
<div class="item">
<video id="cereal" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/free2/seed42_Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly .mp4"
type="video/mp4">
</video>
<p class="prompt-text">Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.
</p>
</div>
</div>
<div id="results-carousel" class="carousel results-carousel">
<div class="item">
<video id="cereal" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/free2/seed42_A white and orange tabby cat is seen happily darting through a dense garden, as if chasing something.mp4"
type="video/mp4">
</video>
<p class="prompt-text">A white and orange tabby cat is seen happily darting through a dense garden, as if chasing something. Its eyes are wide and happy as it jogs forward, scanning the branches, flowers, and leaves as it walks. The path is narrow as it makes its way between all the plants. the scene is captured from a ground-level angle, following the cat closely, giving a low and intimate perspective. The image is cinematic with warm tones and a grainy texture. The scattered daylight between the leaves and plants above creates a warm contrast, accentuating the cat’s orange fur. The shot is clear and sharp, with a shallow depth of field.
</p>
</div>
<div class="item">
<video id="cereal" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/free2/seed42_An extreme close-up of an gray-haired man with a beard in his 60s, he is deep in thought pondering t.mp4"
type="video/mp4">
</video>
<p class="prompt-text">An extreme close-up of an gray-haired man with a beard in his 60s, he is deep in thought pondering the history of the universe as he sits at a cafe in Paris, his eyes focus on people offscreen as they walk as he sits mostly motionless, he is dressed in a wool coat suit coat with a button-down shirt , he wears a brown beret and glasses and has a very professorial appearance, and the end he offers a subtle closed-mouth smile as if he found the answer to the mystery of life, the lighting is very cinematic with the golden light and the Parisian streets and city in the background, depth of field, cinematic 35mm film.
</p>
</div>
<div class="item">
<video id="cereal" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/free2/r049/seed42_A border collie named Max waits eagerly by the door, tail wagging furiously. As soon as the door ope.mp4"
type="video/mp4">
</video>
<p class="prompt-text">A border collie named Max waits eagerly by the door, tail wagging furiously. As soon as the door opens, he bounds out into the yard, paws skidding slightly on the smooth patio tiles. He races towards a frisbee thrown by his owner, leaping high into the air to catch it mid-flight. Landing gracefully, he trots back proudly, dropping the frisbee at his owner’s feet, ready for another round. </p>
</div>
<div class="item">
<video id="cereal" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/free2/seed42_A person's hand is seen interacting with a black and white toy orca in a staged miniature aquatic en.mp4"
type="video/mp4">
</video>
<p class="prompt-text">A person's hand is seen interacting with a black and white toy orca in a staged miniature aquatic environment. The scene includes clear blue water, small plastic aquatic plants, and miniature flags with fish symbols, all set against a solid teal background. The playful and imaginative atmosphere is conveyed through the gentle manipulation of the toy, suggesting a storytelling or demonstration context. The camera remains fixed throughout, capturing the scene in a medium shot that focuses on the toy and its immediate surroundings. The visual style is clear and colorful, highlighting the details of the toy and the miniature aquatic setup.
</p>
</div>
<div class="item">
<video id="cereal" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/free2/seed42_Animated characters, a rabbit and a mouse, are depicted in a perilous situation, first plummeting th.mp4"
type="video/mp4">
</video>
<p class="prompt-text">Animated characters, a rabbit and a mouse, are depicted in a perilous situation, first plummeting through a dark, undefined space, and then floating and swimming in a serene underwater environment. The characters are dressed in adventure gear, suggesting a narrative context. The camera closely follows their expressions and movements, capturing the tension and urgency of their situation. The medium and close-up shots emphasize their facial expressions, which convey fear and determination. The visual style is high-quality 3D animation with detailed textures and lighting, creating a cinematic feel.
</p>
</div>
<div class="item">
<video id="cereal" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/free2/seed42_A man with facial hair, dressed in a burgundy shirt, is seen knocking on a weathered wooden door wit.mp4"
type="video/mp4">
</video>
<p class="prompt-text">A man with facial hair, dressed in a burgundy shirt, is seen knocking on a weathered wooden door with a metal latch and a small window, set in a stone wall. The scene transitions to an indoor setting where the man, now wearing a blue shirt, speaks to the camera in a well-lit room furnished with a couch, a bookshelf, and various decorations. The video captures the man in a medium shot with a stationary camera, conveying a casual and friendly atmosphere in the indoor scene, contrasted with a neutral atmosphere in the outdoor scene. The visual style is realistic with natural lighting and color grading.
</p>
</div>
</div>
<div id="results-carousel" class="carousel results-carousel">
<div class="item">
<video id="berliner" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/free2/r049/seed42_At a serene lakeside, an amateur photographer adjusts the settings on his camera, preparing to captu.mp4"
type="video/mp4">
</video>
<p class="prompt-text">At a serene lakeside, an amateur photographer adjusts the settings on his camera, preparing to capture the sunset. He starts by focusing on the distant horizon, snapping several shots. As the sun sinks lower, he moves closer to the water’s edge, crouching down to frame the reflection of the sky in the lake. Finally, standing up, he captures the final moments of daylight, the sky ablaze with color. </p>
</div>
<div class="item">
<video id="cereal" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/free2/seed42_In a forested area, a temporary camp is set up with tents, a dirt ground, and various equipment, inc.mp4"
type="video/mp4">
</video>
<p class="prompt-text">In a forested area, a temporary camp is set up with tents, a dirt ground, and various equipment, including a four-wheeled vehicle and barrels. A man in a white shirt appears distressed, holding his head, while a woman in a brown dress looks on with concern. The presence of military personnel and civilians suggests a situation of conflict or crisis. The mood is tense and somber, with an undercurrent of urgency or the aftermath of a significant event, as evidenced by the body lying on the ground. The camera maintains a steady, medium-long shot, capturing the expressions and movements of the characters, and the realistic, cinematic visual style enhances the gravity of the scene.
</p>
</div>
<div class="item">
<video id="cereal" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/free2/seed42_In a serene and traditional Japanese environment at night, two animated characters鈥攁 male in traditi.mp4"
type="video/mp4">
</video>
<p class="prompt-text">In a serene and traditional Japanese environment at night, two animated characters—a male in traditional Japanese armor and a female with white hair in a blue dress—are the focal point of the scene. They are engaged in a calm interaction, with two other female characters in the background, one with blonde hair and another with white hair. The medium shot captures the characters from behind the male character, providing a clear view of their attire and the traditional Japanese architecture around them. The visual style is akin to Japanese anime with 3D elements, and the camera remains in a fixed position throughout the frames, emphasizing the character's dialogue and movements.
</p>
</div>
<div class="item">
<video id="cereal" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/free2/seed42_A fawn Pembroke Welsh Corgi walking in Times Square..mp4"
type="video/mp4">
</video>
<p class="prompt-text">A fawn Pembroke Welsh Corgi walking in Times Square.
</p>
</div>
<div class="item">
<video id="cereal" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/free2/seed42_The camera follows behind a white vintage SUV with a black roof rack as it speeds up a steep dirt ro.mp4"
type="video/mp4">
</video>
<p class="prompt-text">The camera follows behind a white vintage SUV with a black roof rack as it speeds up a steep dirt road surrounded by pine trees on a steep mountain slope, dust kicks up from it’s tires, the sunlight shines on the SUV as it speeds along the dirt road, casting a warm glow over the scene. The dirt road curves gently into the distance, with no other cars or vehicles in sight. The trees on either side of the road are redwoods, with patches of greenery scattered throughout. The car is seen from the rear following the curve with ease, making it seem as if it is on a rugged drive through the rugged terrain. The dirt road itself is surrounded by steep hills and mountains, with a clear blue sky above with wispy clouds.
</p>
</div>
<div class="item">
<video id="cereal" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/free2/seed42_A woman and a man are engaged in a dialogue or confrontation within an opulent indoor setting, sugge.mp4"
type="video/mp4">
</video>
<p class="prompt-text">A woman and a man are engaged in a dialogue or confrontation within an opulent indoor setting, suggested by the blurred background with bokeh light effects. The woman, dressed in a blue traditional outfit with intricate patterns, exudes elegance and poise, indicative of high social status, possibly royalty. Her makeup and hair are styled traditionally. The man, in contrast, has a stern expression, long dark hair, and is adorned with a black hat featuring a white pattern, which may signify his lower social status or warrior status. The close-up shots focus on their faces, capturing the tension and moderate emotional intensity of the scene. The camera remains in a fixed position, emphasizing the characters' expressions and the historical drama's visual style. </p>
</div>
</div>
<!-- <div id="results-carousel" class="carousel results-carousel">
<div class="item">
<video id="cereal" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/free2/seed42_The Glenfinnan Viaduct is a historic railway bridge in Scotland, UK, that crosses over the west high.mp4"
type="video/mp4">
</video>
<p class="prompt-text">The Glenfinnan Viaduct is a historic railway bridge in Scotland, UK, that crosses over the west highland line between the towns of Mallaig and Fort William. It is a stunning sight as a steam train leaves the bridge, traveling over the arch-covered viaduct. The landscape is dotted with lush greenery and rocky mountains, creating a picturesque backdrop for the train journey. The sky is blue and the sun is shining, making for a beautiful day to explore this majestic spot.
</p>
</div>
<div class="item">
<video id="berliner" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/free2/r049/seed42_At a serene lakeside, an amateur photographer adjusts the settings on his camera, preparing to captu.mp4"
type="video/mp4">
</video>
<p class="prompt-text">At a serene lakeside, an amateur photographer adjusts the settings on his camera, preparing to capture the sunset. He starts by focusing on the distant horizon, snapping several shots. As the sun sinks lower, he moves closer to the water’s edge, crouching down to frame the reflection of the sky in the lake. Finally, standing up, he captures the final moments of daylight, the sky ablaze with color. </p>
</div>
<div class="item">
<video id="cereal" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/free2/seed42_A person in a red tracksuit pours something out of a cup..mp4"
type="video/mp4">
</video>
<p class="prompt-text">A person in a red tracksuit pours something out of a cup.
</p>
</div>
<div class="item">
<video id="cereal" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/free2/seed42_A man with facial hair, dressed in a plaid shirt, is seated playing an acoustic guitar and singing w.mp4"
type="video/mp4">
</video>
<p class="prompt-text">A man with facial hair, dressed in a plaid shirt, is seated playing an acoustic guitar and singing with visible enjoyment and passion. He is in an indoor setting with a simple gray background, accompanied by a guitar amplifier and a microphone stand to his left, and a stack of "Guitar World" books to his right. The camera maintains a fixed medium shot, capturing his expressive face and hand movements on the guitar strings. The visual quality is clear and unembellished, focusing on the performance without distractions.
</p>
</div>
</div> -->
<br>
<br>
<br>
<hr>
<div class="container " id="text2long">
<h2 id="obj-comparison" class="title is-4 has-text-centered"> 2× Temporal Extrapolation: Fine-tuning 1,000 Steps without Long Videos
</h2>
<!--****************************************************介绍*******************************************************-->
<div class="content has-text-justified" style="font-size: 1.5rem; line-height: 1.8;">
<p style="color:black;">
When fine-tuning is performed for only 1,000 steps with a batch size of 8 on <span style="font-weight: bold">original-length videos</span>, dynamic quality and visual quality are further improved. Below are videos extended from 129 to 261 frames at 24 fps.
</p>
</div>
</div>
<br>
<div id="results-carousel" class="carousel results-carousel">
<div class="item">
<video id="item" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/finetune2/seed42_Animated characters, a rabbit and a mouse, are depicted in a perilous situation, first plummeting th.mp4"
type="video/mp4">
</video>
<p class="prompt-text">Animated characters, a rabbit and a mouse, are depicted in a perilous situation, first plummeting through a dark, undefined space, and then floating and swimming in a serene underwater environment. The characters are dressed in adventure gear, suggesting a narrative context. The camera closely follows their expressions and movements, capturing the tension and urgency of their situation. The medium and close-up shots emphasize their facial expressions, which convey fear and determination. The visual style is high-quality 3D animation with detailed textures and lighting, creating a cinematic feel.
</p>
</div>
<div class="item">
<video id="item" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/finetune2/seed42_A person is seen through circular openings in a dark environment, suggesting a secretive or investig.mp4"
type="video/mp4">
</video>
<p class="prompt-text">A person is seen through circular openings in a dark environment, suggesting a secretive or investigative scenario. The scene transitions to a nighttime setting where the same person is cautiously looking out of a window, illuminated by the warm glow of interior lighting against the cool darkness of the exterior. The atmosphere is tense and mysterious, with a moderate emotional intensity. The camera employs close-up shots with a shallow depth of field to focus on the subject, creating an intimate and intense viewing experience. The cinematic visual style is characterized by dramatic lighting and color grading that underscores the mood of the scene.
</p>
</div>
<div class="item">
<video id="item" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/finetune2/seed42_3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchan.mp4"
type="video/mp4">
</video>
<p class="prompt-text">3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies. The creature stops to interact playfully with a group of tiny, fairy-like beings dancing around a mushroom ring. The creature looks up in awe at a large, glowing tree that seems to be the heart of the forest.
</p>
</div>
<div class="item">
<video id="item" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/finetune2/others/seed42_A cat waking up its sleeping owner demanding breakfast. The owner tries to ignore the cat, but the c.mp4"
type="video/mp4">
</video>
<p class="prompt-text">A cat waking up its sleeping owner demanding breakfast. The owner tries to ignore the cat, but the cat tries new tactics and finally the owner pulls out a secret stash of treats from under the pillow to hold the cat off a little longer.
</p>
</div>
<div class="item">
<video id="item" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/finetune2/others/seed42_A close-up of a musician's fingers skillfully moving over a violin's strings during a solo performan.mp4"
type="video/mp4">
</video>
<p class="prompt-text">A close-up of a musician's fingers skillfully moving over a violin's strings during a solo performance.</p>
</div>
<div class="item">
<video id="item" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/finetune2/others/seed42_A dog wearing virtual reality goggles in sunset..mp4"
type="video/mp4">
</video>
<p class="prompt-text">A dog wearing virtual reality goggles in sunset.</p>
</div>
</div>
<div id="results-carousel" class="carousel results-carousel">
<div class="item">
<video id="item" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/finetune2/seed42_In a forested area, a temporary camp is set up with tents, a dirt ground, and various equipment, inc.mp4"
type="video/mp4">
</video>
<p class="prompt-text">In a forested area, a temporary camp is set up with tents, a dirt ground, and various equipment, including a four-wheeled vehicle and barrels. A man in a white shirt appears distressed, holding his head, while a woman in a brown dress looks on with concern. The presence of military personnel and civilians suggests a situation of conflict or crisis. The mood is tense and somber, with an undercurrent of urgency or the aftermath of a significant event, as evidenced by the body lying on the ground. The camera maintains a steady, medium-long shot, capturing the expressions and movements of the characters, and the realistic, cinematic visual style enhances the gravity of the scene.
</p>
</div>
<div class="item">
<video id="item" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/finetune2/seed42_Animated characters are engaging in a magical interaction within a dark, cavernous environment. The .mp4"
type="video/mp4">
</video>
<p class="prompt-text">Animated characters are engaging in a magical interaction within a dark, cavernous environment. The scene centers on a small, orange magical creature with a glowing heart, as well as two dragon-like creatures, one of which is holding a magical potion. The creature opens the potion, causing a transformation, which captures the attention of the dragons. Subsequently, two human characters with a torch discover the aftermath of the transformation, revealing a small, glowing creature resembling the one from earlier. The atmosphere is whimsical and magical, with a sense of curiosity and discovery. The camera remains static, offering medium shots that focus on the characters and their actions, while the visual style is traditional animation with smooth lines and vibrant colors.
</p>
</div>
<div class="item">
<video id="item" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/finetune2/seed42_A man with slicked-back hair, dressed in a black period costume, is captured in a medium close-up sh.mp4"
type="video/mp4">
</video>
<p class="prompt-text">A man with slicked-back hair, dressed in a black period costume, is captured in a medium close-up shot within an opulent interior setting. He wears a white half-face mask that covers the left side of his face, which remains expressionless. The scene is rich with historical or theatrical elements, including candlelight, red drapes, and period-style furniture. The atmosphere is dramatic and tense, with a moderate emotional intensity evident from the man's expressive eyes and the intimate lighting. The camera remains in a fixed position throughout, focusing on the man's upper body and facial expressions, which are central to the narrative. The cinematic visual style emphasizes the period detail and dramatic ambiance of the scene.
</p>
</div>
<div class="item">
<video id="item" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/finetune2/others/seed42_A man in a formal black suit stands in the entrance of a traditional Korean house, characterized by .mp4"
type="video/mp4">
</video>
<p class="prompt-text">A man in a formal black suit stands in the entrance of a traditional Korean house, characterized by wooden architecture and a tiled floor. Household items such as shoes, pots, and a hanging basket are visible, suggesting a lived-in space. The man's expression is neutral, and the overall atmosphere is calm and mild. The camera remains in a fixed position, capturing the scene in a medium shot with natural lighting, indicative of a standard television drama style.
</p>
</div>
<div class="item">
<video id="item" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/finetune2/others/seed42_A man in a suit is actively speaking and gesturing in front of an audience in an indoor setting, lik.mp4"
type="video/mp4">
</video>
<p class="prompt-text">A man in a suit is actively speaking and gesturing in front of an audience in an indoor setting, likely a conference hall or auditorium. The audience members are seated in rows, with varying levels of attention and engagement. The speaker's enthusiastic demeanor suggests a formal and educational event. The camera work consists of medium shots of the speaker and long shots of the audience, with a fixed position and sharp focus. The visual style is standard, with no special effects or cinematic techniques, indicative of a lecture or seminar recording.
</p>
</div>
<div class="item">
<video id="item" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/finetune2/others/seed42_A man with facial hair, dressed in a burgundy shirt, is seen knocking on a weathered wooden door wit.mp4"
type="video/mp4">
</video>
<p class="prompt-text">A man with facial hair, dressed in a burgundy shirt, is seen knocking on a weathered wooden door with a metal latch and a small window, set in a stone wall. The scene transitions to an indoor setting where the man, now wearing a blue shirt, speaks to the camera in a well-lit room furnished with a couch, a bookshelf, and various decorations. The video captures the man in a medium shot with a stationary camera, conveying a casual and friendly atmosphere in the indoor scene, contrasted with a neutral atmosphere in the outdoor scene. The visual style is realistic with natural lighting and color grading.
</p>
</div>
</div>
<div id="results-carousel" class="carousel results-carousel">
<div class="item">
<video id="item" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/finetune2/395ac65789dd99fff17a1bffbb76e593.mp4"
type="video/mp4">
</video>
<p class="prompt-text">A man with unkempt brown hair, dressed in a brown jacket and a red neckerchief, is seen interacting with a woman inside a horse-drawn carriage. The setting is outdoors, with historical buildings in the background, suggesting a European town or city from a bygone era. The man's facial expressions convey a sense of urgency and distress, with moderate emotional intensity. The camera work includes close-up shots to emphasize the man's reactions and medium shots to show the interaction with the woman. The focus on the man's face and the coin he examines indicates their significance in the narrative. The visual style is characteristic of a historical drama, with natural lighting and a color scheme that enhances the period feel of the scene.
</p>
</div>
<div class="item">
<video id="item" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/finetune2/seed42_A man with facial hair, dressed in a plaid shirt, is seated playing an acoustic guitar and singing w.mp4"
type="video/mp4">
</video>
<p class="prompt-text">A man with facial hair, dressed in a plaid shirt, is seated playing an acoustic guitar and singing with visible enjoyment and passion. He is in an indoor setting with a simple gray background, accompanied by a guitar amplifier and a microphone stand to his left, and a stack of "Guitar World" books to his right. The camera maintains a fixed medium shot, capturing his expressive face and hand movements on the guitar strings. The visual quality is clear and unembellished, focusing on the performance without distractions.
</p>
</div>
<div class="item">
<video id="item" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/finetune2/seed42_In a virtual block-based environment, a group of blue sheep is contained within a wooden pen situate.mp4"
type="video/mp4">
</video>
<p class="prompt-text">In a virtual block-based environment, a group of blue sheep is contained within a wooden pen situated in a flat, grassy area under a clear sky. The sheep exhibit simple, blocky movements as they are herded and directed by an unseen player. The scene is captured from a fixed first-person perspective, providing a medium to long shot view of the activity. The playful and casual atmosphere is conveyed through the mild emotions of the virtual animal management activity. The visual style is characteristic of a block-based building game, with simple textures and a cubic world design.
</p>
</div>
<div class="item">
<video id="item" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/finetune2/others/seed42_An animated character with white hair and a muscular build is shown in a close-up, displaying a ster.mp4"
type="video/mp4">
</video>
<p class="prompt-text">An animated character with white hair and a muscular build is shown in a close-up, displaying a stern and intense expression. The character is dressed in a red and gold outfit, suggesting a regal or powerful status. The scene transitions to reveal the character seated on a throne-like structure with ornate decorations, addressing a group of people who are standing in front of it. The atmosphere is serious and charged with emotion, indicating a moment of significance or decision-making. The camera focuses on the character's face before widening the shot to include the character's interaction with the group, using fixed position shots without any discernible camera movement. The visual style is characteristic of Japanese anime, with detailed character designs and vibrant coloring.
</p>
</div>
<div class="item">
<video id="item" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/finetune2/others/seed42_Two animated characters are engaged in a tense interaction within an ornate indoor setting, possibly.mp4"
type="video/mp4">
</video>
<p class="prompt-text">Two animated characters are engaged in a tense interaction within an ornate indoor setting, possibly a throne room or chamber. The character with long red hair is dressed in a dark, regal outfit, exuding an air of authority or leadership. The other character, with long purple hair, is adorned in a dark, possibly magical or warrior-like attire, and displays a serious or confrontational demeanor. The camera remains fixed on medium shots, capturing the detailed expressions and gestures of the characters, emphasizing the dramatic tension of the scene. The visual style is characteristic of Japanese anime, with vibrant colors and dynamic character designs that contribute to the overall atmosphere of the video.
</p>
</div>
<div class="item">
<video id="item" controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/finetune2/others/seed42_Two men and a woman engage in a conversation within a traditional Korean indoor setting, characteriz.mp4"
type="video/mp4">
</video>
<p class="prompt-text">Two men and a woman engage in a conversation within a traditional Korean indoor setting, characterized by wooden architecture and natural lighting. The men exhibit a variety of emotions, from shock to amusement, while the woman appears distressed. The camera captures their expressions in medium close-up shots, with a focus on their faces against a softly blurred background, creating an intimate and intense atmosphere. The realistic and cinematic visual style enhances the emotional gravity of the scene. </p>
</div>
</div>
<br>
<br>
<br>
<hr>
<div class="container " id="text2long">
<h4 id="obj-comparison" class="title is-4 has-text-centered">Spatial and Joint Temporal-spatial Extrapolation
</h4>
<!--****************************************************介绍*******************************************************-->
<div class="content has-text-justified" style="font-size: 1.5rem; line-height: 1.8;">
<p style="color:black;">
RIFLEx can be extended for spatial resolution extrapolation, as well as for both video duration and resolution. Compared to position extrapolation (PE), we achieve superior visual quality by effectively <span style=" font-weight: bold">addressing repetition issues</span>. Notably, while PE struggles with joint temporal-spatial extrapolation, our RIFLEx still delivers high-quality results.</p>
</div>
</div>
<br>
<div class="columns is-centered has-text-centered">
<div class="column">
<p style="color:black;font-size: 20px;">Normal size        <br>480$\times$720        </p>
</div>
<div class="column">
<p style="color:black;font-size: 20px;">PE        <br>480$\times$1440        </p>
</div>
<div class="column">
<p style="color:black;font-size: 20px;"><span style=" font-weight: bold">RIFLEx (Ours)        </span><br>480$\times$1440        </p>
</div>
</div>
<div id="results-carousel"
style="display: flex; justify-content: space-between; align-items: flex-start; gap: 20px;">
<div class="item" style="flex: 1; display: flex; flex-direction: column">
<img src="images/spatial/2Wnormal.png" alt="Comparison Image" style="width: 90%; height: auto;">
</div>
<div class="item" style="flex: 1; display: flex; flex-direction: column">
<img src="images/spatial/2Wbase.png" alt="Comparison Image" style="width: 90%; height: auto;">
</div>
<div class="item" style="flex: 1; display: flex; flex-direction: column">
<img src="images/spatial/2Wours.png" alt="Comparison Image" style="width: 90%; height: auto;">
</div>
</div>
<br>
<div class="columns is-centered has-text-centered">
<div class="column">
<p style="color:black;font-size: 20px;">Normal size        <br>480$\times$720        </p>
</div>
<div class="column">
<p style="color:black;font-size: 20px;">PE        <br>960$\times$720        </p>
</div>
<div class="column">
<p style="color:black;font-size: 20px;"><span style=" font-weight: bold">RIFLEx (Ours)        </span><br>960$\times$720        </p>
</div>
</div>
<div id="results-carousel"
style="display: flex; justify-content: space-between; align-items: flex-start; gap: 20px;">
<div class="item" style="flex: 1; display: flex; flex-direction: column">
<img src="images/spatial/2Hnormal.png" alt="Comparison Image" style="width: 90%; height: auto;">
</div>
<div class="item" style="flex: 1; display: flex; flex-direction: column">
<img src="images/spatial/2Hbase.png" alt="Comparison Image" style="width: 90%; height: auto;">
</div>
<div class="item" style="flex: 1; display: flex; flex-direction: column">
<img src="images/spatial/2Hours.png" alt="Comparison Image" style="width: 90%; height: auto;">
</div>
</div>
<br>
<div class="columns is-centered has-text-centered">
<div class="column">
<p style="color:black;font-size: 20px;">Normal size        <br>480$\times$720        </p>
</div>
<div class="column">
<p style="color:black;font-size: 20px;">PE        <br>960$\times$1440       </p>
</div>
<div class="column">
<p style="color:black;font-size: 20px;"><span style=" font-weight: bold">RIFLEx (Ours)        </span><br>960$\times$1440       </p>
</div>
</div>
<div id="results-carousel"
style="display: flex; justify-content: space-between; align-items: flex-start; gap: 20px;">
<div class="item" style="flex: 1; display: flex; flex-direction: column">
<img src="images/spatial/2HWnormal1.png" alt="Comparison Image" style="width: 90%; height: auto;">
</div>
<div class="item" style="flex: 1; display: flex; flex-direction: column">
<img src="images/spatial/2HWbase1.png" alt="Comparison Image" style="width: 90%; height: auto;">
</div>
<div class="item" style="flex: 1; display: flex; flex-direction: column">
<img src="images/spatial/2HWours1.png" alt="Comparison Image" style="width: 90%; height: auto;">
</div>
</div>
<div id="results-carousel"
style="display: flex; justify-content: space-between; align-items: flex-start; gap: 20px;">
<div class="item" style="flex: 1; display: flex; flex-direction: column">
<img src="images/spatial/2HWnormal2.png" alt="Comparison Image" style="width: 90%; height: auto;">
</div>
<div class="item" style="flex: 1; display: flex; flex-direction: column">
<img src="images/spatial/2HWbase2.png" alt="Comparison Image" style="width: 90%; height: auto;">
</div>
<div class="item" style="flex: 1; display: flex; flex-direction: column">
<img src="images/spatial/2HWours2.png" alt="Comparison Image" style="width: 90%; height: auto;">
</div>
</div>
<br>
<div class="columns is-centered has-text-centered">
<div class="column">
<p style="color:black;font-size: 20px;">Normal size        <br>480$\times$720$\times$49     </p>
</div>
<div class="column">
<p style="color:black;font-size: 20px;">PE        <br>960$\times$1440$\times$97    </p>
</div>
<div class="column">
<p style="color:black;font-size: 20px;"><span style=" font-weight: bold">RIFLEx (Ours)        </span><br>960$\times$1440$\times$97    </p>
</div>
</div>
<div id="results-carousel"
style="display: flex; justify-content: space-between; align-items: flex-start; gap: 20px;">
<div class="item" style="flex: 1; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<video controls preload="metadata" autoplay muted loop playsinline
style="width: 90%; height: 90%; margin:0px;">
<source src="images/spatial/demo1_normal.mp4" type="video/mp4">
</video>
</div>
<div class="item" style="flex: 1; display: flex; flex-direction: column">
<video controls preload="metadata" autoplay muted loop playsinline
style="width: 90%; height: 90%; margin: 0px;">
<source src="images/spatial/demo1_baseline.mp4" type="video/mp4">
</video>
</div>
<div class="item" style="flex: 1; display: flex; flex-direction: column">
<video controls preload="metadata" autoplay muted loop playsinline
style="width: 90%; height: 90%; margin:0px;">
<source src="images/spatial/demo1_ours.mp4" type="video/mp4">
</video>
</div>
</div>
<div id="results-carousel"
style="display: flex; justify-content: space-between; align-items: flex-start; gap: 20px;">
<div class="item" style="flex: 1; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<video controls preload="metadata" autoplay muted loop playsinline
style="width: 90%; height: 90%; margin:0px;">
<source src="images/spatial/demo2_normal.mp4" type="video/mp4">
</video>
</div>
<div class="item" style="flex: 1; display: flex; flex-direction: column">
<video controls preload="metadata" autoplay muted loop playsinline
style="width: 90%; height: 90%; margin: 0px;">
<source src="images/spatial/demo2_baseline.mp4" type="video/mp4">
</video>
</div>
<div class="item" style="flex: 1; display: flex; flex-direction: column">
<video controls preload="metadata" autoplay muted loop playsinline
style="width: 90%; height: 90%; margin:0px;">
<source src="images/spatial/demo2_ours.mp4" type="video/mp4">
</video>
</div>
</div>
</div>
<!-- ********************************************************************* 方法对比之一 ************************************************************************* -->
<br>
<br>
<br>
<hr>
<div class="container " id="text2shortcomparison">
<h4 id="obj-comparison" class="title is-4 has-text-centered">Temporal Extrapolation Comparsions</h4>
<!-- ****************************************************************介绍****************************************************************************** -->
<div class="content has-text-justified" style="font-size: 1.5rem; line-height: 1.8;">
<p style="color:black;">
We compare popular length extrapolation methods in LLMs and image diffusion transformers, which often introduce <span style="font-weight: bold">temporal repetition or motion deceleration</span>. In contrast, our approach generates new, temporally coherent content that evolves smoothly, delivering superior performance. Below, <span style="font-weight: bold">we highlight repeated frames with a red box</span>, which are identical to the initial frames of the video.
</p>
</div>
</div>
<br>
<div class="columns is-centered has-text-centered">
<div class="column">
<p style="color:black;font-size: 20px;"> PE </p>
</div>
<div class="column">
<p style="color:black;font-size: 20px;"> PI </p>
</div>
<div class="column">
<p style="color:black;font-size: 20px;">TASR</p>
</div>
<div class="column">
<p style="color:black;font-size: 20px;">YaRN </p>
</div>
<div class="column">
<p style="color:black;font-size: 20px; font-weight: bold">RIFLEx (Ours)</p>
</div>
</div>
<div id="results-carousel"
style="display: flex; justify-content: space-between; align-items: flex-start; gap: 0px;">
<div class="item" style="flex: 1; display: flex; flex-direction: column">
<video controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/comparison/PEnew.mp4"
type="video/mp4">
</video>
</div>
<div class="item" style="flex: 1; display: flex; flex-direction: column">
<video controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/comparison/PInew.mp4"
type="video/mp4">
</video>
</div>
<div class="item" style="flex: 1; display: flex; flex-direction: column">
<video controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/comparison/TAnew.mp4"
type="video/mp4">
</video>
</div>
<div class="item" style="flex: 1; display: flex; flex-direction: column">
<video controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/comparison/Yarnnew.mp4"
type="video/mp4">
</video>
</div>
<div class="item" style="flex: 1; display: flex; flex-direction: column">
<video controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/comparison/oursnew.mp4"
type="video/mp4">
</video>
</div>
</div>
<!--
<br>
<div id="results-carousel"
style="display: flex; justify-content: space-between; align-items: flex-start; gap: 5px;">
<div class="item" style="flex: 1; display: flex; flex-direction: column">
<video controls preload="metadata" autoplay muted loop playsinline>
<source
src="videos/comparison/PE2.mp4"
type="video/mp4">