paper_metadata_download/2024-12-16.json

[
    "{'paper': {'id': '2412.10360', 'authors': [{'_id': '675f9d6d2eb87c3a1b120a5f', 'user': {'_id': '648c9605565e3a44f3c9bb7b', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/648c9605565e3a44f3c9bb7b/W5chvk17Zol6-2QSWkFVR.jpeg', 'isPro': True, 'fullname': 'Orr Zohar', 'user': 'orrzohar', 'type': 'user'}, 'name': 'Orr Zohar', 'status': 'extracted_pending', 'statusLastChangedAt': '2024-12-16T03:24:34.794Z', 'hidden': False}, {'_id': '675f9d6d2eb87c3a1b120a60', 'user': {'_id': '65703fab7f50602340d23704', 'avatarUrl': '/avatars/324c45f5fba9cd8c38a89b30427c06b4.svg', 'isPro': False, 'fullname': 'Xiaohan Wang', 'user': 'nicholswang', 'type': 'user'}, 'name': 'Xiaohan Wang', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-16T09:23:12.868Z', 'hidden': False}, {'_id': '675f9d6d2eb87c3a1b120a61', 'user': {'_id': '641362754e5305c14f233f08', 'avatarUrl': '/avatars/bce974ef60c507d22702cc7662033b28.svg', 'isPro': False, 'fullname': 'Yann Dubois', 'user': 'YannDubs', 'type': 'user'}, 'name': 'Yann Dubois', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T09:57:20.008Z', 'hidden': False}, {'_id': '675f9d6d2eb87c3a1b120a62', 'name': 'Nikhil Mehta', 'hidden': False}, {'_id': '675f9d6d2eb87c3a1b120a63', 'name': 'Tong Xiao', 'hidden': False}, {'_id': '675f9d6d2eb87c3a1b120a64', 'user': {'_id': '644eb923a8f384abd038a4f7', 'avatarUrl': '/avatars/3d1a29c6149a6ffc482842f1438ba55a.svg', 'isPro': False, 'fullname': 'Philippe I Hansen-Estruch', 'user': 'philippehansen', 'type': 'user'}, 'name': 'Philippe Hansen-Estruch', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-17T08:04:15.986Z', 'hidden': False}, {'_id': '675f9d6d2eb87c3a1b120a65', 'user': {'_id': '6372b1f9bd81fae2b3a712e8', 'avatarUrl': '/avatars/bcdf7b540cc94882e0ae6035bb07a8f2.svg', 'isPro': False, 'fullname': 'Licheng Yu', 'user': 'lichengyu', 'type': 'user'}, 'name': 'Licheng Yu', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T09:58:30.800Z', 'hidden': False}, {'_id': '675f9d6d2eb87c3a1b120a66', 'user': {'_id': '6334c1b3762d2d0e9eb151ea', 'avatarUrl': '/avatars/45b3eb63f0515ab417aacd79aa9f8a41.svg', 'isPro': False, 'fullname': 'Xiaofang Wang', 'user': 'minione', 'type': 'user'}, 'name': 'Xiaofang Wang', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T09:59:05.541Z', 'hidden': False}, {'_id': '675f9d6d2eb87c3a1b120a67', 'user': {'_id': '6417cf37dce1e4c0229f17b1', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/6417cf37dce1e4c0229f17b1/7h-ZCB5f4wif7TsnF-B1M.jpeg', 'isPro': False, 'fullname': 'Felix Xu', 'user': 'katanaxu', 'type': 'user'}, 'name': 'Felix Juefei-Xu', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-16T16:55:18.689Z', 'hidden': False}, {'_id': '675f9d6d2eb87c3a1b120a68', 'name': 'Ning Zhang', 'hidden': False}, {'_id': '675f9d6d2eb87c3a1b120a69', 'name': 'Serena Yeung-Levy', 'hidden': False}, {'_id': '675f9d6d2eb87c3a1b120a6a', 'name': 'Xide Xia', 'hidden': False}], 'publishedAt': '2024-12-13T18:53:24.000Z', 'title': 'Apollo: An Exploration of Video Understanding in Large Multimodal Models', 'summary': 'Despite the rapid integration of video perception capabilities into Large\\nMultimodal Models (LMMs), the underlying mechanisms driving their video\\nunderstanding remain poorly understood. Consequently, many design decisions in\\nthis domain are made without proper justification or analysis. The high\\ncomputational cost of training and evaluating such models, coupled with limited\\nopen research, hinders the development of video-LMMs. To address this, we\\npresent a comprehensive study that helps uncover what effectively drives video\\nunderstanding in LMMs.\\n  We begin by critically examining the primary contributors to the high\\ncomputational requirements associated with video-LMM research and discover\\nScaling Consistency, wherein design and training decisions made on smaller\\nmodels and datasets (up to a critical size) effectively transfer to larger\\nmodels. Leveraging these insights, we explored many video-specific aspects of\\nvideo-LMMs, including video sampling, architectures, data composition, training\\nschedules, and more. For example, we demonstrated that fps sampling during\\ntraining is vastly preferable to uniform frame sampling and which vision\\nencoders are the best for video representation.\\n  Guided by these findings, we introduce Apollo, a state-of-the-art family of\\nLMMs that achieve superior performance across different model sizes. Our models\\ncan perceive hour-long videos efficiently, with Apollo-3B outperforming most\\nexisting 7B models with an impressive 55.1 on LongVideoBench. Apollo-7B is\\nstate-of-the-art compared to 7B LMMs with a 70.9 on MLVU, and 63.3 on\\nVideo-MME.', 'upvotes': 107, 'discussionId': '675f9d722eb87c3a1b120bb3'}, 'publishedAt': '2024-12-15T23:38:35.355Z', 'title': 'Apollo: An Exploration of Video Understanding in Large Multimodal Models', 'thumbnail': 'https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2412.10360.png', 'numComments': 5, 'submittedBy': {'_id': '648c9605565e3a44f3c9bb7b', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/648c9605565e3a44f3c9bb7b/W5chvk17Zol6-2QSWkFVR.jpeg', 'fullname': 'Orr Zohar', 'name': 'orrzohar', 'type': 'user', 'isPro': True, 'isHf': False, 'isMod': False, 'followerCount': 8}}",
    "{'paper': {'id': '2412.09624', 'authors': [{'_id': '675bcfacd0c3aeab39880d36', 'user': {'_id': '656a9b9f9496f21be8271f1b', 'avatarUrl': '/avatars/cfba9f835bf5eef80c6c5f52be69abd4.svg', 'isPro': False, 'fullname': 'TaiMing', 'user': 'TaiMingLu', 'type': 'user'}, 'name': 'Taiming Lu', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-13T08:56:00.579Z', 'hidden': False}, {'_id': '675bcfacd0c3aeab39880d37', 'user': {'_id': '618af79af281834c95311402', 'avatarUrl': '/avatars/58ba086eb2309657d823b5768616eafe.svg', 'isPro': False, 'fullname': 'Tianmin Shu', 'user': 'tshu', 'type': 'user'}, 'name': 'Tianmin Shu', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T09:55:49.658Z', 'hidden': False}, {'_id': '675bcfacd0c3aeab39880d38', 'user': {'_id': '64b5ba6060274cbb296d6288', 'avatarUrl': '/avatars/67e0343954dda6e92ed3f6e7976f9f87.svg', 'isPro': False, 'fullname': 'Junfei Xiao', 'user': 'lambertxiao', 'type': 'user'}, 'name': 'Junfei Xiao', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-16T09:42:08.190Z', 'hidden': False}, {'_id': '675bcfacd0c3aeab39880d39', 'name': 'Luoxin Ye', 'hidden': False}, {'_id': '675bcfacd0c3aeab39880d3a', 'user': {'_id': '64cbf523e3cc4a476d8291b6', 'avatarUrl': '/avatars/825d7665db471e46921abad3319c2846.svg', 'isPro': False, 'fullname': 'Jiahao Wang', 'user': 'jiahaoplus', 'type': 'user'}, 'name': 'Jiahao Wang', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-16T09:42:10.694Z', 'hidden': False}, {'_id': '675bcfacd0c3aeab39880d3b', 'name': 'Cheng Peng', 'hidden': False}, {'_id': '675bcfacd0c3aeab39880d3c', 'name': 'Chen Wei', 'hidden': False}, {'_id': '675bcfacd0c3aeab39880d3d', 'user': {'_id': '5f6540c65e78cc6b0ed3199d', 'avatarUrl': '/avatars/0280d4df417855965a0964d22766c012.svg', 'isPro': False, 'fullname': 'Daniel Khashabi', 'user': 'danyaljj', 'type': 'user'}, 'name': 'Daniel Khashabi', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T09:55:06.806Z', 'hidden': False}, {'_id': '675bcfacd0c3aeab39880d3e', 'name': 'Rama Chellappa', 'hidden': False}, {'_id': '675bcfacd0c3aeab39880d3f', 'name': 'Alan Yuille', 'hidden': False}, {'_id': '675bcfacd0c3aeab39880d40', 'user': {'_id': '660c9ac4b202fcf3892f62fa', 'avatarUrl': '/avatars/7314fd5f3f642096d0e37d3194f1aa7e.svg', 'isPro': False, 'fullname': 'Jieneng Chen', 'user': 'jienengchen', 'type': 'user'}, 'name': 'Jieneng Chen', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T09:54:46.355Z', 'hidden': False}], 'publishedAt': '2024-12-12T18:59:57.000Z', 'title': 'GenEx: Generating an Explorable World', 'summary': 'Understanding, navigating, and exploring the 3D physical real world has long\\nbeen a central challenge in the development of artificial intelligence. In this\\nwork, we take a step toward this goal by introducing GenEx, a system capable of\\nplanning complex embodied world exploration, guided by its generative\\nimagination that forms priors (expectations) about the surrounding\\nenvironments. GenEx generates an entire 3D-consistent imaginative environment\\nfrom as little as a single RGB image, bringing it to life through panoramic\\nvideo streams. Leveraging scalable 3D world data curated from Unreal Engine,\\nour generative model is rounded in the physical world. It captures a continuous\\n360-degree environment with little effort, offering a boundless landscape for\\nAI agents to explore and interact with. GenEx achieves high-quality world\\ngeneration, robust loop consistency over long trajectories, and demonstrates\\nstrong 3D capabilities such as consistency and active 3D mapping. Powered by\\ngenerative imagination of the world, GPT-assisted agents are equipped to\\nperform complex embodied tasks, including both goal-agnostic exploration and\\ngoal-driven navigation. These agents utilize predictive expectation regarding\\nunseen parts of the physical world to refine their beliefs, simulate different\\noutcomes based on potential decisions, and make more informed choices. In\\nsummary, we demonstrate that GenEx provides a transformative platform for\\nadvancing embodied AI in imaginative spaces and brings potential for extending\\nthese capabilities to real-world exploration.', 'upvotes': 75, 'discussionId': '675bcfafd0c3aeab39880e37'}, 'publishedAt': '2024-12-15T22:25:23.139Z', 'title': 'GenEx: Generating an Explorable World', 'mediaUrls': ['https://cdn-uploads.huggingface.co/production/uploads/660c9ac4b202fcf3892f62fa/zMXBXszWTbZALUIUBrKUr.mp4'], 'thumbnail': 'https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2412.09624.png', 'numComments': 2, 'submittedBy': {'_id': '660c9ac4b202fcf3892f62fa', 'avatarUrl': '/avatars/7314fd5f3f642096d0e37d3194f1aa7e.svg', 'fullname': 'Jieneng Chen', 'name': 'jienengchen', 'type': 'user', 'isPro': False, 'isHf': False, 'isMod': False, 'followerCount': 3}}",
    "{'paper': {'id': '2412.09604', 'authors': [{'_id': '675d657d1c375f21ff6c008f', 'user': {'_id': '64b9033777ae61bcc80aa4f3', 'avatarUrl': '/avatars/408c335395c79f3df69fd9bf70abc312.svg', 'isPro': False, 'fullname': 'Hao Li', 'user': 'cpsxhao', 'type': 'user'}, 'name': 'Hao Li', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-16T09:41:44.798Z', 'hidden': False}, {'_id': '675d657d1c375f21ff6c0090', 'user': {'_id': '64b7475efa7eabaae5f7ba94', 'avatarUrl': '/avatars/346e53b345ccd9e8557ab8d2ec17a8f3.svg', 'isPro': False, 'fullname': 'Changyao Tian', 'user': 'Changyao', 'type': 'user'}, 'name': 'Changyao Tian', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:23:50.435Z', 'hidden': False}, {'_id': '675d657d1c375f21ff6c0091', 'user': {'_id': '644a2bccd9a3ae834104b065', 'avatarUrl': '/avatars/ee2caf787796cca438349d10089bdfd1.svg', 'isPro': False, 'fullname': 'Jie Shao', 'user': 'hehesang', 'type': 'user'}, 'name': 'Jie Shao', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:24:24.661Z', 'hidden': False}, {'_id': '675d657d1c375f21ff6c0092', 'user': {'_id': '64ae2359179421d320b1694b', 'avatarUrl': '/avatars/c387a75191005bcaa473091de5383a10.svg', 'isPro': False, 'fullname': 'Xizhou Zhu', 'user': 'Einsiedler', 'type': 'user'}, 'name': 'Xizhou Zhu', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:24:30.898Z', 'hidden': False}, {'_id': '675d657d1c375f21ff6c0093', 'user': {'_id': '665d4b515fdfe8f923e347a7', 'avatarUrl': '/avatars/d114b24c02dadfca0a8aee104755a8ec.svg', 'isPro': False, 'fullname': 'Zhaokai Wang', 'user': 'wzk1015', 'type': 'user'}, 'name': 'Zhaokai Wang', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-16T09:41:42.930Z', 'hidden': False}, {'_id': '675d657d1c375f21ff6c0094', 'name': 'Jinguo Zhu', 'hidden': False}, {'_id': '675d657d1c375f21ff6c0095', 'user': {'_id': '66efe658de163a536aa84178', 'avatarUrl': '/avatars/fddc42450cabf41ca1ab2f70b185f51c.svg', 'isPro': False, 'fullname': 'dou wenhan', 'user': 'douwh', 'type': 'user'}, 'name': 'Wenhan Dou', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:24:50.436Z', 'hidden': False}, {'_id': '675d657d1c375f21ff6c0096', 'user': {'_id': '66149831f9e58fe02b08f10e', 'avatarUrl': '/avatars/493636f2368e689c9241e53bc2e7b97e.svg', 'isPro': False, 'fullname': 'wangxiaogang', 'user': 'wangxiaogang', 'type': 'user'}, 'name': 'Xiaogang Wang', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:25:01.045Z', 'hidden': False}, {'_id': '675d657d1c375f21ff6c0097', 'user': {'_id': '65c04e9c27a5fdca81abcbd9', 'avatarUrl': '/avatars/12a155683c824fa23da4a9e2bed4f64e.svg', 'isPro': False, 'fullname': 'Hongsheng LI', 'user': 'hsli-cuhk', 'type': 'user'}, 'name': 'Hongsheng Li', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:25:17.271Z', 'hidden': False}, {'_id': '675d657d1c375f21ff6c0098', 'user': {'_id': '65ead3ea908526a39082e641', 'avatarUrl': '/avatars/dcf870695fd56b06ca03d82f831e9019.svg', 'isPro': False, 'fullname': 'Lewei Lu', 'user': 'luotto', 'type': 'user'}, 'name': 'Lewei Lu', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:25:24.403Z', 'hidden': False}, {'_id': '675d657d1c375f21ff6c0099', 'user': {'_id': '64686f7172d9180d4ac8b4e4', 'avatarUrl': '/avatars/db67dd6c4b2b41054ddcce5a18ade6f8.svg', 'isPro': False, 'fullname': 'Jifeng Dai', 'user': 'daijifeng', 'type': 'user'}, 'name': 'Jifeng Dai', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:25:31.520Z', 'hidden': False}], 'publishedAt': '2024-12-12T18:59:26.000Z', 'title': 'SynerGen-VL: Towards Synergistic Image Understanding and Generation with\\n  Vision Experts and Token Folding', 'summary': 'The remarkable success of Large Language Models (LLMs) has extended to the\\nmultimodal domain, achieving outstanding performance in image understanding and\\ngeneration. Recent efforts to develop unified Multimodal Large Language Models\\n(MLLMs) that integrate these capabilities have shown promising results.\\nHowever, existing approaches often involve complex designs in model\\narchitecture or training pipeline, increasing the difficulty of model training\\nand scaling. In this paper, we propose SynerGen-VL, a simple yet powerful\\nencoder-free MLLM capable of both image understanding and generation. To\\naddress challenges identified in existing encoder-free unified MLLMs, we\\nintroduce the token folding mechanism and the vision-expert-based progressive\\nalignment pretraining strategy, which effectively support high-resolution image\\nunderstanding while reducing training complexity. After being trained on\\nlarge-scale mixed image-text data with a unified next-token prediction\\nobjective, SynerGen-VL achieves or surpasses the performance of existing\\nencoder-free unified MLLMs with comparable or smaller parameter sizes, and\\nnarrows the gap with task-specific state-of-the-art models, highlighting a\\npromising path toward future unified MLLMs. Our code and models shall be\\nreleased.', 'upvotes': 33, 'discussionId': '675d657f1c375f21ff6c0123'}, 'publishedAt': '2024-12-16T04:48:18.466Z', 'title': 'SynerGen-VL: Towards Synergistic Image Understanding and Generation with Vision Experts and Token Folding', 'thumbnail': 'https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2412.09604.png', 'numComments': 3, 'submittedBy': {'_id': '665d4b515fdfe8f923e347a7', 'avatarUrl': '/avatars/d114b24c02dadfca0a8aee104755a8ec.svg', 'fullname': 'Zhaokai Wang', 'name': 'wzk1015', 'type': 'user', 'isPro': False, 'isHf': False, 'isMod': False, 'followerCount': 2}}",
    "{'paper': {'id': '2412.07769', 'authors': [{'_id': '675ee6efe6d71589c3c62d6e', 'user': {'_id': '62e23c7f555a866437a53cd0', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/62e23c7f555a866437a53cd0/UaAsYZQXuwb4NSG5WnvdG.jpeg', 'isPro': False, 'fullname': 'Sahal Shaji', 'user': 'sahalshajim', 'type': 'user'}, 'name': 'Sahal Shaji Mullappilly', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:03:51.461Z', 'hidden': False}, {'_id': '675ee6efe6d71589c3c62d6f', 'user': {'_id': '650289dbc130d99814b34dc5', 'avatarUrl': '/avatars/ff0cf5add144cd79c41a255f41f34efb.svg', 'isPro': False, 'fullname': 'K Mohammed Irfan', 'user': 'k-m-irfan', 'type': 'user'}, 'name': 'Mohammed Irfan Kurpath', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-16T13:25:34.125Z', 'hidden': False}, {'_id': '675ee6efe6d71589c3c62d70', 'user': {'_id': '63fe2f588b3c5087ff8721bf', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/1677602580834-noauth.jpeg', 'isPro': False, 'fullname': 'Sara Pieri', 'user': 'HuggingSara', 'type': 'user'}, 'name': 'Sara Pieri', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:02:01.391Z', 'hidden': False}, {'_id': '675ee6efe6d71589c3c62d71', 'name': 'Saeed Yahya Alseiari', 'hidden': False}, {'_id': '675ee6efe6d71589c3c62d72', 'name': 'Shanavas Cholakkal', 'hidden': False}, {'_id': '675ee6efe6d71589c3c62d73', 'name': 'Khaled Aldahmani', 'hidden': False}, {'_id': '675ee6efe6d71589c3c62d74', 'name': 'Fahad Khan', 'hidden': False}, {'_id': '675ee6efe6d71589c3c62d75', 'name': 'Rao Anwer', 'hidden': False}, {'_id': '675ee6efe6d71589c3c62d76', 'name': 'Salman Khan', 'hidden': False}, {'_id': '675ee6efe6d71589c3c62d77', 'name': 'Timothy Baldwin', 'hidden': False}, {'_id': '675ee6efe6d71589c3c62d78', 'user': {'_id': '654a5f4f9b8bd6406d45bb46', 'avatarUrl': '/avatars/ac0d7eef62cd98a280b162cf7896b1a2.svg', 'isPro': False, 'fullname': 'Hisham Cholakkal', 'user': 'hishamcholakkal', 'type': 'user'}, 'name': 'Hisham Cholakkal', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:00:42.925Z', 'hidden': False}], 'publishedAt': '2024-12-10T18:59:35.000Z', 'title': 'BiMediX2: Bio-Medical EXpert LMM for Diverse Medical Modalities', 'summary': 'This paper introduces BiMediX2, a bilingual (Arabic-English) Bio-Medical\\nEXpert Large Multimodal Model (LMM) with a unified architecture that integrates\\ntext and visual modalities, enabling advanced image understanding and medical\\napplications. BiMediX2 leverages the Llama3.1 architecture and integrates text\\nand visual capabilities to facilitate seamless interactions in both English and\\nArabic, supporting text-based inputs and multi-turn conversations involving\\nmedical images. The model is trained on an extensive bilingual healthcare\\ndataset consisting of 1.6M samples of diverse medical interactions for both\\ntext and image modalities, mixed in Arabic and English. We also propose the\\nfirst bilingual GPT-4o based medical LMM benchmark named BiMed-MBench. BiMediX2\\nis benchmarked on both text-based and image-based tasks, achieving\\nstate-of-the-art performance across several medical benchmarks. It outperforms\\nrecent state-of-the-art models in medical LLM evaluation benchmarks. Our model\\nalso sets a new benchmark in multimodal medical evaluations with over 9%\\nimprovement in English and over 20% in Arabic evaluations. Additionally, it\\nsurpasses GPT-4 by around 9% in UPHILL factual accuracy evaluations and excels\\nin various medical Visual Question Answering, Report Generation, and Report\\nSummarization tasks. The project page including source code and the trained\\nmodel, is available at https://github.com/mbzuai-oryx/BiMediX2.', 'upvotes': 25, 'discussionId': '675ee6f0e6d71589c3c62db4'}, 'publishedAt': '2024-12-16T02:36:29.735Z', 'title': 'BiMediX2: Bio-Medical EXpert LMM for Diverse Medical Modalities', 'mediaUrls': ['https://cdn-uploads.huggingface.co/production/uploads/62e23c7f555a866437a53cd0/BzYkwom8AHG07evceoVRp.gif'], 'thumbnail': 'https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2412.07769.png', 'numComments': 2, 'submittedBy': {'_id': '62e23c7f555a866437a53cd0', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/62e23c7f555a866437a53cd0/UaAsYZQXuwb4NSG5WnvdG.jpeg', 'fullname': 'Sahal Shaji', 'name': 'sahalshajim', 'type': 'user', 'isPro': False, 'isHf': False, 'isMod': False, 'followerCount': 1}}",
    "{'paper': {'id': '2412.10047', 'authors': [{'_id': '675f9dce5dca7e3c5c1f0950', 'name': 'Lu Wang', 'hidden': False}, {'_id': '675f9dce5dca7e3c5c1f0951', 'name': 'Fangkai Yang', 'hidden': False}, {'_id': '675f9dce5dca7e3c5c1f0952', 'user': {'_id': '654dbac9938fbf1e696be8aa', 'avatarUrl': '/avatars/b3c4035c48169c1bfb04a439fce3499f.svg', 'isPro': False, 'fullname': 'Chaoyun Zhang', 'user': 'vyokky', 'type': 'user'}, 'name': 'Chaoyun Zhang', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:13:55.597Z', 'hidden': False}, {'_id': '675f9dce5dca7e3c5c1f0953', 'user': {'_id': '6724f7b2f0e98abd46cb94d3', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/no-auth/O47r1efbW0MYxGyhixw9k.png', 'isPro': False, 'fullname': 'lu junior ting', 'user': 'Lujunting', 'type': 'user'}, 'name': 'Junting Lu', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:14:25.301Z', 'hidden': False}, {'_id': '675f9dce5dca7e3c5c1f0954', 'name': 'Jiaxu Qian', 'hidden': False}, {'_id': '675f9dce5dca7e3c5c1f0955', 'user': {'_id': '62c6df026a092eda1f1ab6e5', 'avatarUrl': '/avatars/d58fff1a157b189ce2617889ef5f6e2f.svg', 'isPro': False, 'fullname': 'Shilin He', 'user': 'shilhe', 'type': 'user'}, 'name': 'Shilin He', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:14:11.939Z', 'hidden': False}, {'_id': '675f9dce5dca7e3c5c1f0956', 'name': 'Pu Zhao', 'hidden': False}, {'_id': '675f9dce5dca7e3c5c1f0957', 'name': 'Bo Qiao', 'hidden': False}, {'_id': '675f9dce5dca7e3c5c1f0958', 'name': 'Ray Huang', 'hidden': False}, {'_id': '675f9dce5dca7e3c5c1f0959', 'name': 'Si Qin', 'hidden': False}, {'_id': '675f9dce5dca7e3c5c1f095a', 'name': 'Qisheng Su', 'hidden': False}, {'_id': '675f9dce5dca7e3c5c1f095b', 'name': 'Jiayi Ye', 'hidden': False}, {'_id': '675f9dce5dca7e3c5c1f095c', 'user': {'_id': '662a5511be52f6c4d5dd0b64', 'avatarUrl': '/avatars/759686e916c36ebea5908e400d1cf25b.svg', 'isPro': False, 'fullname': 'Yudi Zhang', 'user': 'ReedZyd', 'type': 'user'}, 'name': 'Yudi Zhang', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-17T08:04:14.067Z', 'hidden': False}, {'_id': '675f9dce5dca7e3c5c1f095d', 'user': {'_id': '6554338ba4de813482e91ad5', 'avatarUrl': '/avatars/bee879fa4a818fd6c070ee001769ce05.svg', 'isPro': False, 'fullname': 'Jian-Guang Lou', 'user': 'substill', 'type': 'user'}, 'name': 'Jian-Guang Lou', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:11:51.046Z', 'hidden': False}, {'_id': '675f9dce5dca7e3c5c1f095e', 'user': {'_id': '652fc9f39bc50a6c0e435224', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/652fc9f39bc50a6c0e435224/70OBVDHHBsxG2giJ-E3_1.jpeg', 'isPro': False, 'fullname': 'Lin Qingwei', 'user': 'Eliblo1969', 'type': 'user'}, 'name': 'Qingwei Lin', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:11:45.046Z', 'hidden': False}, {'_id': '675f9dce5dca7e3c5c1f095f', 'name': 'Saravan Rajmohan', 'hidden': False}, {'_id': '675f9dce5dca7e3c5c1f0960', 'user': {'_id': '66473d2c7abe6ad66e81a3dd', 'avatarUrl': '/avatars/82f40244806c06ffeaa1c4265e9725ea.svg', 'isPro': False, 'fullname': 'ZHANGDONGMEI', 'user': 'ZDM6426', 'type': 'user'}, 'name': 'Dongmei Zhang', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:11:13.362Z', 'hidden': False}, {'_id': '675f9dce5dca7e3c5c1f0961', 'name': 'Qi Zhang', 'hidden': False}], 'publishedAt': '2024-12-13T11:19:56.000Z', 'title': 'Large Action Models: From Inception to Implementation', 'summary': 'As AI continues to advance, there is a growing demand for systems that go\\nbeyond language-based assistance and move toward intelligent agents capable of\\nperforming real-world actions. This evolution requires the transition from\\ntraditional Large Language Models (LLMs), which excel at generating textual\\nresponses, to Large Action Models (LAMs), designed for action generation and\\nexecution within dynamic environments. Enabled by agent systems, LAMs hold the\\npotential to transform AI from passive language understanding to active task\\ncompletion, marking a significant milestone in the progression toward\\nartificial general intelligence.\\n  In this paper, we present a comprehensive framework for developing LAMs,\\noffering a systematic approach to their creation, from inception to deployment.\\nWe begin with an overview of LAMs, highlighting their unique characteristics\\nand delineating their differences from LLMs. Using a Windows OS-based agent as\\na case study, we provide a detailed, step-by-step guide on the key stages of\\nLAM development, including data collection, model training, environment\\nintegration, grounding, and evaluation. This generalizable workflow can serve\\nas a blueprint for creating functional LAMs in various application domains. We\\nconclude by identifying the current limitations of LAMs and discussing\\ndirections for future research and industrial deployment, emphasizing the\\nchallenges and opportunities that lie ahead in realizing the full potential of\\nLAMs in real-world applications.\\n  The code for the data collection process utilized in this paper is publicly\\navailable at: https://github.com/microsoft/UFO/tree/main/dataflow, and\\ncomprehensive documentation can be found at\\nhttps://microsoft.github.io/UFO/dataflow/overview/.', 'upvotes': 24, 'discussionId': '675f9dcf5dca7e3c5c1f0a47'}, 'publishedAt': '2024-12-15T22:26:17.331Z', 'title': 'Large Action Models: From Inception to Implementation', 'thumbnail': 'https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2412.10047.png', 'numComments': 5, 'submittedBy': {'_id': '654dbac9938fbf1e696be8aa', 'avatarUrl': '/avatars/b3c4035c48169c1bfb04a439fce3499f.svg', 'fullname': 'Chaoyun Zhang', 'name': 'vyokky', 'type': 'user', 'isPro': False, 'isHf': False, 'isMod': False, 'followerCount': 1}}",
    "{'paper': {'id': '2412.10208', 'authors': [{'_id': '67603b58418a4b3626cf19c6', 'user': {'_id': '64c890ebca915ffd302bade2', 'avatarUrl': '/avatars/9febfaee743b805fcfd3aada2e7dd64b.svg', 'isPro': False, 'fullname': 'Jaehyeon Kim', 'user': 'jaywalnut310', 'type': 'user'}, 'name': 'Jaehyeon Kim', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-17T08:04:10.474Z', 'hidden': False}, {'_id': '67603b58418a4b3626cf19c7', 'user': {'_id': '63de7eb015266dd945f470a8', 'avatarUrl': '/avatars/a1bc57c6a25b88a3b2848b9549895af2.svg', 'isPro': False, 'fullname': 'Taehong Moon', 'user': 'jason-moon', 'type': 'user'}, 'name': 'Taehong Moon', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-16T16:55:03.692Z', 'hidden': False}, {'_id': '67603b58418a4b3626cf19c8', 'name': 'Keon Lee', 'hidden': False}, {'_id': '67603b58418a4b3626cf19c9', 'name': 'Jaewoong Cho', 'hidden': False}], 'publishedAt': '2024-12-13T15:31:17.000Z', 'title': 'Efficient Generative Modeling with Residual Vector Quantization-Based\\n  Tokens', 'summary': 'We explore the use of Residual Vector Quantization (RVQ) for high-fidelity\\ngeneration in vector-quantized generative models. This quantization technique\\nmaintains higher data fidelity by employing more in-depth tokens. However,\\nincreasing the token number in generative models leads to slower inference\\nspeeds. To this end, we introduce ResGen, an efficient RVQ-based discrete\\ndiffusion model that generates high-fidelity samples without compromising\\nsampling speed. Our key idea is a direct prediction of vector embedding of\\ncollective tokens rather than individual ones. Moreover, we demonstrate that\\nour proposed token masking and multi-token prediction method can be formulated\\nwithin a principled probabilistic framework using a discrete diffusion process\\nand variational inference. We validate the efficacy and generalizability of the\\nproposed method on two challenging tasks across different modalities:\\nconditional image generation} on ImageNet 256x256 and zero-shot text-to-speech\\nsynthesis. Experimental results demonstrate that ResGen outperforms\\nautoregressive counterparts in both tasks, delivering superior performance\\nwithout compromising sampling speed. Furthermore, as we scale the depth of RVQ,\\nour generative models exhibit enhanced generation fidelity or faster sampling\\nspeeds compared to similarly sized baseline models. The project page can be\\nfound at https://resgen-genai.github.io', 'upvotes': 18, 'discussionId': '67603b59418a4b3626cf19f1'}, 'publishedAt': '2024-12-16T20:21:03.178Z', 'title': 'Efficient Generative Modeling with Residual Vector Quantization-Based Tokens', 'thumbnail': 'https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2412.10208.png', 'numComments': 2, 'submittedBy': {'_id': '64c890ebca915ffd302bade2', 'avatarUrl': '/avatars/9febfaee743b805fcfd3aada2e7dd64b.svg', 'fullname': 'Jaehyeon Kim', 'name': 'jaywalnut310', 'type': 'user', 'isPro': False, 'isHf': False, 'isMod': False, 'followerCount': 1}}",
    "{'paper': {'id': '2412.09283', 'authors': [{'_id': '675ba7108e73e58d6574019e', 'user': {'_id': '65927f3b754092f6b1e187a7', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/65927f3b754092f6b1e187a7/gUrNvIQHmsl1vLwSUxpmL.jpeg', 'isPro': False, 'fullname': 'tiehan fan', 'user': 'AnonMegumi', 'type': 'user'}, 'name': 'Tiehan Fan', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-13T15:22:20.639Z', 'hidden': False}, {'_id': '675ba7108e73e58d6574019f', 'user': {'_id': '66271e50f82ffd771a2720ee', 'avatarUrl': '/avatars/34f0f5f954a8c8ff41f86320813b034a.svg', 'isPro': False, 'fullname': 'nankepan', 'user': 'nankepan', 'type': 'user'}, 'name': 'Kepan Nan', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:05:10.506Z', 'hidden': False}, {'_id': '675ba7108e73e58d657401a0', 'user': {'_id': '660a7ecf14cfe4973e0acfe1', 'avatarUrl': '/avatars/e488058397f2b7a617515a4f721a9a00.svg', 'isPro': False, 'fullname': 'Rui Xie', 'user': 'SherryX', 'type': 'user'}, 'name': 'Rui Xie', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-16T09:42:20.696Z', 'hidden': False}, {'_id': '675ba7108e73e58d657401a1', 'name': 'Penghao Zhou', 'hidden': False}, {'_id': '675ba7108e73e58d657401a2', 'user': {'_id': '6421183b69a2c2933882d652', 'avatarUrl': '/avatars/66813a8fa22915087cccd4dbfb945ca7.svg', 'isPro': False, 'fullname': 'Zhenheng Yang', 'user': 'zhenheny', 'type': 'user'}, 'name': 'Zhenheng Yang', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:05:24.220Z', 'hidden': False}, {'_id': '675ba7108e73e58d657401a3', 'user': {'_id': '653b268cd1041ca9188954da', 'avatarUrl': '/avatars/2529a821580f5d87ee46867c5dc5b1e5.svg', 'isPro': False, 'fullname': 'BradyFU', 'user': 'BradyFU', 'type': 'user'}, 'name': 'Chaoyou Fu', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:06:26.985Z', 'hidden': False}, {'_id': '675ba7108e73e58d657401a4', 'name': 'Xiang Li', 'hidden': False}, {'_id': '675ba7108e73e58d657401a5', 'name': 'Jian Yang', 'hidden': False}, {'_id': '675ba7108e73e58d657401a6', 'user': {'_id': '65734004769f3ee9bde1af10', 'avatarUrl': '/avatars/d6310ed861972fd691687d8f47413f33.svg', 'isPro': False, 'fullname': 'Ying Tai', 'user': 'yingtai', 'type': 'user'}, 'name': 'Ying Tai', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-16T09:42:22.452Z', 'hidden': False}], 'publishedAt': '2024-12-12T13:48:40.000Z', 'title': 'InstanceCap: Improving Text-to-Video Generation via Instance-aware\\n  Structured Caption', 'summary': 'Text-to-video generation has evolved rapidly in recent years, delivering\\nremarkable results. Training typically relies on video-caption paired data,\\nwhich plays a crucial role in enhancing generation performance. However,\\ncurrent video captions often suffer from insufficient details, hallucinations\\nand imprecise motion depiction, affecting the fidelity and consistency of\\ngenerated videos. In this work, we propose a novel instance-aware structured\\ncaption framework, termed InstanceCap, to achieve instance-level and\\nfine-grained video caption for the first time. Based on this scheme, we design\\nan auxiliary models cluster to convert original video into instances to enhance\\ninstance fidelity. Video instances are further used to refine dense prompts\\ninto structured phrases, achieving concise yet precise descriptions.\\nFurthermore, a 22K InstanceVid dataset is curated for training, and an\\nenhancement pipeline that tailored to InstanceCap structure is proposed for\\ninference. Experimental results demonstrate that our proposed InstanceCap\\nsignificantly outperform previous models, ensuring high fidelity between\\ncaptions and videos while reducing hallucinations.', 'upvotes': 18, 'discussionId': '675ba7118e73e58d65740237'}, 'publishedAt': '2024-12-15T23:23:39.719Z', 'title': 'InstanceCap: Improving Text-to-Video Generation via Instance-aware Structured Caption', 'thumbnail': 'https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2412.09283.png', 'numComments': 3, 'submittedBy': {'_id': '65927f3b754092f6b1e187a7', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/65927f3b754092f6b1e187a7/gUrNvIQHmsl1vLwSUxpmL.jpeg', 'fullname': 'tiehan fan', 'name': 'AnonMegumi', 'type': 'user', 'isPro': False, 'isHf': False, 'isMod': False}}",
    "{'paper': {'id': '2412.09626', 'authors': [{'_id': '675f980c4d67dd90e85ecc6c', 'user': {'_id': '63023b6ab002e9a4a2152890', 'avatarUrl': '/avatars/cae8ba0a8d61fb4e576934431f43991b.svg', 'isPro': False, 'fullname': 'Haonan Qiu', 'user': 'MoonQiu', 'type': 'user'}, 'name': 'Haonan Qiu', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:14:47.092Z', 'hidden': False}, {'_id': '675f980c4d67dd90e85ecc6d', 'user': {'_id': '63a1200045edac9f7508bae9', 'avatarUrl': '/avatars/e84f0b045f32c5b8b4da43458650b925.svg', 'isPro': False, 'fullname': 'Shiwei Zhang', 'user': 'StevenZhang', 'type': 'user'}, 'name': 'Shiwei Zhang', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:15:06.661Z', 'hidden': False}, {'_id': '675f980c4d67dd90e85ecc6e', 'user': {'_id': '637f70d6fab5db9101c3dfc8', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/637f70d6fab5db9101c3dfc8/NgkYNXWLDavLbrnCby2Fl.jpeg', 'isPro': False, 'fullname': 'Yujie Wei', 'user': 'weilllllls', 'type': 'user'}, 'name': 'Yujie Wei', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:16:14.731Z', 'hidden': False}, {'_id': '675f980c4d67dd90e85ecc6f', 'user': {'_id': '642e3bcb958faf258a40e89c', 'avatarUrl': '/avatars/213501def37dc53032cee17e37fcc4c1.svg', 'isPro': False, 'fullname': 'Ruihang Chu', 'user': 'Ruihang', 'type': 'user'}, 'name': 'Ruihang Chu', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:15:58.665Z', 'hidden': False}, {'_id': '675f980c4d67dd90e85ecc70', 'user': {'_id': '649d54b314afbb10ce2a9eeb', 'avatarUrl': '/avatars/15c325d8c2273ff63569f23015e98486.svg', 'isPro': False, 'fullname': 'Hangjie Yuan', 'user': 'JacobYuan', 'type': 'user'}, 'name': 'Hangjie Yuan', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:15:52.570Z', 'hidden': False}, {'_id': '675f980c4d67dd90e85ecc71', 'user': {'_id': '65fca775fa59bdf4737b1a84', 'avatarUrl': '/avatars/a161b510bde8f57e7686cbb0b4aa6a52.svg', 'isPro': False, 'fullname': 'Xiang Wang', 'user': 'xiangwang1223', 'type': 'user'}, 'name': 'Xiang Wang', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:15:46.971Z', 'hidden': False}, {'_id': '675f980c4d67dd90e85ecc72', 'name': 'Yingya Zhang', 'hidden': False}, {'_id': '675f980c4d67dd90e85ecc73', 'user': {'_id': '62ab1ac1d48b4d8b048a3473', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/1656826685333-62ab1ac1d48b4d8b048a3473.png', 'isPro': False, 'fullname': 'Ziwei Liu', 'user': 'liuziwei7', 'type': 'user'}, 'name': 'Ziwei Liu', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:15:16.895Z', 'hidden': False}], 'publishedAt': '2024-12-12T18:59:59.000Z', 'title': 'FreeScale: Unleashing the Resolution of Diffusion Models via Tuning-Free\\n  Scale Fusion', 'summary': 'Visual diffusion models achieve remarkable progress, yet they are typically\\ntrained at limited resolutions due to the lack of high-resolution data and\\nconstrained computation resources, hampering their ability to generate\\nhigh-fidelity images or videos at higher resolutions. Recent efforts have\\nexplored tuning-free strategies to exhibit the untapped potential\\nhigher-resolution visual generation of pre-trained models. However, these\\nmethods are still prone to producing low-quality visual content with repetitive\\npatterns. The key obstacle lies in the inevitable increase in high-frequency\\ninformation when the model generates visual content exceeding its training\\nresolution, leading to undesirable repetitive patterns deriving from the\\naccumulated errors. To tackle this challenge, we propose FreeScale, a\\ntuning-free inference paradigm to enable higher-resolution visual generation\\nvia scale fusion. Specifically, FreeScale processes information from different\\nreceptive scales and then fuses it by extracting desired frequency components.\\nExtensive experiments validate the superiority of our paradigm in extending the\\ncapabilities of higher-resolution visual generation for both image and video\\nmodels. Notably, compared with the previous best-performing method, FreeScale\\nunlocks the generation of 8k-resolution images for the first time.', 'upvotes': 17, 'discussionId': '675f980f4d67dd90e85ecd9b'}, 'publishedAt': '2024-12-15T22:05:52.515Z', 'title': 'FreeScale: Unleashing the Resolution of Diffusion Models via Tuning-Free Scale Fusion', 'mediaUrls': ['https://cdn-uploads.huggingface.co/production/uploads/63023b6ab002e9a4a2152890/Bm1rw2co68KEsacwdDAP2.png', 'https://cdn-uploads.huggingface.co/production/uploads/63023b6ab002e9a4a2152890/Av6UKCvIjv9zogt8QgNgr.png'], 'thumbnail': 'https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2412.09626.png', 'numComments': 2, 'submittedBy': {'_id': '63023b6ab002e9a4a2152890', 'avatarUrl': '/avatars/cae8ba0a8d61fb4e576934431f43991b.svg', 'fullname': 'Haonan Qiu', 'name': 'MoonQiu', 'type': 'user', 'isPro': False, 'isHf': False, 'isMod': False, 'followerCount': 2}}",
    "{'paper': {'id': '2412.08645', 'authors': [{'_id': '675b1648634bd80c4c1d84f8', 'user': {'_id': '675fccea42bc7570939a3642', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/no-auth/x081h2YwSs9kIUgA5me8z.png', 'isPro': False, 'fullname': 'Daniel Winter', 'user': 'danielwinter', 'type': 'user'}, 'name': 'Daniel Winter', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:16:31.261Z', 'hidden': False}, {'_id': '675b1648634bd80c4c1d84f9', 'name': 'Asaf Shul', 'hidden': False}, {'_id': '675b1648634bd80c4c1d84fa', 'name': 'Matan Cohen', 'hidden': False}, {'_id': '675b1648634bd80c4c1d84fb', 'name': 'Dana Berman', 'hidden': False}, {'_id': '675b1648634bd80c4c1d84fc', 'name': 'Yael Pritch', 'hidden': False}, {'_id': '675b1648634bd80c4c1d84fd', 'name': 'Alex Rav-Acha', 'hidden': False}, {'_id': '675b1648634bd80c4c1d84fe', 'user': {'_id': '646cfc3b4220471ca0c56b20', 'avatarUrl': '/avatars/19d6ab141ec2cd25c1c3b45fd8f69910.svg', 'isPro': False, 'fullname': 'Yedid Hoshen', 'user': 'yedid', 'type': 'user'}, 'name': 'Yedid Hoshen', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-16T09:42:31.672Z', 'hidden': False}], 'publishedAt': '2024-12-11T18:59:53.000Z', 'title': 'ObjectMate: A Recurrence Prior for Object Insertion and Subject-Driven\\n  Generation', 'summary': \"This paper introduces a tuning-free method for both object insertion and\\nsubject-driven generation. The task involves composing an object, given\\nmultiple views, into a scene specified by either an image or text. Existing\\nmethods struggle to fully meet the task's challenging objectives: (i)\\nseamlessly composing the object into the scene with photorealistic pose and\\nlighting, and (ii) preserving the object's identity. We hypothesize that\\nachieving these goals requires large scale supervision, but manually collecting\\nsufficient data is simply too expensive. The key observation in this paper is\\nthat many mass-produced objects recur across multiple images of large unlabeled\\ndatasets, in different scenes, poses, and lighting conditions. We use this\\nobservation to create massive supervision by retrieving sets of diverse views\\nof the same object. This powerful paired dataset enables us to train a\\nstraightforward text-to-image diffusion architecture to map the object and\\nscene descriptions to the composited image. We compare our method, ObjectMate,\\nwith state-of-the-art methods for object insertion and subject-driven\\ngeneration, using a single or multiple references. Empirically, ObjectMate\\nachieves superior identity preservation and more photorealistic composition.\\nDifferently from many other multi-reference methods, ObjectMate does not\\nrequire slow test-time tuning.\", 'upvotes': 11, 'discussionId': '675b164c634bd80c4c1d85e6'}, 'publishedAt': '2024-12-16T04:37:52.464Z', 'title': 'ObjectMate: A Recurrence Prior for Object Insertion and Subject-Driven Generation', 'mediaUrls': ['https://cdn-uploads.huggingface.co/production/uploads/646cfc3b4220471ca0c56b20/CIeYSl_7BPhSBo5RejxTH.mp4'], 'thumbnail': 'https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2412.08645.png', 'numComments': 2, 'submittedBy': {'_id': '646cfc3b4220471ca0c56b20', 'avatarUrl': '/avatars/19d6ab141ec2cd25c1c3b45fd8f69910.svg', 'fullname': 'Yedid Hoshen', 'name': 'yedid', 'type': 'user', 'isPro': False, 'isHf': False, 'isMod': False}}",
    "{'paper': {'id': '2412.07517', 'authors': [{'_id': '675b9172aeae790a46d985b1', 'user': {'_id': '66be27e39284c8209f5e82fe', 'avatarUrl': '/avatars/5c59b74a364107654f343c3904bcfae4.svg', 'isPro': False, 'fullname': 'dengyingying', 'user': 'yingying87', 'type': 'user'}, 'name': 'Yingying Deng', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:20:24.256Z', 'hidden': False}, {'_id': '675b9172aeae790a46d985b2', 'user': {'_id': '6422430fe61513ec1c687165', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/6422430fe61513ec1c687165/zxJRw2kDVegk7kyhjITLu.jpeg', 'isPro': False, 'fullname': 'Xiangyu He', 'user': 'MagicBag', 'type': 'user'}, 'name': 'Xiangyu He', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-13T15:22:40.083Z', 'hidden': False}, {'_id': '675b9172aeae790a46d985b3', 'name': 'Changwang Mei', 'hidden': False}, {'_id': '675b9172aeae790a46d985b4', 'user': {'_id': '626f98528a894872cfbf620c', 'avatarUrl': '/avatars/fe31d20313e6ca85e96bc249424c5383.svg', 'isPro': False, 'fullname': 'Peisong Wang', 'user': 'duke1852022', 'type': 'user'}, 'name': 'Peisong Wang', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:19:47.517Z', 'hidden': False}, {'_id': '675b9172aeae790a46d985b5', 'name': 'Fan Tang', 'hidden': False}], 'publishedAt': '2024-12-10T13:56:26.000Z', 'title': 'FireFlow: Fast Inversion of Rectified Flow for Image Semantic Editing', 'summary': 'Though Rectified Flows (ReFlows) with distillation offers a promising way for\\nfast sampling, its fast inversion transforms images back to structured noise\\nfor recovery and following editing remains unsolved. This paper introduces\\nFireFlow, a simple yet effective zero-shot approach that inherits the startling\\ncapacity of ReFlow-based models (such as FLUX) in generation while extending\\nits capabilities to accurate inversion and editing in 8 steps. We first\\ndemonstrate that a carefully designed numerical solver is pivotal for ReFlow\\ninversion, enabling accurate inversion and reconstruction with the precision of\\na second-order solver while maintaining the practical efficiency of a\\nfirst-order Euler method. This solver achieves a 3times runtime speedup\\ncompared to state-of-the-art ReFlow inversion and editing techniques, while\\ndelivering smaller reconstruction errors and superior editing results in a\\ntraining-free mode. The code is available at\\nhttps://github.com/HolmesShuan/FireFlow{this URL}.', 'upvotes': 10, 'discussionId': '675b9176aeae790a46d986fc'}, 'publishedAt': '2024-12-15T21:43:07.244Z', 'title': 'FireFlow: Fast Inversion of Rectified Flow for Image Semantic Editing', 'thumbnail': 'https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2412.07517.png', 'numComments': 3, 'submittedBy': {'_id': '6422430fe61513ec1c687165', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/6422430fe61513ec1c687165/zxJRw2kDVegk7kyhjITLu.jpeg', 'fullname': 'Xiangyu He', 'name': 'MagicBag', 'type': 'user', 'isPro': False, 'isHf': False, 'isMod': False, 'followerCount': 4}}",
    "{'paper': {'id': '2412.09856', 'authors': [{'_id': '675f9f0bcd95ed0ab888fa28', 'user': {'_id': '63c8d37c9cf77a4393d4e16c', 'avatarUrl': '/avatars/58a44bdad26718c3814fcdfa589aed1e.svg', 'isPro': False, 'fullname': 'Hongjie Wang', 'user': 'hongjiewang', 'type': 'user'}, 'name': 'Hongjie Wang', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-16T09:16:27.354Z', 'hidden': False}, {'_id': '675f9f0bcd95ed0ab888fa29', 'name': 'Chih-Yao Ma', 'hidden': False}, {'_id': '675f9f0bcd95ed0ab888fa2a', 'name': 'Yen-Cheng Liu', 'hidden': False}, {'_id': '675f9f0bcd95ed0ab888fa2b', 'name': 'Ji Hou', 'hidden': False}, {'_id': '675f9f0bcd95ed0ab888fa2c', 'name': 'Tao Xu', 'hidden': False}, {'_id': '675f9f0bcd95ed0ab888fa2d', 'name': 'Jialiang Wang', 'hidden': False}, {'_id': '675f9f0bcd95ed0ab888fa2e', 'user': {'_id': '6417cf37dce1e4c0229f17b1', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/6417cf37dce1e4c0229f17b1/7h-ZCB5f4wif7TsnF-B1M.jpeg', 'isPro': False, 'fullname': 'Felix Xu', 'user': 'katanaxu', 'type': 'user'}, 'name': 'Felix Juefei-Xu', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-16T16:55:17.348Z', 'hidden': False}, {'_id': '675f9f0bcd95ed0ab888fa2f', 'name': 'Yaqiao Luo', 'hidden': False}, {'_id': '675f9f0bcd95ed0ab888fa30', 'name': 'Peizhao Zhang', 'hidden': False}, {'_id': '675f9f0bcd95ed0ab888fa31', 'name': 'Tingbo Hou', 'hidden': False}, {'_id': '675f9f0bcd95ed0ab888fa32', 'name': 'Peter Vajda', 'hidden': False}, {'_id': '675f9f0bcd95ed0ab888fa33', 'name': 'Niraj K. Jha', 'hidden': False}, {'_id': '675f9f0bcd95ed0ab888fa34', 'name': 'Xiaoliang Dai', 'hidden': False}], 'publishedAt': '2024-12-13T04:55:10.000Z', 'title': 'LinGen: Towards High-Resolution Minute-Length Text-to-Video Generation\\n  with Linear Computational Complexity', 'summary': 'Text-to-video generation enhances content creation but is highly\\ncomputationally intensive: The computational cost of Diffusion Transformers\\n(DiTs) scales quadratically in the number of pixels. This makes minute-length\\nvideo generation extremely expensive, limiting most existing models to\\ngenerating videos of only 10-20 seconds length. We propose a Linear-complexity\\ntext-to-video Generation (LinGen) framework whose cost scales linearly in the\\nnumber of pixels. For the first time, LinGen enables high-resolution\\nminute-length video generation on a single GPU without compromising quality. It\\nreplaces the computationally-dominant and quadratic-complexity block,\\nself-attention, with a linear-complexity block called MATE, which consists of\\nan MA-branch and a TE-branch. The MA-branch targets short-to-long-range\\ncorrelations, combining a bidirectional Mamba2 block with our token\\nrearrangement method, Rotary Major Scan, and our review tokens developed for\\nlong video generation. The TE-branch is a novel TEmporal Swin Attention block\\nthat focuses on temporal correlations between adjacent tokens and medium-range\\ntokens. The MATE block addresses the adjacency preservation issue of Mamba and\\nimproves the consistency of generated videos significantly. Experimental\\nresults show that LinGen outperforms DiT (with a 75.6% win rate) in video\\nquality with up to 15times (11.5times) FLOPs (latency) reduction.\\nFurthermore, both automatic metrics and human evaluation demonstrate our\\nLinGen-4B yields comparable video quality to state-of-the-art models (with a\\n50.5%, 52.1%, 49.1% win rate with respect to Gen-3, LumaLabs, and Kling,\\nrespectively). This paves the way to hour-length movie generation and real-time\\ninteractive video generation. We provide 68s video generation results and more\\nexamples in our project website: https://lineargen.github.io/.', 'upvotes': 8, 'discussionId': '675f9f0dcd95ed0ab888fab3'}, 'publishedAt': '2024-12-16T11:29:19.319Z', 'title': 'LinGen: Towards High-Resolution Minute-Length Text-to-Video Generation with Linear Computational Complexity', 'mediaUrls': ['https://cdn-uploads.huggingface.co/production/uploads/63c8d37c9cf77a4393d4e16c/SJjKH6yrDDt5FNx0OOJfh.mp4'], 'thumbnail': 'https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2412.09856.png', 'numComments': 3, 'submittedBy': {'_id': '63c8d37c9cf77a4393d4e16c', 'avatarUrl': '/avatars/58a44bdad26718c3814fcdfa589aed1e.svg', 'fullname': 'Hongjie Wang', 'name': 'hongjiewang', 'type': 'user', 'isPro': False, 'isHf': False, 'isMod': False, 'followerCount': 1}}",
    "{'paper': {'id': '2412.09611', 'authors': [{'_id': '675ca1bf2905333f65f374c6', 'user': {'_id': '65454d7c117ecae648892170', 'avatarUrl': '/avatars/83a7091a24bf86801176ca85234b417a.svg', 'isPro': False, 'fullname': 'Yusuf Dalva', 'user': 'ydalva', 'type': 'user'}, 'name': 'Yusuf Dalva', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-16T09:41:55.826Z', 'hidden': False}, {'_id': '675ca1bf2905333f65f374c7', 'user': {'_id': '6760ac76f39b78ef539b236a', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/no-auth/jHIjeR9sf1tnuT4rYllfc.png', 'isPro': False, 'fullname': 'Kavana Venkatesh', 'user': 'Kavanavnlp', 'type': 'user'}, 'name': 'Kavana Venkatesh', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-17T08:04:19.865Z', 'hidden': False}, {'_id': '675ca1bf2905333f65f374c8', 'name': 'Pinar Yanardag', 'hidden': False}], 'publishedAt': '2024-12-12T18:59:40.000Z', 'title': 'FluxSpace: Disentangled Semantic Editing in Rectified Flow Transformers', 'summary': 'Rectified flow models have emerged as a dominant approach in image\\ngeneration, showcasing impressive capabilities in high-quality image synthesis.\\nHowever, despite their effectiveness in visual generation, rectified flow\\nmodels often struggle with disentangled editing of images. This limitation\\nprevents the ability to perform precise, attribute-specific modifications\\nwithout affecting unrelated aspects of the image. In this paper, we introduce\\nFluxSpace, a domain-agnostic image editing method leveraging a representation\\nspace with the ability to control the semantics of images generated by\\nrectified flow transformers, such as Flux. By leveraging the representations\\nlearned by the transformer blocks within the rectified flow models, we propose\\na set of semantically interpretable representations that enable a wide range of\\nimage editing tasks, from fine-grained image editing to artistic creation. This\\nwork offers a scalable and effective image editing approach, along with its\\ndisentanglement capabilities.', 'upvotes': 8, 'discussionId': '675ca1c12905333f65f375bf'}, 'publishedAt': '2024-12-16T05:22:02.573Z', 'title': 'FluxSpace: Disentangled Semantic Editing in Rectified Flow Transformers', 'thumbnail': 'https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2412.09611.png', 'numComments': 2, 'submittedBy': {'_id': '65454d7c117ecae648892170', 'avatarUrl': '/avatars/83a7091a24bf86801176ca85234b417a.svg', 'fullname': 'Yusuf Dalva', 'name': 'ydalva', 'type': 'user', 'isPro': False, 'isHf': False, 'isMod': False}}",
    "{'paper': {'id': '2412.10319', 'authors': [{'_id': '675f918be24babdf1816c11e', 'user': {'_id': '63d00710645dd8d34ea9bcc6', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/63d00710645dd8d34ea9bcc6/E6YsIsXH57OACL-NZ52fB.jpeg', 'isPro': False, 'fullname': 'Yucheng', 'user': 'liyucheng', 'type': 'user'}, 'name': 'Yucheng Li', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:23:35.424Z', 'hidden': False}, {'_id': '675f918be24babdf1816c11f', 'user': {'_id': '6278bd42541f3d2dfa77ea70', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/6278bd42541f3d2dfa77ea70/ejn49eapnB3UXQckAYdTd.jpeg', 'isPro': True, 'fullname': 'Huiqiang Jiang', 'user': 'iofu728', 'type': 'user'}, 'name': 'Huiqiang Jiang', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:22:52.093Z', 'hidden': False}, {'_id': '675f918be24babdf1816c120', 'user': {'_id': '63ef330b1e695b35aa484e11', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/63ef330b1e695b35aa484e11/bXwpGy0dl8JXeJwJ--ilr.jpeg', 'isPro': False, 'fullname': 'Qianhui WU', 'user': 'qianhuiwu', 'type': 'user'}, 'name': 'Qianhui Wu', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:22:43.253Z', 'hidden': False}, {'_id': '675f918be24babdf1816c121', 'user': {'_id': '64b750a2fdb702b3d8619514', 'avatarUrl': '/avatars/f09181c0825763dff692c4bc65effc4c.svg', 'isPro': False, 'fullname': 'Xufang Luo', 'user': 'luoxufang', 'type': 'user'}, 'name': 'Xufang Luo', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:22:37.186Z', 'hidden': False}, {'_id': '675f918be24babdf1816c122', 'name': 'Surin Ahn', 'hidden': False}, {'_id': '675f918be24babdf1816c123', 'user': {'_id': '64646896884f2e3e1ced3cd5', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/64646896884f2e3e1ced3cd5/86-t8V8LGMNaPQRXnADiD.png', 'isPro': False, 'fullname': 'Zhang', 'user': 'Chengruidong', 'type': 'user'}, 'name': 'Chengruidong Zhang', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:22:07.437Z', 'hidden': False}, {'_id': '675f918be24babdf1816c124', 'name': 'Amir H. Abdi', 'hidden': False}, {'_id': '675f918be24babdf1816c125', 'user': {'_id': '6524f13821af4b07309015aa', 'avatarUrl': '/avatars/f97d6a041428d48305568df7afd83cc8.svg', 'isPro': False, 'fullname': 'dongsheng li', 'user': 'dongshengli', 'type': 'user'}, 'name': 'Dongsheng Li', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:21:34.011Z', 'hidden': False}, {'_id': '675f918be24babdf1816c126', 'user': {'_id': '641904caf9d6f1d772ec7af7', 'avatarUrl': '/avatars/4a63eac71eb30f70b1a0e9d4708f26c1.svg', 'isPro': False, 'fullname': 'Jianfeng Gao', 'user': 'wyngjf', 'type': 'user'}, 'name': 'Jianfeng Gao', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:21:06.943Z', 'hidden': False}, {'_id': '675f918be24babdf1816c127', 'user': {'_id': '65c0de12efbb14b39c97f78e', 'avatarUrl': '/avatars/18485f79427a35bd9e19f71b67c88dce.svg', 'isPro': False, 'fullname': 'Yuqing Yang', 'user': 'ayyyq', 'type': 'user'}, 'name': 'Yuqing Yang', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:20:56.316Z', 'hidden': False}, {'_id': '675f918be24babdf1816c128', 'name': 'Lili Qiu', 'hidden': False}], 'publishedAt': '2024-12-13T17:59:52.000Z', 'title': 'SCBench: A KV Cache-Centric Analysis of Long-Context Methods', 'summary': 'Long-context LLMs have enabled numerous downstream applications but also\\nintroduced significant challenges related to computational and memory\\nefficiency. To address these challenges, optimizations for long-context\\ninference have been developed, centered around the KV cache. However, existing\\nbenchmarks often evaluate in single-request, neglecting the full lifecycle of\\nthe KV cache in real-world use. This oversight is particularly critical, as KV\\ncache reuse has become widely adopted in LLMs inference frameworks, such as\\nvLLM and SGLang, as well as by LLM providers, including OpenAI, Microsoft,\\nGoogle, and Anthropic. To address this gap, we introduce\\nSCBench(SharedContextBench), a comprehensive benchmark for evaluating\\nlong-context methods from a KV cachecentric perspective: 1) KV cache\\ngeneration, 2) KV cache compression, 3) KV cache retrieval, 4) KV cache\\nloading. Specifically, SCBench uses test examples with shared context, ranging\\n12 tasks with two shared context modes, covering four categories of\\nlong-context capabilities: string retrieval, semantic retrieval, global\\ninformation, and multi-task. With it, we provide an extensive KV cache-centric\\nanalysis of eight categories long-context solutions, including Gated Linear\\nRNNs, Mamba-Attention hybrids, and efficient methods such as sparse attention,\\nKV cache dropping, quantization, retrieval, loading, and prompt compression.\\nThe evaluation is conducted on 8 long-context LLMs. Our findings show that\\nsub-O(n) memory methods suffer in multi-turn scenarios, while sparse encoding\\nwith O(n) memory and sub-O(n^2) pre-filling computation perform robustly.\\nDynamic sparsity yields more expressive KV caches than static patterns, and\\nlayer-level sparsity in hybrid architectures reduces memory usage with strong\\nperformance. Additionally, we identify attention distribution shift issues in\\nlong-generation scenarios. https://aka.ms/SCBench.', 'upvotes': 8, 'discussionId': '675f918be24babdf1816c14b'}, 'publishedAt': '2024-12-15T21:51:54.437Z', 'title': 'SCBench: A KV Cache-Centric Analysis of Long-Context Methods', 'mediaUrls': ['https://cdn-uploads.huggingface.co/production/uploads/6278bd42541f3d2dfa77ea70/ZKm0LHpcYyY-49S5G9za1.png'], 'thumbnail': 'https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2412.10319.png', 'numComments': 2, 'submittedBy': {'_id': '6278bd42541f3d2dfa77ea70', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/6278bd42541f3d2dfa77ea70/ejn49eapnB3UXQckAYdTd.jpeg', 'fullname': 'Huiqiang Jiang', 'name': 'iofu728', 'type': 'user', 'isPro': True, 'isHf': False, 'isMod': False, 'followerCount': 7}}",
    "{'paper': {'id': '2412.09428', 'authors': [{'_id': '675fc58aa733b7f21793a3c9', 'user': {'_id': '6433a54982ca403c44e0560c', 'avatarUrl': '/avatars/66ca5c1ba2351909b1b8d3a2a774e186.svg', 'isPro': False, 'fullname': 'baisen wang', 'user': 'wbs2788', 'type': 'user'}, 'name': 'Baisen Wang', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:17:48.195Z', 'hidden': False}, {'_id': '675fc58aa733b7f21793a3ca', 'user': {'_id': '6358a167f56b03ec9147074d', 'avatarUrl': '/avatars/e54ea7bf0c240cf76d538296efb3976c.svg', 'isPro': False, 'fullname': 'Le Zhuo', 'user': 'JackyZhuo', 'type': 'user'}, 'name': 'Le Zhuo', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:17:53.867Z', 'hidden': False}, {'_id': '675fc58aa733b7f21793a3cb', 'user': {'_id': '665d4b515fdfe8f923e347a7', 'avatarUrl': '/avatars/d114b24c02dadfca0a8aee104755a8ec.svg', 'isPro': False, 'fullname': 'Zhaokai Wang', 'user': 'wzk1015', 'type': 'user'}, 'name': 'Zhaokai Wang', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-16T09:16:18.569Z', 'hidden': False}, {'_id': '675fc58aa733b7f21793a3cc', 'user': {'_id': '64265b32ad1e3b0e6e91d009', 'avatarUrl': '/avatars/f1f9971bf804eb1665d157fc3d9b5209.svg', 'isPro': False, 'fullname': 'chenxi', 'user': 'baochenxi', 'type': 'user'}, 'name': 'Chenxi Bao', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:18:05.888Z', 'hidden': False}, {'_id': '675fc58aa733b7f21793a3cd', 'name': 'Wu Chengjing', 'hidden': False}, {'_id': '675fc58aa733b7f21793a3ce', 'name': 'Xuecheng Nie', 'hidden': False}, {'_id': '675fc58aa733b7f21793a3cf', 'name': 'Jiao Dai', 'hidden': False}, {'_id': '675fc58aa733b7f21793a3d0', 'name': 'Jizhong Han', 'hidden': False}, {'_id': '675fc58aa733b7f21793a3d1', 'user': {'_id': '670f827bb94a3734d270f707', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/no-auth/D6qCPBMJAUgozfG7YTwky.png', 'isPro': False, 'fullname': 'Yue Liao', 'user': 'morninghaze', 'type': 'user'}, 'name': 'Yue Liao', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:18:50.052Z', 'hidden': False}, {'_id': '675fc58aa733b7f21793a3d2', 'name': 'Si Liu', 'hidden': False}], 'publishedAt': '2024-12-12T16:33:21.000Z', 'title': 'Multimodal Music Generation with Explicit Bridges and Retrieval\\n  Augmentation', 'summary': 'Multimodal music generation aims to produce music from diverse input\\nmodalities, including text, videos, and images. Existing methods use a common\\nembedding space for multimodal fusion. Despite their effectiveness in other\\nmodalities, their application in multimodal music generation faces challenges\\nof data scarcity, weak cross-modal alignment, and limited controllability. This\\npaper addresses these issues by using explicit bridges of text and music for\\nmultimodal alignment. We introduce a novel method named Visuals Music Bridge\\n(VMB). Specifically, a Multimodal Music Description Model converts visual\\ninputs into detailed textual descriptions to provide the text bridge; a\\nDual-track Music Retrieval module that combines broad and targeted retrieval\\nstrategies to provide the music bridge and enable user control. Finally, we\\ndesign an Explicitly Conditioned Music Generation framework to generate music\\nbased on the two bridges. We conduct experiments on video-to-music,\\nimage-to-music, text-to-music, and controllable music generation tasks, along\\nwith experiments on controllability. The results demonstrate that VMB\\nsignificantly enhances music quality, modality, and customization alignment\\ncompared to previous methods. VMB sets a new standard for interpretable and\\nexpressive multimodal music generation with applications in various multimedia\\nfields. Demos and code are available at https://github.com/wbs2788/VMB.', 'upvotes': 5, 'discussionId': '675fc58fa733b7f21793a712'}, 'publishedAt': '2024-12-16T01:16:48.363Z', 'title': 'Multimodal Music Generation with Explicit Bridges and Retrieval Augmentation', 'thumbnail': 'https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2412.09428.png', 'numComments': 4, 'submittedBy': {'_id': '6358a167f56b03ec9147074d', 'avatarUrl': '/avatars/e54ea7bf0c240cf76d538296efb3976c.svg', 'fullname': 'Le Zhuo', 'name': 'JackyZhuo', 'type': 'user', 'isPro': False, 'isHf': False, 'isMod': False, 'followerCount': 3}}",
    "{'paper': {'id': '2412.09722', 'authors': [{'_id': '67607fea899f4731eb7dc446', 'name': 'Sarkar Snigdha Sarathi Das', 'hidden': False}, {'_id': '67607fea899f4731eb7dc447', 'user': {'_id': '60591896102f61b42f65ae37', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/60591896102f61b42f65ae37/PSHcLoEeNnwvzAPOd6-6i.jpeg', 'isPro': False, 'fullname': 'Ryo Kamoi', 'user': 'ryokamoi', 'type': 'user'}, 'name': 'Ryo Kamoi', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-17T08:04:07.183Z', 'hidden': False}, {'_id': '67607fea899f4731eb7dc448', 'name': 'Bo Pang', 'hidden': False}, {'_id': '67607fea899f4731eb7dc449', 'name': 'Yusen Zhang', 'hidden': False}, {'_id': '67607fea899f4731eb7dc44a', 'name': 'Caiming Xiong', 'hidden': False}, {'_id': '67607fea899f4731eb7dc44b', 'name': 'Rui Zhang', 'hidden': False}], 'publishedAt': '2024-12-12T20:59:43.000Z', 'title': 'GReaTer: Gradients over Reasoning Makes Smaller Language Models Strong\\n  Prompt Optimizers', 'summary': 'The effectiveness of large language models (LLMs) is closely tied to the\\ndesign of prompts, making prompt optimization essential for enhancing their\\nperformance across a wide range of tasks. Many existing approaches to\\nautomating prompt engineering rely exclusively on textual feedback, refining\\nprompts based solely on inference errors identified by large, computationally\\nexpensive LLMs. Unfortunately, smaller models struggle to generate high-quality\\nfeedback, resulting in complete dependence on large LLM judgment. Moreover,\\nthese methods fail to leverage more direct and finer-grained information, such\\nas gradients, due to operating purely in text space. To this end, we introduce\\nGReaTer, a novel prompt optimization technique that directly incorporates\\ngradient information over task-specific reasoning. By utilizing task loss\\ngradients, GReaTer enables self-optimization of prompts for open-source,\\nlightweight language models without the need for costly closed-source LLMs.\\nThis allows high-performance prompt optimization without dependence on massive\\nLLMs, closing the gap between smaller models and the sophisticated reasoning\\noften needed for prompt refinement. Extensive evaluations across diverse\\nreasoning tasks including BBH, GSM8k, and FOLIO demonstrate that GReaTer\\nconsistently outperforms previous state-of-the-art prompt optimization methods,\\neven those reliant on powerful LLMs. Additionally, GReaTer-optimized prompts\\nfrequently exhibit better transferability and, in some cases, boost task\\nperformance to levels comparable to or surpassing those achieved by larger\\nlanguage models, highlighting the effectiveness of prompt optimization guided\\nby gradients over reasoning. Code of GReaTer is available at\\nhttps://github.com/psunlpgroup/GreaTer.', 'upvotes': 3, 'discussionId': '67607feb899f4731eb7dc497'}, 'publishedAt': '2024-12-16T14:34:19.689Z', 'title': 'GReaTer: Gradients over Reasoning Makes Smaller Language Models Strong Prompt Optimizers', 'thumbnail': 'https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2412.09722.png', 'numComments': 3, 'submittedBy': {'_id': '63c707ec656e7822e23af13c', 'avatarUrl': '/avatars/50d2cfe1e55fe8bd2e0f09e2ca5715ea.svg', 'fullname': 'Sarkar Snigdha Sarathi Das', 'name': 'sarathismg', 'type': 'user', 'isPro': False, 'isHf': False, 'isMod': False}}",
    "{'paper': {'id': '2412.08347', 'authors': [{'_id': '675ac92733f6ac95ba7d6336', 'user': {'_id': '63fc685a4c57549ad5e5c419', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/63fc685a4c57549ad5e5c419/0oySJfCD7BUVD59kXyhsV.jpeg', 'isPro': False, 'fullname': 'Sultan Alrashed', 'user': 'SultanR', 'type': 'user'}, 'name': 'Sultan Alrashed', 'status': 'claimed_verified', 'statusLastChangedAt': '2024-12-12T13:41:00.392Z', 'hidden': False}], 'publishedAt': '2024-12-11T12:41:36.000Z', 'title': 'SmolTulu: Higher Learning Rate to Batch Size Ratios Can Lead to Better\\n  Reasoning in SLMs', 'summary': \"We present SmolTulu-1.7b-Instruct, referenced in this report as\\nSmolTulu-DPO-1130, an instruction-tuned language model that adapts AllenAI's\\nTulu 3 post-training pipeline to enhance Huggingface's SmolLM2-1.7B base model.\\nThrough comprehensive empirical analysis using a 135M parameter model, we\\ndemonstrate that the relationship between learning rate and batch size\\nsignificantly impacts model performance in a task-dependent manner. Our\\nfindings reveal a clear split: reasoning tasks like ARC and GSM8K benefit from\\nhigher learning rate to batch size ratios, while pattern recognition tasks such\\nas HellaSwag and IFEval show optimal performance with lower ratios. These\\ninsights informed the development of SmolTulu, which achieves state-of-the-art\\nperformance among sub-2B parameter models on instruction following, scoring\\n67.7% on IFEval (Delta11%), and mathematical reasoning with 51.6% on GSM8K\\n(Delta3.4%), with an alternate version achieving scoring 57.1% on ARC\\n(Delta5.4%). We release our model, training recipes, and ablation studies to\\nfacilitate further research in efficient model alignment, demonstrating that\\ncareful adaptation of optimization dynamics can help bridge the capability gap\\nbetween small and large language models.\", 'upvotes': 3, 'discussionId': '675ac92833f6ac95ba7d634f'}, 'publishedAt': '2024-12-16T03:27:09.717Z', 'title': 'SmolTulu: Higher Learning Rate to Batch Size Ratios Can Lead to Better Reasoning in SLMs', 'mediaUrls': ['https://cdn-uploads.huggingface.co/production/uploads/63fc685a4c57549ad5e5c419/VEcQv3o_5VTWLZKTT315f.png'], 'thumbnail': 'https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2412.08347.png', 'numComments': 2, 'submittedBy': {'_id': '63fc685a4c57549ad5e5c419', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/63fc685a4c57549ad5e5c419/0oySJfCD7BUVD59kXyhsV.jpeg', 'fullname': 'Sultan Alrashed', 'name': 'SultanR', 'type': 'user', 'isPro': False, 'isHf': False, 'isMod': False, 'followerCount': 2}}",
    "{'paper': {'id': '2412.10345', 'authors': [{'_id': '6760512bda45b7d6f71baef6', 'name': 'Ruijie Zheng', 'hidden': False}, {'_id': '6760512bda45b7d6f71baef7', 'name': 'Yongyuan Liang', 'hidden': False}, {'_id': '6760512bda45b7d6f71baef8', 'name': 'Shuaiyi Huang', 'hidden': False}, {'_id': '6760512bda45b7d6f71baef9', 'name': 'Jianfeng Gao', 'hidden': False}, {'_id': '6760512bda45b7d6f71baefa', 'name': 'Hal Daumé III', 'hidden': False}, {'_id': '6760512bda45b7d6f71baefb', 'name': 'Andrey Kolobov', 'hidden': False}, {'_id': '6760512bda45b7d6f71baefc', 'name': 'Furong Huang', 'hidden': False}, {'_id': '6760512bda45b7d6f71baefd', 'name': 'Jianwei Yang', 'hidden': False}], 'publishedAt': '2024-12-13T18:40:51.000Z', 'title': 'TraceVLA: Visual Trace Prompting Enhances Spatial-Temporal Awareness for\\n  Generalist Robotic Policies', 'summary': \"Although large vision-language-action (VLA) models pretrained on extensive\\nrobot datasets offer promising generalist policies for robotic learning, they\\nstill struggle with spatial-temporal dynamics in interactive robotics, making\\nthem less effective in handling complex tasks, such as manipulation. In this\\nwork, we introduce visual trace prompting, a simple yet effective approach to\\nfacilitate VLA models' spatial-temporal awareness for action prediction by\\nencoding state-action trajectories visually. We develop a new TraceVLA model by\\nfinetuning OpenVLA on our own collected dataset of 150K robot manipulation\\ntrajectories using visual trace prompting. Evaluations of TraceVLA across 137\\nconfigurations in SimplerEnv and 4 tasks on a physical WidowX robot demonstrate\\nstate-of-the-art performance, outperforming OpenVLA by 10% on SimplerEnv and\\n3.5x on real-robot tasks and exhibiting robust generalization across diverse\\nembodiments and scenarios. To further validate the effectiveness and generality\\nof our method, we present a compact VLA model based on 4B Phi-3-Vision,\\npretrained on the Open-X-Embodiment and finetuned on our dataset, rivals the 7B\\nOpenVLA baseline while significantly improving inference efficiency.\", 'upvotes': 2, 'discussionId': '6760512dda45b7d6f71baf72'}, 'publishedAt': '2024-12-16T11:13:00.613Z', 'title': 'TraceVLA: Visual Trace Prompting Enhances Spatial-Temporal Awareness for Generalist Robotic Policies', 'thumbnail': 'https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2412.10345.png', 'numComments': 2, 'submittedBy': {'_id': '653b24ef8f8d60f204872f0a', 'avatarUrl': '/avatars/45a55219e8a78be53fd32e96ba460282.svg', 'fullname': 'Ruijie Zheng', 'name': 'rzheng12', 'type': 'user', 'isPro': False, 'isHf': False, 'isMod': False, 'followerCount': 1}}",
    "{'paper': {'id': '2412.09910', 'authors': [{'_id': '675fdfecdd3e9eeed63d8093', 'user': {'_id': '650392bb694f6b620fca36e5', 'avatarUrl': '/avatars/35d8f1a0b96835bca81457ff6f361de7.svg', 'isPro': False, 'fullname': 'yasamin medghalchi', 'user': 'yasimed', 'type': 'user'}, 'name': 'Yasamin Medghalchi', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:26:08.678Z', 'hidden': False}, {'_id': '675fdfecdd3e9eeed63d8094', 'user': {'_id': '61ba19bf6122a4fd29049371', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/1639586194527-noauth.jpeg', 'isPro': False, 'fullname': 'Moein Heidari', 'user': 'moein99', 'type': 'user'}, 'name': 'Moein Heidari', 'status': 'admin_assigned', 'statusLastChangedAt': '2024-12-16T10:26:03.231Z', 'hidden': False}, {'_id': '675fdfecdd3e9eeed63d8095', 'name': 'Clayton Allard', 'hidden': False}, {'_id': '675fdfecdd3e9eeed63d8096', 'name': 'Leonid Sigal', 'hidden': False}, {'_id': '675fdfecdd3e9eeed63d8097', 'name': 'Ilker Hacihaliloglu', 'hidden': False}], 'publishedAt': '2024-12-13T06:56:12.000Z', 'title': 'Prompt2Perturb (P2P): Text-Guided Diffusion-Based Adversarial Attacks on\\n  Breast Ultrasound Images', 'summary': 'Deep neural networks (DNNs) offer significant promise for improving breast\\ncancer diagnosis in medical imaging. However, these models are highly\\nsusceptible to adversarial attacks--small, imperceptible changes that can\\nmislead classifiers--raising critical concerns about their reliability and\\nsecurity. Traditional attacks rely on fixed-norm perturbations, misaligning\\nwith human perception. In contrast, diffusion-based attacks require pre-trained\\nmodels, demanding substantial data when these models are unavailable, limiting\\npractical use in data-scarce scenarios. In medical imaging, however, this is\\noften unfeasible due to the limited availability of datasets. Building on\\nrecent advancements in learnable prompts, we propose Prompt2Perturb (P2P), a\\nnovel language-guided attack method capable of generating meaningful attack\\nexamples driven by text instructions. During the prompt learning phase, our\\napproach leverages learnable prompts within the text encoder to create subtle,\\nyet impactful, perturbations that remain imperceptible while guiding the model\\ntowards targeted outcomes. In contrast to current prompt learning-based\\napproaches, our P2P stands out by directly updating text embeddings, avoiding\\nthe need for retraining diffusion models. Further, we leverage the finding that\\noptimizing only the early reverse diffusion steps boosts efficiency while\\nensuring that the generated adversarial examples incorporate subtle noise, thus\\npreserving ultrasound image quality without introducing noticeable artifacts.\\nWe show that our method outperforms state-of-the-art attack techniques across\\nthree breast ultrasound datasets in FID and LPIPS. Moreover, the generated\\nimages are both more natural in appearance and more effective compared to\\nexisting adversarial attacks. Our code will be publicly available\\nhttps://github.com/yasamin-med/P2P.', 'upvotes': 1, 'discussionId': '675fdfeedd3e9eeed63d80f9'}, 'publishedAt': '2024-12-16T03:09:42.284Z', 'title': 'Prompt2Perturb (P2P): Text-Guided Diffusion-Based Adversarial Attacks on Breast Ultrasound Images', 'thumbnail': 'https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2412.09910.png', 'numComments': 2, 'submittedBy': {'_id': '61ba19bf6122a4fd29049371', 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/1639586194527-noauth.jpeg', 'fullname': 'Moein Heidari', 'name': 'moein99', 'type': 'user', 'isPro': False, 'isHf': False, 'isMod': False, 'followerCount': 2}}"
]