diff --git a/.github/workflows/build-ci-docker-images.yml b/.github/workflows/build-ci-docker-images.yml
index a07b99af65d..9d947684ee8 100644
--- a/.github/workflows/build-ci-docker-images.yml
+++ b/.github/workflows/build-ci-docker-images.yml
@@ -74,4 +74,4 @@ jobs:
           slack_channel: "#transformers-ci-circleci-images"
           title: 🤗 New docker images for CircleCI are pushed.
           status: ${{ job.status }}
-          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
\ No newline at end of file
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 3f28753f3a7..d6ef15d71b7 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -930,6 +930,7 @@
         "is_tokenizers_available",
         "is_torch_available",
         "is_torch_mlu_available",
+        "is_torch_musa_available",
         "is_torch_neuroncore_available",
         "is_torch_npu_available",
         "is_torch_tpu_available",
@@ -5706,6 +5707,7 @@
         is_tokenizers_available,
         is_torch_available,
         is_torch_mlu_available,
+        is_torch_musa_available,
         is_torch_neuroncore_available,
         is_torch_npu_available,
         is_torch_tpu_available,
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index a5a0487d305..7db33ab5bd1 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -45,6 +45,7 @@
     is_torch_cuda_available,
     is_torch_mlu_available,
     is_torch_mps_available,
+    is_torch_musa_available,
     is_torch_npu_available,
     is_torch_xpu_available,
     logging,
@@ -873,6 +874,8 @@ def __init__(
                 self.device = torch.device("cpu")
             elif is_torch_mlu_available():
                 self.device = torch.device(f"mlu:{device}")
+            elif is_torch_musa_available():
+                self.device = torch.device(f"musa:{device}")
             elif is_torch_cuda_available():
                 self.device = torch.device(f"cuda:{device}")
             elif is_torch_npu_available():
@@ -1042,6 +1045,9 @@ def device_placement(self):
             elif self.device.type == "mlu":
                 with torch.mlu.device(self.device):
                     yield
+            elif self.device.type == "musa":
+                with torch.musa.device(self.device):
+                    yield
             else:
                 yield
 
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 094f0584690..fe1f3f8e4bb 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -164,6 +164,7 @@
     is_torch_compile_available,
     is_torch_mlu_available,
     is_torch_mps_available,
+    is_torch_musa_available,
     is_torch_neuroncore_available,
     is_torch_npu_available,
     is_torch_xla_available,
@@ -2894,6 +2895,17 @@ def _load_rng_state(self, checkpoint):
                         f"Didn't manage to set back the RNG states of the MLU because of the following error:\n {e}"
                         "\nThis won't yield the same results as if the training had not been interrupted."
                     )
+        if is_torch_musa_available():
+            if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
+                torch.musa.set_rng_state_all(checkpoint_rng_state["musa"])
+            else:
+                try:
+                    torch.musa.set_rng_state(checkpoint_rng_state["musa"])
+                except Exception as e:
+                    logger.info(
+                        f"Didn't manage to set back the RNG states of the MUSA because of the following error:\n {e}"
+                        "\nThis won't yield the same results as if the training had not been interrupted."
+                    )
 
     def _save_checkpoint(self, model, trial, metrics=None):
         # In all cases, including ddp/dp/deepspeed, self.model is always a reference to the model we
@@ -2982,6 +2994,12 @@ def _save_rng_state(self, output_dir):
             else:
                 rng_states["mlu"] = torch.mlu.random.get_rng_state()
 
+        if is_torch_musa_available():
+            if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
+                rng_states["musa"] = torch.musa.get_rng_state_all()
+            else:
+                rng_states["musa"] = torch.musa.get_rng_state()
+
         # A process can arrive here before the process 0 has a chance to save the model, in which case output_dir may
         # not yet exist.
         os.makedirs(output_dir, exist_ok=True)
@@ -3351,6 +3369,8 @@ def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor,
                 torch.xpu.empty_cache()
             elif is_torch_mlu_available():
                 torch.mlu.empty_cache()
+            elif is_torch_musa_available():
+                torch.musa.empty_cache()
             elif is_torch_npu_available():
                 torch.npu.empty_cache()
             elif is_torch_mps_available(min_version="2.0"):
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index b9e1cf46611..8974340f753 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -37,6 +37,7 @@
     is_torch_cuda_available,
     is_torch_mlu_available,
     is_torch_mps_available,
+    is_torch_musa_available,
     is_torch_npu_available,
     is_torch_xla_available,
     is_torch_xpu_available,
@@ -108,6 +109,8 @@ def set_seed(seed: int, deterministic: bool = False):
             torch.use_deterministic_algorithms(True)
     if is_torch_mlu_available():
         torch.mlu.manual_seed_all(seed)
+    if is_torch_musa_available():
+        torch.musa.manual_seed_all(seed)
     if is_torch_npu_available():
         torch.npu.manual_seed_all(seed)
     if is_torch_xpu_available():
@@ -464,7 +467,7 @@ def __init__(self, skip_memory_metrics=False):
 
         import psutil  # noqa
 
-        if is_torch_cuda_available() or is_torch_mlu_available():
+        if is_torch_cuda_available() or is_torch_mlu_available() or is_torch_musa_available():
             import torch
 
             self.torch = torch
@@ -540,6 +543,9 @@ def start(self):
             elif is_torch_mlu_available():
                 self.torch.mlu.reset_peak_memory_stats()
                 self.torch.mlu.empty_cache()
+            elif is_torch_musa_available():
+                self.torch.musa.reset_peak_memory_stats()
+                self.torch.musa.empty_cache()
             elif is_torch_xpu_available():
                 self.torch.xpu.reset_peak_memory_stats()
                 self.torch.xpu.empty_cache()
@@ -555,6 +561,8 @@ def start(self):
                 self.gpu_mem_used_at_start = self.torch.cuda.memory_allocated()
             elif is_torch_mlu_available():
                 self.gpu_mem_used_at_start = self.torch.mlu.memory_allocated()
+            elif is_torch_musa_available():
+                self.gpu_mem_used_at_start = self.torch.musa.memory_allocated()
             elif is_torch_xpu_available():
                 self.gpu_mem_used_at_start = self.torch.xpu.memory_allocated()
             elif is_torch_npu_available():
@@ -588,6 +596,8 @@ def stop(self, stage):
                 self.torch.cuda.empty_cache()
             elif is_torch_mlu_available():
                 self.torch.mlu.empty_cache()
+            elif is_torch_musa_available():
+                self.torch.musa.empty_cache()
             elif is_torch_xpu_available():
                 self.torch.xpu.empty_cache()
             elif is_torch_npu_available():
@@ -608,6 +618,9 @@ def stop(self, stage):
             elif is_torch_mlu_available():
                 self.gpu_mem_used_now = self.torch.mlu.memory_allocated()
                 self.gpu_mem_used_peak = self.torch.mlu.max_memory_allocated()
+            elif is_torch_musa_available():
+                self.gpu_mem_used_now = self.torch.musa.memory_allocated()
+                self.gpu_mem_used_peak = self.torch.musa.max_memory_allocated()
             elif is_torch_xpu_available():
                 self.gpu_mem_used_now = self.torch.xpu.memory_allocated()
                 self.gpu_mem_used_peak = self.torch.xpu.max_memory_allocated()
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 1058b8356b9..ca6f32279fa 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -49,6 +49,7 @@
     is_torch_bf16_gpu_available,
     is_torch_mlu_available,
     is_torch_mps_available,
+    is_torch_musa_available,
     is_torch_neuroncore_available,
     is_torch_npu_available,
     is_torch_tf32_available,
@@ -1090,7 +1091,7 @@ class TrainingArguments:
         default=None,
         metadata={
             "help": "The backend to be used for distributed training",
-            "choices": ["nccl", "gloo", "mpi", "ccl", "hccl", "cncl"],
+            "choices": ["nccl", "gloo", "mpi", "ccl", "hccl", "cncl", "mccl"],
         },
     )
     tpu_num_cores: Optional[int] = field(
@@ -2201,6 +2202,9 @@ def _setup_devices(self) -> "torch.device":
             elif is_torch_mlu_available():
                 device = torch.device("mlu:0")
                 torch.mlu.set_device(device)
+            elif is_torch_musa_available():
+                device = torch.device("musa:0")
+                torch.musa.set_device(device)
             elif is_torch_npu_available():
                 device = torch.device("npu:0")
                 torch.npu.set_device(device)
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index a8aa670c07a..546a69132d8 100755
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -201,6 +201,7 @@
     is_torch_fx_proxy,
     is_torch_mlu_available,
     is_torch_mps_available,
+    is_torch_musa_available,
     is_torch_neuroncore_available,
     is_torch_npu_available,
     is_torch_sdpa_available,
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 97d3a5501b3..8eae679501e 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -677,6 +677,29 @@ def is_torch_mlu_available(check_device=False):
     return hasattr(torch, "mlu") and torch.mlu.is_available()
 
 
+@lru_cache()
+def is_torch_musa_available(check_device=False):
+    "Checks if `torch_musa` is installed and potentially if a MUSA is in the environment"
+    if not _torch_available or importlib.util.find_spec("torch_musa") is None:
+        return False
+
+    import torch
+    import torch_musa  # noqa: F401
+
+    torch_musa_min_version = "0.33.0"
+    if _accelerate_available and version.parse(_accelerate_version) < version.parse(torch_musa_min_version):
+        return False
+
+    if check_device:
+        try:
+            # Will raise a RuntimeError if no MUSA is found
+            _ = torch.musa.device_count()
+            return torch.musa.is_available()
+        except RuntimeError:
+            return False
+    return hasattr(torch, "musa") and torch.musa.is_available()
+
+
 def is_torchdynamo_available():
     if not is_torch_available():
         return False