diff --git a/README.md b/README.md index 98e9571a44..f7b2b71ca9 100644 --- a/README.md +++ b/README.md @@ -99,7 +99,7 @@ Every model is written from scratch to maximize performance and remove layers of | Llama 3 & 3.1 | 8B, 70B, 405B | Meta AI | [Meta AI 2024](https://github.com/meta-llama/llama3) | | Code Llama | 7B, 13B, 34B, 70B | Meta AI | [Rozière et al. 2023](https://arxiv.org/abs/2308.12950) | | Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/) | -| Mistral | 7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/) | +| Mistral | 7B, 123B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/) | | CodeGemma | 7B | Google | [Google Team, Google Deepmind](https://ai.google.dev/gemma/docs/codegemma) | | Gemma 2 | 2B, 9B, 27B | Google | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-2-report.pdf) | | Phi 3 | 3.8B | Microsoft | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219) | @@ -129,7 +129,7 @@ Every model is written from scratch to maximize performance and remove layers of | Mathstral | 7B | Mistral AI | [Mistral AI 2024](https://mistral.ai/news/mathstral/) | | MicroLlama | 300M | Ken Wang | [MicroLlama repo](https://github.com/keeeeenw/MicroLlama) | | Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/) | -| Mistral | 7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/) | +| Mistral | 7B, 123B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/) | | Nous-Hermes | 7B, 13B, 70B | NousResearch | [Org page](https://huggingface.co/NousResearch) | | OpenLLaMA | 3B, 7B, 13B | OpenLM Research | [Geng & Liu 2023](https://github.com/openlm-research/open_llama) | | Phi 1.5 & 2 | 1.3B, 2.7B | Microsoft Research | [Li et al. 2023](https://arxiv.org/abs/2309.05463) | diff --git a/litgpt/config.py b/litgpt/config.py index ebc74b4c3b..940238c607 100644 --- a/litgpt/config.py +++ b/litgpt/config.py @@ -1756,6 +1756,26 @@ def norm_class(self) -> Type: intermediate_size=14336, ) ) +configs.append( + # https://huggingface.co/mistralai/Mistral-Large-Instruct-2407/blob/main/config.json + dict( + name="Mistral-Large-Instruct-2407", + hf_config=dict(org="mistralai", name="Mistral-Large-Instruct-2407"), + padded_vocab_size=32768, + block_size=32768, + n_layer=88, + n_head=96, + n_embd=12288, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + norm_class_name="RMSNorm", + norm_eps=1e-05, + mlp_class_name="LLaMAMLP", + intermediate_size=28672, + ) +) ############ diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md index 6d82df8889..a3cf633734 100644 --- a/tutorials/download_model_weights.md +++ b/tutorials/download_model_weights.md @@ -23,7 +23,7 @@ LitGPT supports a variety of LLM architectures with publicly available weights. | Mathstral | 7B | Mistral AI | [Mistral AI 2024](https://mistral.ai/news/mathstral/) | | MicroLlama | 300M | Ken Wang | [MicroLlama repo](https://github.com/keeeeenw/MicroLlama) | Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/) | -| Mistral | 7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/) | +| Mistral | 7B, 123B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/) | | Nous-Hermes | 7B, 13B, 70B | NousResearch | [Org page](https://huggingface.co/NousResearch) | | OpenLLaMA | 3B, 7B, 13B | OpenLM Research | [Geng & Liu 2023](https://github.com/openlm-research/open_llama) | | Phi 1.5 & 2 | 1.3B, 2.7B | Microsoft Research | [Li et al. 2023](https://arxiv.org/abs/2309.05463) | @@ -136,7 +136,10 @@ microsoft/Phi-3-mini-4k-instruct mistralai/mathstral-7B-v0.1 mistralai/Mistral-7B-Instruct-v0.1 mistralai/Mistral-7B-Instruct-v0.2 +mistralai/Mistral-7B-Instruct-v0.3 mistralai/Mistral-7B-v0.1 +mistralai/Mistral-7B-v0.3 +mistralai/Mistral-Large-Instruct-2407 mistralai/Mixtral-8x7B-Instruct-v0.1 mistralai/Mixtral-8x7B-v0.1 NousResearch/Nous-Hermes-13b