Add Qwen1.5-72B-chat (OpenCSGs#44)

* push image to opencsg registry * fix bug pad token * remove cpu_nums and add time log * add deep code model * update * update OpenCSG model name * update deepseek parameters * add Qwen1.5 * add Qwen 1.5 72B --------- Co-authored-by: haihwang <[email protected]>
utopic-dev · Mar 25, 2024 · 600bdc1 · 600bdc1
1 parent ada574d
commit 600bdc1
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 1 deletion.
diff --git a/llmserve/backend/server/config.py b/llmserve/backend/server/config.py
@@ -131,7 +131,8 @@
     "opencsg/opencsg-starcoder-v0.1": "./models/text-generation--opencsg--opencsg-starcoder-15B-v0.1-pipeline.yaml",
     "OpenCSG/opencsg-starcoder-v0.1": "./models/text-generation--opencsg--opencsg-starcoder-15B-v0.1-pipeline.yaml",
     "opencsg/opencsg-deepseek-coder-1.3b-v0.1": "./models/text-generation--opencsg--opencsg-deepseek-coder-1.3b-v0.1.yaml",
-    "OpenCSG/opencsg-deepseek-coder-1.3b-v0.1": "./models/text-generation--opencsg--opencsg-deepseek-coder-1.3b-v0.1.yaml"
+    "OpenCSG/opencsg-deepseek-coder-1.3b-v0.1": "./models/text-generation--opencsg--opencsg-deepseek-coder-1.3b-v0.1.yaml",
+    "Qwen/Qwen1.5-72B-Chat": "./models/text-generation--Qwen--Qwen1.5-72B-Chat.yaml"
 }
 
 SERVE_RUN_HOST = "0.0.0.0"

diff --git a/models/text-generation--Qwen--Qwen1.5-72B-Chat.yaml b/models/text-generation--Qwen--Qwen1.5-72B-Chat.yaml
@@ -0,0 +1,49 @@
+deployment_config:
+  autoscaling_config:
+    min_replicas: 1
+    initial_replicas: 1
+    max_replicas: 1
+    target_num_ongoing_requests_per_replica: 1.0
+    metrics_interval_s: 10.0
+    look_back_period_s: 30.0
+    smoothing_factor: 1.0
+    downscale_delay_s: 300.0
+    upscale_delay_s: 90.0
+  ray_actor_options:
+    num_cpus: 2    # for a model deployment, we have 3 actor created, 1 and 2 will cost 0.1 cpu, and the model infrence will cost 6(see the setting in the end of the file)
+model_config:
+  warmup: False
+  model_task: text-generation
+  model_id: Qwen/Qwen1.5-72B-Chat
+  max_input_words: 800
+  initialization:
+    s3_mirror_config:
+      bucket_uri: /data/models/Qwen1.5-72B-Chat/
+    initializer:
+      type: DeviceMap
+      dtype: float16
+      from_pretrained_kwargs:
+        use_cache: true
+        trust_remote_code: true
+      # use_kernel: true   # for deepspped type only
+      # max_tokens: 1536   # for deepspped type only
+    pipeline: defaulttransformers
+    # pipeline: default
+  generation:
+    max_batch_size: 1
+    generate_kwargs:
+      bos_token_id: 151643,
+      # pad_token_id: 151643,
+      # eos_token_id: [151645, 151643],
+      do_sample: false
+      max_new_tokens: 512
+      repetition_penalty: 1.05
+      temperature: 0.7
+      top_p: 0.8
+      top_k: 20
+    prompt_format: "'role': 'user', 'content': {instruction}"
+    # stopping_sequences: ["### Response:", "### End"]
+scaling_config:
+  num_workers: 1
+  num_gpus_per_worker: 7
+  num_cpus_per_worker: 32  # for inference