models/text-generation--llama-7b-GGUF.yaml

deployment_config:
  autoscaling_config:
    min_replicas: 0
    initial_replicas: 1
    max_replicas: 8
    target_num_ongoing_requests_per_replica: 1.0
    metrics_interval_s: 10.0
    look_back_period_s: 30.0
    smoothing_factor: 1.0
    downscale_delay_s: 300.0
    upscale_delay_s: 90.0
  ray_actor_options:
    num_cpus: 0.1    # for a model deployment, we have 3 actor created, 1 and 2 will cost 0.1 cpu, and the model infrence will cost 6(see the setting in the end of the file)
model_config:
  warmup: True
  model_task: text-generation
  model_id: TheBloke/Llama-2-7B-GGUF
  max_input_words: 800
  initialization:
    # s3_mirror_config:
      # endpoint_url: http://39.107.108.170:9000
      # bucket_uri: /Users/hub/models/llama-2-7b-gguf/
    initializer:
      type: LlamaCpp
      model_filename: llama-2-7b.Q5_K_S.gguf
      model_init_kwargs:
        test: true

      # use_kernel: true   # for deepspped type only
      # max_tokens: 1536   # for deepspped type only
    # pipeline: defaulttransformers
    # pipeline: default
    pipeline: llamacpp
  generation:
    max_batch_size: 2
    batch_wait_timeout_s: 0
    generate_kwargs:
      # do_sample: true
      max_tokens: 128
      temperature: 0.7
      top_p: 0.8
      top_k: 50
      echo: false
    # prompt_format: "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n### Instruction:\n{instruction}\n### Response:\n"
    stopping_sequences: ["### Response:", "### End"]
scaling_config:
  num_workers: 1
  num_gpus_per_worker: 0
  num_cpus_per_worker: 8   # for inference