forked from SciSharp/LLamaSharp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIContextParams.cs
127 lines (102 loc) · 3.24 KB
/
IContextParams.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
using System.Text;
using LLama.Native;
namespace LLama.Abstractions;
/// <summary>
/// The parameters for initializing a LLama context from a model.
/// </summary>
public interface IContextParams
{
/// <summary>
/// Model context size (n_ctx)
/// </summary>
uint? ContextSize { get; }
/// <summary>
/// maximum batch size that can be submitted at once (must be >=32 to use BLAS) (n_batch)
/// </summary>
uint BatchSize { get; }
/// <summary>
/// Physical batch size
/// </summary>
uint UBatchSize { get; }
/// <summary>
/// max number of sequences (i.e. distinct states for recurrent models)
/// </summary>
uint SeqMax { get; }
/// <summary>
/// Seed for the random number generator (seed)
/// </summary>
uint? Seed { get; }
/// <summary>
/// If true, extract embeddings (together with logits).
/// </summary>
bool Embeddings { get; }
/// <summary>
/// RoPE base frequency (null to fetch from the model)
/// </summary>
float? RopeFrequencyBase { get; }
/// <summary>
/// RoPE frequency scaling factor (null to fetch from the model)
/// </summary>
float? RopeFrequencyScale { get; }
/// <summary>
/// The encoding to use for models
/// </summary>
Encoding Encoding { get; }
/// <summary>
/// Number of threads (null = autodetect) (n_threads)
/// </summary>
uint? Threads { get; }
/// <summary>
/// Number of threads to use for batch processing (null = autodetect) (n_threads)
/// </summary>
uint? BatchThreads { get; }
/// <summary>
/// YaRN extrapolation mix factor (null = from model)
/// </summary>
float? YarnExtrapolationFactor { get; }
/// <summary>
/// YaRN magnitude scaling factor (null = from model)
/// </summary>
float? YarnAttentionFactor { get; }
/// <summary>
/// YaRN low correction dim (null = from model)
/// </summary>
float? YarnBetaFast { get; }
/// <summary>
/// YaRN high correction dim (null = from model)
/// </summary>
float? YarnBetaSlow { get; }
/// <summary>
/// YaRN original context length (null = from model)
/// </summary>
uint? YarnOriginalContext { get; }
/// <summary>
/// YaRN scaling method to use.
/// </summary>
RopeScalingType? YarnScalingType { get; }
/// <summary>
/// Override the type of the K cache
/// </summary>
GGMLType? TypeK { get; }
/// <summary>
/// Override the type of the V cache
/// </summary>
GGMLType? TypeV { get; }
/// <summary>
/// Whether to disable offloading the KQV cache to the GPU
/// </summary>
bool NoKqvOffload { get; }
/// <summary>
/// Whether to use flash attention
/// </summary>
bool FlashAttention { get; }
/// <summary>
/// defragment the KV cache if holes/size > defrag_threshold, Set to < 0 to disable (default)
/// defragment the KV cache if holes/size > defrag_threshold, Set to <see langword="null"/> or < 0 to disable (default)
/// </summary>
float? DefragThreshold { get; }
/// <summary>
/// How to pool (sum) embedding results by sequence id (ignored if no pooling layer)
/// </summary>
LLamaPoolingType PoolingType { get; }
}