forked from apache/arrow
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Schema.fbs
430 lines (368 loc) · 14.9 KB
/
Schema.fbs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
/// Logical types, vector layouts, and schemas
namespace org.apache.arrow.flatbuf;
enum MetadataVersion:short {
/// 0.1.0 (October 2016).
V1,
/// 0.2.0 (February 2017). Non-backwards compatible with V1.
V2,
/// 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2.
V3,
/// >= 0.8.0 (December 2017). Non-backwards compatible with V3.
V4,
/// >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4
/// metadata and IPC messages). Implementations are recommended to provide a
/// V4 compatibility mode with V5 format changes disabled.
///
/// Incompatible changes between V4 and V5:
/// - Union buffer layout has changed. In V5, Unions don't have a validity
/// bitmap buffer.
V5,
}
/// Represents Arrow Features that might not have full support
/// within implementations. This is intended to be used in
/// two scenarios:
/// 1. A mechanism for readers of Arrow Streams
/// and files to understand that the stream or file makes
/// use of a feature that isn't supported or unknown to
/// the implementation (and therefore can meet the Arrow
/// forward compatibility guarantees).
/// 2. A means of negotiating between a client and server
/// what features a stream is allowed to use. The enums
/// values here are intented to represent higher level
/// features, additional details maybe negotiated
/// with key-value pairs specific to the protocol.
///
/// Enums added to this list should be assigned power-of-two values
/// to facilitate exchanging and comparing bitmaps for supported
/// features.
enum Feature : long {
/// Needed to make flatbuffers happy.
UNUSED = 0,
/// The stream makes use of multiple full dictionaries with the
/// same ID and assumes clients implement dictionary replacement
/// correctly.
DICTIONARY_REPLACEMENT = 1,
/// The stream makes use of compressed bodies as described
/// in Message.fbs.
COMPRESSED_BODY = 2
}
/// These are stored in the flatbuffer in the Type union below
table Null {
}
/// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct
/// (according to the physical memory layout). We used Struct_ here as
/// Struct is a reserved word in Flatbuffers
table Struct_ {
}
table List {
}
/// Same as List, but with 64-bit offsets, allowing to represent
/// extremely large data values.
table LargeList {
}
table FixedSizeList {
/// Number of list items per value
listSize: int;
}
/// A Map is a logical nested type that is represented as
///
/// List<entries: Struct<key: K, value: V>>
///
/// In this layout, the keys and values are each respectively contiguous. We do
/// not constrain the key and value types, so the application is responsible
/// for ensuring that the keys are hashable and unique. Whether the keys are sorted
/// may be set in the metadata for this field.
///
/// In a field with Map type, the field has a child Struct field, which then
/// has two children: key type and the second the value type. The names of the
/// child fields may be respectively "entries", "key", and "value", but this is
/// not enforced.
///
/// Map
/// ```text
/// - child[0] entries: Struct
/// - child[0] key: K
/// - child[1] value: V
/// ```
/// Neither the "entries" field nor the "key" field may be nullable.
///
/// The metadata is structured so that Arrow systems without special handling
/// for Map can make Map an alias for List. The "layout" attribute for the Map
/// field must have the same contents as a List.
table Map {
/// Set to true if the keys within each value are sorted
keysSorted: bool;
}
enum UnionMode:short { Sparse, Dense }
/// A union is a complex type with children in Field
/// By default ids in the type vector refer to the offsets in the children
/// optionally typeIds provides an indirection between the child offset and the type id
/// for each child `typeIds[offset]` is the id used in the type vector
table Union {
mode: UnionMode;
typeIds: [ int ]; // optional, describes typeid of each child.
}
table Int {
bitWidth: int; // restricted to 8, 16, 32, and 64 in v1
is_signed: bool;
}
enum Precision:short {HALF, SINGLE, DOUBLE}
table FloatingPoint {
precision: Precision;
}
/// Unicode with UTF-8 encoding
table Utf8 {
}
/// Opaque binary data
table Binary {
}
/// Same as Utf8, but with 64-bit offsets, allowing to represent
/// extremely large data values.
table LargeUtf8 {
}
/// Same as Binary, but with 64-bit offsets, allowing to represent
/// extremely large data values.
table LargeBinary {
}
table FixedSizeBinary {
/// Number of bytes per value
byteWidth: int;
}
table Bool {
}
/// Exact decimal value represented as an integer value in two's
/// complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers
/// are used. The representation uses the endianness indicated
/// in the Schema.
table Decimal {
/// Total number of decimal digits
precision: int;
/// Number of digits after the decimal point "."
scale: int;
/// Number of bits per value. The only accepted widths are 128 and 256.
/// We use bitWidth for consistency with Int::bitWidth.
bitWidth: int = 128;
}
enum DateUnit: short {
DAY,
MILLISECOND
}
/// Date is either a 32-bit or 64-bit type representing elapsed time since UNIX
/// epoch (1970-01-01), stored in either of two units:
///
/// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no
/// leap seconds), where the values are evenly divisible by 86400000
/// * Days (32 bits) since the UNIX epoch
table Date {
unit: DateUnit = MILLISECOND;
}
enum TimeUnit: short { SECOND, MILLISECOND, MICROSECOND, NANOSECOND }
/// Time type. The physical storage type depends on the unit
/// - SECOND and MILLISECOND: 32 bits
/// - MICROSECOND and NANOSECOND: 64 bits
table Time {
unit: TimeUnit = MILLISECOND;
bitWidth: int = 32;
}
/// Time elapsed from the Unix epoch, 00:00:00.000 on 1 January 1970, excluding
/// leap seconds, as a 64-bit integer. Note that UNIX time does not include
/// leap seconds.
///
/// Date & time libraries often have multiple different data types for temporal
/// data. In order to ease interoperability between different implementations the
/// Arrow project has some recommendations for encoding these types into a Timestamp
/// column.
///
/// An "instant" represents a single moment in time that has no meaningful time zone
/// or the time zone is unknown. A column of instants can also contain values from
/// multiple time zones. To encode an instant set the timezone string to "UTC".
///
/// A "zoned date-time" represents a single moment in time that has a meaningful
/// reference time zone. To encode a zoned date-time as a Timestamp set the timezone
/// string to the name of the timezone. There is some ambiguity between an instant
/// and a zoned date-time with the UTC time zone. Both of these are stored the same.
/// Typically, this distinction does not matter. If it does, then an application should
/// use custom metadata or an extension type to distinguish between the two cases.
///
/// An "offset date-time" represents a single moment in time combined with a meaningful
/// offset from UTC. To encode an offset date-time as a Timestamp set the timezone string
/// to the numeric time zone offset string (e.g. "+03:00").
///
/// A "local date-time" does not represent a single moment in time. It represents a wall
/// clock time combined with a date. Because of daylight savings time there may multiple
/// instants that correspond to a single local date-time in any given time zone. A
/// local date-time is often stored as a struct or a Date32/Time64 pair. However, it can
/// also be encoded into a Timestamp column. To do so the value should be the the time
/// elapsed from the Unix epoch so that a wall clock in UTC would display the desired time.
/// The timezone string should be set to null or the empty string.
table Timestamp {
unit: TimeUnit;
/// The time zone is a string indicating the name of a time zone, one of:
///
/// * As used in the Olson time zone database (the "tz database" or
/// "tzdata"), such as "America/New_York"
/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
///
/// Whether a timezone string is present indicates different semantics about
/// the data:
///
/// * If the time zone is null or an empty string, the data is a local date-time
/// and does not represent a single moment in time. Instead it represents a wall clock
/// time and care should be taken to avoid interpreting it semantically as an instant.
///
/// * If the time zone is set to a valid value, values can be displayed as
/// "localized" to that time zone, even though the underlying 64-bit
/// integers are identical to the same data stored in UTC. Converting
/// between time zones is a metadata-only operation and does not change the
/// underlying values
timezone: string;
}
enum IntervalUnit: short { YEAR_MONTH, DAY_TIME}
// A "calendar" interval which models types that don't necessarily
// have a precise duration without the context of a base timestamp (e.g.
// days can differ in length during day light savings time transitions).
// YEAR_MONTH - Indicates the number of elapsed whole months, stored as
// 4-byte integers.
// DAY_TIME - Indicates the number of elapsed days and milliseconds,
// stored as 2 contiguous 32-bit integers (8-bytes in total). Support
// of this IntervalUnit is not required for full arrow compatibility.
table Interval {
unit: IntervalUnit;
}
// An absolute length of time unrelated to any calendar artifacts.
//
// For the purposes of Arrow Implementations, adding this value to a Timestamp
// ("t1") naively (i.e. simply summing the two number) is acceptable even
// though in some cases the resulting Timestamp (t2) would not account for
// leap-seconds during the elapsed time between "t1" and "t2". Similarly,
// representing the difference between two Unix timestamp is acceptable, but
// would yield a value that is possibly a few seconds off from the true elapsed
// time.
//
// The resolution defaults to millisecond, but can be any of the other
// supported TimeUnit values as with Timestamp and Time types. This type is
// always represented as an 8-byte integer.
table Duration {
unit: TimeUnit = MILLISECOND;
}
/// ----------------------------------------------------------------------
/// Top-level Type value, enabling extensible type-specific metadata. We can
/// add new logical types to Type without breaking backwards compatibility
union Type {
Null,
Int,
FloatingPoint,
Binary,
Utf8,
Bool,
Decimal,
Date,
Time,
Timestamp,
Interval,
List,
Struct_,
Union,
FixedSizeBinary,
FixedSizeList,
Map,
Duration,
LargeBinary,
LargeUtf8,
LargeList,
}
/// ----------------------------------------------------------------------
/// user defined key value pairs to add custom metadata to arrow
/// key namespacing is the responsibility of the user
table KeyValue {
key: string;
value: string;
}
/// ----------------------------------------------------------------------
/// Dictionary encoding metadata
/// Maintained for forwards compatibility, in the future
/// Dictionaries might be explicit maps between integers and values
/// allowing for non-contiguous index values
enum DictionaryKind : short { DenseArray }
table DictionaryEncoding {
/// The known dictionary id in the application where this data is used. In
/// the file or streaming formats, the dictionary ids are found in the
/// DictionaryBatch messages
id: long;
/// The dictionary indices are constrained to be non-negative integers. If
/// this field is null, the indices must be signed int32. To maximize
/// cross-language compatibility and performance, implementations are
/// recommended to prefer signed integer types over unsigned integer types
/// and to avoid uint64 indices unless they are required by an application.
indexType: Int;
/// By default, dictionaries are not ordered, or the order does not have
/// semantic meaning. In some statistical, applications, dictionary-encoding
/// is used to represent ordered categorical data, and we provide a way to
/// preserve that metadata here
isOrdered: bool;
dictionaryKind: DictionaryKind;
}
/// ----------------------------------------------------------------------
/// A field represents a named column in a record / row batch or child of a
/// nested type.
table Field {
/// Name is not required, in i.e. a List
name: string;
/// Whether or not this field can contain nulls. Should be true in general.
nullable: bool;
/// This is the type of the decoded value if the field is dictionary encoded.
type: Type;
/// Present only if the field is dictionary encoded.
dictionary: DictionaryEncoding;
/// children apply only to nested data types like Struct, List and Union. For
/// primitive types children will have length 0.
children: [ Field ];
/// User-defined metadata
custom_metadata: [ KeyValue ];
}
/// ----------------------------------------------------------------------
/// Endianness of the platform producing the data
enum Endianness:short { Little, Big }
/// ----------------------------------------------------------------------
/// A Buffer represents a single contiguous memory segment
struct Buffer {
/// The relative offset into the shared memory page where the bytes for this
/// buffer starts
offset: long;
/// The absolute length (in bytes) of the memory buffer. The memory is found
/// from offset (inclusive) to offset + length (non-inclusive). When building
/// messages using the encapsulated IPC message, padding bytes may be written
/// after a buffer, but such padding bytes do not need to be accounted for in
/// the size here.
length: long;
}
/// ----------------------------------------------------------------------
/// A Schema describes the columns in a row batch
table Schema {
/// endianness of the buffer
/// it is Little Endian by default
/// if endianness doesn't match the underlying system then the vectors need to be converted
endianness: Endianness=Little;
fields: [Field];
// User-defined metadata
custom_metadata: [ KeyValue ];
/// Features used in the stream/file.
features : [ Feature ];
}
root_type Schema;