Created
June 26, 2024 14:12
-
-
Save mzaks/1e1a11bc2bcc65fff7a193f32061c496 to your computer and use it in GitHub Desktop.
Arrow Schema
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@always_inline | |
fn indirect(buf: DTypePointer[DType.uint8], pos: Int) -> Int32: | |
return buf.offset(pos).bitcast[DType.int32]()[0] | |
@always_inline | |
fn read[T: DType](buf: DTypePointer[DType.uint8], pos: Int) -> Scalar[T]: | |
return buf.offset(pos).bitcast[T]()[0] | |
fn field[T: DType](buf: DTypePointer[DType.uint8], pos: Int, field_offset: Int, default: Scalar[T]) -> Scalar[T]: | |
var relativ_value_offset =_relative_field_offset(buf, pos, field_offset) | |
if relativ_value_offset == 0: | |
return default | |
return buf.offset(int(pos) + relativ_value_offset).bitcast[T]()[0] | |
fn field_table(buf: DTypePointer[DType.uint8], pos: Int, field_offset: Int) -> Optional[Int32]: | |
var relativ_value_offset =_relative_field_offset(buf, pos, field_offset) | |
if relativ_value_offset == 0: | |
return None | |
return int(pos + buf.offset(pos + relativ_value_offset).bitcast[DType.int32]()[0]) | |
fn field_struct(buf: DTypePointer[DType.uint8], pos: Int, field_offset: Int) -> Optional[Int32]: | |
var relativ_value_offset =_relative_field_offset(buf, pos, field_offset) | |
if relativ_value_offset == 0: | |
return None | |
return pos + relativ_value_offset | |
fn field_vector(buf: DTypePointer[DType.uint8], pos: Int, field_offset: Int) -> Int: | |
var relativ_value_offset =_relative_field_offset(buf, pos, field_offset) | |
if relativ_value_offset == 0: | |
return 0 | |
return int(pos + buf.offset(pos + relativ_value_offset).bitcast[DType.int32]()[0]) + 4 | |
fn field_vector_len(buf: DTypePointer[DType.uint8], pos: Int, field_offset: Int) -> Int: | |
var relativ_value_offset =_relative_field_offset(buf, pos, field_offset) | |
if relativ_value_offset == 0: | |
return 0 | |
var vec_pos = int(pos + buf.offset(pos + relativ_value_offset).bitcast[DType.int32]()[0]) | |
return int(buf.offset(vec_pos).bitcast[DType.int32]()[0]) | |
fn field_string(buf: DTypePointer[DType.uint8], pos: Int, field_offset: Int) -> StringRef: | |
var relativ_value_offset =_relative_field_offset(buf, pos, field_offset) | |
if relativ_value_offset == 0: | |
return "" | |
var str_pos = int(pos + buf.offset(pos + relativ_value_offset).bitcast[DType.int32]()[0]) | |
var length = buf.offset(str_pos).bitcast[DType.int32]()[0] | |
return StringRef(buf.offset(str_pos + 4), int(length)) | |
@always_inline | |
fn _relative_field_offset(buf: DTypePointer[DType.uint8], pos: Int, field_offset: Int) -> Int: | |
var relativ_vtable_offset = indirect(buf, pos) | |
var vtable_pos = pos - relativ_vtable_offset | |
return int(buf.offset(field_offset).bitcast[DType.uint16]().offset(vtable_pos)[0]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# automatically generated by the FlatBuffers compiler, do not modify | |
import flatbuffers | |
@value | |
struct MetadataVersion: | |
var _value: Int16 | |
# 0.1.0 (October 2016). | |
alias V1 = 0 | |
# 0.2.0 (February 2017). Non-backwards compatible with V1. | |
alias V2 = 1 | |
# 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2. | |
alias V3 = 2 | |
# >= 0.8.0 (December 2017). Non-backwards compatible with V3. | |
alias V4 = 3 | |
# >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4 | |
# metadata and IPC messages). Implementations are recommended to provide a | |
# V4 compatibility mode with V5 format changes disabled. | |
# | |
# Incompatible changes between V4 and V5: | |
# - Union buffer layout has changed. In V5, Unions don't have a validity | |
# bitmap buffer. | |
alias V5 = 4 | |
# Represents Arrow Features that might not have full support | |
# within implementations. This is intended to be used in | |
# two scenarios: | |
# 1. A mechanism for readers of Arrow Streams | |
# and files to understand that the stream or file makes | |
# use of a feature that isn't supported or unknown to | |
# the implementation (and therefore can meet the Arrow | |
# forward compatibility guarantees). | |
# 2. A means of negotiating between a client and server | |
# what features a stream is allowed to use. The enums | |
# values here are intented to represent higher level | |
# features, additional details maybe negotiated | |
# with key-value pairs specific to the protocol. | |
# | |
# Enums added to this list should be assigned power-of-two values | |
# to facilitate exchanging and comparing bitmaps for supported | |
# features. | |
@value | |
struct Feature: | |
var _value: Int64 | |
# Needed to make flatbuffers happy. | |
alias UNUSED = 0 | |
# The stream makes use of multiple full dictionaries with the | |
# same ID and assumes clients implement dictionary replacement | |
# correctly. | |
alias DICTIONARY_REPLACEMENT = 1 | |
# The stream makes use of compressed bodies as described | |
# in Message.fbs. | |
alias COMPRESSED_BODY = 2 | |
@value | |
struct UnionMode: | |
var _value: Int16 | |
alias Sparse = 0 | |
alias Dense = 1 | |
@value | |
struct Precision: | |
var _value: Int16 | |
alias HALF = 0 | |
alias SINGLE = 1 | |
alias DOUBLE = 2 | |
@value | |
struct DateUnit: | |
var _value: Int16 | |
alias DAY = 0 | |
alias MILLISECOND = 1 | |
@value | |
struct TimeUnit: | |
var _value: Int16 | |
alias SECOND = 0 | |
alias MILLISECOND = 1 | |
alias MICROSECOND = 2 | |
alias NANOSECOND = 3 | |
@value | |
struct IntervalUnit: | |
var _value: Int16 | |
alias YEAR_MONTH = 0 | |
alias DAY_TIME = 1 | |
alias MONTH_DAY_NANO = 2 | |
# ---------------------------------------------------------------------- | |
# Top-level Type value, enabling extensible type-specific metadata. We can | |
# add new logical types to Type without breaking backwards compatibility | |
@value | |
struct Type: | |
var _value: UInt8 | |
alias NONE = 0 | |
alias Null = 1 | |
alias Int_ = 2 | |
alias FloatingPoint = 3 | |
alias Binary = 4 | |
alias Utf8 = 5 | |
alias Bool_ = 6 | |
alias Decimal = 7 | |
alias Date = 8 | |
alias Time = 9 | |
alias Timestamp = 10 | |
alias Interval = 11 | |
alias List = 12 | |
alias Struct_ = 13 | |
alias Union = 14 | |
alias FixedSizeBinary = 15 | |
alias FixedSizeList = 16 | |
alias Map = 17 | |
alias Duration = 18 | |
alias LargeBinary = 19 | |
alias LargeUtf8 = 20 | |
alias LargeList = 21 | |
alias RunEndEncoded = 22 | |
alias BinaryView = 23 | |
alias Utf8View = 24 | |
alias ListView = 25 | |
alias LargeListView = 26 | |
# ---------------------------------------------------------------------- | |
# Dictionary encoding metadata | |
# Maintained for forwards compatibility, in the future | |
# Dictionaries might be explicit maps between integers and values | |
# allowing for non-contiguous index values | |
@value | |
struct DictionaryKind: | |
var _value: Int16 | |
alias DenseArray = 0 | |
# ---------------------------------------------------------------------- | |
# Endianness of the platform producing the data | |
@value | |
struct Endianness: | |
var _value: Int16 | |
alias Little = 0 | |
alias Big = 1 | |
# These are stored in the flatbuffer in the Type union below | |
@value | |
struct Null: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn GetRootAsNull(buf: DTypePointer[DType.uint8]) -> Null: | |
return Null(buf, flatbuffers.indirect(buf, 0)) | |
# A Struct_ in the flatbuffer metadata is the same as an Arrow Struct | |
# (according to the physical memory layout). We used Struct_ here as | |
# Struct is a reserved word in Flatbuffers | |
@value | |
struct Struct_: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn GetRootAsStruct_(buf: DTypePointer[DType.uint8]) -> Struct_: | |
return Struct_(buf, flatbuffers.indirect(buf, 0)) | |
@value | |
struct List: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn GetRootAsList(buf: DTypePointer[DType.uint8]) -> List: | |
return List(buf, flatbuffers.indirect(buf, 0)) | |
# Same as List, but with 64-bit offsets, allowing to represent | |
# extremely large data values. | |
@value | |
struct LargeList: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn GetRootAsLargeList(buf: DTypePointer[DType.uint8]) -> LargeList: | |
return LargeList(buf, flatbuffers.indirect(buf, 0)) | |
# Represents the same logical types that List can, but contains offsets and | |
# sizes allowing for writes in any order and sharing of child values among | |
# list values. | |
@value | |
struct ListView: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn GetRootAsListView(buf: DTypePointer[DType.uint8]) -> ListView: | |
return ListView(buf, flatbuffers.indirect(buf, 0)) | |
# Same as ListView, but with 64-bit offsets and sizes, allowing to represent | |
# extremely large data values. | |
@value | |
struct LargeListView: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn GetRootAsLargeListView(buf: DTypePointer[DType.uint8]) -> LargeListView: | |
return LargeListView(buf, flatbuffers.indirect(buf, 0)) | |
@value | |
struct FixedSizeList: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
# Number of list items per value | |
fn listSize(self) -> Int32: | |
return flatbuffers.field[DType.int32](self._buf, int(self._pos), 4, 0) | |
fn GetRootAsFixedSizeList(buf: DTypePointer[DType.uint8]) -> FixedSizeList: | |
return FixedSizeList(buf, flatbuffers.indirect(buf, 0)) | |
# A Map is a logical nested type that is represented as | |
# | |
# List<entries: Struct<key: K, value: V>> | |
# | |
# In this layout, the keys and values are each respectively contiguous. We do | |
# not constrain the key and value types, so the application is responsible | |
# for ensuring that the keys are hashable and unique. Whether the keys are sorted | |
# may be set in the metadata for this field. | |
# | |
# In a field with Map type, the field has a child Struct field, which then | |
# has two children: key type and the second the value type. The names of the | |
# child fields may be respectively "entries", "key", and "value", but this is | |
# not enforced. | |
# | |
# Map | |
# ```text | |
# - child[0] entries: Struct | |
# - child[0] key: K | |
# - child[1] value: V | |
# ``` | |
# Neither the "entries" field nor the "key" field may be nullable. | |
# | |
# The metadata is structured so that Arrow systems without special handling | |
# for Map can make Map an alias for List. The "layout" attribute for the Map | |
# field must have the same contents as a List. | |
@value | |
struct Map: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
# Set to true if the keys within each value are sorted | |
fn keysSorted(self) -> Scalar[DType.bool]: | |
return flatbuffers.field[DType.int8](self._buf, int(self._pos), 4, 0) | |
fn GetRootAsMap(buf: DTypePointer[DType.uint8]) -> Map: | |
return Map(buf, flatbuffers.indirect(buf, 0)) | |
# A union is a complex type with children in Field | |
# By default ids in the type vector refer to the offsets in the children | |
# optionally typeIds provides an indirection between the child offset and the type id | |
# for each child `typeIds[offset]` is the id used in the type vector | |
@value | |
struct Union: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn mode(self) -> UnionMode: | |
return UnionMode(flatbuffers.field[DType.int16](self._buf, int(self._pos), 4, 0)) | |
fn typeIds(self, i: Int) -> Int32: | |
return flatbuffers.read[DType.int32](self._buf, flatbuffers.field_vector(self._buf, int(self._pos), 6) + i * 4) | |
fn typeIds_length(self) -> Int: | |
return flatbuffers.field_vector_len(self._buf, int(self._pos), 6) | |
fn GetRootAsUnion(buf: DTypePointer[DType.uint8]) -> Union: | |
return Union(buf, flatbuffers.indirect(buf, 0)) | |
@value | |
struct Int_: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn bitWidth(self) -> Int32: | |
return flatbuffers.field[DType.int32](self._buf, int(self._pos), 4, 0) | |
fn is_signed(self) -> Scalar[DType.bool]: | |
return flatbuffers.field[DType.int8](self._buf, int(self._pos), 6, 0) | |
fn GetRootAsInt_(buf: DTypePointer[DType.uint8]) -> Int_: | |
return Int_(buf, flatbuffers.indirect(buf, 0)) | |
@value | |
struct FloatingPoint: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn precision(self) -> Precision: | |
return Precision(flatbuffers.field[DType.int16](self._buf, int(self._pos), 4, 0)) | |
fn GetRootAsFloatingPoint(buf: DTypePointer[DType.uint8]) -> FloatingPoint: | |
return FloatingPoint(buf, flatbuffers.indirect(buf, 0)) | |
# Unicode with UTF-8 encoding | |
@value | |
struct Utf8: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn GetRootAsUtf8(buf: DTypePointer[DType.uint8]) -> Utf8: | |
return Utf8(buf, flatbuffers.indirect(buf, 0)) | |
# Opaque binary data | |
@value | |
struct Binary: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn GetRootAsBinary(buf: DTypePointer[DType.uint8]) -> Binary: | |
return Binary(buf, flatbuffers.indirect(buf, 0)) | |
# Same as Utf8, but with 64-bit offsets, allowing to represent | |
# extremely large data values. | |
@value | |
struct LargeUtf8: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn GetRootAsLargeUtf8(buf: DTypePointer[DType.uint8]) -> LargeUtf8: | |
return LargeUtf8(buf, flatbuffers.indirect(buf, 0)) | |
# Same as Binary, but with 64-bit offsets, allowing to represent | |
# extremely large data values. | |
@value | |
struct LargeBinary: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn GetRootAsLargeBinary(buf: DTypePointer[DType.uint8]) -> LargeBinary: | |
return LargeBinary(buf, flatbuffers.indirect(buf, 0)) | |
# Logically the same as Utf8, but the internal representation uses a view | |
# struct that contains the string length and either the string's entire data | |
# inline (for small strings) or an inlined prefix, an index of another buffer, | |
# and an offset pointing to a slice in that buffer (for non-small strings). | |
# | |
# Since it uses a variable number of data buffers, each Field with this type | |
# must have a corresponding entry in `variadicBufferCounts`. | |
@value | |
struct Utf8View: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn GetRootAsUtf8View(buf: DTypePointer[DType.uint8]) -> Utf8View: | |
return Utf8View(buf, flatbuffers.indirect(buf, 0)) | |
# Logically the same as Binary, but the internal representation uses a view | |
# struct that contains the string length and either the string's entire data | |
# inline (for small strings) or an inlined prefix, an index of another buffer, | |
# and an offset pointing to a slice in that buffer (for non-small strings). | |
# | |
# Since it uses a variable number of data buffers, each Field with this type | |
# must have a corresponding entry in `variadicBufferCounts`. | |
@value | |
struct BinaryView: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn GetRootAsBinaryView(buf: DTypePointer[DType.uint8]) -> BinaryView: | |
return BinaryView(buf, flatbuffers.indirect(buf, 0)) | |
@value | |
struct FixedSizeBinary: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
# Number of bytes per value | |
fn byteWidth(self) -> Int32: | |
return flatbuffers.field[DType.int32](self._buf, int(self._pos), 4, 0) | |
fn GetRootAsFixedSizeBinary(buf: DTypePointer[DType.uint8]) -> FixedSizeBinary: | |
return FixedSizeBinary(buf, flatbuffers.indirect(buf, 0)) | |
@value | |
struct Bool_: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn GetRootAsBool_(buf: DTypePointer[DType.uint8]) -> Bool_: | |
return Bool_(buf, flatbuffers.indirect(buf, 0)) | |
# Contains two child arrays, run_ends and values. | |
# The run_ends child array must be a 16/32/64-bit integer array | |
# which encodes the indices at which the run with the value in | |
# each corresponding index in the values child array ends. | |
# Like list/struct types, the value array can be of any type. | |
@value | |
struct RunEndEncoded: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn GetRootAsRunEndEncoded(buf: DTypePointer[DType.uint8]) -> RunEndEncoded: | |
return RunEndEncoded(buf, flatbuffers.indirect(buf, 0)) | |
# Exact decimal value represented as an integer value in two's | |
# complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers | |
# are used. The representation uses the endianness indicated | |
# in the Schema. | |
@value | |
struct Decimal: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
# Total number of decimal digits | |
fn precision(self) -> Int32: | |
return flatbuffers.field[DType.int32](self._buf, int(self._pos), 4, 0) | |
# Number of digits after the decimal point "." | |
fn scale(self) -> Int32: | |
return flatbuffers.field[DType.int32](self._buf, int(self._pos), 6, 0) | |
# Number of bits per value. The only accepted widths are 128 and 256. | |
# We use bitWidth for consistency with Int::bitWidth. | |
fn bitWidth(self) -> Int32: | |
return flatbuffers.field[DType.int32](self._buf, int(self._pos), 8, 128) | |
fn GetRootAsDecimal(buf: DTypePointer[DType.uint8]) -> Decimal: | |
return Decimal(buf, flatbuffers.indirect(buf, 0)) | |
# Date is either a 32-bit or 64-bit signed integer type representing an | |
# elapsed time since UNIX epoch (1970-01-01), stored in either of two units: | |
# | |
# * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no | |
# leap seconds), where the values are evenly divisible by 86400000 | |
# * Days (32 bits) since the UNIX epoch | |
@value | |
struct Date: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn unit(self) -> DateUnit: | |
return DateUnit(flatbuffers.field[DType.int16](self._buf, int(self._pos), 4, 1)) | |
fn GetRootAsDate(buf: DTypePointer[DType.uint8]) -> Date: | |
return Date(buf, flatbuffers.indirect(buf, 0)) | |
# Time is either a 32-bit or 64-bit signed integer type representing an | |
# elapsed time since midnight, stored in either of four units: seconds, | |
# milliseconds, microseconds or nanoseconds. | |
# | |
# The integer `bitWidth` depends on the `unit` and must be one of the following: | |
# * SECOND and MILLISECOND: 32 bits | |
# * MICROSECOND and NANOSECOND: 64 bits | |
# | |
# The allowed values are between 0 (inclusive) and 86400 (=24*60*60) seconds | |
# (exclusive), adjusted for the time unit (for example, up to 86400000 | |
# exclusive for the MILLISECOND unit). | |
# This definition doesn't allow for leap seconds. Time values from | |
# measurements with leap seconds will need to be corrected when ingesting | |
# into Arrow (for example by replacing the value 86400 with 86399). | |
@value | |
struct Time: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn unit(self) -> TimeUnit: | |
return TimeUnit(flatbuffers.field[DType.int16](self._buf, int(self._pos), 4, 1)) | |
fn bitWidth(self) -> Int32: | |
return flatbuffers.field[DType.int32](self._buf, int(self._pos), 6, 32) | |
fn GetRootAsTime(buf: DTypePointer[DType.uint8]) -> Time: | |
return Time(buf, flatbuffers.indirect(buf, 0)) | |
# Timestamp is a 64-bit signed integer representing an elapsed time since a | |
# fixed epoch, stored in either of four units: seconds, milliseconds, | |
# microseconds or nanoseconds, and is optionally annotated with a timezone. | |
# | |
# Timestamp values do not include any leap seconds (in other words, all | |
# days are considered 86400 seconds long). | |
# | |
# Timestamps with a non-empty timezone | |
# ------------------------------------ | |
# | |
# If a Timestamp column has a non-empty timezone value, its epoch is | |
# 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone | |
# (the Unix epoch), regardless of the Timestamp's own timezone. | |
# | |
# Therefore, timestamp values with a non-empty timezone correspond to | |
# physical points in time together with some additional information about | |
# how the data was obtained and/or how to display it (the timezone). | |
# | |
# For example, the timestamp value 0 with the timezone string "Europe/Paris" | |
# corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the | |
# application may prefer to display it as "January 1st 1970, 01h00" in | |
# the Europe/Paris timezone (which is the same physical point in time). | |
# | |
# One consequence is that timestamp values with a non-empty timezone | |
# can be compared and ordered directly, since they all share the same | |
# well-known point of reference (the Unix epoch). | |
# | |
# Timestamps with an unset / empty timezone | |
# ----------------------------------------- | |
# | |
# If a Timestamp column has no timezone value, its epoch is | |
# 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone. | |
# | |
# Therefore, timestamp values without a timezone cannot be meaningfully | |
# interpreted as physical points in time, but only as calendar / clock | |
# indications ("wall clock time") in an unspecified timezone. | |
# | |
# For example, the timestamp value 0 with an empty timezone string | |
# corresponds to "January 1st 1970, 00h00" in an unknown timezone: there | |
# is not enough information to interpret it as a well-defined physical | |
# point in time. | |
# | |
# One consequence is that timestamp values without a timezone cannot | |
# be reliably compared or ordered, since they may have different points of | |
# reference. In particular, it is *not* possible to interpret an unset | |
# or empty timezone as the same as "UTC". | |
# | |
# Conversion between timezones | |
# ---------------------------- | |
# | |
# If a Timestamp column has a non-empty timezone, changing the timezone | |
# to a different non-empty value is a metadata-only operation: | |
# the timestamp values need not change as their point of reference remains | |
# the same (the Unix epoch). | |
# | |
# However, if a Timestamp column has no timezone value, changing it to a | |
# non-empty value requires to think about the desired semantics. | |
# One possibility is to assume that the original timestamp values are | |
# relative to the epoch of the timezone being set; timestamp values should | |
# then adjusted to the Unix epoch (for example, changing the timezone from | |
# empty to "Europe/Paris" would require converting the timestamp values | |
# from "Europe/Paris" to "UTC", which seems counter-intuitive but is | |
# nevertheless correct). | |
# | |
# Guidelines for encoding data from external libraries | |
# ---------------------------------------------------- | |
# | |
# Date & time libraries often have multiple different data types for temporal | |
# data. In order to ease interoperability between different implementations the | |
# Arrow project has some recommendations for encoding these types into a Timestamp | |
# column. | |
# | |
# An "instant" represents a physical point in time that has no relevant timezone | |
# (for example, astronomical data). To encode an instant, use a Timestamp with | |
# the timezone string set to "UTC", and make sure the Timestamp values | |
# are relative to the UTC epoch (January 1st 1970, midnight). | |
# | |
# A "zoned date-time" represents a physical point in time annotated with an | |
# informative timezone (for example, the timezone in which the data was | |
# recorded). To encode a zoned date-time, use a Timestamp with the timezone | |
# string set to the name of the timezone, and make sure the Timestamp values | |
# are relative to the UTC epoch (January 1st 1970, midnight). | |
# | |
# (There is some ambiguity between an instant and a zoned date-time with the | |
# UTC timezone. Both of these are stored the same in Arrow. Typically, | |
# this distinction does not matter. If it does, then an application should | |
# use custom metadata or an extension type to distinguish between the two cases.) | |
# | |
# An "offset date-time" represents a physical point in time combined with an | |
# explicit offset from UTC. To encode an offset date-time, use a Timestamp | |
# with the timezone string set to the numeric timezone offset string | |
# (e.g. "+03:00"), and make sure the Timestamp values are relative to | |
# the UTC epoch (January 1st 1970, midnight). | |
# | |
# A "naive date-time" (also called "local date-time" in some libraries) | |
# represents a wall clock time combined with a calendar date, but with | |
# no indication of how to map this information to a physical point in time. | |
# Naive date-times must be handled with care because of this missing | |
# information, and also because daylight saving time (DST) may make | |
# some values ambiguous or nonexistent. A naive date-time may be | |
# stored as a struct with Date and Time fields. However, it may also be | |
# encoded into a Timestamp column with an empty timezone. The timestamp | |
# values should be computed "as if" the timezone of the date-time values | |
# was UTC; for example, the naive date-time "January 1st 1970, 00h00" would | |
# be encoded as timestamp value 0. | |
@value | |
struct Timestamp: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn unit(self) -> TimeUnit: | |
return TimeUnit(flatbuffers.field[DType.int16](self._buf, int(self._pos), 4, 0)) | |
# The timezone is an optional string indicating the name of a timezone, | |
# one of: | |
# | |
# * As used in the Olson timezone database (the "tz database" or | |
# "tzdata"), such as "America/New_York". | |
# * An absolute timezone offset of the form "+XX:XX" or "-XX:XX", | |
# such as "+07:30". | |
# | |
# Whether a timezone string is present indicates different semantics about | |
# the data (see above). | |
fn timezone(self) -> StringRef: | |
return flatbuffers.field_string(self._buf, int(self._pos), 6) | |
fn GetRootAsTimestamp(buf: DTypePointer[DType.uint8]) -> Timestamp: | |
return Timestamp(buf, flatbuffers.indirect(buf, 0)) | |
@value | |
struct Interval: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn unit(self) -> IntervalUnit: | |
return IntervalUnit(flatbuffers.field[DType.int16](self._buf, int(self._pos), 4, 0)) | |
fn GetRootAsInterval(buf: DTypePointer[DType.uint8]) -> Interval: | |
return Interval(buf, flatbuffers.indirect(buf, 0)) | |
@value | |
struct Duration: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn unit(self) -> TimeUnit: | |
return TimeUnit(flatbuffers.field[DType.int16](self._buf, int(self._pos), 4, 1)) | |
fn GetRootAsDuration(buf: DTypePointer[DType.uint8]) -> Duration: | |
return Duration(buf, flatbuffers.indirect(buf, 0)) | |
# ---------------------------------------------------------------------- | |
# user defined key value pairs to add custom metadata to arrow | |
# key namespacing is the responsibility of the user | |
@value | |
struct KeyValue: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
fn key(self) -> StringRef: | |
return flatbuffers.field_string(self._buf, int(self._pos), 4) | |
fn value(self) -> StringRef: | |
return flatbuffers.field_string(self._buf, int(self._pos), 6) | |
fn GetRootAsKeyValue(buf: DTypePointer[DType.uint8]) -> KeyValue: | |
return KeyValue(buf, flatbuffers.indirect(buf, 0)) | |
@value | |
struct DictionaryEncoding: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
# The known dictionary id in the application where this data is used. In | |
# the file or streaming formats, the dictionary ids are found in the | |
# DictionaryBatch messages | |
fn id(self) -> Int64: | |
return flatbuffers.field[DType.int64](self._buf, int(self._pos), 4, 0) | |
# The dictionary indices are constrained to be non-negative integers. If | |
# this field is null, the indices must be signed int32. To maximize | |
# cross-language compatibility and performance, implementations are | |
# recommended to prefer signed integer types over unsigned integer types | |
# and to avoid uint64 indices unless they are required by an application. | |
fn indexType(self) -> Optional[Int_]: | |
var o = flatbuffers.field_table(self._buf, int(self._pos), 6) | |
if o: | |
return Int_(self._buf, o.take()) | |
return None | |
# By default, dictionaries are not ordered, or the order does not have | |
# semantic meaning. In some statistical, applications, dictionary-encoding | |
# is used to represent ordered categorical data, and we provide a way to | |
# preserve that metadata here | |
fn isOrdered(self) -> Scalar[DType.bool]: | |
return flatbuffers.field[DType.int8](self._buf, int(self._pos), 8, 0) | |
fn dictionaryKind(self) -> DictionaryKind: | |
return DictionaryKind(flatbuffers.field[DType.int16](self._buf, int(self._pos), 10, 0)) | |
fn GetRootAsDictionaryEncoding(buf: DTypePointer[DType.uint8]) -> DictionaryEncoding: | |
return DictionaryEncoding(buf, flatbuffers.indirect(buf, 0)) | |
# ---------------------------------------------------------------------- | |
# A field represents a named column in a record / row batch or child of a | |
# nested type. | |
@value | |
struct Field: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
# Name is not required, in i.e. a List | |
fn name(self) -> StringRef: | |
return flatbuffers.field_string(self._buf, int(self._pos), 4) | |
# Whether or not this field can contain nulls. Should be true in general. | |
fn nullable(self) -> Scalar[DType.bool]: | |
return flatbuffers.field[DType.int8](self._buf, int(self._pos), 6, 0) | |
fn type_type(self) -> Type: | |
return Type(flatbuffers.field[DType.uint8](self._buf, int(self._pos), 8, 0)) | |
# This is the type of the decoded value if the field is dictionary encoded. | |
fn type_as_Null(self) -> Null: | |
return Null(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_Int(self) -> Int_: | |
return Int_(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_FloatingPoint(self) -> FloatingPoint: | |
return FloatingPoint(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_Binary(self) -> Binary: | |
return Binary(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_Utf8(self) -> Utf8: | |
return Utf8(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_Bool(self) -> Bool_: | |
return Bool_(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_Decimal(self) -> Decimal: | |
return Decimal(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_Date(self) -> Date: | |
return Date(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_Time(self) -> Time: | |
return Time(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_Timestamp(self) -> Timestamp: | |
return Timestamp(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_Interval(self) -> Interval: | |
return Interval(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_List(self) -> List: | |
return List(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_Struct_(self) -> Struct_: | |
return Struct_(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_Union(self) -> Union: | |
return Union(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_FixedSizeBinary(self) -> FixedSizeBinary: | |
return FixedSizeBinary(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_FixedSizeList(self) -> FixedSizeList: | |
return FixedSizeList(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_Map(self) -> Map: | |
return Map(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_Duration(self) -> Duration: | |
return Duration(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_LargeBinary(self) -> LargeBinary: | |
return LargeBinary(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_LargeUtf8(self) -> LargeUtf8: | |
return LargeUtf8(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_LargeList(self) -> LargeList: | |
return LargeList(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_RunEndEncoded(self) -> RunEndEncoded: | |
return RunEndEncoded(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_BinaryView(self) -> BinaryView: | |
return BinaryView(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_Utf8View(self) -> Utf8View: | |
return Utf8View(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_ListView(self) -> ListView: | |
return ListView(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
fn type_as_LargeListView(self) -> LargeListView: | |
return LargeListView(self._buf, flatbuffers.field_table(self._buf, int(self._pos), 10).or_else(0)) | |
# Present only if the field is dictionary encoded. | |
fn dictionary(self) -> Optional[DictionaryEncoding]: | |
var o = flatbuffers.field_table(self._buf, int(self._pos), 12) | |
if o: | |
return DictionaryEncoding(self._buf, o.take()) | |
return None | |
# children apply only to nested data types like Struct, List and Union. For | |
# primitive types children will have length 0. | |
fn children(self, i: Int) -> Field: | |
return Field(self._buf, flatbuffers.indirect(self._buf, flatbuffers.field_vector(self._buf, int(self._pos), 14) + i * 4)) | |
fn children_length(self) -> Int: | |
return flatbuffers.field_vector_len(self._buf, int(self._pos), 14) | |
# User-defined metadata | |
fn custom_metadata(self, i: Int) -> KeyValue: | |
return KeyValue(self._buf, flatbuffers.indirect(self._buf, flatbuffers.field_vector(self._buf, int(self._pos), 16) + i * 4)) | |
fn custom_metadata_length(self) -> Int: | |
return flatbuffers.field_vector_len(self._buf, int(self._pos), 16) | |
fn GetRootAsField(buf: DTypePointer[DType.uint8]) -> Field: | |
return Field(buf, flatbuffers.indirect(buf, 0)) | |
# ---------------------------------------------------------------------- | |
# A Buffer represents a single contiguous memory segment | |
@value | |
struct Buffer: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
# The relative offset into the shared memory page where the bytes for this | |
# buffer starts | |
fn offset(self) -> Int64: | |
return flatbuffers.read[DType.int64](self._buf, int(self._pos) + 0) | |
# The absolute length (in bytes) of the memory buffer. The memory is found | |
# from offset (inclusive) to offset + length (non-inclusive). When building | |
# messages using the encapsulated IPC message, padding bytes may be written | |
# after a buffer, but such padding bytes do not need to be accounted for in | |
# the size here. | |
fn length(self) -> Int64: | |
return flatbuffers.read[DType.int64](self._buf, int(self._pos) + 8) | |
# ---------------------------------------------------------------------- | |
# A Schema describes the columns in a row batch | |
@value | |
struct Schema: | |
var _buf: DTypePointer[DType.uint8] | |
var _pos: Int32 | |
# endianness of the buffer | |
# it is Little Endian by default | |
# if endianness doesn't match the underlying system then the vectors need to be converted | |
fn endianness(self) -> Endianness: | |
return Endianness(flatbuffers.field[DType.int16](self._buf, int(self._pos), 4, 0)) | |
fn fields(self, i: Int) -> Field: | |
return Field(self._buf, flatbuffers.indirect(self._buf, flatbuffers.field_vector(self._buf, int(self._pos), 6) + i * 4)) | |
fn fields_length(self) -> Int: | |
return flatbuffers.field_vector_len(self._buf, int(self._pos), 6) | |
fn custom_metadata(self, i: Int) -> KeyValue: | |
return KeyValue(self._buf, flatbuffers.indirect(self._buf, flatbuffers.field_vector(self._buf, int(self._pos), 8) + i * 4)) | |
fn custom_metadata_length(self) -> Int: | |
return flatbuffers.field_vector_len(self._buf, int(self._pos), 8) | |
# Features used in the stream/file. | |
fn features(self, i: Int) -> Feature: | |
return flatbuffers.read[DType.int64](self._buf, flatbuffers.field_vector(self._buf, int(self._pos), 10) + i * 8) | |
fn features_length(self) -> Int: | |
return flatbuffers.field_vector_len(self._buf, int(self._pos), 10) | |
fn GetRootAsSchema(buf: DTypePointer[DType.uint8]) -> Schema: | |
return Schema(buf, flatbuffers.indirect(buf, 0)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment