-
Notifications
You must be signed in to change notification settings - Fork 161
wal-protocol: introduce v2 Envelope with bilrost encoding and lazy payloads #4695
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,28 +9,33 @@ | |
| // by the Apache License, Version 2.0. | ||
|
|
||
| use bytes::Bytes; | ||
|
|
||
| use restate_storage_api::fsm_table::{CurrentReplicaSetState, NextReplicaSetState}; | ||
| use restate_types::identifiers::{LeaderEpoch, PartitionId}; | ||
| use restate_types::logs::{Keys, Lsn, SequenceNumber}; | ||
| use restate_types::logs::{HasRecordKeys, Keys, Lsn, SequenceNumber}; | ||
| use restate_types::partitions::PartitionConfiguration; | ||
| use restate_types::partitions::state::{MemberState, ReplicaSetState}; | ||
| use restate_types::replication::{NodeSet, ReplicationProperty}; | ||
| use restate_types::schema::Schema; | ||
| use restate_types::sharding::KeyRange; | ||
| use restate_types::time::MillisSinceEpoch; | ||
| use restate_types::{GenerationalNodeId, SemanticRestateVersion, Version, Versioned}; | ||
| use restate_types::{ | ||
| GenerationalNodeId, SemanticRestateVersion, Version, Versioned, bilrost_storage_encode_decode, | ||
| flexbuffers_storage_encode_decode, | ||
| }; | ||
|
|
||
| /// Announcing a new leader. This message can be written by any component to make the specified | ||
| /// partition processor the leader. | ||
| #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] | ||
| #[derive(Debug, Clone, serde::Serialize, serde::Deserialize, bilrost::Message)] | ||
| pub struct AnnounceLeader { | ||
| /// Sender of the announce leader message. | ||
| /// | ||
| /// This became non-optional in v1.5. Noting that it has always been set in previous versions, | ||
| /// it's safe to assume that it's always set. | ||
| #[bilrost(tag(1))] | ||
| pub node_id: GenerationalNodeId, | ||
| #[bilrost(tag(2))] | ||
| pub leader_epoch: LeaderEpoch, | ||
| #[bilrost(tag(3))] | ||
| pub partition_key_range: KeyRange, | ||
|
|
||
| /// Associated epoch metadata version | ||
|
|
@@ -40,25 +45,40 @@ pub struct AnnounceLeader { | |
| /// | ||
| /// *Since v1.6* | ||
| #[serde(default, skip_serializing_if = "Option::is_none")] | ||
| #[bilrost(tag(4))] | ||
| pub epoch_version: Option<Version>, | ||
| /// Current replica set configuration at the time of the announcement. | ||
| /// This field is optional for backward compatibility with older versions. | ||
| /// *Since v1.6* | ||
| #[serde(default, skip_serializing_if = "Option::is_none")] | ||
| #[bilrost(tag(5))] | ||
| pub current_config: Option<CurrentReplicaSetConfiguration>, | ||
| /// Next replica set configuration. | ||
| /// *Since v1.6* | ||
| #[serde(default, skip_serializing_if = "Option::is_none")] | ||
| #[bilrost(tag(6))] | ||
| pub next_config: Option<NextReplicaSetConfiguration>, | ||
| } | ||
|
|
||
| bilrost_storage_encode_decode!(AnnounceLeader); | ||
|
|
||
| impl HasRecordKeys for AnnounceLeader { | ||
| fn record_keys(&self) -> Keys { | ||
| Keys::RangeInclusive(self.partition_key_range.start()..=self.partition_key_range.end()) | ||
| } | ||
| } | ||
|
|
||
| #[serde_with::serde_as] | ||
| #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] | ||
| #[derive(Debug, Clone, serde::Serialize, serde::Deserialize, bilrost::Message)] | ||
| pub struct CurrentReplicaSetConfiguration { | ||
| #[bilrost(tag = 1)] | ||
| pub version: Version, | ||
| #[bilrost(tag = 2)] | ||
| pub replica_set: NodeSet, | ||
| #[bilrost(tag = 3)] | ||
| pub modified_at: MillisSinceEpoch, | ||
| #[serde_as(as = "serde_with::DisplayFromStr")] | ||
| #[bilrost(tag = 4)] | ||
| pub replication: ReplicationProperty, | ||
| } | ||
|
|
||
|
|
@@ -87,9 +107,11 @@ impl CurrentReplicaSetConfiguration { | |
| } | ||
| } | ||
|
|
||
| #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] | ||
| #[derive(Debug, Clone, serde::Serialize, serde::Deserialize, bilrost::Message)] | ||
| pub struct NextReplicaSetConfiguration { | ||
| #[bilrost(tag(1))] | ||
| pub version: Version, | ||
| #[bilrost(tag(2))] | ||
| pub replica_set: NodeSet, | ||
| } | ||
|
|
||
|
|
@@ -138,7 +160,7 @@ fn new_replica_set_state(version: Version, node_set: &NodeSet) -> ReplicaSetStat | |
| /// Readers before v1.4.0 will crash when reading this command. For v1.4.0+, the barrier defines the | ||
| /// minimum version of restate server that can progress after this command. It also updates the FSM | ||
| /// in case command has been trimmed. | ||
| #[derive(Debug, Clone, PartialEq, Eq, bilrost::Message, serde::Serialize, serde::Deserialize)] | ||
| #[derive(Debug, Clone, bilrost::Message, serde::Serialize, serde::Deserialize)] | ||
| pub struct VersionBarrier { | ||
| /// The minimum version required (inclusive) to progress after this barrier. | ||
| pub version: SemanticRestateVersion, | ||
|
|
@@ -147,21 +169,44 @@ pub struct VersionBarrier { | |
| pub partition_key_range: Keys, | ||
| } | ||
|
|
||
| bilrost_storage_encode_decode!(VersionBarrier); | ||
|
|
||
| impl HasRecordKeys for VersionBarrier { | ||
| fn record_keys(&self) -> Keys { | ||
| self.partition_key_range.clone() | ||
| } | ||
| } | ||
|
|
||
| /// Updates the `PARTITION_DURABILITY` FSM variable to the given value. Note that durability | ||
| /// only applies to partitions with the same `partition_id`. At replay time, the partition will | ||
| /// ignore updates that are not targeted to its own ID. | ||
| /// | ||
| /// NOTE: The durability point is monotonically increasing. | ||
| /// | ||
| /// Since v1.4.2. | ||
| #[derive(Debug, Clone, PartialEq, Eq, bilrost::Message, serde::Serialize, serde::Deserialize)] | ||
| #[derive(Debug, Clone, bilrost::Message, serde::Serialize, serde::Deserialize)] | ||
| pub struct PartitionDurability { | ||
| #[bilrost(tag(1))] | ||
| pub partition_id: PartitionId, | ||
| /// The partition has applied this LSN durably to the replica-set and/or has been | ||
| /// persisted in a snapshot in the snapshot repository. | ||
| #[bilrost(tag(2))] | ||
| pub durable_point: Lsn, | ||
| /// Timestamp which the durability point was updated | ||
| #[bilrost(tag(3))] | ||
| pub modification_time: MillisSinceEpoch, | ||
| /// partition key range | ||
| #[bilrost(tag(4))] | ||
| #[serde(default)] | ||
| pub partition_key_range: Keys, | ||
|
Comment on lines
+198
to
+201
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
In clusters upgrading with existing v1 WAL entries, Useful? React with 👍 / 👎.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's safe to |
||
| } | ||
|
|
||
| bilrost_storage_encode_decode!(PartitionDurability); | ||
|
|
||
| impl HasRecordKeys for PartitionDurability { | ||
| fn record_keys(&self) -> Keys { | ||
| self.partition_key_range.clone() | ||
| } | ||
| } | ||
|
|
||
| /// Consistently store schema across partition replicas. | ||
|
|
@@ -173,6 +218,14 @@ pub struct UpsertSchema { | |
| pub schema: Schema, | ||
| } | ||
|
|
||
| flexbuffers_storage_encode_decode!(UpsertSchema); | ||
|
|
||
| impl HasRecordKeys for UpsertSchema { | ||
| fn record_keys(&self) -> Keys { | ||
| self.partition_key_range.clone() | ||
| } | ||
| } | ||
|
|
||
| /// Consistently distribute the cluster-global rule book across partition | ||
| /// replicas. Each partition's leader observes a node-level cache of the | ||
| /// rule book stored in the metadata store and proposes this command when | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it possible to avoid the extra allocation and serialize/deserialize directly into the indexmap?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I will look into this one