rfc9722xml2.original.xml   rfc9722.xml 
<?xml version="1.0" encoding="US-ASCII"?> <?xml version='1.0' encoding='UTF-8'?>
<!DOCTYPE rfc [ <!DOCTYPE rfc [
<!ENTITY nbsp "&#160;"> <!ENTITY nbsp "&#160;">
<!ENTITY zwsp "&#8203;"> <!ENTITY zwsp "&#8203;">
<!ENTITY nbhy "&#8209;"> <!ENTITY nbhy "&#8209;">
<!ENTITY wj "&#8288;"> <!ENTITY wj "&#8288;">
]> ]>
<!-- used by XSLT processors --> <rfc xmlns:xi="http://www.w3.org/2001/XInclude" category="std" docName="draft-ie
<?xml-stylesheet type='text/xsl' href='http://xml.resource.org/authoring/rfc2629 tf-bess-evpn-fast-df-recovery-12" number="9722" updates="8584" obsoletes="" cons
.xslt'?> ensus="true" submissionType="IETF" ipr="trust200902" tocInclude="true" tocDepth=
<!-- For a complete list and description of processing instructions (PIs), "4" symRefs="true" sortRefs="true" version="3" xml:lang="en">
please see http://xml.resource.org/authoring/README.html. -->
<?rfc strict="yes" ?>
<!-- give errors regarding ID-nits and DTD validation -->
<!-- control the table of contents (ToC) -->
<?rfc toc="yes"?>
<!-- generate a ToC -->
<?rfc tocdepth="4"?>
<!-- the number of levels of subsections in ToC. default: 3 -->
<!-- control references -->
<?rfc symrefs="yes"?>
<!-- use symbolic references tags, i.e, [RFC2119] instead of [1] -->
<?rfc sortrefs="yes" ?>
<!-- sort the reference entries alphabetically -->
<!-- control vertical white space
(using these PIs as follows is recommended by the RFC Editor) -->
<?rfc compact="yes" ?>
<!-- do not start each main section on a new page -->
<?rfc subcompact="no" ?>
<!-- keep one blank line between list items -->
<!-- end of list of popular I-D processing instructions -->
<rfc category="std"
xmlns:xi="http://www.w3.org/2001/XInclude"
docName="draft-ietf-bess-evpn-fast-df-recovery-12"
updates="8584"
consensus="true"
submissionType="IETF"
ipr="trust200902">
<!-- ***** FRONT MATTER ***** -->
<front> <front>
<!-- The abbreviated title is used in the page header - it is only necessary <title abbrev="Fast Recovery for EVPN DF Election">Fast Recovery for EVPN Des
if the ignated Forwarder Election</title>
full title is longer than 39 characters --> <seriesInfo name="RFC" value="9722"/>
<title abbrev="Fast Recovery for EVPN DF-Election">Fast Recovery for EVPN Des
ignated Forwarder Election</title>
<!-- add 'role="editor"' below for the editors if appropriate -->
<!-- Another author who claims to be an editor -->
<author fullname="Patrice Brissette" initials="P." surname="Brissette"> <author fullname="Patrice Brissette" initials="P." surname="Brissette">
<organization>Cisco</organization> <organization>Cisco</organization>
<address> <address>
<email>pbrisset@cisco.com</email> <email>pbrisset@cisco.com</email>
</address> </address>
</author> </author>
<author fullname="Ali Sajassi" initials="A." surname="Sajassi">
<author fullname="Ali Sajassi" initials="A." surname="Sajassi"> <organization>Cisco</organization>
<organization>Cisco</organization> <address>
<address> <email>sajassi@cisco.com</email>
<email>sajassi@cisco.com</email> </address>
</address> </author>
</author> <author fullname="Luc André Burdet" initials="LA." surname="Burdet" role="ed
itor">
<author fullname="Luc Andre Burdet" initials="LA." surname="Burdet" role="edit <organization>Cisco</organization>
or"> <address>
<organization>Cisco</organization> <email>lburdet@cisco.com</email>
<address> </address>
<email>lburdet@cisco.com</email> </author>
</address> <author fullname="John Drake" initials="J." surname="Drake">
</author> <organization>Independent</organization>
<address>
<author fullname="John Drake" initials="J." surname="Drake"> <email>je_drake@yahoo.com</email>
<organization>Independent</organization> </address>
<address> </author>
<email>je_drake@yahoo.com</email> <author fullname="Jorge Rabadan" initials="J." surname="Rabadan">
</address> <organization>Nokia</organization>
</author> <address>
<email>jorge.rabadan@nokia.com</email>
<author fullname="Jorge Rabadan" initials="J." surname="Rabadan"> </address>
<organization>Nokia</organization> </author>
<address> <date year="2025" month="April"/>
<email>jorge.rabadan@nokia.com</email>
</address>
</author>
<date year="2024" />
<!-- Meta-data Declarations -->
<area>General</area>
<workgroup>BESS Working Group</workgroup>
<!-- WG name at the upperleft corner of the doc,
IETF is fine for individual submissions.
If this element is not present, the default is "Network Working Group",
which is used by the RFC Editor as a nod to the history of the IETF. -->
<keyword>EVPN</keyword>
<keyword>Designated Forwarder</keyword>
<keyword>Convergence</keyword>
<keyword>Recovery</keyword>
<abstract>
<t>The Ethernet Virtual Private Network (EVPN) solution in RFC 7432 provide
s
Designated Forwarder (DF) election procedures for multihomed Ethernet Segme
nts. These
procedures have been enhanced further by applying the Highest
Random Weight (HRW) algorithm for Designated Forwarder election
to avoid unnecessary DF status changes upon a failure.
This document improves these procedures by providing a fast Designated Forw
arder
election upon recovery of the failed link or node associated
with the multihomed Ethernet Segment.
This document updates RFC 8584 by optionally introducing delays between
some of the events therein.</t>
<t>The solution is independent of the number of EVPN Instances (EVIs) assoc
iated with that Ethernet
Segment and it is performed via a simple signaling in BGP between the
recovered node and each of the other nodes in the multihoming group.</t>
</abstract>
</front> <area>RTG</area>
<workgroup>bess</workgroup>
<middle> <keyword>EVPN</keyword>
<section anchor="intro" title="Introduction"> <keyword>Designated Forwarder</keyword>
<t>The Ethernet Virtual Private Network (EVPN) solution <xref target="RFC74 <keyword>Convergence</keyword>
32"/> is <keyword>Recovery</keyword>
<abstract>
<t>The Ethernet Virtual Private Network (EVPN) solution in RFC 7432
provides Designated Forwarder (DF) election procedures for multihomed
Ethernet Segments. These procedures have been enhanced further by
applying the Highest Random Weight (HRW) algorithm for DF election to
avoid unnecessary DF status changes upon a failure. This document
improves these procedures by providing a fast DF election upon recovery
of the failed link or node associated with the multihomed Ethernet
Segment. This document updates RFC 8584 by optionally introducing
delays between some of the events therein.</t>
<t>The solution is independent of the number of EVPN Instances (EVIs)
associated with that Ethernet Segment, and it is performed via a simple
signaling in BGP between the recovered node and each of the other nodes
in the multihoming group.</t>
</abstract>
</front>
<middle>
<section anchor="intro">
<name>Introduction</name>
<t>The Ethernet Virtual Private Network (EVPN) solution <xref target="RFC7
432"/> is
widely used in data center (DC) applications for Network widely used in data center (DC) applications for Network
Virtualization Overlay (NVO) and DC interconnect (DCI) services, and Virtualization Overlay (NVO) and Data Center Interconnect (DCI) services an
in service provider (SP) applications for next generation virtual d
in service provider (SP) applications for next-generation virtual
private LAN services.</t> private LAN services.</t>
<t><xref target="RFC7432"/> describes Designated Forwarder (DF) election p
<t><xref target="RFC7432"/> describes Designated Forwarder (DF) election pr rocedures for
ocedures for
multihomed Ethernet Segments. These procedures are enhanced further in multihomed Ethernet Segments. These procedures are enhanced further in
<xref target="RFC8584"/> by applying the Highest Random Weight algorithm fo r DF <xref target="RFC8584"/> by applying the Highest Random Weight (HRW) algori thm for DF
election in order to avoid unnecessary DF status changes upon a link election in order to avoid unnecessary DF status changes upon a link
or node failure associated with the multihomed Ethernet Segment.</t> or node failure associated with the multihomed Ethernet Segment.</t>
<t>This document makes further improvements to the DF election procedures i n <t>This document makes further improvements to the DF election procedures in
<xref target="RFC8584"/> by providing an option for a fast DF election upon <xref target="RFC8584"/> by providing an option for a fast DF election upon
recovery of the failed link or node associated with the multihomed recovery of the failed link or node associated with the multihomed
Ethernet Segment. This DF election is achieved independent of the number Ethernet Segment. This DF election is achieved independent of the number
of EVPN Instances (EVIs) associated with that Ethernet Segment and it is pe rformed via of EVPN Instances (EVIs) associated with that Ethernet Segment, and it is p erformed via
straightforward signaling in BGP between the recovered node and each of the other nodes straightforward signaling in BGP between the recovered node and each of the other nodes
in the multihomed Ethernet Segment redundancy group.<br/> in the multihomed Ethernet Segment redundancy group.</t>
This document updates the DF Election Finite State Machine (FSM) described <t>This document updates the DF Election Finite State Machine (FSM)
in <relref target="RFC8584" section="2.1"/>, described in <xref target="RFC8584" section="2.1"/> by optionally
by optionally introducing delays between some events, as further detailed i introducing delays between some events, as further detailed in <xref
n <xref target="fsm_8584"/>. target="fsm_8584"/>. The solution is based on a simple one-way signaling
The solution is based on a simple one-way signaling mechanism.</t> mechanism.</t>
<section>
<section title="Requirements Language"> <name>Requirements Language</name>
<t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
"SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and
"OPTIONAL" in this document are to be interpreted as described in
BCP 14 <xref target="RFC2119"/> <xref target="RFC8174"/> when, and only when,
they appear in all
capitals, as shown here.</t>
</section>
<section anchor="terminology" title="Terminology">
<t> <t>
<dl> The key words "<bcp14>MUST</bcp14>", "<bcp14>MUST NOT</bcp14>", "<bcp14>REQU
<dt>PE:</dt><dd>Provider Edge device.</dd> IRED</bcp14>", "<bcp14>SHALL</bcp14>", "<bcp14>SHALL
<dt>Designated Forwarder (DF):</dt><dd>A PE that is currently forward NOT</bcp14>", "<bcp14>SHOULD</bcp14>", "<bcp14>SHOULD NOT</bcp14>", "<bcp14>
ing RECOMMENDED</bcp14>", "<bcp14>NOT RECOMMENDED</bcp14>",
"<bcp14>MAY</bcp14>", and "<bcp14>OPTIONAL</bcp14>" in this document are to
be interpreted as
described in BCP&nbsp;14 <xref target="RFC2119"/> <xref target="RFC8174"/>
when, and only when, they appear in all capitals, as shown here.
</t>
</section>
<section anchor="terminology">
<name>Terminology</name>
<dl>
<dt>PE:</dt>
<dd>Provider Edge</dd>
<dt>DF:</dt>
<dd>Designated Forwarder. A PE that is currently forwarding
(encapsulating/decapsulating) traffic for a given VLAN in and out of (encapsulating/decapsulating) traffic for a given VLAN in and out of
a site.</dd> a site.</dd>
<dt>NDF:</dt><dd>Non-Designated Forwarder, a PE that is currently blo <dt>NDF:</dt>
cking traffic (see <dd>Non-Designated Forwarder. A PE that is currently blocking traffic
(see
DF above).</dd> DF above).</dd>
<dt>EVI:</dt><dd>An EVPN instance spanning the Provider Edge (PE) dev <dt>EVI:</dt>
ices <dd>EVPN Instance. It spans the PE devices participating in that
participating in that EVPN.</dd> EVPN.</dd>
<dt>HRW:</dt><dd>Highest Random Weight algorithm, <xref target="HRW98 <dt>HRW:</dt>
"/> </dd> <dd>Highest Random Weight algorithm <xref target="HRW98"/></dd>
<dt>Service carving:</dt><dd>DF Election is also referred to as "serv <dt>Service carving:</dt>
ice carving" in <xref <dd>This refers to DF election, as defined in <xref target="RFC7432"/>
target="RFC7432"/></dd> .</dd>
<dt>SCT:</dt><dd>Service Carving Time, defined in this document, the <dt>SCT:</dt>
time at <dd>Service Carving Time. Defined in this document as the time at
which all nodes participating in an Ethernet Segment perform DF Elect ion.</dd> which all nodes participating in an Ethernet Segment perform DF Elect ion.</dd>
</dl> </dl>
</t> </section>
</section> <section anchor="challenges">
<name>Challenges with Existing Mechanism</name>
<section anchor="challenges" title="Challenges with Existing Mechanism"> <t>In EVPN technology, multiple PE devices encapsulate
<t>In EVPN technology, multiple Provider Edge (PE) devices encapsulate
and decapsulate data belonging to the same VLAN. Under certain condition s, this and decapsulate data belonging to the same VLAN. Under certain condition s, this
may cause duplicated Ethernet packets and potential loops if there is a momentary may cause duplicated Ethernet packets and potential loops if there is a momentary
overlap in forwarding roles between two or more PE devices, potentially also leading overlap in forwarding roles between two or more PE devices, potentially also leading
to broadcast storms of frames forwarded back into the VLAN.</t> to broadcast storms of frames forwarded back into the VLAN.</t>
<t>EVPN <xref target="RFC7432"/> currently specifies timer-based synchro nization among PE <t>EVPN <xref target="RFC7432"/> currently specifies timer-based synchro nization among PE
devices within an Ethernet Segment redundancy group. This approach can l ead to duplications and potential devices within an Ethernet Segment redundancy group. This approach can l ead to duplications and potential
loops due to multiple Designated Forwarders (DFs) if the timer interval loops due to multiple DFs if the timer interval is too short
is too short, or can lead to packet drops if the timer interval is too long.</t>
or to packet drops if the timer interval is too long.</t> <t>Split-horizon filtering, as described in <xref target="RFC7432" secti
on="8.3"/>,
<t>Split-horizon filtering, as described in <relref target="RFC7432" sec
tion="8.3"/>,
can prevent loops but does not address duplicates. can prevent loops but does not address duplicates.
However, if there are overlapping Designated Forwarders of two However, if there are overlapping DFs of two
different sites simultaneously for the same VLAN, the site identifier wi ll differ when the different sites simultaneously for the same VLAN, the site identifier wi ll differ when the
packet re-enters the Ethernet Segment. Consequently, the split-horizon c heck will fail, packet re-enters the Ethernet Segment. Consequently, the split-horizon c heck will fail,
resulting in layer-2 loops.</t> resulting in Layer 2 loops.</t>
<t>The updated DF procedures outlined in <xref target="RFC8584"/> <t>The updated DF procedures outlined in <xref target="RFC8584"/>
use the well-known use the well-known
Highest Random Weight&nbsp;(HRW) algorithm to prevent the reshuffling of VLANs among HRW algorithm to prevent the reshuffling of VLANs among
PE devices within the Ethernet Segment redundancy group during failure o r recovery events. This PE devices within the Ethernet Segment redundancy group during failure o r recovery events. This
approach minimizes the impact on VLANs not assigned to the failed or rec overed ports approach minimizes the impact on VLANs not assigned to the failed or rec overed ports
and eliminates the occurrence of loops or duplicates during such events. </t> and eliminates the occurrence of loops or duplicates during such events. </t>
<t>However, upon PE insertion or a port being newly added to a multihome d Ethernet Segment, <t>However, upon PE insertion or a port being newly added to a multihome d Ethernet Segment,
HRW cannot help either as a transfer of DF role to the new port must occ ur the HRW cannot help either, as a transfer of the DF role to the new port must occur
while the old DF is still active.</t> while the old DF is still active.</t>
<figure anchor="topology">
<figure anchor="topology" title="CE1 multihomed to PE1 and PE2."> <name>CE1 Multihomed to PE1 and PE2</name>
<artwork><![CDATA[ <artwork><![CDATA[
+---------+ +---------+
+-------------+ | | +-------------+ | |
| | | | | | | |
/ | PE1 |----| | +-------------+ / | PE1 |----| | +-------------+
/ | | | MPLS/ | | |---CE3 / | | | MPLS/ | | |---CE3
/ +-------------+ | VxLAN/ | | PE3 | / +-------------+ | VxLAN/ | | PE3 |
CE1 - | Cloud | | | CE1 - | Cloud | | |
\ +-------------+ | |---| | \ +-------------+ | |---| |
\ | | | | +-------------+ \ | | | | +-------------+
\ | PE2 |----| | \ | PE2 |----| |
| | | | | | | |
+-------------+ | | +-------------+ | |
+---------+ +---------+]]></artwork>
]]> </figure>
</artwork></figure>
<t>In <xref target="topology"/>, when PE2 is inserted in the Ethernet Se gment or its <t>In <xref target="topology"/>, when PE2 is inserted in the Ethernet Se gment or its
CE1-facing interface recovered, PE1 will transfer CE1-facing interface is recovered, PE1 will transfer
the DF role of some VLANs to PE2 to achieve load balancing. However, the DF role of some VLANs to PE2 to achieve load-balancing. However,
because there is no handshake mechanism between PE1 and PE2, because there is no handshake mechanism between PE1 and PE2,
overlapping of DF roles for a given VLAN is possible which leads to dupl overlapping of DF roles for a given VLAN is possible, which leads to dup
ication of lication of
traffic as well as layer-2 loops.</t> traffic as well as Layer 2 loops.</t>
<t>Current EVPN specifications <xref target="RFC7432"/> and <xref target ="RFC8584"/> <t>Current EVPN specifications <xref target="RFC7432"/> and <xref target ="RFC8584"/>
rely on a timer-based approach for transferring the DF role to the newly inserted device. rely on a timer-based approach for transferring the DF role to the newly inserted device.
This can cause the following issues: This can cause the following issues:</t>
<ul> <ul>
<li>Loops/Duplicates if the timer value is too short</li> <li>Loops and duplicates, if the timer value is too short</li>
<li>Prolonged Traffic Blackholing if the timer value is too long</li <li>Prolonged traffic loss, if the timer value is too long</li>
>
</ul> </ul>
</t> </section>
</section> <section anchor="advantages">
<name>Design Principles for a Solution</name>
<section anchor="advantages" title="Design Principles for a Solution">
<t>The clock-synchronization solution for fast DF recovery presented in this document <t>The clock-synchronization solution for fast DF recovery presented in this document
follows several design principles and offers follows several design principles and offers
multiple advantages, namely: multiple advantages, namely:
</t>
<ul> <ul>
<li>Complex handshake signaling mechanisms and state machines are <li>Complex handshake signaling mechanisms and state machines are
avoided in favor of a simple uni-directional signaling approach.</li avoided in favor of a simple unidirectional signaling approach.</li>
> <li>The fast DF recovery solution maintains backwards compatibility (s
<li>The fast DF recovery solution maintains backwards compatibility (s ee <xref target="ntpcompat"/>) by ensuring that PEs reject any unrecognized new
ee <xref BGP EVPN Extended Community.</li>
target="ntpcompat"/>) by ensuring that PEs reject any unrecognized new
BGP EVPN Extended Community.</li>
<li>Existing DF Election algorithms remain supported.</li> <li>Existing DF Election algorithms remain supported.</li>
<li>The fast DF recovery solution is independent of any BGP delays in propagation of Ethernet Segment <li>The fast DF recovery solution is independent of any BGP delays in propagation of Ethernet Segment
routes (Route Type 4)</li> routes (Route Type 4)</li>
<li>The fast DF recovery solution is agnostic of the actual time synch ronization mechanism <li>The fast DF recovery solution is agnostic of the actual time synch ronization mechanism
used; however, an NTP-based representation of time is used for EVPN si gnaling.</li> used; however, an NTP-based representation of time is used for EVPN si gnaling.</li>
</ul> </ul>
</t>
<t>The solution in this document relies on nodes in the topology, more s pecifically <t>The solution in this document relies on nodes in the topology, more s pecifically
the peering nodes of each Ethernet-Segment, to be clock-synchronized and advertise Time the peering nodes of each Ethernet-Segment, to be clock-synchronized and to advertise the Time
Synchronization capability. Synchronization capability.
When this is not the case, or clocks are badly desynchronized, network c When this is not the case, or when clocks are badly desynchronized, netw
onvergence and DF ork convergence and DF
Election is no worse than <xref target="RFC7432"/> due to the timestamp Election is no worse than that described in <xref target="RFC7432"/> due
range checking (<xref to the timestamp range checking (<xref target="timestamp_verification"/>).
target="timestamp_verification"/>).
</t> </t>
</section> </section>
</section>
</section> <section anchor="sync">
<name>DF Election Synchronization Solution</name>
<section anchor="sync" title="DF Election Synchronization Solution">
<t>The fast DF recovery solution relies on the concept of common clock ali gnment between partner PEs participating <t>The fast DF recovery solution relies on the concept of common clock ali gnment between partner PEs participating
in a common Ethernet Segment, i.e., PE1 and PE2 in <xref target="topology" />. The main idea is to have all peering PEs of that in a common Ethernet Segment, i.e., PE1 and PE2 in <xref target="topology" />. The main idea is to have all peering PEs of that
Ethernet Segment perform DF election and apply the result at the same prev Ethernet Segment perform DF election and apply the result at the same prev
iously-announced time. </t> iously announced time. </t>
<t>The DF Election procedure, as described in <xref target="RFC7432"/> and as optionally <t>The DF Election procedure, as described in <xref target="RFC7432"/> and as optionally
signaled in <xref target="RFC8584"/>, is applied. signaled in <xref target="RFC8584"/>, is applied.
All PEs attached to a given Ethernet Segment are clock-synchronized All PEs attached to a given Ethernet Segment are clock-synchronized
using a networking protocol for clock synchronization (e.g., NTP, PTP). using a networking protocol for clock synchronization (e.g., NTP, Precisio
Whenever possible, recovery activities for failed PEs SHOULD NOT be initia n Time Protocol (PTP)).
ted until after the Whenever possible, recovery activities for failed PEs <bcp14>SHOULD NOT</b
cp14> be initiated until after the
underlying clock synchronization protocol has converged to benefit from th is document's fast DF recovery underlying clock synchronization protocol has converged to benefit from th is document's fast DF recovery
procedures. procedures.
When a new PE is inserted in an Ethernet Segment or a failed PE of the Eth ernet When a new PE is inserted in an Ethernet Segment or when a failed PE of th e Ethernet
Segment recovers, that PE communicates to peering partners the current tim e plus the value of Segment recovers, that PE communicates to peering partners the current tim e plus the value of
the timer for partner discovery from step 2 in <relref target="RFC7432" se ction="8.5"/>. the timer for partner discovery from step 2 in <xref target="RFC7432" sect ion="8.5"/>.
This constitutes an "end time" or "absolute time" as seen from the local P E. This constitutes an "end time" or "absolute time" as seen from the local P E.
That absolute time is called the "Service Carving Time" (SCT).</t> That absolute time is called the Service Carving Time (SCT).</t>
<t>A new BGP EVPN Extended Community, the Service Carving Time, is adverti
<t>A new BGP EVPN Extended Community, the Service Carving Time is advertis sed along with
ed along with the Ethernet Segment Route Type 4 (RT-4) and communicates the SCT to other
the Ethernet Segment Route Type 4 (RT-4) and communicates the Service Carv
ing Time to other
partners to ensure an orderly transfer of forwarding duties.</t> partners to ensure an orderly transfer of forwarding duties.</t>
<t>Upon receipt of the new BGP EVPN Extended Community, partner PEs can de
<t>Upon receipt of the new BGP EVPN Extended Community, partner PEs can de termine the SCT
termine the service carving time
of the newly inserted PE. To eliminate any potential for duplicate traffic or loops, the of the newly inserted PE. To eliminate any potential for duplicate traffic or loops, the
concept of skew is introduced: a small time offset to ensure a controlled concept of "skew" is introduced: a small time offset to ensure a controlle
and orderly d and orderly
transition when multiple Provider Edge (PE) devices are involved. transition when multiple PE devices are involved.
The previously inserted PE(s) must perform service carving first for NDF t o DF transitions. The previously inserted PE(s) must perform service carving first for NDF t o DF transitions.
The receiving PEs subtract this skew (default = 10ms) to the Service Carvi ng Time and apply NDF The receiving PEs subtract this skew (default = 10 ms) to the Service Carv ing Time and apply NDF
to DF transitions first. This is followed shortly by the NDF to DF transit ions on both PEs, after the skew delay. to DF transitions first. This is followed shortly by the NDF to DF transit ions on both PEs, after the skew delay.
On the recovering PE, all services are already in NDF state and no On the recovering PE, all services are already in NDF state, and no
skew for DF to NDF transitions is required.<br/> skew for DF to NDF transitions is required.</t>
This document proposes a default skew value of 10ms to allow completion of <t>This document proposes a default skew value of 10 ms to allow completio
programming the DF n of programming the DF
to NDF transitions, but implementations may make the skew larger (or confi gurable) taking to NDF transitions, but implementations may make the skew larger (or confi gurable) taking
into consideration scale, hardware capabilities and clock accuracy.</t> into consideration scale, hardware capabilities, and clock accuracy.</t>
<t>To summarize, all peering PEs perform service carving almost
<t>To summarize, all peering PEs perform service carving almost simultaneo simultaneously at the time announced by the newly added/recovered
usly at the time PE. The newly inserted PE initiates the SCT and triggers service carving
announced by the newly added/recovered PE. The newly inserted PE initiates immediately on its local timer expiry. The previously inserted PE(s)
the SCT, receiving Ethernet Segment route (RT-4) with an SCT BGP extended
and triggers service carving immediately on its local timer expiry. The pr community perform service carving shortly before the SCT for DF to NDF
eviously inserted PE(s) receiving Ethernet Segment route (RT-4) with an SCT BGP transitions and at the SCT for NDF to DF transitions.</t>
extended community, <section anchor="ntpencoding">
perform service carving shortly before Service Carving Time for DF to NDF <name>BGP Encoding</name>
transitions, and at
Service Carving Time for NDF to DF transitions.</t>
<section anchor="ntpencoding" title="BGP Encoding">
<t>A BGP extended community, with Type 0x06 and Sub-Type 0x0F, is define d to communicate the <t>A BGP extended community, with Type 0x06 and Sub-Type 0x0F, is define d to communicate the
Service Carving Time for each Ethernet Segment: SCT for each Ethernet Segment:</t>
<figure>
<figure title="Service Carving Time"><artwork><![CDATA[ <name>Service Carving Time</name>
<artwork><![CDATA[
1 2 3 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Type = 0x06 | Sub-Type(0x0F)| Timestamp Seconds ~ | Type = 0x06 | Sub-Type(0x0F)| Timestamp Seconds ~
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
~ Timestamp Seconds | Timestamp Fractional Seconds | ~ Timestamp Seconds | Timestamp Fraction |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+]]></artwork>
]]> </figure>
</artwork></figure>
</t>
<t>
The timestamp exchanged uses the NTP prime epoch of January 1, 1900 <xre
f target="RFC5905"/>
and an adapted form of the 64-bit NTP Timestamp Format.<br/>
The 64-bit NTP Timestamp Format consists of a 32-bit part for Seconds an
d a 32-bit
part for Fraction, which are encoded in the Service Carving Time as foll
ows:
<ul>
<li>Timestamp Seconds: 32-bit NTP seconds are encoded in this field.</li
>
<li>Timestamp Fractional Seconds: the high order 16 bits of the NTP 'Fra
ction' field are encoded in this
field.</li>
</ul>
</t>
<t>When rebuilding a 64-bit NTP Timestamp Format using the values from a
received SCT BGP extended community, the lower order 16 bits of the
Fractional field are set to 0. The use of a 16-bit fractional seconds va
lue yields adequate precision of 15 microseconds
(2^-16 s).</t>
<t>This document introduces a new flag called Time
Synchronization indicated by "T" in the DF Election Capabilities registr
y defined in <xref
target="RFC8584"/> for use in DF Election Extended Community.
<figure title="DF Election Extended Community"><artwork><![CDATA[ <t>The timestamp exchanged uses the NTP prime epoch of 0 h 1 January
1900 UTC <xref target="RFC5905"/> and an adapted form of the 64-bit NTP times
tamp format.</t>
<t>The 64-bit NTP timestamp format consists of a 32-bit unsigned seconds
field and a 32-bit fraction field, which are encoded in the
Service Carving Time as follows:</t>
<dl spacing="normal" newline="false">
<dt>Timestamp Seconds:</dt><dd>32-bit NTP seconds are encoded in this
field.</dd>
<dt>Timestamp Fraction:</dt><dd>The high-order 16 bits of
the NTP "Fraction" field are encoded in this field.</dd>
</dl>
<t>When rebuilding a 64-bit NTP timestamp format using the values from a
received SCT BGP extended community, the lower-order 16 bits of the
NTP "Fraction" field are set to 0. The use of a 16-bit fractional second
s value yields adequate precision of 15 microseconds
(2<sup>-16</sup> s).</t>
<t>The format of the DF Election Extended Community that is used in this
document is:</t>
<figure>
<name>DF Election Extended Community (RFC 8584)</name>
<artwork><![CDATA[
1 2 3 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Type = 0x06 | Sub-Type(0x06)| RSV | DF Alg | Bitmap ~ | Type = 0x06 | Sub-Type(0x06)| RSV | DF Alg | Bitmap ~
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
~ Bitmap | Reserved | ~ Bitmap | Reserved |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+]]></artwork>
</figure>
Figure 4: DF Election Extended Community <t>The Bitmap field (2 octets) encodes "capabilities" <xref target="RFC8
]]> 584"/>, where this
</artwork></figure> document introduces a new Time Synchronization capability indicated by "
T".</t>
<figure title="DF Election Capabilities"><artwork><![CDATA[ <figure>
<name>Bitmap Field in the DF Election Extended Community</name>
<artwork><![CDATA[
1 1 1 1
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| |A| |T| | | |A| |T| |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+]]></artwork>
</figure>
Figure 5: DF Election Capabilities
]]>
</artwork></figure>
</t>
<t>
<ul>
<li>Bit 3: Time Synchronization (corresponds to Bit 27 of the DF Electio
n Extended
Community). When set
to 1, it indicates the desire to use Time Synchronization capability
with the rest of the PEs in the Ethernet Segment.</li>
</ul>
</t>
<dl spacing="normal" newline="false">
<dt>Bit 3:</dt><dd>Time Synchronization (corresponds to Bit 27 of
the DF Election Extended Community). When set to 1, it indicates the
desire to use the Time Synchronization capability with the rest of the
PEs in the Ethernet Segment.</dd>
</dl>
<t> <t>
This capability is utilized in conjunction with the agreed-upon DF Elect ion Type. This capability is utilized in conjunction with the agreed-upon DF Elect ion Type.
For instance, if all the PE devices in the Ethernet Segment indicate the desire to use the For instance, if all the PE devices in the Ethernet Segment indicate the desire to use the
Time Synchronization capability and request the DF Election Type to be H Time Synchronization capability and request the DF Election Type to be t
ighest Random Weight (HRW), he HRW,
then the HRW algorithm is used in conjunction with this capability. A PE then the HRW algorithm is used in conjunction with this capability. A PE
which does not that does not
support the procedures set out in this document, or receives a route fro support the procedures set out in this document or that receives a route
m another PE in from another PE in
which the capability is not set, MUST NOT delay Designated Forwarder ele which the capability is not set <bcp14>MUST NOT</bcp14> delay DF electio
ction as this could n as this could
lead to duplicate traffic in some instances (overlapping Designated Forw lead to duplicate traffic in some instances (overlapping DFs).</t>
arders).</t>
</section> </section>
<section anchor="timestamp_verification">
<section anchor="timestamp_verification" title="Timestamp Verification"> <name>Timestamp Verification</name>
<t>The NTP Era value is not exchanged and participating PEs may consider <t>The NTP Era value is not exchanged, and participating PEs may conside
the timestamps to be in the same Era as their local value. r the timestamps to be in the same Era as their local value.
A DF Election operation occurring exactly at the next Era transition wil A DF Election operation occurring exactly at the next Era transition wil
l be sometime on l be some time on
February&nbsp;7,&nbsp;2036. Implementors and operators may address credi February 7, 2036. Implementors and operators may address credible
ble cases of rollover ambiguity (adjacent Eras n and n+1) as well as the sec
cases of rollover ambiguity (adjacent Eras n and n+1), as well as the se urity issue of unreasonably
curity issue of unreasonably large or unreasonably small NTP timestamps in the following manner.</t>
large or unreasonably small NTP timestamps, in the following manner.</t> <t>The procedures in this document address implicitly what occurs with r
eceiving an SCT value
<t>The procedures in this document address implicitly what occurs with r
eceiving a SCT value
in the past. This would be a naturally occurring event with a large BGP propagation delay: in the past. This would be a naturally occurring event with a large BGP propagation delay:
the receiving PE treats the receiving PE treats
the DF Election at the peer as having occurred already and proceeds with the DF Election at the peer as having already occurred and proceeds with
out starting any out starting any
timer to further delay service carving, effectively falling back on <xre timer to further delay service carving, effectively falling back on beha
f target="RFC7432"/> behavior. vior as specified in <xref target="RFC7432"/>.
A PE which receives a SCT value smaller than its current time, MUST disc A PE that receives an SCT value smaller than its current time <bcp14>MUS
ard the Service Carving Time and SHALL treat the DF Election at T</bcp14> discard the Service Carving Time and <bcp14>SHALL</bcp14> treat the DF
Election at
the peer as having occurred already.</t> the peer as having occurred already.</t>
<t>The more problematic scenario is the PE in Era n+1 that receives an S
<t>The more problematic scenario is the PE in Era n+1 which receives a S CT advertised by
ervice Carving Time advertised by
the PE still in Era n, with a very large SCT value. To address this Era rollover as well as the PE still in Era n, with a very large SCT value. To address this Era rollover as well as
the large values attack vector, implementations MUST validate the receiv the large values attack vector, implementations <bcp14>MUST</bcp14> vali
ed SCT against date the received SCT against
an upper-bound.<br/> an upper bound.</t>
It is left to implementations to decide what constitutes an "unreasonabl <t>It is left to implementations to decide what constitutes an "unreason
y large" SCT value. ably large" SCT value.
A recommended approach, however, is to compare the received offset to th e local peering timer value. A recommended approach, however, is to compare the received offset to th e local peering timer value.
In practice, peering timer values are configured uniformly across Ethern In practice, peering timer values are configured uniformly across Ethern
et-Segment peers and et Segment peers and
may be treated as an upper-bound on the offset of received SCT values. may be treated as an upper bound on the offset of received SCT values.
A PE which receives an SCT representing an offset larger than the local A PE that receives an SCT representing an offset larger than the local p
peering timer MUST eering timer <bcp14>MUST</bcp14>
discard the Service Carving Time and SHALL treat discard the SCT and <bcp14>SHALL</bcp14> treat
the DF Election at the peer as having occurred already, as above.</t> the DF Election at the peer as having already occurred, as above.</t>
</section> </section>
<section anchor="fsm_8584">
<section anchor="fsm_8584" title="Updates to RFC8584"> <name>Updates to RFC 8584</name>
<t>This document introduces an additional delay to the events and <t>This document introduces an additional delay to the events and
transitions defined for the default DF election algorithm FSM in transitions defined for the default DF election algorithm FSM in
<relref target="RFC8584" section="2.1"/> without changing the FSM state or event definitions <xref target="RFC8584" section="2.1"/> without changing the FSM state or event definitions
themselves.</t> themselves.</t>
<t>Upon receiving an RCVD_ES message, the peering PE's FSM transitions
<t>Upon receiving a RCVD_ES message, the peering PE's Finite State Machi from the DF_DONE state (indicating the DF election process was complete)
ne (FSM) transitions to the DF_CALC state
from the DF_DONE (indicating the DF election process was complete) state (indicating that a new DF calculation is needed). Due to the
to the DF_CALC SCT included in the Ethernet Segment update, the completion of the DF_CA
(indicating that a new DF calculation is needed) state. Due to the Servi LC state and the
ce Carving Time
(SCT) included in the Ethernet-Segment update, the completion of the DF_
CALC state and the
subsequent transition back to the DF_DONE state are delayed. This delay ensures proper subsequent transition back to the DF_DONE state are delayed. This delay ensures proper
synchronization and prevents conflicts. Consequently, the accompanying f orwarding updates to synchronization and prevents conflicts. Consequently, the accompanying f orwarding updates to
the Designated Forwarder (DF) and Non-Designated Forwarder (NDF) states the DF and NDF states are also deferred.</t>
are also deferred.</t>
<t>Item 9. in <relref target="RFC8584" section="2.1"/>, the list "Corres
ponding actions when transitions
are performed or states are entered/exited" is changed as follows:</t>
<ol start="9">
<li>DF_CALC on CALCULATED: Mark the election result for the VLAN or
VLAN Bundle.
<ol type="9.%d">
<li>If an SCT timestamp is present during the RCVD_ES event of Action 11
, wait until the
time indicated by the SCT minus skew before proceeding to step 9.3.</li>
<li>If an SCT timestamp is present during the RCVD_ES event of Action 11
, wait until the
time indicated by the SCT before proceeding to step 9.4.</li>
<li>Assume the role of NDF for the local PE concerning the VLAN or VLAN
Bundle, and transition to the DF_DONE state.</li>
<li>Assume the role of DF for the local PE concerning the VLAN or VLAN B
undle, and transition to the DF_DONE state.</li>
</ol>
</li>
</ol>
<t>This revised approach ensures proper timing and synchronization in th <t>Item 9 in <xref target="RFC8584" section="2.1"/>, in the list "Corres
e DF election ponding actions when transitions
process, avoiding conflicts and ensuring accurate forwarding updates.</t are performed or states are entered/exited", is changed as follows:</t>
>
</section>
</section> <blockquote>
<ol start="9" spacing="normal">
<li><t>DF_CALC on CALCULATED: Mark the election result for the VLAN
or VLAN bundle.</t>
<section anchor="example" title="Synchronization Scenarios"> <ol type="9.%d" spacing="normal">
<li>If no Service Carving Time is present during the RCVD_ES event o
f Action 11,
proceed to step 9.4</li>
<t>Consider <xref target="topology"/> as an example, where initially PE2 <li>If a Service Carving Time is present during the RCVD_ES event of
has failed and PE1 has taken over. Action 11, wait until the time indicated by the SCT minus skew befor
This scenario illustrates the problem with the DF-Election mechanism des e proceeding to step
cribed in <relref target="RFC7432" section="8.5"/>, 9.3.</li>
specifically in the context of the timer value configured for all PEs on <li>Assume the role of NDF for the local PE concerning the VLAN or V
the Ethernet LAN bundle.
Segment.</t> Wait the remaining skew time before proceeding to step 9.4.</li>
<t>Procedure based on <relref target="RFC7432" section="8.5"/> with the <li>Assume the election result's role (DF or NDF) for the local PE c
default 3-second timer in step 2: oncerning the VLAN or
<ol> VLAN bundle and transition to the DF_DONE state.</li>
<li>Initial state: PE1 is in a steady-state and PE2 is recovering.</li </ol>
> </li>
<li>Recovery: PE2 recovers at an absolute time of t=99.</li>
<li>Advertisement: PE2 advertises RT-4, sent at t=100, to partner PE1.
</li>
<li>Timer Start: PE2 starts a 3-second timer to allow the reception of
RT-4 from other PE
nodes.</li>
<li>Immediate carving: PE1 performs service carving immediately upon R
T-4 reception, i.e., t=100 plus some BGP propagation delay.</li>
<li>Delayed Carving: PE2 performs service carving at time t=103.</li>
</ol> </ol>
</t> </blockquote>
<t><xref target="RFC7432"/> favors traffic drops over duplicate traffic.
With the above procedure, traffic drops will occur as part of each PE
recovery sequence
since PE1 transitions some VLANs to Non-Designated Forwarder (NDF) immed
iately upon RT-4
reception.<br/>
The timer value (default = 3 seconds) directly affects the duration of t
he packet
drops. A shorter (or zero) timer may result in duplicate traffic or traf
fic loops.</t>
<t>Procedure based on the Service Carving Time (SCT) approach: <t>This revised approach ensures proper timing and synchronization in th
<ol> e DF election
<li>Initial state: PE1 is in a steady state, and PE2 is recovering.</l process, avoiding conflicts and ensuring accurate forwarding updates.</t
i> >
<li>Recovery: PE2 recovers at an absolute time of t=99.</li> </section>
<li>Timer Start: PE2 starts at t=100 a 3-second timer to allow the rec </section>
eption of RT-4 from other PE <section anchor="example">
<name>Synchronization Scenarios</name>
<t>Consider <xref target="topology"/> as an example, where initially PE2
has failed and PE1 has taken over. This scenario illustrates the
problem with the DF Election mechanism described in <xref
target="RFC7432" section="8.5"/>, specifically in the context of the
timer value configured for all PEs on the Ethernet Segment.</t>
<t>The following procedure is based on <xref target="RFC7432"
section="8.5"/> with the default 3-second timer in step 2. </t>
<ol spacing="normal">
<li>Initial state: PE1 is in a steady-state and PE2 is recovering.</li>
<li>Recovery: PE2 recovers at an absolute time of t=99.</li>
<li>Advertisement: PE2 advertises RT-4, sent at t=100, to its partner (P
E1).</li>
<li>Timer Start: PE2 starts a 3-second timer to allow the reception of
RT-4 from other PE nodes.</li>
<li>Immediate carving: PE1 performs service carving immediately upon
RT-4 reception, i.e., t=100 plus some BGP propagation delay.</li>
<li>Delayed Carving: PE2 performs service carving at time t=103.</li>
</ol>
<t><xref target="RFC7432"/> favors traffic drops over duplicate traffic.
With the above procedure, traffic drops will occur as part of each PE
recovery sequence since PE1 transitions some VLANs to
an NDF immediately upon RT-4 reception. The timer value
(default = 3 seconds) directly affects the duration of the packet
drops. A shorter (or zero) timer may result in duplicate traffic or
traffic loops.</t>
<t>The following procedure is based on the SCT approach:
</t>
<ol spacing="normal">
<li>Initial state: PE1 is in a steady state, and PE2 is recovering.</li>
<li>Recovery: PE2 recovers at an absolute time of t=99.</li>
<li>Timer Start: PE2 starts at t=100 a 3-second timer to allow the recep
tion of RT-4 from other PE
nodes.</li> nodes.</li>
<li>Advertisement: PE2 advertises RT-4, sent at t=100, with a target S <li>Advertisement: PE2 advertises RT-4, sent at t=100, with a target SCT
CT value of t=103 to value of t=103 to
partner PE1.</li> its partner (PE1).</li>
<li>Service Carving Timer: PE1 starts the service carving timer, with <li>Service Carving Timer: PE1 starts the service carving timer, with th
the remaining time e remaining time
until t=103.</li> until t=103.</li>
<li>Simultaneous Carving: Both PE1 and PE2 carve at an absolute time o <li>Simultaneous Carving: Both PE1 and PE2 carve at an absolute time of
f t=103.</li> t=103.</li>
</ol> </ol>
</t> <t>
To maintain the preference for minimal loss over duplicate traffic, PE1
<t> <bcp14>SHOULD</bcp14> carve
To maintain the preference for minimal loss over duplicate traffic, PE1 slightly before PE2 (with skew). The recovering PE2 performs both DF-to-
SHOULD carve NDF and NDF-to-DF
slightly before PE2 (with skew). The recovering PE2 performs both DF to
NDF and NDF to DF
transitions per VLAN at the timer's expiry. The original PE1, which rece ived the SCT, applies the following: transitions per VLAN at the timer's expiry. The original PE1, which rece ived the SCT, applies the following:
<ul> </t>
<li>DF to NDF Transition(s): at t=SCT minus skew, where both PEs are <ul spacing="normal">
NDF for the skew duration.</li> <li>DF-to-NDF Transition(s): at t=SCT minus skew, where both PEs are NDF
<li>NDF to DF Transition(s): at t=SCT.</li> for the skew duration.</li>
</ul> <li>NDF-to-DF Transition(s): at t=SCT.</li>
This split-behavior ensures a smooth DF role transition with minimal los </ul>
s. <t>
</t> This split behavior ensures a smooth DF role transition with minimal los
s.
<t>Using the SCT approach, the negative effect of the timer to allow the </t>
reception of <t>The SCT approach mitigates the negative effect of requiring a timer for
Ethernet Segment RT-4 from other PE nodes is mitigated. Furthermore, the discovery of
BGP Ethernet Segment (ES) RT-4 from other PE nodes. Furthermore, the BGP
transmission delay (from PE2 to PE1) of the ES RT-4 becomes a non-issue. The SCT approach shortens the transmission delay (from PE2 to PE1) of the ES RT-4 becomes a non-issue. The SCT approach shortens the
3-second timer window to the order of milliseconds.</t> 3-second timer window to the order of milliseconds.</t>
<t>The peering timer is a configurable value where 3 seconds represents th
<t>The peering timer is a configurable value where 3 seconds represents e default.
the default.
Configuring a timer value of 0, or so small as to expire during propagat ion of the BGP Configuring a timer value of 0, or so small as to expire during propagat ion of the BGP
routes, is outside the scope of this document. routes, is outside the scope of this document.
In reality, the use of the SCT approach presented in this document encou rages the use of In reality, the use of the SCT approach presented in this document encou rages the use of
larger peering timer values to overcome any sort of BGP route propagatio n delays.</t> larger peering timer values to overcome any sort of BGP route propagatio n delays.</t>
<section anchor="concurrent">
<section anchor="concurrent" title="Concurrent Recoveries"> <name>Concurrent Recoveries</name>
<t>In the eventuality 2 or more PEs in a peering Ethernet Segment group <t>In the eventuality that two or more PEs in a peering Ethernet Segment
are recovering group are recovering
concurrently or roughly the same time, each will advertise a Service Car concurrently or roughly at the same time, each will advertise a SCT.
ving Time.
This SCT value would correspond to what each recovering PE considers the "end time" for DF This SCT value would correspond to what each recovering PE considers the "end time" for DF
Election. A similar situation arises in sequentially recovering PEs, whe n a second PE Election. A similar situation arises in sequentially recovering PEs, whe n a second PE
recovers approximately at the time of the first PE's advertised SCT expi ry, and with its own recovers approximately at the time of the first PE's advertised SCT expi ry and with its own
new SCT-2 outside of the initial SCT window.</t> new SCT-2 outside of the initial SCT window.</t>
<t>In the case of multiple concurrent DF elections, each initiated by on e of the recovering <t>In the case of multiple concurrent DF elections, each initiated by on e of the recovering
PEs, the SCTs must be ordered chronologically. All PEs SHALL execute onl y a single DF PEs, the SCTs must be ordered chronologically. All PEs <bcp14>SHALL</bcp 14> execute only a single DF
Election at the service carving time corresponding to the largest (lates t) received timestamp value. Election at the service carving time corresponding to the largest (lates t) received timestamp value.
This DF Election will lead peering PEs into a single co-ordinated DF Ele This DF Election will lead peering PEs into a single coordinated DF Elec
ction update.</t> tion update.</t>
<t>Example: <t>Example:
</t>
<ol> <ol>
<li>Initial State: PE1 is in a steady state, with services elected at PE1.</li> <li>Initial State: PE1 is in a steady state, with services elected at PE1.</li>
<li>Recovery of PE2: PE2 recovers at time t=100 and advertises RT-4 wi th a target SCT <li>Recovery of PE2: PE2 recovers at time t=100 and advertises RT-4 wi th a target SCT
value of t=103 to its partners (PE1).</li> value of t=103 to its partner (PE1).</li>
<li>Timer Initiation by PE2: PE2 starts a 3-second timer to allow the reception of RT-4 <li>Timer Initiation by PE2: PE2 starts a 3-second timer to allow the reception of RT-4
from other PE nodes.</li> from other PE nodes.</li>
<li>Timer Initiation by PE1: PE1 starts the service carving timer, wit h the remaining time <li>Timer Initiation by PE1: PE1 starts the service carving timer, wit h the remaining time
until t=103.</li> until t=103.</li>
<li>Recovery of PE3: PE3 recovers at time t=102 and advertises RT-4 wi th a target SCT <li>Recovery of PE3: PE3 recovers at time t=102 and advertises RT-4 wi th a target SCT
value of t=105 to its partners (PE1, PE2).</li> value of t=105 to its partners (PE1, PE2).</li>
<li>Timer Initiation by PE3: PE3 starts a 3-second timer to allow the reception of RT-4 <li>Timer Initiation by PE3: PE3 starts a 3-second timer to allow the reception of RT-4
from other PE nodes.</li> from other PE nodes.</li>
<li>Timer Update by PE2: PE2 cancels the running timer and starts the service carving <li>Timer Update by PE2: PE2 cancels the running timer and starts the service carving
timer with the remaining time until t=105.</li> timer with the remaining time until t=105.</li>
<li>Timer Update by PE1: PE1 updates its service carving timer, with t he remaining time <li>Timer Update by PE1: PE1 updates its service carving timer, with t he remaining time
until t=105.</li> until t=105.</li>
<li>Service Carving: PE1, PE2, and PE3 perform service carving at the absolute time of t=105.</li> <li>Service Carving: PE1, PE2, and PE3 perform service carving at the absolute time of t=105.</li>
</ol> </ol>
</t> <t>In the eventuality that a PE in an Ethernet Segment group recovers du
ring the discovery window
<t>In the eventuality a PE in an Ethernet Segment group recovers during specified in <xref target="RFC7432" section="8.5"/> and does not support
the discovery window or advertise the
specified in <relref target="RFC7432" section="8.5"/>, and does not supp T-bit, all PEs in the current peering sequence <bcp14>SHALL</bcp14> imme
ort or advertise the diately revert to the default
T-bit, then all PEs in the current peering sequence SHALL immediately re behavior described in <xref target="RFC7432"/>.</t>
vert to the default </section>
<xref target="RFC7432"/> behavior.</t> </section>
<section anchor="ntpcompat">
</section> <name>Backwards Compatibility</name>
</section> <t>For the DF election procedures to achieve global convergence and unanim
ity within a
<section anchor="ntpcompat" title="Backwards Compatibility">
<t>For the DF election procedures to achieve global convergence and un
animity within a
redundancy group, it is essential that all participating PEs agree on the DF election redundancy group, it is essential that all participating PEs agree on the DF election
algorithm to be employed. However, it is possible that some PEs may co ntinue to use the algorithm to be employed. However, it is possible that some PEs may co ntinue to use the
existing modulo-based DF election algorithm from <xref target="RFC7432 existing modulo-based DF election algorithm from <xref target="RFC7432
"/> and not utilize the new Service Carving Time "/> and not utilize the new
(SCT) BGP extended community. PEs that operate using the baseline DF e SCT BGP extended community. PEs that operate using the baseline DF ele
lection mechanism ction mechanism
will simply discard the new SCT BGP extended community as unrecognized .</t> will simply discard the new SCT BGP extended community as unrecognized .</t>
<t>A PE can indicate its willingness to support clock-synchronized carving
<t>A PE can indicate its willingness to support clock-synchronized car by signaling
ving by signaling the new "T" DF Election Capability and including the new SCT BGP exten
the new 'T' DF Election Capability and including the new SCT BGP exten ded community along
ded community along
with the Ethernet Segment Route Type 4. If one or more PEs attached to the Ethernet with the Ethernet Segment Route Type 4. If one or more PEs attached to the Ethernet
Segment do not signal T=1, then all PEs in the Ethernet Segment SHALL revert to the Segment do not signal T=1, then all PEs in the Ethernet Segment <bcp14 >SHALL</bcp14> revert to the
timer-based approach as specified in <xref target="RFC7432"/>. This re version is particularly crucial in timer-based approach as specified in <xref target="RFC7432"/>. This re version is particularly crucial in
preventing VLAN shuffling when more than two PEs are involved.</t> preventing VLAN shuffling when more than two PEs are involved.</t>
<t>In the event a new or extra RT-4 is received without the new "T" DF Ele
<t>In the event a new or extra RT-4 is received without the new 'T' DF ction
Election
Capability in the midst of an ongoing DF Election sequence, all SCT-ba sed delays are Capability in the midst of an ongoing DF Election sequence, all SCT-ba sed delays are
cancelled and the DF Election immediately applied as specified in <xre canceled, and the DF Election is immediately applied as specified in <
f xref target="RFC7432"/>, as if no SCT had been previously exchanged.</t>
target="RFC7432"/>, as if no SCT had been previously exchanged.</t> </section>
<section anchor="security">
</section> <name>Security Considerations</name>
<t>The mechanisms in this document use the EVPN control plane as defined
<section anchor="security" title="Security Considerations"> in <xref target="RFC7432"/>. Security considerations described in <xref
<t>The mechanisms in this document use the EVPN control plane as defined target="RFC7432"/> are equally applicable.</t>
in <t>For the new SCT Extended Community, attack vectors may be setting the
<xref target="RFC7432"/>. Security considerations described in value to zero, to a value in the past, or to large times in the
<xref target="RFC7432"/> are equally applicable.</t> future. Handling of this attack vector is addressed in <xref
target="timestamp_verification"/> alongside NTP Era rollover
<t>For the new SCT Extended Community, attack vectors may be setting the ambiguity.</t>
value to zero, to a <t>This document uses MPLS- and IP-based tunnel technologies to support
value in the past or to large times in the future. Handling of this atta data plane transport. Security considerations described in <xref
ck vector is target="RFC7432"/> and <xref target="RFC8365"/> are equally
addressed in <xref target="timestamp_verification"/> alongside NTP Era r applicable.</t>
ollover ambiguity.</t> </section>
<section anchor="IANA">
<t>This document uses MPLS and IP-based tunnel technologies to support d <name>IANA Considerations</name>
ata plane transport. <t>IANA has made the following assignment in the "EVPN Extended
Security considerations described in <xref target="RFC7432"/> and in <xr Community Sub-Types" registry set up by <xref target="RFC7153"/>.
ef target="RFC8365"/> are equally applicable.</t> </t>
</section> <table>
<name></name>
<section anchor="IANA" title="IANA Considerations"> <thead>
<tr>
<t>IANA maintains the "EVPN Extended Community Sub-Types" registry set <th>Sub-Type Value</th>
up by <xref target='RFC7153'/>, where the following assignment has been m <th>Name</th>
ade: <th>Reference</th>
<figure><artwork><![CDATA[ </tr>
Sub-Type Value Name Reference </thead>
-------------- ------------------------- ------------- <tbody>
0x0F Service Carving Time This document <tr>
]]></artwork></figure> <td>0x0F</td>
</t> <td>Service Carving Time</td>
<td>RFC 9722</td>
<t>IANA maintains the "DF Election Capabilities" registry set up by </tr>
<xref target="RFC8584"/>. IANA is requested to make the following assign </tbody>
ment from </table>
this registry:
<figure><artwork><![CDATA[ <t>IANA has made the following assignment in the "DF Election
Bit Name Reference Capabilities" registry set up by <xref target="RFC8584"/>.</t>
---- ---------------- -------------
3 Time Synchronization This document
]]></artwork></figure>
</t> <table>
</section> <name></name>
</middle> <thead>
<tr>
<th>Bit</th>
<th>Name</th>
<th>Reference</th>
</tr>
</thead>
<tbody>
<tr>
<td>3</td>
<td>Time Synchronization</td>
<td>RFC 9722</td>
</tr>
</tbody>
</table>
<!-- *****BACK MATTER ***** --> </section>
</middle>
<back> <back>
<!-- References split into informative and normative --> <references>
<references title="Normative References"> <name>References</name>
<references>
<name>Normative References</name>
<xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.2 119.xml"/> <xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.2 119.xml"/>
<xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.8 174.xml"/> <xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.8 174.xml"/>
<xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.7 153.xml"/> <xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.7 153.xml"/>
<xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.7 432.xml"/> <xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.7 432.xml"/>
<xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.8 365.xml"/> <xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.8 365.xml"/>
<xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.8 584.xml"/> <xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.8 584.xml"/>
<xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.5 905.xml"/> <xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.5 905.xml"/>
</references> </references>
<!-- References split into informative and normative --> <references>
<references title="Informative References"> <name>Informative References</name>
<reference anchor="HRW98" target="https://www.microsoft.com/en-us/resear <reference anchor="HRW98" target="https://www.microsoft.com/en-us/resear
ch/wp-content/ ch/wp-content/uploads/2017/02/HRW98.pdf">
uploads/2017/02/HRW98.pdf">
<front> <front>
<title>Using Name-Based Mappings to Increase Hit Rates</title> <title>Using Name-Based Mappings to Increase Hit Rates</title>
<author initials="D" surname="Thaler"> <author initials="D" surname="Thaler">
<organization/> <organization/>
</author> </author>
<author initials="C" surname="Ravishankar"> <author initials="C" surname="Ravishankar">
<organization/> <organization/>
</author> </author>
<date year="1998"/> <date month="February" year="1998"/>
</front> </front>
<refcontent>IEEE/ACM Transactions on Networking, vol. 6, no. 1</refcont ent>
</reference> </reference>
</references>
</references> </references>
<section anchor="contributors" title="Contributors"> <section anchor="acknowledgements" numbered="false">
<t>In addition to the authors listed on the front page, the following co-aut <name>Acknowledgements</name>
hors <t>Authors would like to acknowledge helpful comments and contributions
have also contributed substantially to this document:</t> of <contact fullname="Satya Mohanty"/> and <contact fullname="Bharath
Vasudevan"/>. Also thank you to <contact fullname="Anoop Ghanwani"/>
and <contact fullname="Gunter van de Velde"/> for their thorough review
with valuable comments and corrections.</t>
</section>
<t>Gaurav Badoni<br/>Cisco</t> <section anchor="contributors" numbered="false">
<t>Email: gbadoni@cisco.com</t> <name>Contributors</name>
<t>In addition to the authors listed on the front page, the following
coauthors have also contributed substantially to this document:</t>
<t>Dhananjaya Rao<br/>Cisco</t> <contact fullname="Gaurav Badoni">
<t>Email: dhrao@cisco.com</t> <organization>Cisco</organization>
</section> <address>
<email>gbadoni@cisco.com</email>
</address>
</contact>
<contact fullname="Dhananjaya Rao">
<organization>Cisco</organization>
<address>
<email>dhrao@cisco.com</email>
</address>
</contact>
<section anchor="acknowledgements" title="Acknowledgements">
<t>Authors would like to acknowledge helpful comments
and contributions of Satya Mohanty and Bharath Vasudevan.
Also thank you to Anoop Ghanwani and Gunter van de Velde for their thoro
ugh review with valuable comments and
corrections.</t>
</section> </section>
</back> </back>
</rfc> </rfc>
 End of changes. 102 change blocks. 
596 lines changed or deleted 514 lines changed or added

This html diff was produced by rfcdiff 1.48.