<?xml version="1.0" encoding="US-ASCII"?>
<!-- This template is for creating an Internet Draft using xml2rfc,
    which is available here: http://xml.resource.org. -->
<!DOCTYPE rfc SYSTEM "rfc2629.dtd" [
<!-- One method to get references from the online citation libraries.
    There has to be one entity for each item to be referenced. 
    An alternate method (rfc include) is described in the references. -->
<!ENTITY RFC2119 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml">
<!ENTITY RFC2629 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2629.xml">
<!ENTITY RFC3552 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3552.xml">
<!ENTITY I-D.narten-iana-considerations-rfc2434bis SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.narten-iana-considerations-rfc2434bis.xml">
]>
<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?>
<!-- used by XSLT processors -->
<!-- For a complete list and description of processing instructions (PIs), 
    please see http://xml.resource.org/authoring/README.html. -->
<!-- Below are generally applicable Processing Instructions (PIs) that most I-Ds might want to use.
    (Here they are set differently than their defaults in xml2rfc v1.32) -->
<?rfc strict="yes" ?>
<!-- give errors regarding ID-nits and DTD validation -->
<!-- control the table of contents (ToC) -->
<?rfc toc="yes"?>
<!-- generate a ToC -->
<?rfc tocdepth="4"?>
<!-- the number of levels of subsections in ToC. default: 3 -->
<!-- control references -->
<?rfc symrefs="yes"?>
<!-- use symbolic references tags, i.e, [RFC2119] instead of [1] -->
<?rfc sortrefs="yes" ?>
<!-- sort the reference entries alphabetically -->
<!-- control vertical white space 
    (using these PIs as follows is recommended by the RFC Editor) -->
<?rfc compact="yes" ?>
<!-- do not start each main section on a new page -->
<?rfc subcompact="no" ?>
<!-- keep one blank line between list items -->
<!-- end of list of popular I-D processing instructions -->
<rfc category="std" docName="draft-xu-idr-fare-in-mpson-00" ipr="trust200902">
  <front>
    <title abbrev="FARE in Multi-plane SON">Fully Adaptive Routing Ethernet in
    Multi-Plane Scale-Out Networks</title>

    <author fullname="Xiaohu Xu" initials="X." surname="Xu">
      <organization>China Mobile</organization>

      <address>
        <email>xuxiaohu_ietf@hotmail.com</email>
      </address>
    </author>

    <author fullname="Zongying He" initials="Z." surname="He">
      <organization>Broadcom</organization>

      <address>
        <email>zongying.he@broadcom.com</email>
      </address>
    </author>

    <author fullname="Nan Wang " initials="N." surname="Wang">
      <organization>Intel</organization>

      <address>
        <email>nan.wang@intel.com</email>
      </address>
    </author>

    <author fullname="Nan Wang " initials="N." surname="Wang">
      <organization>Hygon</organization>

      <address>
        <postal>
          <street/>

          <city/>

          <region/>

          <code/>

          <country/>
        </postal>

        <phone/>

        <facsimile/>

        <email>wangn@hygon.cn</email>

        <uri/>
      </address>
    </author>

    <author fullname="Wei Wan" initials="W." surname="Wan">
      <organization>Sugon</organization>

      <address>
        <postal>
          <street/>

          <city/>

          <region/>

          <code/>

          <country/>
        </postal>

        <phone/>

        <facsimile/>

        <email>wanwei@sugon.com</email>

        <uri/>
      </address>
    </author>

    <author fullname="Hua Wang" initials="H." surname="Wang">
      <organization>Moore Threads</organization>

      <address>
        <email>wh@mthreads.com</email>
      </address>
    </author>

    <author fullname="Jian Guo" initials="J." surname="Guo">
      <organization>Biren Technology</organization>

      <address>
        <postal>
          <street/>

          <city/>

          <region/>

          <code/>

          <country/>
        </postal>

        <phone/>

        <facsimile/>

        <email>jguo@birentech.com</email>

        <uri/>
      </address>
    </author>

    <author fullname="Xiang Li" initials="X." surname="Li">
      <organization>Enflame Technology</organization>

      <address>
        <postal>
          <street/>

          <city/>

          <region/>

          <code/>

          <country/>
        </postal>

        <phone/>

        <facsimile/>

        <email>xiang.li@enflame-tech.com</email>

        <uri/>
      </address>
    </author>

    <author fullname="Tianyou Zhou" initials="T." surname="Zhou">
      <organization>Resnics Technology</organization>

      <address>
        <email>tzhou@resnics.com</email>
      </address>
    </author>

    <author fullname="Yongtao Yang" initials="Y." surname="Yang">
      <organization>Centec</organization>

      <address>
        <email>yangyt@centec.com</email>
      </address>
    </author>

    <author fullname="Yinben Xia" initials="Y." surname="Xia">
      <organization>Tencent</organization>

      <address>
        <email>forestxia@tencent.com</email>
      </address>
    </author>

    <author fullname="Weifeng Zhang" initials="W." surname="Zhang">
      <organization>Tencent</organization>

      <address>
        <email>wikkizhang@tencent.com</email>
      </address>
    </author>

    <author fullname="Peilong Wang" initials="P." surname="Wang">
      <organization>Baidu</organization>

      <address>
        <email>wangpeilong01@baidu.com</email>
      </address>
    </author>

    <author fullname="Haibo Wang" initials="H." surname="Wang">
      <organization>Huawei Technologies</organization>

      <address>
        <email>rainsword.wang@huawei.com</email>
      </address>
    </author>

    <author fullname="Fajie Yang " initials="F." surname="Yang">
      <organization>Cloudnine Information Technologies</organization>

      <address>
        <email>yangfajie@cloudnineinfo.com</email>
      </address>
    </author>

    <author fullname="Chao Li" initials="C." surname="Li">
      <organization>Metanet Networking Technology</organization>

      <address>
        <email>lichao22@ieisystem.com</email>
      </address>
    </author>

    <author fullname="Xiaojun Wang " initials="X." surname="Wang">
      <organization>Ruijie Networks</organization>

      <address>
        <email>wxj@ruijie.com.cn</email>
      </address>
    </author>

    <author fullname="Roman Glebov" initials="R." surname="Glebov">
      <organization>Yandex</organization>

      <address>
        <postal>
          <street/>

          <city/>

          <region/>

          <code/>

          <country/>
        </postal>

        <phone/>

        <facsimile/>

        <email>kitaro630@yandex.ru</email>

        <uri/>
      </address>
    </author>

    <author fullname="Wei Sun" initials="W." surname="Sun">
      <organization>Yunsilicon Technology</organization>

      <address>
        <postal>
          <street/>

          <city/>

          <region/>

          <code/>

          <country/>
        </postal>

        <phone/>

        <facsimile/>

        <email>sunw@yunsilicon.com </email>

        <uri/>
      </address>
    </author>

    <author fullname="Guoqiang Ma" initials="G." surname="Ma">
      <organization> NebulaMatrix</organization>

      <address>
        <postal>
          <street/>

          <city/>

          <region/>

          <code/>

          <country/>
        </postal>

        <phone/>

        <facsimile/>

        <email>patrick.ma@nebula-matrix.com</email>

        <uri/>
      </address>
    </author>

    <!---->

    <date day="10" month="June" year="2026"/>

    <abstract>
      <t>FARE&nbhy;BGP enables weighted ECMP load balancing using a
      path&nbhy;bandwidth extended community. FARE&nbhy;in&nbhy;SUN extends
      this mechanism from switches to GPUs for scale&nbhy;up networks, which
      are typically multi&nbhy;plane. Large AI training clusters are
      increasingly adopting multi&nbhy;plane scale&nbhy;out network
      topologies. This document further extends FARE&nbhy;BGP from switches to
      RoCE NICs (RNICs) for such multi&nbhy;plane scale&nbhy;out networks. The
      document also presents two techniques to address route scalability
      concerns caused by the injection of numerous host routes.</t>
    </abstract>

    <note title="Requirements Language">
      <t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
      "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
      document are to be interpreted as described in <xref
      target="RFC2119">RFC 2119</xref>.</t>
    </note>
  </front>

  <middle>
    <section title="Introduction">
      <t>Large AI training clusters (approaching or even exceeding 100,000
      GPUs) are increasingly using multi&nbhy;plane scale&nbhy;out network
      topologies (see Figure 1) to reduce the total number of switches and
      links. In such a network, each RNIC is partitioned into multiple
      interfaces at either port or sub&nbhy;port granularity (Note that a port
      can be further split into multiple sub&nbhy;ports using breakout cables
      or shuffles), with each interface connected to an independent CLOS
      fabric (referred to as a "plane"). Because there are no links between
      planes, the RNIC itself must decide which plane to use for each packet
      or flow. In other words, the RNIC needs to determine the reachability
      and available bandwidth of each plane, and then perform global
      load-balancing across them.</t>

      <t><figure>
          <artwork align="center"><![CDATA[      
       
   =========================================
   #        +----+ +----+                  #
   #        | S1 | | S2 |        (Spine)   #
   #        +----+ +----+                  #
   #                              Plane-1  #
   # +----+ +----+ +----+ +----+           #
   # | L1 | | L2 | | L3 | | L4 | (Leaf)    #
   # +----+ +----+ +----+ +----+           #
   =========================================

   ===================================     ============
   # +-----+ +-----+ +-----+ +-----+ #     #          #
   # |RNIC1| |RNIC2| |RNIC3| |RNIC4| #     #          #
   # +-----+ +-----+ +-----+ +-----+ #     #          #
   #              Server-1           #     # Server-n #
   #================================== ... ============

   =========================================
   # +----+ +----+ +----+ +----+           #
   # | L1 | | L2 | | L3 | | L4 | (Leaf)    #
   # +----+ +----+ +----+ +----+           #
   #                              Plane-2  #
   #        +----+ +----+                  #
   #        | S1 | | S2 |        (Spine)   #
   #        +----+ +----+                  #
   =========================================          


                              Figure 1
]]></artwork>
        </figure></t>

      <t>(For simplicity, the diagram above omits the connections between
      RNICs and leaf switches, as well as the connections between leaf
      switches and spine switches within the same plane. In practice, each
      RNIC is multi&nbhy;homed to one leaf switch in every plane.
      Additionally, each leaf switch is connected to all spine switches of its
      own plane.)</t>

      <t>FARE&nbhy;in&nbhy;SUN <xref target="I-D.xu-rtgwg-fare-in-sun"/>
      extends the FARE&nbhy;BGP protocol <xref target="I-D.xu-idr-fare"/> from
      switches to GPUs for scale&nbhy;up networks, which are typically
      multi&nbhy;plane. Since multi&nbhy;plane scale&nbhy;out networks share
      the same architectural pattern, the adaptive routing approach defined in
      FARE&nbhy;in&nbhy;SUN is directly applicable to them. </t>

      <t>The solution described in this document is almost identical to that
      in FARE&nbhy;in&nbhy;SUN, with two essential differences. First,
      FARE&nbhy;BGP is extended from switches to RNICs rather than to GPUs.
      Second, In a scale&nbhy;up network, the number of route entries is small
      (typically a few hundred) and can be installed directly on GPUs. In
      contrast, consider an isolated multi&nbhy;plane scale&nbhy;out network
      with 100,000 GPUs (assuming a 1:1 GPU&nbhy;to&nbhy;RNIC ratio) and four
      planes. If the loopback addresses of RNICs are used for QP
      establishment, each plane MUST propagate up to 100,000 host routes for
      RNICs to avoid the blackholing issue associated with route aggregation.
      Even when interface addresses (with different prefixes configured for
      interfaces attached to different planes) are used instead of loopback
      addresses, it may still be desirable to propagate those host routes to
      speed up failover. However, storing all these routes on an RNIC is
      impractical, and maintaining such a large number of host routes on
      switches is also suboptimal. Therefore, routing tables on RNICs MUST be
      suppressed, and routing tables on switches SHOULD be suppressed as
      well.</t>
    </section>

    <section anchor="Abbreviations_Terminology" title="Terminology">
      <t>This memo makes use of the terms defined in <xref
      target="RFC2119"/>.</t>
    </section>

    <section title="WECMP Load-balancing across Planes">
      <t>In an isolated multi&nbhy;plane scale&nbhy;out network, an RNIC is
      connected to each plane and configured as a stub BGP speaker per plane.
      It MUST establish separate BGP sessions with the attached leaf switches
      of each plane. The BGP neighbor discovery mechanism <xref
      target="I-D.xu-idr-neighbor-autodiscovery"/> MAY be used to simplify
      configuration.</t>

      <t>Through these sessions, the RNIC learns routes to remote RNICs
      together with the path&nbhy;bandwidth extended community and then
      performs WECMP load-balancing as defined in <xref
      target="I-D.xu-idr-fare"/>. In this manner, the RNIC provides almost the
      same Weighted Equal&nbhy;Cost Multi&nbhy;Path (WECMP) load-balancing
      functionality as a FARE&nbhy;capable GPU as defined in <xref
      target="I-D.xu-rtgwg-fare-in-sun"/>, distributing traffic in proportion
      to the weight of each ECMP route.</t>

      <section title="Per-flow WECMP Load-balancing">
        <t>Per&nbhy;flow weighted load balancing is recommended when ordered
        packet delivery is essential. </t>

        <t>For per&nbhy;flow weighted load balancing, at least one Queue Pair
        (QP) per plane MUST be established between a pair of RNICs.
        Furthermore, the following requirements SHOULD be met:</t>

        <t><list style="symbols">
            <t>If QPs are established using the loopback address assigned to
            each RNIC, each QP SHOULD be assigned a unique UDP source port to
            differentiate traffic flows across all available planes between
            the RNIC pair. </t>

            <t>If QPs are established using the physical addresses assigned
            directly to interfaces, there is no need to assign a unique UDP
            source port for each QP, because the interface address inherently
            distinguishes traffic flows across all available planes between
            the RNIC pair.</t>
          </list></t>

        <t>Switches within each plane SHOULD also perform per&nbhy;flow
        weighted load balancing to ensure ordered packet delivery for all
        QPs.</t>
      </section>

      <section title="Per-packet WECMP Load-balancing&#8232;">
        <t>Per-packet weighted load balancing is recommended when disordered
        packet delivery is acceptable (e.g., through the Direct Data Placement
        mechanism <xref target="RFC7306"/>). </t>

        <t>For per&nbhy;packet weighted load balancing, a single QP per RNIC
        pair is sufficient. Therefore, it is RECOMMENDED to use the loopback
        address assigned to each RNIC for QP establishment. The traffic of
        that QP is distributed across all available planes according to the
        weight of each plane. </t>

        <t>Switches within each network plane are RECOMMENDED to perform
        per&nbhy;packet weighted load balancing, as disordered packet delivery
        is acceptable for all QPs. </t>
      </section>
    </section>

    <section title="Route Table Suppression">
      <t>In an isolated multi&nbhy;plane scale&nbhy;out network with 100,000
      GPUs and four planes, each plane may propagate up to 100,000 host routes
      &ndash; a total of 400,000 routes. Storing all these routes on an RNIC
      is impractical. Moreover, maintaining roughly 100,000 host routes on the
      switches of each plane is also suboptimal. Consequently, the following
      two complementary approaches can be employed to reduce the number of
      routes that both the RNIC and the switches need to store. </t>

      <section title="Route Aggregation with Unreachable Host Route Advertisement ">
        <t>A straightforward approach is to aggregate host routes for RNICs,
        especially when advertising them from leaf switches to RNICs. However,
        naive aggregation can create route blackholes: if a remote RNIC
        becomes unreachable via a given plane, the aggregated route to that
        RNIC over that plane remains on the local RNIC. Consequently, traffic
        destined for that remote RNIC will be forwarded by the local RNIC to
        that plane and then dropped within the plane.</t>

        <t>To address this issue, when an RNIC becomes disconnected from a
        given plane, the switch in that plane that performs route aggregation
        for the RNIC's host route (e.g., the leaf switch to which the RNIC was
        previously connected) MUST explicitly advertise the unreachability of
        that RNIC within the plane, while keeping the aggregated route intact.
        </t>

        <t>Specifically, the switch SHOULD advertise this unreachability using
        one of the following two methods:</t>

        <t><list style="symbols">
            <t>Path Bandwidth value of 0: The leaf switch advertises the host
            route (NLRI) with the Path Bandwidth extended community set to 0.
            The RNIC interprets this as "unreachable".</t>

            <t>Specific BGP unreachability advertisement: The leaf switch
            sends a dedicated BGP unreachability advertisement message as
            defined in <xref target="I-D.wang-idr-bgp-upa"/> or <xref
            target="I-D.krierhorn-idr-upa"/>. This message is distinct from a
            standard BGP route withdrawal and explicitly marks the host as
            unreachable via that plane. </t>
          </list></t>

        <t>When the corresponding specific prefix becomes reachable again, the
        unreachability advertisement MUST be withdrawn immediately.</t>

        <t>Upon receiving such an unreachability advertisement, the RNIC
        updates its forwarding table as follows: </t>

        <t><list style="symbols">
            <t>It locates the longest&nbhy;matching aggregated route that
            covers the unreachable host (e.g., a default route or a subnet
            prefix route). </t>

            <t>From that aggregated route's set of next&nbhy;hops (which
            originally includes multiple planes), it removes the next&nbhy;hop
            associated with the plane over which the unreachable advertisement
            was received. </t>

            <t>It then installs a host&nbhy;specific route for the unreachable
            destination, using the remaining next&nbhy;hops from the
            aggregated route.</t>
          </list></t>

        <t>For example, suppose an RNIC has an aggregated route (a.b.c.0/24)
        with next&nbhy;hops pointing to planes A, B, C, and D. Host X
        (a.b.c.d/32) becomes unreachable via plane A. The RNIC receives an
        unreachable advertisement for X and then installs a host&nbhy;specific
        route for X with next&nbhy;hops set to {B, C, D} &mdash; i.e., the
        next&nbhy;hop set of the longest&nbhy;matching aggregate route minus
        the next&nbhy;hop associated with plane A. As a result, traffic
        destined for X is never sent to plane A, thereby avoiding
        blackholes.</t>

        <t>This technique dramatically reduces the routing table size on the
        RNIC: the RNIC needs to store only aggregated routes plus a small
        number of host routes for RNICs that are unreachable via some planes.
        The majority of RNICs reachable across all planes are covered by the
        aggregated routes and therefore require no host routes. This approach
        is especially effective when unreachability is rare, which is typical
        in well&nbhy;managed clusters.</t>

        <t>Switches within each plane do not need to install the unreachable
        host route into their FIB tables.</t>
      </section>

      <section title="Prefix&nbhy;ORF&nbhy;based Route Filtering&#8232;">
        <t>Since a given RNIC communicates only with a limited subset of GPUs
        (due to collective communication patterns in distributed AI training,
        such as data, pipeline, and tensor parallelism), it can filter routes
        to retain only those it actually needs.</t>

        <t>The RNIC sends Address Prefix ORF <xref target="RFC5292"/> entries
        to its BGP peer (leaf switch) per plane. These entries indicate the
        host routes for remote RNICs that the local RNIC is interested in. The
        peer filters outbound route updates accordingly, sending only the
        requested routes. Thus, the RNIC stores only a limited number of
        routes. </t>

        <t>For switches, there is no need to install host routes for remote
        RNICs. Therefore, the FIB suppression mechanism as described in <xref
        target="I-D.ietf-grow-va-auto"/> can be leveraged. More specifically,
        upon receiving host routes from the attached RNICs, leaf switches MAY
        tag those routes with a "FIB-Suppress" Extended Community attribute as
        defined in Section 4.2.1.</t>

        <t>Compared to the approach described in Section 4.1, this method
        enables fine&nbhy;grained WECMP load balancing. For example, some
        modern transceivers with partial lane failures may continue operating,
        though at reduced capacity. In such cases, even though each RNIC
        remains multi&nbhy;homed to multiple planes at the same nominal
        interface speed, the actual available bandwidth can differ across
        planes. By obtaining host routes for the communicating RNICs along
        with their associated path&nbhy;bandwidth attributes,
        fine&nbhy;grained WECMP load balancing is achieved.</t>

        <section title="FIB-Suppress Extended Community">
          <t>The FIB-Suppress Extended Community indicates that the associated
          routes MAY be suppressed from the FIB (i.e., not installed in the
          forwarding table). It is a new AS&nbhy;Specific Extended Community
          and MUST be transitive. The low&nbhy;order octet of the Type field
          is to be assigned (TBD).</t>

          <t>The Value field consists of two sub-fields:</t>

          <t><list style="symbols">
              <t>Global Administrator sub-field: This sub-field contains the
              AS number of the advertising router that appends the
              FIB-Suppress Extended Community.</t>

              <t>Local Administrator sub-field: This sub-field contains the
              Router ID of the advertising router that appends the
              FIB-Suppress Extended Community.</t>
            </list></t>
        </section>
      </section>
    </section>

    <section anchor="Acknowledgements" title="Acknowledgements">
      <t>TBD.</t>

      <!---->
    </section>

    <section anchor="IANA" title="IANA Considerations">
      <t>IANA is requested to allocate a low-order octet value for the
      FIB-Suppress Extended Community from the registry of Transitive
      Two-Octet AS-Specific Extended Community Sub-Types. Upon allocation,
      IANA is requested to reference this document. </t>
    </section>

    <section anchor="Security" title="Security Considerations">
      <t>TBD.</t>

      <!---->
    </section>
  </middle>

  <back>
    <references title="Normative References">
      <?rfc include='reference.RFC.2119'?>

      <?rfc include='reference.RFC.5292'?>

      <?rfc include="reference.I-D.xu-idr-fare"?>

      <?rfc include="reference.I-D.xu-idr-neighbor-autodiscovery"?>

      <?rfc include="reference.I-D.wang-idr-bgp-upa"
?>

      <?rfc include="reference.I-D.krierhorn-idr-upa"?>

      <!---->
    </references>

    <references title="Informative References">
      <?rfc include='reference.RFC.7306'?>

      <?rfc include="reference.I-D.xu-rtgwg-fare-in-sun"?>

      <?rfc include="reference.I-D.ietf-grow-va-auto"?>

      <!---->
    </references>
  </back>
</rfc>
