<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
<head>
<!-- 2025-01-13 Mon 11:55 -->
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>D3425R1: Reducing operation-state sizes for subobject child operations</title>
<meta name="author" content="Lewis Baker" />
<meta name="generator" content="Org Mode" />
<style>
  #content { max-width: 60em; margin: auto; }
  .title  { text-align: center;
             margin-bottom: .2em; }
  .subtitle { text-align: center;
              font-size: medium;
              font-weight: bold;
              margin-top:0; }
  .todo   { font-family: monospace; color: red; }
  .done   { font-family: monospace; color: green; }
  .priority { font-family: monospace; color: orange; }
  .tag    { background-color: #eee; font-family: monospace;
            padding: 2px; font-size: 80%; font-weight: normal; }
  .timestamp { color: #bebebe; }
  .timestamp-kwd { color: #5f9ea0; }
  .org-right  { margin-left: auto; margin-right: 0px;  text-align: right; }
  .org-left   { margin-left: 0px;  margin-right: auto; text-align: left; }
  .org-center { margin-left: auto; margin-right: auto; text-align: center; }
  .underline { text-decoration: underline; }
  #postamble p, #preamble p { font-size: 90%; margin: .2em; }
  p.verse { margin-left: 3%; }
  pre {
    border: 1px solid #e6e6e6;
    border-radius: 3px;
    background-color: #f2f2f2;
    padding: 8pt;
    font-family: monospace;
    overflow: auto;
    margin: 1.2em;
  }
  pre.src {
    position: relative;
    overflow: auto;
  }
  pre.src:before {
    display: none;
    position: absolute;
    top: -8px;
    right: 12px;
    padding: 3px;
    color: #555;
    background-color: #f2f2f299;
  }
  pre.src:hover:before { display: inline; margin-top: 14px;}
  /* Languages per Org manual */
  pre.src-asymptote:before { content: 'Asymptote'; }
  pre.src-awk:before { content: 'Awk'; }
  pre.src-authinfo::before { content: 'Authinfo'; }
  pre.src-C:before { content: 'C'; }
  /* pre.src-C++ doesn't work in CSS */
  pre.src-clojure:before { content: 'Clojure'; }
  pre.src-css:before { content: 'CSS'; }
  pre.src-D:before { content: 'D'; }
  pre.src-ditaa:before { content: 'ditaa'; }
  pre.src-dot:before { content: 'Graphviz'; }
  pre.src-calc:before { content: 'Emacs Calc'; }
  pre.src-emacs-lisp:before { content: 'Emacs Lisp'; }
  pre.src-fortran:before { content: 'Fortran'; }
  pre.src-gnuplot:before { content: 'gnuplot'; }
  pre.src-haskell:before { content: 'Haskell'; }
  pre.src-hledger:before { content: 'hledger'; }
  pre.src-java:before { content: 'Java'; }
  pre.src-js:before { content: 'Javascript'; }
  pre.src-latex:before { content: 'LaTeX'; }
  pre.src-ledger:before { content: 'Ledger'; }
  pre.src-lisp:before { content: 'Lisp'; }
  pre.src-lilypond:before { content: 'Lilypond'; }
  pre.src-lua:before { content: 'Lua'; }
  pre.src-matlab:before { content: 'MATLAB'; }
  pre.src-mscgen:before { content: 'Mscgen'; }
  pre.src-ocaml:before { content: 'Objective Caml'; }
  pre.src-octave:before { content: 'Octave'; }
  pre.src-org:before { content: 'Org mode'; }
  pre.src-oz:before { content: 'OZ'; }
  pre.src-plantuml:before { content: 'Plantuml'; }
  pre.src-processing:before { content: 'Processing.js'; }
  pre.src-python:before { content: 'Python'; }
  pre.src-R:before { content: 'R'; }
  pre.src-ruby:before { content: 'Ruby'; }
  pre.src-sass:before { content: 'Sass'; }
  pre.src-scheme:before { content: 'Scheme'; }
  pre.src-screen:before { content: 'Gnu Screen'; }
  pre.src-sed:before { content: 'Sed'; }
  pre.src-sh:before { content: 'shell'; }
  pre.src-sql:before { content: 'SQL'; }
  pre.src-sqlite:before { content: 'SQLite'; }
  /* additional languages in org.el's org-babel-load-languages alist */
  pre.src-forth:before { content: 'Forth'; }
  pre.src-io:before { content: 'IO'; }
  pre.src-J:before { content: 'J'; }
  pre.src-makefile:before { content: 'Makefile'; }
  pre.src-maxima:before { content: 'Maxima'; }
  pre.src-perl:before { content: 'Perl'; }
  pre.src-picolisp:before { content: 'Pico Lisp'; }
  pre.src-scala:before { content: 'Scala'; }
  pre.src-shell:before { content: 'Shell Script'; }
  pre.src-ebnf2ps:before { content: 'ebfn2ps'; }
  /* additional language identifiers per "defun org-babel-execute"
       in ob-*.el */
  pre.src-cpp:before  { content: 'C++'; }
  pre.src-abc:before  { content: 'ABC'; }
  pre.src-coq:before  { content: 'Coq'; }
  pre.src-groovy:before  { content: 'Groovy'; }
  /* additional language identifiers from org-babel-shell-names in
     ob-shell.el: ob-shell is the only babel language using a lambda to put
     the execution function name together. */
  pre.src-bash:before  { content: 'bash'; }
  pre.src-csh:before  { content: 'csh'; }
  pre.src-ash:before  { content: 'ash'; }
  pre.src-dash:before  { content: 'dash'; }
  pre.src-ksh:before  { content: 'ksh'; }
  pre.src-mksh:before  { content: 'mksh'; }
  pre.src-posh:before  { content: 'posh'; }
  /* Additional Emacs modes also supported by the LaTeX listings package */
  pre.src-ada:before { content: 'Ada'; }
  pre.src-asm:before { content: 'Assembler'; }
  pre.src-caml:before { content: 'Caml'; }
  pre.src-delphi:before { content: 'Delphi'; }
  pre.src-html:before { content: 'HTML'; }
  pre.src-idl:before { content: 'IDL'; }
  pre.src-mercury:before { content: 'Mercury'; }
  pre.src-metapost:before { content: 'MetaPost'; }
  pre.src-modula-2:before { content: 'Modula-2'; }
  pre.src-pascal:before { content: 'Pascal'; }
  pre.src-ps:before { content: 'PostScript'; }
  pre.src-prolog:before { content: 'Prolog'; }
  pre.src-simula:before { content: 'Simula'; }
  pre.src-tcl:before { content: 'tcl'; }
  pre.src-tex:before { content: 'TeX'; }
  pre.src-plain-tex:before { content: 'Plain TeX'; }
  pre.src-verilog:before { content: 'Verilog'; }
  pre.src-vhdl:before { content: 'VHDL'; }
  pre.src-xml:before { content: 'XML'; }
  pre.src-nxml:before { content: 'XML'; }
  /* add a generic configuration mode; LaTeX export needs an additional
     (add-to-list 'org-latex-listings-langs '(conf " ")) in .emacs */
  pre.src-conf:before { content: 'Configuration File'; }

  table { border-collapse:collapse; }
  caption.t-above { caption-side: top; }
  caption.t-bottom { caption-side: bottom; }
  td, th { vertical-align:top;  }
  th.org-right  { text-align: center;  }
  th.org-left   { text-align: center;   }
  th.org-center { text-align: center; }
  td.org-right  { text-align: right;  }
  td.org-left   { text-align: left;   }
  td.org-center { text-align: center; }
  dt { font-weight: bold; }
  .footpara { display: inline; }
  .footdef  { margin-bottom: 1em; }
  .figure { padding: 1em; }
  .figure p { text-align: center; }
  .equation-container {
    display: table;
    text-align: center;
    width: 100%;
  }
  .equation {
    vertical-align: middle;
  }
  .equation-label {
    display: table-cell;
    text-align: right;
    vertical-align: middle;
  }
  .inlinetask {
    padding: 10px;
    border: 2px solid gray;
    margin: 10px;
    background: #ffffcc;
  }
  #org-div-home-and-up
   { text-align: right; font-size: 70%; white-space: nowrap; }
  textarea { overflow-x: auto; }
  .linenr { font-size: smaller }
  .code-highlighted { background-color: #ffff00; }
  .org-info-js_info-navigation { border-style: none; }
  #org-info-js_console-label
    { font-size: 10px; font-weight: bold; white-space: nowrap; }
  .org-info-js_search-highlight
    { background-color: #ffff00; color: #000000; font-weight: bold; }
  .org-svg { }
</style>
</head>
<body>
<div id="content" class="content">
<h1 class="title">D3425R1: Reducing operation-state sizes for subobject child operations</h1>
<div id="table-of-contents" role="doc-toc">
<h2>Table of Contents</h2>
<div id="text-table-of-contents" role="doc-toc">
<ul>
<li><a href="#orgb4e7ec9">1. Abstract</a></li>
<li><a href="#org5bd7446">2. Motivation</a>
<ul>
<li><a href="#orga182ef9">2.1. Example</a></li>
<li><a href="#orgba4f8f0">2.2. Example - Revisited</a></li>
</ul>
</li>
<li><a href="#org847f8ec">3. Proposal</a>
<ul>
<li><a href="#org3dc0536">3.1. The core protocol</a></li>
<li><a href="#org2d52c4a">3.2. Adding a helper for child operation-states (optional)</a></li>
<li><a href="#org3eb7d00">3.3. Implementing <code>make_receiver_for()</code></a></li>
<li><a href="#orge11b828">3.4. Adding a helper for parent operation-states (optional/future)</a></li>
<li><a href="#orga01213a">3.5. Applying this optimisation to standard-library sender algorithms</a></li>
</ul>
</li>
<li><a href="#org15e07c8">4. Design Discussion</a>
<ul>
<li><a href="#org6c6ca12">4.1. Naming of <code>inlinable_receiver</code> concept and <code>inlinable_operation_state</code></a></li>
</ul>
</li>
<li><a href="#org9e3d38a">5. Proposed Wording</a>
<ul>
<li><a href="#orgfcaebe3">5.1. <code>inlinable_receiver</code> concept wording</a></li>
<li><a href="#org93dc0ff">5.2. Changes to <i><code>basic-operation</code></i></a></li>
<li><a href="#org26c937d">5.3. Changes to <i><code>just</code></i>, <i><code>just_error</code></i>, and <i><code>just_stopped</code></i></a></li>
<li><a href="#org1f8192d">5.4. Changes to <i><code>read_env</code></i></a></li>
<li><a href="#org8cfe691">5.5. Changes to <i><code>schedule_from</code></i></a></li>
<li><a href="#org5ed4102">5.6. Changes to <i><code>then</code></i>, <i><code>upon_error</code></i>, <i><code>upon_stopped</code></i></a></li>
<li><a href="#orge3815b0">5.7. Changes to <i><code>let_value</code></i>, <i><code>let_error</code></i>, <i><code>let_stopped</code></i></a></li>
<li><a href="#org8412ee9">5.8. Changes to <i><code>bulk</code></i></a></li>
<li><a href="#orgb2eef78">5.9. Changes to <i><code>split</code></i></a></li>
<li><a href="#org159dba0">5.10. Changes to <i><code>when_all</code></i></a></li>
<li><a href="#orgc6422ff">5.11. Changes to <i><code>into_variant</code></i></a></li>
<li><a href="#org0eda1e7">5.12. Changes to <i><code>run_loop::run-loop-sender</code></i></a></li>
</ul>
</li>
<li><a href="#org716872d">6. References</a></li>
</ul>
</div>
</div>
<style>
  ins { background-color:#A0FFA0 }
  del { background-color:#FFA0A0 }
  expos { font-style:italic }
  exposonly {
    content: "// exposition only";
    font-style:italic;
  }
  div.section {
    counter-reset: paragraph;
    background-color:#A0FFA0;
  }
  div.wording > div.section {
    padding: 5px;
  }
  div.wording > div.section > h3 {
    margin-top: 5px;
  }
  div.wording {
    counter-reset: paragraph;
    margin-left: 50px
  }
  div.wording p.numbered::before {
    position: absolute;
    margin-left: -20px;
    margin-top: 2px;
    font-size: 0.75em;
    color: #CCC;
    content: counter(paragraph);
    counter-increment: paragraph;
  }
  div.wording ul {
    list-style-type: '\2014   ';
    counter-reset: paragraph;
  }
  div.wording ul li::before {
    position: absolute;
    margin-left: -75px;
    margin-top: 2px;
    font-size: 0.75em;
    color: #CCC;
    content: "(" counters(paragraph, ".") ")";
    counter-increment: paragraph;
  }
  div.wording ul ul li::before {
    margin-left: -125px;
  }
  div.wording ul ul ul li::before {
    margin-left: -175px;
  }
  div.wording ul ul ul ul li::before {
    margin-left: -225px;
  }
</style>

<table border="2" cellspacing="0" cellpadding="6" rules="groups" frame="hsides">


<colgroup>
<col  class="org-left" />

<col  class="org-left" />
</colgroup>
<tbody>
<tr>
<td class="org-left">Document</td>
<td class="org-left"><b>D3424R1</b></td>
</tr>

<tr>
<td class="org-left">Date</td>
<td class="org-left"><b>2024-11-15</b></td>
</tr>

<tr>
<td class="org-left">Reply To</td>
<td class="org-left"><b>Lewis Baker &lt;lewissbaker@gmail.com&gt;</b></td>
</tr>

<tr>
<td class="org-left">Audience</td>
<td class="org-left"><b>LEWG</b></td>
</tr>
</tbody>
</table>

<div id="outline-container-orgb4e7ec9" class="outline-2">
<h2 id="orgb4e7ec9"><span class="section-number-2">1.</span> Abstract</h2>
<div class="outline-text-2" id="text-1">
<p>
This paper proposes defining a standard protocol between parent operation-state and child-operation state
that permits the child-operation state to avoid needing to store the receiver in cases where the receiver
can be computed on-demand from the address of the child operation-state object.
</p>

<p>
Enabling this optimisation can save a pointer of storage for each nested operation-state, e.g. saving
40 bytes for a sender-expression 5 levels deep. Further, it can improve the performance of queries
to retrieve properties from the receiver's environment, by translating a series of pointer-dereferences
through successive parent-operation-state pointers into a series of constant-offsets from the child
operation-state address, allowing compilers to constant-fold such queries into a single pointer-dereference,
reducing code size and improving runtime performance.
</p>

<p>
While such a protocol is purely additive and could theoretically be added later, there are advantages to
adding it now as it permits us to specify that default implementations of standard-library algorithms
added by P2300 must incorporate this protocol in their implementation - something that may be difficult
to retrofit later, due to the potential for this causing breaking changes in type-layout.
</p>
</div>
</div>

<div id="outline-container-org5bd7446" class="outline-2">
<h2 id="org5bd7446"><span class="section-number-2">2.</span> Motivation</h2>
<div class="outline-text-2" id="text-2">
<p>
When a sender-algorithm composes one or more child operations, it typically connects the child senders
with a receiver that contains a pointer to the parent operation-state as its only data-member.
The operation-state returned from the call to <code>connect()</code> will need to store this receiver as a
data-member so that it can later call the completion-functions on the receiver to signal completion
of the operation.
</p>

<p>
This means that for every sender in a sender expression-tree we generally have a linked list of
operation-states, with pointers wrapped up inside receivers, going from leaf operations up to
the top-most operation. This bears similarities to a traditional call-stack, although in the
case of senders the links can form a tree rather than being limited to a linear stack.
</p>

<p>
For algorithms that have a statically-known number of child operations, it is common to have the child
operation-states stored as data-members of the parent operation-state as this allows them to avoid
dynamic memory allocations.
</p>

<p>
However, if we think about it, for cases where the child operation-state is a direct member of the
parent operation-state, the pointer to the parent operation-state stored in the receiver held by
the child is always going to be pointing to some constant offset relative to the child operation-state's
address.
</p>

<p>
In cases where we can compute the address of the parent operation-state from the address of the child
operation-state, we can potentially reduce the size of child operation-states by not storing the receiver
and instead just constructing the receiver on-demand from the address of the child operation-state
whenever the child operation-state needs it.
</p>

<p>
Doing this would have two main of benefits:
</p>

<ol class="org-ol">
<li>It reduces the size of an overall operation-state tree by at least one pointer for each operation in
the tree (it might save more due to padding for some operation-states).</li>
<li>When querying the environment of the receiver it can turn a walk of a linked-list of operation-states
to get the address of the operation-state providing some query result into applying a series of
constant offsets from the current operation-state's address.</li>
</ol>

<p>
There are also some secondary benefits for composition of algorithms that adopt these optimisations:
</p>

<ol class="org-ol">
<li>It reduces the run-time overhead of composing existing algorithms to build new algorithms.</li>
<li>This in turn reduces the need to write your own sender algorithms from scratch as often
in order to get optimal performance.</li>
</ol>
</div>

<div id="outline-container-orga182ef9" class="outline-3">
<h3 id="orga182ef9"><span class="section-number-3">2.1.</span> Example</h3>
<div class="outline-text-3" id="text-2-1">
<p>
For example, consider the following sender-expression:
</p>
<div class="org-src-container">
<pre class="src src-c++">  when_all(
    then(                            // then_op#3
      then(                          // then_op#2
        then(                        // then_op#1
          schedule(thread_pool),
          f),
        g),
      h),
    then(                            // then_op#6
      then(                          // then_op#5
        then(                        // then_op#4
          schedule(thread_pool),
          a),
        b),
      c))
</pre>
</div>

<p>
With the status-quo, connecting the resulting sender to a receiver, <code>rcvr</code>, would result in the
following operation-state layout:
</p>

<pre class="example">
                          A
  +-----------------------|-----------------+
  | when_all_op           |  A              |
  | - rcvr (parent_op*) --'  |              |
  | - ref_count              |              |
  | - stop_source            |              |
  | - stop_callback          |              |
  | - result_tuple           |              |
  | +------------------------|------------+ |
  | | then_op#3              | A          | |
  | | - rcvr (when_all_op*) -' |          | |
  | | - h                      |          | |
  | | +------------------------|--------+ | |
  | | | then_op#2              | A      | | |
  | | | - rcvr (then_op#3*) ---' |      | | |
  | | | - g                      |      | | |
  | | | +------------------------|----+ | | |
  | | | | then_op#1              | A  | | | |
  | | | | - rcvr (then_op#2*) ---' |  | | | |
  | | | | - f                      |  | | | |
  | | | | +------------------------|+ | | | |
  | | | | | schedule_op            || | | | |
  | | | | | - rcvr (then_op#1*) ---'| | | | |
  | | | | | - thread_pool*          | | | | |
  | | | | | - stop_callback         | | | | |
  | | | | | - ...                   | | | | |
  | | | | +-------------------------+ | | | |
  | | | +-----------------------------+ | | |
  | | +---------------------------------+ | |
  | +-------------------------------------+ |
  | +-------------------------------------+ |
  | | then_op#6                           | |
  | | - rcvr (when_all_op*)               | |
  | | - ... (similar to above)            | |
  | +-------------------------------------+ |
  +-----------------------------------------+
</pre>

<p>
There are a few things worth noting here.
</p>

<p>
<b>Operation State Size</b>
</p>

<p>
The child operation states all hold a receiver that contains a pointer
to the parent operation-state. In total, this consists of 8x pointers to
parent operation-states (9x pointers if you include the one likely to be stored
in the receiver held by when<sub>all</sub><sub>op</sub>).
</p>

<p>
Together, these contribute at least 64-72-bytes in total across the whole
operation-state hierarchy - possibly more depending on the size/alignment
of the function-objects passed to <code>then()</code>. e.g. if padding is required.
</p>

<p>
<b>Cost of Environment Queries</b>
</p>

<p>
The leaf <code>schedule_op</code> operations need to subscribe to a stop-callback
on the environment's current stop-token in order to support cancellation of
the operation - the <code>when_all()</code> algorithm can send a stop-request to children
if any of them fail.
</p>

<p>
However, in order to obtain the stop-token needed to register the stop-callback,
the <code>schedule_op</code> implementation needs to ask its receiver by calling
<code>std::get_stop_token(std::execution::get_env(rcvr))</code>.
</p>

<p>
The <code>get_stop_token</code> query on the receiver stored in the <code>schedule_op</code>
forwards the query to the receiver stored in the <code>then_op#1</code> object, which then
forwards the query to the receiver stored in the <code>then_op#2</code> object, which then
forwards the query to the receiver stored in the <code>then_op#3</code> object, which then
satisfies the query by calling <code>stop_source.get_token()</code> on the stop-source
stored in the <code>when_all_op</code> object.
</p>

<p>
This is 4x pointer dereferences needed to obtain the address of the stop-source
object required to evaluate obtain the stop-token.
Queries which propagate further up the stack of sender-operations might have to
do even more pointer dereferencing to get the query result.
</p>

<p>
Further, each query that is satisfied by a parent environment up the stack
will require its own walk through these pointers to the operation-state that
fulfils that particular query.
</p>

<p>
This stack-walking has a run-time cost - the successive loads from memory of the
pointer data cost possibly a few cycles if the data is cache, but could be 10s or 100s
of cycles if several of the loads need to go to main memory. The successive loads
are all dependent on the prior loads and so the CPU cannot generally pipeline the
loads.
</p>

<p>
The stack-walking logic also has a code-size cost. The compiler needs to generate
a sequence of N mov/load instructions for evaluating each query, where N is the number
of levels up the stack the query needs to traverse in order to get to the
operation-state that statisfies the query.
</p>

<p>
<b>Cost of Completion</b>
</p>

<p>
Similar to the cost of pointer-walking for performing queries, calling the completion-function
on a receiver also often requires dereferencing the same set of pointers.
</p>

<p>
In the example above, when the schedule-operation completes it needs to load the pointer to
the <code>then_op#3</code> operation state from the <code>schedule_op</code> state in order to compute the address
of the function-object, <code>f</code>, to invoke. Then, when <code>f()</code> returns, it needs to load the
pointer to the <code>then_op#2</code> operation state from the <code>then_op#3</code> state in order to compute
the address of the function-object, <code>g</code>, to invoke, and so forth.
</p>

<p>
In a more extreme example, consider the case where a composition of nested operations
all just forward through the result to the parent receiver up N levels until the eventual
ancestor operation that handles the result. Even in this case, where there is no processing
of the result datums being done, we still need to follow the linked-list of operation-states
in order to compute the address of the final handler of the result.
</p>

<p>
<b>Cost of Composition</b>
</p>

<p>
The net result of all of the above costs is that there is a cost to composing these
operations.
</p>

<p>
If the code had, instead, been written with a single <code>then()</code> which took a function object
that composed <code>f</code>, <code>g</code> and <code>h</code> then the result would have less overhead than the expression
where each of these transformations is applied in a separate <code>then()</code> invocations.
</p>

<p>
For example, we could have written:
</p>
<div class="org-src-container">
<pre class="src src-c++">  when_all(
    then(schedule(thread_pool), [f, g, h] { return h(g(f())); }),
    then(schedule(thread_pool), [a, b, c] { return c(b(a())); }))
</pre>
</div>

<p>
and this would have fewer operation-state pointers and fewer pointer-indirections than
the original code above.
</p>

<p>
While, in some cases, this kind of manual flattening of composition is possible - it is not always possible.
</p>

<p>
This makes the cost of composition have non-zero runtime overhead.
</p>

<p>
This is likely to have the unfortunate side-effect of encouraging users to try to write
their code using as few layers of composition as possible - potentially making their code
more complex, or even having to write new sender algorithms that implement certain compositions
more efficiently.
</p>
</div>
</div>

<div id="outline-container-orgba4f8f0" class="outline-3">
<h3 id="orgba4f8f0"><span class="section-number-3">2.2.</span> Example - Revisited</h3>
<div class="outline-text-3" id="text-2-2">
<p>
If we look at the above example, but this time with the optimisations proposed in this paper
being applied, then the resulting operation-state will, instead, look something like this:
</p>

<pre class="example">

                     A
  +------------------|---+------A-----------+
  | when_all_op      |   |      |           |
  | - (maybe?) rcvr -'   |      |           |
  | - ref_count          |      |           |
  | - stop_source    &lt;---'      | -72 bytes |
  | - stop_callback   +16 bytes |           |
  | - result_tuple              |           |
  | +----------------------A----+---------+ |
  | | then_op#3            | -16 bytes    | |
  | | - h                  |              | |
  | | +-----------------A--+------------+ | |
  | | | then_op#2       | -4 bytes      | | |
  | | | - g             |               | | |
  | | | +-------------A-+-------------+ | | |
  | | | | then_op#1   | -8 bytes      | | | |
  | | | | - f         |               | | | |
  | | | | +-----------+-------------+ | | | |
  | | | | | schedule_op             | | | | |
  | | | | | - thread_pool*          | | | | |
  | | | | | - stop_callback         | | | | |
  | | | | | - ...                   | | | | |
  | | | | +-------------------------+ | | | |
  | | | +-----------------------------+ | | |
  | | +---------------------------------+ | |
  | +-------------------------------------+ |
  | +-------------------------------------+ |
  | | then_op#6                           | |
  | | - ... (similar to above)            | |
  | +-------------------------------------+ |
  +-----------------------------------------+
</pre>

<p>
In this case, each of the child operations knows how to compute the address of the parent-operation
state from the address of the child operation state - because the parent operation-state injects this
information in along with the receiver in the form of a static function on the receiver type.
</p>

<p>
So, when the <code>schedule_op</code> object goes to construct the <code>stop_callback</code> member and needs to get
the stop-token from the environment, the compiler sees a series of inlinable calls to compute the
parent receiver, each of which just subtracts an offset from the child operation-state.
</p>

<p>
The net result is that, in optimised compilation modes, the compiler can constant-fold all of these
offsets into a single offset from the <code>schedule_op</code> address and thus does not need to perform any
memory loads in order to obtain the stop-token (which is just initialized with the address of the
stop-source object).
</p>

<p>
For example, in the above operation-state layout diagram, the compiler would effectively lower this
code to the equivalent of the following (after inlining):
</p>
<div class="org-src-container">
<pre class="src src-c++">  void schedule_op::start() noexcept {
    // Evaluate:
    //  auto st = std::get_stop_token(std::get_env(this-&gt;get_receiver()));
    //
    // Lowers to equivalent to:
    auto* _op1 = reinterpret_cast&lt;then_op_1*&gt;(reinterpret_cast&lt;unsigned char*&gt;(this) - 8);
    auto* _op2 = reinterpret_cast&lt;then_op_2*&gt;(reinterpret_cast&lt;unsigned char*&gt;(_op1) - 4);
    auto* _op3 = reinterpret_cast&lt;then_op_3*&gt;(reinterpret_cast&lt;unsigned char*&gt;(_op2) - 16);
    auto* _when_all_op = reinterpret_cast&lt;when_all_op*&gt;(reinterpret_cast&lt;unsigned char*&gt;(_op3) - 72);
    auto st = _when_all_op.stop_source.get_token();
    // ...
  }
</pre>
</div>

<p>
Which, after constant-folding would result in a constant offset from <code>this</code>:
</p>
<div class="org-src-container">
<pre class="src src-c++">  void schedule_op::start() noexcept {
    // Evaluate:
    //  auto st = std::get_stop_token(std::get_env(this-&gt;get_receiver()));
    //
    // Lowers to equivalent to:
    auto&amp; _ss = *reinterpret_cast&lt;std::inplace_stop_source*&gt;(
                   reinterpret_cast&lt;unsigned char*&gt;(this) - 84);
    auto st = _ss.get_token();
    // ...
  }
</pre>
</div>

<p>
In addition to this being more optimisable by the compiler, the overall operation-state size has
now shrunk by at least 64-bytes due to not having to store the pointers to parent-operation states.
</p>

<p>
There is also now a reduction in code-size in the resulting binary.
There are no longer instructions needed to initialize the the pointers to parent-operation-states.
There is no longer instructions needed to dereference the chain of pointers during query evaluation
or on completion.
</p>

<p>
The overall net result is that this optimisation permits a reduction in memory usage, an increase
in run-time performance and a reduction in code-size proportional to the depth of the sender
expression tree that can be inlined.
</p>

<p>
Further, this code is now more efficient than the hand-flattened version above that combined
the three nested invocations of <code>then()</code> into a single invocation of <code>then()</code>, reducing the
motivation for programmers to perform this sort of manual optimisation.
</p>
</div>
</div>
</div>

<div id="outline-container-org847f8ec" class="outline-2">
<h2 id="org847f8ec"><span class="section-number-2">3.</span> Proposal</h2>
<div class="outline-text-2" id="text-3">
<p>
The proposal includes two key parts which enable the optimisations mentioned above:
</p>
<ol class="org-ol">
<li>Defining the key protocol that allows a parent and child operation to negotiate to apply
the optimisation when both support it.</li>
<li>Applying this protocol to the sender-algorithms proposed by P2300R10.</li>
</ol>

<p>
This proposal also includes some utilities which can be used to make it easier for authors of
sender types to implement the above optimisation protocol correctly. These facilities could
be optionally included either now or later. If not included now the sender authors can still
implement the protocol, but will need to implement their own versions of these helpers in
the meantime.
</p>
</div>

<div id="outline-container-org3dc0536" class="outline-3">
<h3 id="org3dc0536"><span class="section-number-3">3.1.</span> The core protocol</h3>
<div class="outline-text-3" id="text-3-1">
<p>
The key, enabling part of this optimisation is providing a child operation with a way to
construct a receiver on-demand given the address of the child operation.
</p>

<p>
The mechanism proposed here for this is to allow receiver types to define a static factory
function that accepts a pointer to the child operation-state and that returns an instance
of that receiver type.
</p>

<p>
For example:
</p>
<div class="org-src-container">
<pre class="src src-c++">  struct some_receiver {
    // Factory-construct a receiver on-demand from the child operation-state address.
    static some_receiver make_receiver_for(child_op_state* op) noexcept;

    // Other receiver methods.
    void set_value(auto&amp;&amp;... vs) noexcept;
    void set_error(auto&amp;&amp; e) noexcept;
    void set_stopped() noexcept;
    some_env get_env() const noexcept;
  };
</pre>
</div>

<p>
If the receiver has such a factory function then the child operation is free to not store
the reciever passed to <code>connect()</code> and to, instead, just call this factory function to
obtain a new receiver object whenever the receiver is needed.
</p>

<p>
This requirement basically defines a new concept that subsumes the <code>receiver</code> concept
which can be written as follows:
</p>

<div class="org-src-container">
<pre class="src src-c++">  namespace std::execution
  {
    template&lt;typename T, typename ChildOp&gt;
    concept inlinable_receiver =
      receiver&lt;T&gt; &amp;&amp;
      requires(ChildOp* op) {
        { T::make_receiver_for(op) } noexcept -&gt; std::same_as&lt;T&gt;;
      };
  }
</pre>
</div>

<p>
Note that the concept does not check that <code>ChildOp</code> satisfies <code>operation_state</code> as the concept
needs to be usable at a point where the <code>ChildOp</code> type is still an incomplete type.
</p>

<p>
With this concept, a child operation-state type, <code>ChildOp</code>, can then specialise itself to either
hold the receiver as a data-member or not depending on whether the receiver type satisfies
the <code>inlinable_receiver&lt;ChildOp&gt;</code> concept.
</p>

<p>
For example:
</p>
<div class="org-src-container">
<pre class="src src-c++">  template&lt;typename Receiver&gt;
  class my_op_state {
  public:
    my_op_state(Receiver r) noexcept : rcvr_(std::move(r)) {}
    void start() noexcept;
  private:
    Receiver&amp; get_receiver() noexcept { return rcvr_; }
    Receiver rcvr_;
  };

  template&lt;typename Receiver&gt;
  requires inlinable_receiver&lt;Receiver, my_op_state&lt;Receiver&gt;&gt;
  class my_op_state&lt;Receiver&gt; {
    my_op_state([[maybe_unused]] Receiver r) noexcept {}
    void start() noexcept;
  private:
    Receiver get_receiver() noexcept { return Receiver::make_receiver_for(this); }
    // NOTE: No 'Receiver' data-member.
  };
</pre>
</div>

<p>
It is worth noting that the optimisation proposed here requires both the parent operation
and child operation to opt-in to the protocol for the optimisation to be applied.
If either the parent or child do not opt-in to the protocol then we need to still gracefully
revert back to the default behaviour of storing the receiver.
</p>

<p>
We can see how this would work by examining the code above:
</p>
<ul class="org-ul">
<li>If the specialisation for an <code>inlinable_reciever</code> was not present, as would be the case if the
child operation did not opt-in to the optimisation, then the child operation would just store
the receiver as normal.</li>
<li>If the parent operation-state did not provide a receiver to the child operation-state that
implemented the <code>inlinable_receiver</code> concept, then the child operation state would not
instantiate the specialisation and would instead fall back to instantiating the primary
template that just stores the receiver as normal.</li>
<li>If the parent operation-state provides a receiver that implements the <code>inlinable_reciever</code>
concept <i>and</i> the child operation implements the specialisation for <code>inlinable_receiver</code>
then we end up instantiating the child operation state specialisation that can avoid storing
the receiver.</li>
</ul>

<p>
Note that while it is optional for operation-state implementations to implement this protocol,
it is recommended that all operation-state implementations do so, in order to maximise the
effectiveness of the optimisation.
</p>
</div>
</div>

<div id="outline-container-org2d52c4a" class="outline-3">
<h3 id="org2d52c4a"><span class="section-number-3">3.2.</span> Adding a helper for child operation-states (optional)</h3>
<div class="outline-text-3" id="text-3-2">
<p>
When defining the operation-state for a sender, it would be overly verbose for the author
to have to duplicate their logic across two specialisations as defined above.
</p>

<p>
To allow encapsulating this optimisation and eliminating the duplication of code,
we can factor out this facility into a helper CRTP base-class which is responsible
for storing (or producing on demand) the receiver.
</p>

<p>
This paper proposes optionally adding the following helper class for operation-state
authors to use to enable the optimisation in their implementations:
</p>
<div class="org-src-container">
<pre class="src src-c++">  // In &lt;execution&gt; header
  namespace std::execution
  {
    template&lt;typename Derived, receiver Receiver&gt;
    class inlinable_operation_state {
    protected:
      explicit inlinable_operation_state(Receiver r)
        noexcept(std::is_nothrow_move_constructible_v&lt;Receiver&gt;)
        : rcvr_(std::move(r)) {}

      Receiver&amp; get_receiver() noexcept { return rcvr_; }

    private:
      Receiver rcvr_; // exposition-only
    };

    template&lt;typename Derived, receiver Receiver&gt;
    requires inlinable_receiver&lt;Receiver, Derived&gt;
    class inlinable_operation_state&lt;Derived, Receiver&gt; {
    protected:
      explicit inlinable_operation_state(Receiver r) noexcept {}

      Receiver get_receiver() noexcept {
        return Receiver::make_receiver_for(static_cast&lt;Derived*&gt;(this));
      }
    };
  }
</pre>
</div>

<p>
This class can then be used as a base-class of any operation-state that wants to
be able to opt-in to this optimisation.
</p>

<p>
For example, the above <code>my_op_state</code> class can now be written as a single primary template
by inheriting publicly from <code>inlinable_operation_state</code>:
</p>
<div class="org-src-container">
<pre class="src src-c++">  template&lt;typename Receiver&gt;
  class my_op_state : public inlinable_operation_state&lt;my_op_state&lt;Receiver&gt;, Receiver&gt; {
  public:
    my_op_state(Receiver r) noexcept
      : inlinable_operation_state&lt;my_op_state, Receiver&gt;(std::move(r))
    {}

    void start() noexcept {
      // Call this-&gt;get_receiver() to get the receiver from the base-class.
      auto st = std::get_stop_token(std::execution::get_env(this-&gt;get_receiver()));
      if (st.stop_possible()) {
        // ...
      }
    }
  };
</pre>
</div>

<p>
This facility will be useful for all sender implementations (basically any sender that might become
a child-operation of some sender-algorithm). This includes both leaf sender operations, which I
expect will be the majority of senders authored by users, as well as sender-algorithms that compose
other senders.
</p>

<p>
However, this facility is also fairly simple and straight-forward for users to write themselves
when authoring sender implementations. It is only 20 lines of code and so the benefit from having
such a facility in the standard library is one of convenience rather than one of abstracting away
something complex that would be difficult to write by-hand.
</p>
</div>
</div>

<div id="outline-container-org3eb7d00" class="outline-3">
<h3 id="org3eb7d00"><span class="section-number-3">3.3.</span> Implementing <code>make_receiver_for()</code></h3>
<div class="outline-text-3" id="text-3-3">
<p>
So, now that we have shown the child-operation part of the protocol and how it can
use this protocol to avoid storing the receiver, let's now turn to looking at how
we can actually implement this protocol from the parent-operation side.
</p>

<p>
This part of the protocol is considerably more involved, and there are a few pitfalls
that we need to be careful to avoid, lest we unintentially invoke undefined behaviour.
</p>

<p>
A naive first approach might be to try something like the following which uses <code>offsetof</code>
to compute the address of the parent operation from the address of the child:
</p>
<div class="org-src-container">
<pre class="src src-c++">  template&lt;typename ParentReceiver, typename ChildSender&gt;
  class parent_op
    : public std::execution::inlinable_operation_state&lt;parent_op&lt;ParentReceiver, ChildSender&gt;, ParentReceiver&gt; {
  private:
    struct child_receiver {
      parent_op* op;

      template&lt;typename ChildOp&gt;
      static child_receiver make_receiver_for(ChildOp* child_op) noexcept {
        static_assert(std::same_as&lt;ChildOp, child_op_t&gt;);
        // KEY PART: Compute address of parent_op from address of child_op
        auto* parent = reinterpret_cast&lt;parent_op*&gt;(
            reinterpret_cast&lt;unsigned char*&gt;(child_op) - offsetof(parent_op, child_op_));
        return child_receiver{parent};
      }

      // ... other receiver methods omitted for brevity
    };

    using child_op_t = std::connect_result_t&lt;ChildSender, child_receiver&gt;;
    child_op_t child_op_;

  public:
    parent_op(ChildSender&amp;&amp; child, ParentReceiver rcvr)
    : std::execution::inlinable_operation_state&lt;parent_op, ParentReceiver&gt;(std::move(rcvr))
    , child_op_(std::execution::connect(std::forward&lt;ChildSender&gt;(child), child_receiver{this}))
    {}

    void start() noexcept {
      std::execution::start(child_op_);
    }
  };
</pre>
</div>

<p>
However, while this approach may appear to work on some implementations, <i>it is actually undefined behaviour</i>
to do this.
</p>

<p>
It is not permitted to go from the address of a child data-member to the address of the parent class except
in very limited circumstances. This rule is there to permit, among other things, a compiler-optimisation
called "scalar replacement of aggregates", which allows the compiler to break up an aggregate type into
a set of separate stack-allocations for each of the data-members if the address of the parent object is
not aliased/observed.
</p>

<p>
The very limited circumstances in which we can go from the address of a sub-object to the address of
the parent-object are the following:
</p>
<ul class="org-ul">
<li>When the sub-object is a non-ambiguous base-class of parent-object (<a href="https://eel.is/c++draft/expr.static.cast#11">[expr.static.cast] p11</a>)
In this case, we can use <code>static_cast</code> to cast from pointer to base-class to the pointer to the derived parent-object</li>
<li>When the parent-object and sub-object are "pointer-interconvertible" (<a href="https://eel.is/c++draft/basic.compound#5">[basic.compound] p5</a>).
In this case, we can use <code>reinterpret_cast</code> to cast from pointer to sub-object to pointer to parent-object.</li>
</ul>

<p>
Two objects are "pointer-interconvertible" only if:
</p>
<ul class="org-ul">
<li>the parent-object is a union and the sub-object is a non-static data-member of that union; or</li>
<li>the parent-object is a "standard layout" class object and the sub-object is the first non-static
data-member of the parent-object or any base-class sub-object of the parent-object</li>
<li>there exists an intermediate sub-object, <code>C</code>, such that the parent-object is pointer-interconvertible
with <code>C</code> and <code>C</code> is pointer-interconvertible with the sub-object (i.e. the relationship is transitive)</li>
</ul>

<p>
Note that there are a number of rules for types that are considered "standard layout" class types (<a href="https://eel.is/c++draft/class.prop#3">[class.prop] p3</a>).
I won't go into particular details here but, among other things, this doesn't allow types with virtual methods,
virtual base-classes, types with non-static data-members with different access control, or data-members
that are not also standard layout class types.
</p>

<p>
As child operation states in general are not going to all be standard layout types and since we also want
to support cases where a parent-operation has multiple child operations, we cannot just rely on being able
to convert the address of the first non-static data member to the address of the parent as a general
solution.
</p>

<p>
This means that we are going to need to make use of base-classes to allow going from address of a
sub-object to the address of a parent-object.
</p>

<p>
Further, there are also cases where we need to be able to defer construction of a child operation-state
until after the operation is started, or where we want to be able to destroy a child operation-state
before the parent operation-state is destroyed.
</p>

<p>
This means that, in general, we cannot just use the child operation-state as a direct base-class as
this would force the lifetimes of the child operation-state to be the same as the lifetime of the
parent operation-state.
Instead, we can define a base-class that has as its only data-member an array of bytes which is used
as storage for the child-operation state, into which we can placement-new the child operation-state
at the appropriate time.
</p>

<p>
This can also be used to emulate unions of operation-states, where there might be a set of possible
operation-state types that might need to be able to be constructed in that storage. For example,
consider the set of possible operation-states for the successor operation of <code>let_value()</code>, the type
of which may depend on what value completion-signature the predecessor completed with.
</p>

<p>
There are also some challenges with regards to avoiding circular dependencies when computing the
complete type for the child operation-state. This will generally require the receiver type to be
complete, but may also require the receiver's environment type to be complete if the child operation-state
depends on the types of query-results (e.g. if it contains a stop-callback data-member).
</p>

<p>
However, as the layout of the child operation-state needs to be known during instantiation of a base-class
of the parent operation-state type, the completeness of the receiver and its environment cannot depend
on anything defined in the interface of the parent operation-state class.
</p>

<p>
This means that the return-types of all environment queries need to be known, even if the body of the
query methods needs to access some state from the parent-operation-state (e.g. a stop-source).
This information about the environment, therefore, needs to be injected into the base-class somehow,
typically in the form of an additional template parameter.
</p>

<p>
Finally, since we might have multiple child operations which are constructed from the same sender
(consider the child operations of <code>when_all(just(1), just(2))</code>), we need some way to distinguish
different base-class child-objects so that we don't run into issues with duplicate base-classes,
which would either be ill-formed or make the down-cast we want to perform ambiguous.
</p>

<p>
So, therefore, as we want to have a generic helper class we can use for the base-class, we also
need to add some kind of 'tag' template parameter which can be passed something different for
each child-operation to ensure that each child-operation base-class is distinct.
</p>

<p>
So, putting all of this together, we end up with some helper-classes like the following:
</p>
<div class="org-src-container">
<pre class="src src-c++">  template&lt;typename Sender, typename Receiver&gt;
  inline constexpr bool is_nothrow_connectable_v =
    noexcept(std::execution::connect(std::declval&lt;Sender&gt;(), std::declval&lt;Receiver&gt;()));

  // Helper class for parent operations that want to manually manage the lifetime of
  // a child operation.
  template&lt;typename ParentOp, typename Tag, typename Env, typename ChildSender&gt;
  class manual_child_operation_state {
  private:
    class receiver {
    public:
      // Implement the prot
      template&lt;typename ChildOp&gt;
      static receiver make_receiver_for(ChildOp* child_op) noexcept {
        static_assert(std::same_as&lt;ChildOp, child_op_t&gt;);

        // Cast from 'child_op_t*' to  'unsigned char*' pointer to 'storage_' member.
        // - valid since we constructed at the storage address using placement-new.
        auto* storage = reinterpret_cast&lt;unsigned char*&gt;(child_op);

        // Cast from address of first member of 'manual_child_operation_state' to
        // address of 'manual_child_operation_state'.
        // Valid as 'manual_child_operation_state' is a standard-layout type.
        auto* self = reinterpret_cast&lt;manual_child_operation_state*&gt;(storage);

        // Cast from manual_child_operation_state address to address of 'ParentOp'
        // which inherits from manual_child_operation_state.
        auto* parent_op = static_cast&lt;ParentOp*&gt;(self);

        // Construct a receiver with the address of the parent operation-state.
        return receiver{parent_op};
      }

      // Forward following calls on the receiver to calls on the parent operation-state
      // object with the added 'Tag' object as the first argument.

      template&lt;typename... Vs&gt;
      void set_value(Vs&amp;&amp;... vs) noexcept {
        parent_op_-&gt;set_value(Tag{}, std::forward&lt;Vs&gt;(vs)...);
      }

      template&lt;typename E&gt;
      void set_error(E&amp;&amp; e) noexcept {
        parent_op_-&gt;set_error(Tag{}, std::forward&lt;E&gt;(e));
      }

      void set_stopped() noexcept {
        parent_op-&gt;set_stopped(Tag{});
      }

      Env get_env() const noexcept {
        return parent_op_-&gt;get_env(Tag{});
      }

    private:
      friend manual_child_operation_state;
      explicit receiver(ParentOp* parent_op) noexcept : parent_op_(parent_op) {}
      ParentOp* parent_op_;
    };

    using child_op_t = std::execution::connect_result_t&lt;ChildSender, receiver&gt;;

  protected:
    // Trivial default constructor/destructor
    manual_child_operation_state() noexcept = default;
    ~manual_child_operation_state() = default;

    // Start execution of the child operation state.
    void start() noexcept {
      std::execution::start(get());
    }

    // Manually construct the child operation from the sender.
    void construct(ChildSender&amp;&amp; sender) noexcept(is_nothrow_connectable_v&lt;ChildSender, receiver&gt;)
      auto* parent_op = static_cast&lt;ParentOp*&gt;(this);
      ::new (&amp;storage_) child_op_t(
          std::connect(std::forward&lt;ChildSender&gt;(sender), receiver{parent_op}));
    }

    // Manually destruct the child operation from the sender.
    void destruct() noexcept {
      get().~child_op_t();
    }

  private:
    child_op_t&amp; get() noexcept {
      return *std::launder(reinterpret_cast&lt;child_op_t*&gt;(&amp;storage_));
    }

    alignas(child_op_t) unsigned char storage_[sizeof(child_op_t)];
  };

  // Helper class for parent operations that want a child operation with the same lifetime
  // as that of the parent operation.
  template&lt;typename ParentOp, typename Tag, typename Env, typename ChildSender&gt;
  class child_operation_state : public manual_child_operation_state&lt;ParentOp, Tag, Env, ChildSender&gt; {
  private:
    using base_t = manual_child_operation_state&lt;ParentOp, Tag, Env, ChildSender&gt;;
    using base_t::construct;
    using base_t::destruct;

  protected:
    explicit child_operation_state(ChildSender&amp;&amp; sender)
        noexcept(noexcept(base_t::construct(std::forward&lt;ChildSender&gt;(sender))) {
      base_t::construct(std::forward&lt;ChildSender&gt;(sender));
    }

    ~child_operation_state() {
      base_t::destruct();
    }
  };
</pre>
</div>

<p>
Revisiting the <code>parent_op</code> example above, it can now be rewritten as follows:
</p>
<div class="org-src-container">
<pre class="src src-c++">  // A tag type to used for identifying which child a completion signal comes from
  struct source_tag {};

  template&lt;typename ParentReceiver, typename ChildSender&gt;
  class parent_op
      : public std::execution::inlinable_operation_state&lt;
          parent_op&lt;ParentReceiver, ChildSender&gt;,
          ParentReceiver&gt;
      , public child_operation_state&lt;   // Inherit from 'child_operation_state'
          parent_op&lt;ParentReceiver, ChildSender&gt;,
          source_tag,
          std::execution::env_of_t&lt;ParentReceiver&gt;,
          ChildSender&gt; {
    using inline_base_t = std::execution::inlinable_operation_state&lt;parent_op, ParentReceiver&gt;;
    using env_t = std::execution::env_of_t&lt;ParentReceiver&gt;;
    using child_base_t = child_operation_state&lt;parent_op, source_tag, env_t, ChildSender&gt;;

  public:
    parent_op(ChildSender&amp;&amp; child, ParentReceiver rcvr)
      : inline_base_t(std::move(rcvr))
      , child_base_t(std::forward&lt;ChildSender&gt;(child))
    {}

    void start() noexcept {
      child_base_t::start();
    }

    //
    // Implement handling for signals coming from receiver passed to the
    // 'source_tag' child operation.
    //

    template&lt;typename... Vs&gt;
    void set_value(source_tag, Vs&amp;&amp;... vs) noexcept {
      // ...

      // Eventually... signal completion.
      std::execution::set_value(this-&gt;get_receiver(), the_result);
    }

    template&lt;typename E&gt;
    void set_error(source_tag, E&amp;&amp; e) noexcept {
      // ...
    }

    void set_stopped(source_tag) noexcept {
      // ...
    }

    env_t get_env(source_tag) noexcept {
      return std::execution::get_env(this-&gt;get_receiver());
    }
  };
</pre>
</div>

<p>
Some interesting points to note with this implementation:
</p>
<ul class="org-ul">
<li>There will be a separate base-class for each child operation that is stored inline
in the parent operation for which we want to be able to use this optimisation.</li>
<li>We no longer need to define our own <code>receiver</code> class to pass to the child sender's connect method.
This is all handled by the <code>child_operation_state</code> helper.</li>
<li>The use of <code>inlinable_operation_state</code> means that this class can avoid storing
the parent receiver if the parent operation state includes it as a sub-object,
and the use of <code>child_operation_state</code> means that the child of this operation
can avoid storing the receiver we pass to it if it uses the <code>inlinable_operation_state</code>
class to manage storing (or not storing) the receiver.
i.e. it implements the optimisation protocol both from the child-operation
and parent-operation perspectives.</li>
<li>All of the child completion signals are forwarded to methods on the operation-state, with
signals from different children differentiated by a tag parameter.</li>
<li>These methods need to be public to allow the <code>manual_child_operation_state::receiver</code> class
to call them without having to declare it as a friend.</li>
<li>This example just forwards through the parent environment to the child operation.
If you wanted to modify the environment in some way (e.g. by changing the stop-token)
then you'd need to define a separate environment class and pass that as the <code>Env</code> template
argument to <code>child_operation_state</code> instead.</li>
</ul>
</div>
</div>

<div id="outline-container-orge11b828" class="outline-3">
<h3 id="orge11b828"><span class="section-number-3">3.4.</span> Adding a helper for parent operation-states (optional/future)</h3>
<div class="outline-text-3" id="text-3-4">
<p>
As evidenced by the long description above, it is complicated to try to implement the
<code>make_receiver_for</code> function needed to enable the optimisation proposed by this paper.
</p>

<p>
There are a lot of subtle details that implementations of <code>make_receiver_for</code> need to
get right and it's easy to accidentaly run into undefined behaviour or to creating
accidental cyclic dependencies that result in inscrutable compiler-errors.
</p>

<p>
Therefore, there is a reasonably high value in abstracting a lot of this away
for users who want to write their own sender algorithms which implement the optimisation
protocol proposed by this paper.
</p>

<p>
All users that want to implement their own sender algorithms that compose a
known set of child operations would need such a facility if they wanted their algorithm
to be able to participate in this optimisation.
</p>

<p>
However, such a facility would also be largely just an implementation detail for
sender algorithms. The majority of users of the sender/receiver framework should be just
composing those algorithms and, other than
TODO: Fix this
</p>

<p>
As long as the implementers of sender-algorithms implement the protocol proposed by
this paper in <i>some</i> way then users will benefit from the optimisations that are
enabled by the protocol. Different libraries can use their own helper classes to
implement the protocol - we do not need to standardise
</p>
</div>
</div>

<div id="outline-container-orga01213a" class="outline-3">
<h3 id="orga01213a"><span class="section-number-3">3.5.</span> Applying this optimisation to standard-library sender algorithms</h3>
<div class="outline-text-3" id="text-3-5">
<p>
In order for the optimisations proposed by this paper to be effective in wider code-bases,
you generally want most of the algorithms you use to opt-in to the <code>inlinable_receiver</code>
protocols, where possible.
</p>

<p>
A sender-adapter algorithm that does not opt-into the optimisation (either as a child
or as a parent) will inhibit applying the optimisation at both the boundary with its
children and at the boundary with its parent. Thus it will result in potentially adding
two pointer-indirections in the middle of a sender expression.
</p>

<p>
So, as much as possible we want to make sure that standard-library senders all implement
this optimisation.
</p>

<p>
The proposal P2300R10, which was merged into the draft standard, includes a number of
sender factories and sender algorithms provided by the standard library.
</p>

<p>
Some of the algorithms have default implementations that are just compositions
of other algorithms and so don't need any changes.
These algorithms are:
</p>
<ul class="org-ul">
<li><code>starts_on()</code> - defined in terms of <code>let_value()</code> and <code>schedule()</code></li>
<li><code>continues_on()</code> - defined in terms of <code>schedule_from()</code></li>
<li><code>on()</code> - defined in terms of <code>write-env</code>, <code>continues_on</code> an <code>starts_on</code>.</li>
<li><code>stopped_as_optional()</code> - defined in terms of <code>let_stopped</code>, <code>then</code> and <code>just</code>.</li>
<li><code>stopped_as_error()</code> - defined in terms of <code>let_stopped</code>, and <code>just_error</code>.</li>
</ul>

<p>
The following algorithms are all of the algorithms which have some implementation
of a sender for the default version of the algorithm that is not just a composition
of other sender algorithms:
</p>
<ul class="org-ul">
<li><code>just</code></li>
<li><code>just_error</code></li>
<li><code>just_stopped</code></li>
<li><code>read_env</code></li>
<li><code>schedule_from</code></li>
<li><code>then</code></li>
<li><code>upon_error</code></li>
<li><code>upon_stopped</code></li>
<li><code>let_value</code></li>
<li><code>let_error</code></li>
<li><code>let_stopped</code></li>
<li><code>bulk</code></li>
<li><code>split</code></li>
<li><code>when_all</code></li>
<li><code>into_variant</code></li>
<li><code>run_loop::run-loop-sender</code></li>
</ul>

<p>
The design intent is to have each of these algorithms implement the optimization
to avoid storing the receiver if the reciever connected to it satisfies <code>inlinable_receiver</code>.
i.e. when this sender is used as the child of another operation that stores the
child-operation as a sub-object.
</p>

<p>
Some of the above algorithms are leaf operations which do not have any children
and so do not need to implement the <code>inlinable_receiver</code> concept themselves.
These algorithms are: <code>just</code>, <code>just_error</code>, <code>just_stopped</code>, <code>read_env</code>, and <code>run_loop::run-loop-sender</code>.
</p>

<p>
The algorithms that do have children and thus would need to implement the parent
operation side of the protocol are all of the other algorithms listed above.
</p>

<p>
The <code>run_loop::run-loop-sender</code> will need some individual rework to support omitting
storage of the parent receiver, but this should be relatively straight-forward.
The other algorithms are defined in terms of the exposition-only <code>basic-operation</code>
and <code>basic-state</code> facilities and so should be able to have support added for
omitting storage of the receiver in a generic way.
</p>

<p>
There are currently some assumptions in the design of the <code>impls-for&lt;Tag&gt;</code> interface
that require the receiver object to exist for the duration of the <code>basic-state</code>
object which will require some rework. For example, the <code>get-state</code> of <code>schedule_from</code>
returns an object that holds a reference to the receiver. Similarly with <code>split</code>'s
<code>get-state</code> function.
</p>

<p>
Implementing the parent-side of the optimisation protocol will require changes
to move the child-operation states to be stored as base-classes rather than
as the <code>basic-operation::inner-ops</code> tuple-like data-member.
</p>

<p>
The <code>let_value</code>, <code>let_error</code> and <code>let_stopped</code> algorithms all have an additional
operation-state object stored in the object returned from <code>impls-for::get-state</code>.
This object would also need to be moved to a base-class of <code>basic-operation</code>,
but would need to have a manual lifetime and support being any of a set of possible
operation-state types.
</p>

<p>
The <code>split</code> algorithm has a child operation that is a child of the <code>shared-state&lt;Sndr&gt;</code>
structure. The child operation-state would need to be moved to a base-class and the
<code>split-receiver&lt;Sndr&gt;</code> would need to be updated to define the <code>make_receiver_for()</code>
static function.
</p>

<p>
All of this will need some major surgery to the specification machinery, but should
not change the semantics of any of the existing algorithms.
</p>
</div>
</div>
</div>

<div id="outline-container-org15e07c8" class="outline-2">
<h2 id="org15e07c8"><span class="section-number-2">4.</span> Design Discussion</h2>
<div class="outline-text-2" id="text-4">
</div>
<div id="outline-container-org6c6ca12" class="outline-3">
<h3 id="org6c6ca12"><span class="section-number-3">4.1.</span> Naming of <code>inlinable_receiver</code> concept and <code>inlinable_operation_state</code></h3>
<div class="outline-text-3" id="text-4-1">
<p>
The naming of the propsoed concept <code>inlinable_receiver</code> and <code>inlinable_operation_state</code>
base-class for operation-states both use the <code>inlinable</code> adjective to indicate that this
is for operation-states which might be stored inline in their parent operation-state.
</p>

<p>
If we want to use a different name, for example because we don't want to use the term
<code>inlinable</code> in this context, the following are some alternatives which could be considered.
</p>

<p>
Since a receiver that supports this concept is reconstructible from the operation-state address,
it could use the name <code>reconstructible_receiver</code> or <code>reconstructible_receiver_from</code>, instead.
</p>

<p>
The other option is that  we make the receiver concept exposition-only and only provide
the <code>inlinable_operation_state</code> class as this would likely be the facility that most
people would reach for rather than constraining their own class specializations on
the concept.
</p>

<p>
With regards to naming of the <code>inlinable_operation_state</code> helper class, we could also
choose a name that reflects better its purpose as a holder for the receiver by naming
it <code>receiver_holder_base&lt;Op, Rcvr&gt;</code>, or similar.
</p>
</div>
</div>
</div>

<div id="outline-container-org9e3d38a" class="outline-2">
<h2 id="org9e3d38a"><span class="section-number-2">5.</span> Proposed Wording</h2>
<div class="outline-text-2" id="text-5">
</div>
<div id="outline-container-orgfcaebe3" class="outline-3">
<h3 id="orgfcaebe3"><span class="section-number-3">5.1.</span> <code>inlinable_receiver</code> concept wording</h3>
<div class="outline-text-3" id="text-5-1">
<p>
Modify [execution.syn] as follows:
</p>

<div class="wording">
<pre>
  ...
  template&lt;class Sch&gt;
    concept scheduler = <i>see below</i>;

  // [exec.recv], receivers
  struct receiver_t {};

  template&lt;class Rcvr&gt;
    concept receiver = <i>see below</i>;

  template&lt;class Rcvr, class Completions&gt;
    concept receiver_of = <i>see below</i>;
  <ins>
  template&lt;class Rcvr, class ChildOp&gt;
    concept inlinable_receiver =
      receiver&lt;Rcvr&gt; &amp;&amp;
      requires (ChildOp* child) {
        { Rcvr::make_receiver_for(child) } noexcept -> same_as&lt;Rcvr&gt;;
      };
  </ins>
  struct set_value_t { unspecified };
  struct set_error_t { unspecified };
  struct set_stopped_t { unspecified };

  inline constexpr set_value_t set_value{};
  inline constexpr set_error_t set_error{};
  inline constexpr set_stopped_t set_stopped{};

  // [exec.opstate], operation states
  struct operation_state_t {};
  ...
</pre>
</div>

<p>
Add the following paragraph to [exec.recv.concepts] between p1 and p2:
</p>

<div class="wording" style="counter-set: paragraph 1">
<p class="numbered"><ins>The <code>inlinable_receiver</code> concept defines the requirements for a receiver that can be
reconstructed on-demand from a pointer to the operation-state object created when
the receiver was connected to a sender. Given a receiver object, <code>rcvr</code>, of type, <code>Rcvr</code>,
which was connected to a sender, producing an operation-state object, <code>op</code>, of type <code>Op</code>,
and where <code>Rcvr</code> models <code>inlinable_receiver&lt;Op></code>, then the expression,
<code>Rcvr::make_receiver_for(addressof(op))</code>, evaluates to a receiver that is equal to <code>rcvr</code>.</ins></p>
<p class="numbered"><ins><i>[Note: Such a receiver does not need to be stored as a data-member of <code>op</code> as it
can be recreated on demand - end note]</i></ins></p>
</div>
</div>
</div>

<div id="outline-container-org93dc0ff" class="outline-3">
<h3 id="org93dc0ff"><span class="section-number-3">5.2.</span> Changes to <i><code>basic-operation</code></i></h3>
<div class="outline-text-3" id="text-5-2">
<p>
Modify the synopsis in [exec.snd.expos] as follows:
</p>

<div class="wording">
<pre>
namespace std::execution {
  template&lt;class Tag&gt;
  concept <expos>completion-tag</expos> =                                      <expos>// exposition only</expos>
    same_as&lt;Tag, set_value_t&gt; || same_as&lt;Tag, set_error_t&gt; || same_as&lt;Tag, set_stopped_t&gt;;

  template&lt;template&lt;class...&gt; class T, class... Args&gt;
  concept <expos>valid-specialization</expos> =                                <expos>// exposition only</expos>
    requires { typename T&lt;Args...&gt;; };

  <ins>template&lt;size_t Id&gt;
  using <expos>indexed-tag</expos> = integral_constant&lt;size_t, Id&gt;;            <expos>// exposition only</expos></ins>

  struct <expos>default-<ins>state-</ins>impls</expos> {                                        <expos>// exposition only</expos>
    <del>static constexpr auto <expos>get-attrs</expos> = <i>see below</i>;              <expos>// exposition only</expos></del>
    <del>static constexpr auto <expos>get-env</expos> = <i>see below</i>;                <expos>// exposition only</expos></del>
    <ins>template&lt;class Self, size_t Id&gt;
    constexpr decltype(auto) <expos>get-env</expos>(this Self& self, <expos>indexed-tag</expos>&lt;Id&gt;) noexcept { // exposition only
      return get_env(self.get_receiver());
    }</ins>
    <del>static constexpr auto <expos>get-state</expos> = <i>see below</i>;              <expos>// exposition only</expos></del>
    <del>static constexpr auto <expos>start</expos> = <i>see below</i>;                  <expos>// exposition only</expos></del>
    <ins>template&lt;class Self&gt;
    constexpr void <expos>start</expos>(this Self& self) noexcept { // exposition only
      self.<expos>start-all</expos>();
    }</ins>
    <del>static constexpr auto <expos>complete</expos> = <i>see below</i>;               <expos>// exposition only</expos></del>
    <ins>template&lt;class Self, size_t Id, class CompletionTag, class... Datums&gt;
    constexpr void <expos>complete</expos>(this Self& self, <expos>indexed-tag</expos>&lt;Id&gt;,
        CompletionTag, Datums&&... datums) noexcept { //exposition only
      CompletionTag()(self.get_receiver(), std::forward&lt;Datums&gt;(datums)...);
    }</ins>
  };

  template&lt;class Tag&gt;
  struct <expos><ins>state-</ins>impls-for</expos> : <expos>default-<ins>state-</ins>impls</expos> {};       <expos>// exposition only</expos>

  <ins>template&lt;class Tag, class Data&gt;
  struct <expos>default-state</expos> : <expos>state-impls-for</expos>&lt;Tag&gt { <expos> // exposition only</expos>
    Data <expos>data</expos>;       <expos>// exposition only</expos>
  };

  template&lt;class Sndr&gt;
  decltype(auto) <expos>get-data</expos>(Sndr&amp;&amp; sndr) noexcept {        <expos>// exposition only</expos>
    auto&amp; [tag, data, ...children] = sndr;
    return std::forward_like&lt;Sndr&gt;(data);
  };

  template&lt;class Sndr&gt;
  using <expos>data-type-t</expos> = decltype(<expos>get-data</expos>(declval&lt;Sndr&gt()));

  template&lt;T&gt;
  inline constexpr bool <expos>is-nothrow-decay-copyable-v</expos> =      <expos>// exposition only</expos>
      is_nothrow_constructible_v&lt;decay_t&lt;T&gt;, T&gt;;

  template&lt;class Sndr, class Rcvr&gt;
  inline constexpr bool <expos>is-nothrow-connectable-v</expos> =         <expos>// exposition only</expos>
      noexcept(execution::connect(declval<Sndr>(), declval<Rcvr>()));

  struct <expos>default-sender-impls</expos> {                          <expos>// exposition only</expos>
    template&lt;class Data, class... Children&gt;
    static decltype(auto) <expos>get-attrs</expos>(const Data& data,
                                    const Children&... children) noexcept <expos>// exposition only</expos>
      if constexpr (sizeof...(children) == 1) {
        return <expos>fwd-env</expos>(execution::get_env(children...[0]));
      } else {
        return empty_env{};
      }
    }

    template&lt;class Self&gt;
    <expos>default-state</expos>&lt;tag_of_t<Self>, decay_t&lt;<expos>data-type-t</expos>&lt;Self&gt;&gt;&gt; <expos>get-state</expos>(this Self&amp;&amp; self) <expos>// exposition only</expos>
        noexcept(<expos>is-nothrow-decay-copyable-v</expos>&lt;<expos>data-type-t</expos>&lt;Self&gt;&gt;) {
      return {{}, <expos>get-data</expos>(std::forward&lt;Self&gt;(self))};
    }
  };

  template&lt;class Tag&gt;
  struct <expos>sender-impls-for</expos> : <expos>default-sender-impls</expos> {}; <expos>// exposition only</expos>
  </ins>

  template&lt;class Sndr, class Rcvr&gt;                              <expos>// exposition only</expos>
  using <expos>state-type</expos> = decay_t&lt;<expos>call-result-t</expos>&lt;
    decltype(<expos><ins>sender-</ins>impls-for</expos>&lt;tag_of_t&lt;Sndr&gt;&gt;::<expos>get-state</expos>), Sndr, Rcvr&amp;&gt;&gt;;

  <del>template&lt;class Index, class Sndr, class Rcvr&gt;                 <expos>// exposition only</expos>
  using <expos>env-type</expos> = <expos>call-result-t</expos>&lt;
    decltype(<expos>impls-for</expos>&lt;tag_of_t&lt;Sndr&gt;&gt;::<expos>get-env</expos>), Index,
    <expos>state-type</expos>&lt;Sndr, Rcvr&gt;&amp;, const Rcvr&amp;&gt;;

  template&lt;class Sndr, size_t I = 0&gt;
  using <expos>child-type</expos> = decltype(declval&lt;Sndr&gt;().template get&lt;I+2&gt;());     <expos>// exposition only</expos>

  template&lt;class Sndr&gt;
  using <expos>indices-for</expos> = remove_reference_t&lt;Sndr&gt;::<expos>indices-for</expos>;           <expos>// exposition only</expos>

  template&lt;class Sndr, class Rcvr&gt;
  struct <expos>basic-state</expos> {                                          <expos>// exposition only</expos>
    <expos>basic-state</expos>(Sndr&amp;&amp; sndr, Rcvr&amp;&amp; rcvr) noexcept(see below)
      : <expos>rcvr</expos>(std::move(rcvr))
      , <expos>state</expos>(<expos>impls-for</expos>&lt;tag_of_t&lt;Sndr&gt;&gt;::<expos>get-state</expos>(std::forward&lt;Sndr&gt;(sndr), <expos>rcvr</expos>)) { }

    Rcvr <expos>rcvr</expos>;                                                  <expos>// exposition only</expos>
    <expos>state-type</expos>&lt;Sndr, Rcvr&gt; <expos>state</expos>;                               <expos>// exposition only</expos>
  };

  template&lt;class Sndr, class Rcvr, class Index&gt;
    requires <expos>valid-specialization</expos>&lt;env-type, Index, Sndr, Rcvr&gt;
  struct <expos>basic-receiver</expos> {                                       <expos>// exposition only</expos>
    using receiver_concept = receiver_t;

    using <expos>tag-t</expos> = tag_of_t&lt;Sndr&gt;;                               <expos>// exposition only</expos>
    using <expos>state-t</expos> = <expos>state-type</expos>&lt;Sndr, Rcvr&gt;;                     <expos>// exposition only</expos>
    static constexpr const auto&amp; <expos>complete</expos> = <expos>impls-for</expos>&lt;<expos>tag-t</expos>&gt;::<expos>complete</expos>;   <expos>// exposition only</expos>

    template&lt;class... Args&gt;
      requires <expos>callable</expos>&lt;decltype(<expos>complete</expos>), Index, <expos>state-t</expos>&amp;, Rcvr&amp;, set_value_t, Args...&gt;
    void set_value(Args&amp;&amp;... args) &amp;&amp; noexcept {
      <expos>complete</expos>(Index(), <expos>op</expos>-&gt;<expos>state</expos>, <expos>op</expos>-&gt;<expos>rcvr</expos>, set_value_t(), std::forward&lt;Args&gt;(args)...);
    }

    template&lt;class Error&gt;
      requires <expos>callable</expos>&lt;decltype(<expos>complete</expos>), Index, <expos>state-t</expos>&amp;, Rcvr&amp;, set_error_t, Error&gt;
    void set_error(Error&amp;&amp; err) &amp;&amp; noexcept {
      <expos>complete</expos>(Index(), <expos>op</expos>-&gt;<expos>state</expos>, <expos>op</expos>-&gt;<expos>rcvr</expos>, set_error_t(), std::forward&lt;Error&gt;(err));
    }

    void set_stopped() &amp;&amp; noexcept
      requires <expos>callable</expos>&lt;decltype(<expos>complete</expos>), Index, <expos>state-t</expos>&amp;, Rcvr&amp;, set_stopped_t&gt; {
      <expos>complete</expos>(Index(), <expos>op</expos>-&gt;<expos>state</expos>, <expos>op</expos>-&gt;<expos>rcvr</expos>, set_stopped_t());
    }

    auto get_env() const noexcept -&gt; <expos>env-type</expos>&lt;Index, Sndr, Rcvr&gt; {
      return <expos>impls-for</expos>&lt;<expos>tag-t</expos>&gt;::<expos>get-env</expos>(Index(), <expos>op</expos>-&gt;<expos>state</expos>, <expos>op</expos>-&gt;<expos>rcvr</expos>);
    }

    <expos>basic-state</expos>&lt;Sndr, Rcvr&gt;* <expos>op</expos>;                           <expos>// exposition only</expos>
  };

  constexpr auto <expos>connect-all</expos> = <i>see below</i>;                         <expos>// exposition only</expos>

  template&lt;class Sndr, class Rcvr&gt;
  using <expos>connect-all-result</expos> = <expos>call-result-t</expos>&lt;                     <expos>// exposition only</expos>
    decltype(<expos>connect-all</expos>), <expos>basic-state</expos>&lt;Sndr, Rcvr&gt;*, Sndr, <expos>indices-for</expos>&lt;Sndr&gt;&gt;;</del>

<ins>  template&lt;class Op, class Rcvr&gt;
  struct <expos>inlinable-operation-state</expos> {      <expos>// exposition only</expos>
    explicit <expos>inlinable-operation-state</expos>(Rcvr&& r) noexcept(is_nothrow_move_constructible_v&lt;Rcvr&gt;)
      : <expos>rcvr</expos>(std::move(r))
    {}

    Rcvr& <expos>get-receiver</expos>() noexcept { return rcvr; }      <expos>// exposition only</expos>

    Rcvr <expos>rcvr</expos>;   <expos>// exposition only</expos>
  };

  template&lt;class Op, class Rcvr&gt;
    requires inlinable_receiver&lt;Rcvr, Op&gt;
  struct <expos>inlinable-operation-state</expos>&lt;Op, Rcvr&gt; {    <expos>// exposition only</expos>
    explicit <expos>inlinable-operation-state</expos>(Rcvr&&) noexcept {}

    Rcvr <expos>get-receiver</expos>() noexcept {    <expos>// exposition only</expos>
      return Rcvr::make_receiver_for(static_cast<Op*>(this));
    }
  };

  template&lt;class ParentOp, class ChildTag, class ChildEnv, class ChildSndr&gt;
  class <expos>manual-child-operation</expos> {       <expos>// exposition only</expos>
    struct <expos>child-receiver</expos> {         <expos>// exposition only</expos>
      using receiver_concept = receiver_t;

      template&lt;class ChildOp&gt;
      static <expos>child-receiver</expos> make_receiver_for(ChildOp* child) noexcept {
        auto* parent = static_cast&lt;ParentOp*&gt;(
                         reinterpret_cast&lt;<expos>manual-child-operation</expos>*&gt(
                           reinterpret_cast&lt;<expos>storage-t</expos>*&gt;(child)));
        return <expos>child-receiver</expos>{parent};
      }

      ChildEnv get_env() const noexcept {
        return <expos>parent</expos>-&gt;<expos>get-env</expos>(ChildTag{});
      }

      template&lt;class... Vs&gt;
      void set_value(Vs&amp;&amp;... vs) && noexcept {
        <expos>parent</expos>-&gt;<expos>complete</expos>(ChildTag{}, set_value_t{}, std::forward&lt;Vs&gt;(vs)...);
      }

      template&lt;class E&gt;
      void set_error(E&amp;&amp; e) && noexcept {
        <expos>parent</expos>-&gt;<expos>complete</expos>(ChildTag{}, set_error_t{}, std::forward&lt;E&gt;(e));
      }

      void set_stopped() && noexcept {
        <expos>parent</expos>-&gt;<expos>complete</expos>(ChildTag{}, set_stopped_t{});
      }

    private:
      friend <expos>manual-child-operation</expos>;

      explicit <expos>child-receiver</expos>(ParentOp* p) noexcept
      : <expos>parent</expos>(p) {}

      ParentOp* <expos>parent</expos>;      <expos>// exposition only</expos>
    };

    using <expos>child-op</expos> = connect_result_t&lt;ChildSndr, <expos>child-receiver</expos>&gt;; <expos>//exposition only</expos>

  protected:
    <expos>manual-child-operation</expos>() noexcept {}
    ~<expos>manual-child-operation</expos>() {}

  public:
    using <expos>is-nothrow-connectable</expos> =     <expos>// exposition only</expos>
        <expos>is-nothrow-connectable-v&lt;ChildSndr, <expos>child-receiver</expos>&gt;

    void <expos>start-child</expos>() noexcept { <expos>// exposition only</expos>
      execution::start(this-&gt;<expos>get</expos>());
    }

    void <expos>construct</expos>(ChildSndr&amp;&amp; child) noexcept { <expos>// exposition only</expos>
      ::new (static_cast<void*>(addressof(<expos>storage</expos>))) <expos>child-op</expos>(
        execution::connect(std::forward&lt;ChildSndr&gt;(child),
                           <expos>child-receiver</expos>(static_cast&lt;ParentOp*&gt;(this))));
    }

    void <expos>destruct</expos>() noexcept {
      this-&gt;<expos>get</expos>().~<expos>child-op</expos>();
    }

  private:
    <expos>child-op</expos>&amp; <expos>get</expos>() noexcept {  <expos>// exposition only</expos>
      return *launder(reinterpret_cast&lt;<expos>child-op</expos>*&gt;(addressof(<expos>storage</expos>)));
    }

    using <expos>storage-t</expos> =     <expos>// exposition only</expos>
      conditional_t&lt;
        is_empty_v&lt;<expos>child-op</expos>&gt; &amp;&amp; is_standard_layout_v&lt;<expos>child-op</expos>&gt;,
        <expos>child-op</expos>,
        unsigned char[sizeof(<expos>child-op</expos>)]&gt;;

    union {
      alignas(<expos>child-op</expos>) <expos>storage-t</expos> <expos>storage</expos>;  <expos>// exposition only</expos>
    };
  };

  template&lt;class ParentOp, class ChildTag, class ChildEnv, class ChildSndr&gt;
  class <expos>child-operation</expos>        <expos>// exposition only</expos>
      : public <expos>manual-child-operation</expos>&lt;ParentOp, ChildTag, ChildEnv, ChildSndr&gt; {
    using <expos>base-t</expos> = <expos>manual-child-operation</expos>&lt;ParentOp, ChildTag, ChildEnv, ChildSndr&gt;
    using <expos>base-t</expos>::<expos>construct</expos>;
    using <expos>base-t</expos>::<expos>destruct</expos>;

  protected:
    explicit <expos>child-operation</expos>(ChildSndr&amp;&amp; child)    <expos>// exposition only</expos>
        noexcept(<expos>base-t</expos>::<expos>is-nothrow-connectable</expos>) {
      <expos>base-t</expos>::<expos>construct</expos>(std::forward&lt;ChildSndr&gt;(child));
    }

    ~<expos>child-operation</expos>() {
      <expos>base-t</expos>::<expos>destruct</expos>();
    }
  };

  template&lt;
    class ParentOp,
    class ParentEnv,
    template&lt;class, size_t&gt; class ChildEnv,
    class Indices,
    class... Children&gt;
  class <expos>child-operations</expos>;    <expos>// exposition only</expos>

  template&lt;
    class ParentOp,
    class ParentEnv,
    template&lt;class, size_t&gt; class ChildEnv,
    size_t... Indices,
    class... Children&gt;
  class <expos>child-operations</expos>&lt;ParentOp, ParentEnv, ChildEnv, index_sequence&lt;Indices...&gt;, Children&gt;   <expos>// exposition only</expos>
    : public <expos>child-operation</expos>&lt;ParentOp, <expos>indexed-tag</expos>&lt;Indices&gt;, ChildEnv&lt;ParentEnv, Indicies&gt;, Children&gt;... {
  protected:
    template&lt;size_t Id&gt;
      using <expos>child-t</expos> =    <expos>//exposition only</expos>
        <expos>child-operation</expos>&lt;ParentOp, <expos>indexed-tag</expos>&lt;Id&gt;, ChildEnv&lt;ParentEnv, Id&gt;, Children...[Id]&gt;

    explicit <expos>child-operations</expos>(Children&amp;&amp;... children)
      : <expos>child-t</expos>&lt;Indices&gt;(std::forward&lt;Children&gt;(children))...
    {}

    void <expos>start-all</expos>() noexcept {     <expos>// exposition only</expos>
      (<expos>child-t&lt;Indices&gt;::<expos>start-child</expos>(), ...);
    }
  };
</ins>
  template&lt;class <ins>Tag</ins><del>Sndr</del>, class Rcvr<ins>, class State, class... Children</ins>&gt;
    <del>requires <expos>valid-specialization</expos>&lt;<expos>state-type</expos>, Sndr, Rcvr&gt; &amp;&amp;
             <expos>valid-specialization</expos>&lt;<expos>connect-all-result</expos>, Sndr, Rcvr&gt;</del>
  struct <expos>basic-operation</expos><del> : <expos>basic-state</expos>&lt;Sndr, Rcvr&gt; {</del>                <expos>// exposition only</expos>
    <ins>: <expos>inlinable-operation-state</expos>&lt;<expos>basic-operation</expos>&lt;Tag, Rcvr, State, Children...&gt;, Rcvr&gt;
    , State
    , <expos>child-operations</expos>&lt;
        <expos>basic-operation</expos>&lt;Tag, Rcvr, State, Children...&gt;,
        env_of_t&lt;Rcvr&gt;,
        State::template <expos>env-type</expos>,
        index_sequence_for&lt;Children...&gt;,
        Children...&gt; {</ins>
    using operation_state_concept = operation_state_t;
    <ins>using <expos>rcvr-base</expos> = <expos>inlinable-operation-state</expos>&lt;<expos>basic-operation</expos>, Rcvr&gt;   <expos>// exposition only</expos>
    using <expos>children-base</expos> =    <expos>//exposition only</expos>
      <expos>child-operations</expos>&lt;
        <expos>basic-operation</expos>,
        env_ov_t&lt;Rcvr&gt;,
        State::template <expos>env-type</expos>,
        index_sequence_for&lt;Children...&gt;,
        Children...&gt;</ins>
    <del>using <expos>tag-t</expos> = tag_of_t&lt;Sndr&gt;;                               <expos>// exposition only</expos>

    <expos>connect-all-result</expos>&lt;Sndr, Rcvr&gt; <expos>inner-ops</expos>;              <expos>// exposition only</expos>

    <expos>basic-operation</expos>(Sndr&amp;&amp; sndr, Rcvr&amp;&amp; rcvr) noexcept(<i>see below</i>)  <expos>// exposition only</expos>
      : <expos>basic-state</expos>&lt;Sndr, Rcvr&gt;(std::forward&lt;Sndr&gt;(sndr), std::move(rcvr)),
        <expos>inner-ops</expos>(<expos>connect-all</expos>(this, std::forward&lt;Sndr&gt;(sndr), <expos>indices-for</expos>&lt;Sndr&gt;()))
    {}</del>

    <ins>template&lt;class Data&gt;
    <expos>basic-operation</expos>(Rcvr&amp;&amp; rcvr, Data&amp;&amp; data, Children&&... children) noexcept(<i>see below</i>)
      : <expos>rcvr-base</expos>(std::move(rcvr))
      , State(std::forward&lt;Data&gt;(data))
      , <expos>children-base</expos>(std::forward&lt;Children&gt;(children))
    {}</ins>

    void start() &amp; noexcept {
      <del>auto&amp; [...ops] = <expos>inner-ops</expos>;
      <expos>impls-for</expos>&lt;<expos>tag-t</expos>&gt;::<expos>start</expos>(this-&gt;<expos>state</expos>, this-&gt;<expos>rcvr</expos>, ops...);</del>
      <ins>this-&gt;State::<expos>start</expos>();</ins>
    }
  };

  template&lt;class Sndr, class Env&gt;
  using <expos>completion-signatures-for</expos> = <i>see below</i>;                   <expos>// exposition only</expos>

  template&lt;class Tag, class Data, class... Child&gt;
  struct <expos>basic-sender</expos> : <expos>product-type</expos>&lt;Tag, Data, Child...&gt; {    <expos>// exposition only</expos>
    using sender_concept = sender_t;
    <del>using <expos>indices-for</expos> = index_sequence_for&lt;Child...&gt;;       <expos>// exposition only</expos></del>

    decltype(auto) get_env() const noexcept {
      auto&amp; [_, data, ...child] = *this;
      return <expos><ins>sender-</ins>impls-for</expos>&lt;Tag&gt;::<expos>get-attrs</expos>(data, child...);
    }

    template&lt;<expos>decays-to</expos>&lt;<expos>basic-sender</expos>&gt; Self, receiver Rcvr&gt;
    auto connect(this Self&amp;&amp; self, Rcvr rcvr) noexcept(<i>see below</i>)
      -&gt; <expos>basic-operation</expos>&lt;<ins>Tag</ins><del>Self</del>, Rcvr<ins>, <expos>state-type-t</expos>&lt;Tag, <expos>member-t</expos>&lt;Self, Data&gt;, <expos>member-t</expos>&lt;Self, Child&gt;...</ins>&gt; {
      <del>return {std::forward&lt;Self&gt;(self), std::move(rcvr)};</del>
      <ins>auto& [_, data, ...child] = *this;
      return {std::move(rcvr), std::forward<decltype(data)>(data), std::forward<decltype(child)>(child)...};</ins>
    }

    template&lt;<expos>decays-to</expos>&lt;<expos>basic-sender</expos>&gt; Self, class Env&gt;
    auto get_completion_signatures(this Self&amp;&amp; self, Env&amp;&amp; env) noexcept
      -&gt; <expos>completion-signatures-for</expos>&lt;Self, Env&gt; {
      return {};
    }
  };
}
</pre>
</div>

<p>
Strike paragraphs 34 through 38 from [exec.snd.expos]:
</p>

<div class="wording">

<del>The member default-impls ::get-attrs is initialized with a callable object equivalent to the following
lambda:</del>
<pre><del>[](const auto&, const auto&... child) noexcept -> decltype(auto) {
  if constexpr (sizeof...(child) == 1)
    return (FWD-ENV (get_env(child)), ...);
  else
    return env<>();
}
</del></pre>

<del>The member default-impls ::get-env is initialized with a callable object equivalent to the following
lambda:</del>
<pre><del>[](auto, auto&, const auto& rcvr) noexcept -> decltype(auto) {
  return FWD-ENV (get_env(rcvr));
}</del></pre>

<del>The member default-impls ::get-state is initialized with a callable object equivalent to the following
lambda:</del>
<pre><del>[]<class Sndr, class Rcvr>(Sndr&& sndr, Rcvr& rcvr) noexcept -> decltype(auto) {
  auto& [_, data, ...child] = sndr;
  return std::forward_like&lt;Sndr&gt;(data);
}</del></pre>

<del>The member default-impls ::start is initialized with a callable object equivalent to the following lambda:</del>
<pre><del>[](auto&, auto&, auto&... ops) noexcept -> void {
  (execution::start(ops), ...);
}</del></pre>

<del>The member default-impls ::complete is initialized with a callable object equivalent to the following
lambda:</del>
<pre><del>[]&lt;class Index, class Rcvr, class Tag, class... Args&gt;(
  Index, auto& state, Rcvr& rcvr, Tag, Args&&... args) noexcept
  -> void requires callable &lt;Tag, Rcvr, Args...&gt; {
  static_assert(Index::value == 0);
  Tag()(std::move(rcvr), std::forward&lt;Args&gt;(args)...);
}</del></pre>

</div>
</div>
</div>

<div id="outline-container-org26c937d" class="outline-3">
<h3 id="org26c937d"><span class="section-number-3">5.3.</span> Changes to <i><code>just</code></i>, <i><code>just_error</code></i>, and <i><code>just_stopped</code></i></h3>
<div class="outline-text-3" id="text-5-3">
<p>
Modify [exec.just] p2.3 as follows:
</p>

<div class="wording">
<pre>template<>
struct <expos><ins>state-</ins>impls-for</expos>&lt;<expos>decayed-typeof</expos>&lt;<expos>just-cpo</expos>&gt;&gt; : <expos>default-<ins>state-</ins>impls</expos> {
  <del>static constexpr auto <expos>start</expos> =
    [](auto& state, auto& rcvr) noexcept -> void {
      auto& [...ts] = state;
      <expos>set-cpo</expos>(std::move(rcvr), std::move(ts)...);
    };</del>
  <ins>template &lt;class Self&gt;
  void <expos>start</expos>(this Self& self) {
    auto& [...ts] = self.data;
    <expos>set-cpo</expos>(std::move(self.get_receiver()), std::move(ts)...);
  }</ins>
};</pre>
</div>
</div>
</div>

<div id="outline-container-org1f8192d" class="outline-3">
<h3 id="org1f8192d"><span class="section-number-3">5.4.</span> Changes to <i><code>read_env</code></i></h3>
<div class="outline-text-3" id="text-5-4">
<p>
Modify [exec.read.env] p3 as follows:
</p>

<div class="wording">
<pre>template<>
struct <expos><ins>state-</ins>impls-for</expos>&lt;<expos>decayed-typeof</expos>&lt;read_env&gt;&gt; : <expos>default-<ins>state-</ins>impls</expos> {
  <del>static constexpr auto start =
    [](auto query, auto& rcvr) noexcept -> void {
      TRY-SET-VALUE (rcvr, query(get_env(rcvr)));
    };</del>
  <ins>template &lt;class Self&gt;
  void <expos>start</expos>(this Self& self) {
    TRY-SET-VALUE(self.get_receiver(), query(get_env(self.get_receiver())));
  }</ins>
};</pre>
</div>
</div>
</div>

<div id="outline-container-org8cfe691" class="outline-3">
<h3 id="org8cfe691"><span class="section-number-3">5.5.</span> Changes to <i><code>schedule_from</code></i></h3>
<div class="outline-text-3" id="text-5-5">
<p>
Forthcoming.
</p>
</div>
</div>

<div id="outline-container-org5ed4102" class="outline-3">
<h3 id="org5ed4102"><span class="section-number-3">5.6.</span> Changes to <i><code>then</code></i>, <i><code>upon_error</code></i>, <i><code>upon_stopped</code></i></h3>
<div class="outline-text-3" id="text-5-6">
<p>
Modify [exec.then] p4 as follows:
</p>

<div class="wording">
<pre>template<>
struct <expos><ins>state-</ins>impls-for</expos>&lt;<expos>decayed-typeof</expos>&lt;<expos>then-cpo</expos>&gt;&gt; : <expos>default-<ins>state-</ins>impls</expos> {
  <del>static constexpr auto <expos>complete</expos> =
    []&lt;class Tag, class... Args&gt;
      (auto, auto& fn, auto& rcvr, Tag, Args&&... args) noexcept -> void {
      if constexpr (same_as&lt;Tag, <expos>decayed-typeof</expos>&lt;<expos>set-cpo</expos>&gt;&gt;) {
        TRY-SET-VALUE (rcvr,
          invoke(std::move(fn), std::forward&lt;Args&gt;(args)...));
      } else {
        Tag()(std::move(rcvr), std::forward&lt;Args&gt;(args)...);
      }
    };</del>
  <ins>template&lt;class Self, size_t Id, class CompletionTag, class... Datums&gt;
  constexpr void <expos>complete</expos>(this Self& self, <expos>indexed-tag</expos>&lt;Id&gt;,
    CompletionTag, Datums&&... datums) noexcept { //exposition only
      if constexpr (same_as&lt;CompletionTag, <expos>decayed-typeof</expos>&lt;<expos>set-cpo</expos>&gt;&gt;) {
        TRY-SET-VALUE (self.get_receiver(),
          invoke(self.data, std::forward&lt;Args&gt;(args)...));
      } else {
        CompletionTag()(self.get_receiver(), std::forward&lt;Args&gt;(args)...);
      }
  }</ins>
};</pre>
</div>
</div>
</div>

<div id="outline-container-orge3815b0" class="outline-3">
<h3 id="orge3815b0"><span class="section-number-3">5.7.</span> Changes to <i><code>let_value</code></i>, <i><code>let_error</code></i>, <i><code>let_stopped</code></i></h3>
<div class="outline-text-3" id="text-5-7">
<p>
Forthcoming.
</p>
</div>
</div>

<div id="outline-container-org8412ee9" class="outline-3">
<h3 id="org8412ee9"><span class="section-number-3">5.8.</span> Changes to <i><code>bulk</code></i></h3>
<div class="outline-text-3" id="text-5-8">
<p>
Modify [exec.bulk] p3 as follows:
</p>

<div class="wording">
<pre>template<>
struct <expos><ins>state-</ins>impls-for</expos>&lt;bulk_t&gt; : <expos>default-<ins>state-</ins>impls</expos> {
  <del>static constexpr auto <expos>complete</expos> = see below;</del>
  <ins>template&lt;class Self, size_t Id, class CompletionTag, class... Datums&gt;
  constexpr void <expos>complete</expos>(this Self& self, <expos>indexed-tag</expos>&lt;Id&gt;,
    CompletionTag, Datums&&... datums) noexcept requires see below;</ins>
};</pre>
</div>

<p>
Modify [exec.bulk] p4 as follows:
</p>

<div class="wording">
The member <expos><ins>state-</ins>impls-for</expos>&lt;bulk_t&gt;::<expos>complete</expos> is <del>initialized with a callable object</del> equivalent to the following
<del>lambda</del>:

<pre><del>[]&lt;class Index, class State, class Rcvr, class Tag, class... Args&gt;
  (Index, State& state, Rcvr& rcvr, Tag, Args&&... args) noexcept -> void requires see below {
  if constexpr (same_as&lt;Tag, set_value_t&gt;) {
    auto& [shape, f] = state;
    constexpr bool nothrow = noexcept(f(auto(shape), args...));
    TRY-EVAL (rcvr, [&]() noexcept(nothrow) {
      for (decltype(auto(shape)) i = 0; i &lt; shape; ++i) {
        f(auto(i), args...);
      }
      Tag()(std::move(rcvr), std::forward<Args>(args)...);
    }());
  } else {
    Tag()(std::move(rcvr), std::forward<Args>(args)...);
  }
}</del>
<ins>if constexpr (same_as&lt;Tag, set_value_t&gt;) {
  auto& [shape, f] = self.data;
  constexpr bool nothrow = noexcept(f(auto(shape), args...));
  TRY-EVAL (self.get_receiver(), [&]() noexcept(nothrow) {
    for (decltype(auto(shape)) i = 0; i &lt; shape; ++i) {
      f(auto(i), args...);
    }
    Tag()(self.get_receiver(), std::forward<Args>(args)...);
  }());
} else {
  Tag()(self.get_receiver(), std::forward<Args>(args)...);
}</ins></pre>
</div>
</div>
</div>

<div id="outline-container-orgb2eef78" class="outline-3">
<h3 id="orgb2eef78"><span class="section-number-3">5.9.</span> Changes to <i><code>split</code></i></h3>
<div class="outline-text-3" id="text-5-9">
<p>
Forthcoming
</p>
</div>
</div>

<div id="outline-container-org159dba0" class="outline-3">
<h3 id="org159dba0"><span class="section-number-3">5.10.</span> Changes to <i><code>when_all</code></i></h3>
<div class="outline-text-3" id="text-5-10">
<p>
Forthcoming.
</p>
</div>
</div>

<div id="outline-container-orgc6422ff" class="outline-3">
<h3 id="orgc6422ff"><span class="section-number-3">5.11.</span> Changes to <i><code>into_variant</code></i></h3>
<div class="outline-text-3" id="text-5-11">
<p>
Forthcoming.
</p>
</div>
</div>

<div id="outline-container-org0eda1e7" class="outline-3">
<h3 id="org0eda1e7"><span class="section-number-3">5.12.</span> Changes to <i><code>run_loop::run-loop-sender</code></i></h3>
<div class="outline-text-3" id="text-5-12">
<p>
Forthcoming.
</p>
</div>
</div>
</div>

<div id="outline-container-org716872d" class="outline-2">
<h2 id="org716872d"><span class="section-number-2">6.</span> References</h2>
<div class="outline-text-2" id="text-6">
<ul class="org-ul">
<li><a href="https://github.com/cplusplus/sender-receiver/issues/224">https://github.com/cplusplus/sender-receiver/issues/224</a>
Original github issue describing the problem and suggested solution described
by this paper.</li>
</ul>
</div>
</div>
</div>
</body>
</html>
