<!doctype html><html lang="en">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
  <meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport">
  <title>P1629R0: Standard Text Encoding</title>
<style data-fill-with="stylesheet">/******************************************************************************
 *                   Style sheet for the W3C specifications                   *
 *
 * Special classes handled by this style sheet include:
 *
 * Indices
 *   - .toc for the Table of Contents (<ol class="toc">)
 *     + <span class="secno"> for the section numbers
 *   - #toc for the Table of Contents (<nav id="toc">)
 *   - ul.index for Indices (<a href="#ref">term</a><span>, in §N.M</span>)
 *   - table.index for Index Tables (e.g. for properties or elements)
 *
 * Structural Markup
 *   - table.data for general data tables
 *     -> use 'scope' attribute, <colgroup>, <thead>, and <tbody> for best results !
 *     -> use <table class='complex data'> for extra-complex tables
 *     -> use <td class='long'> for paragraph-length cell content
 *     -> use <td class='pre'> when manual line breaks/indentation would help readability
 *   - dl.switch for switch statements
 *   - ol.algorithm for algorithms (helps to visualize nesting)
 *   - .figure and .caption (HTML4) and figure and figcaption (HTML5)
 *     -> .sidefigure for right-floated figures
 *   - ins/del
 *
 * Code
 *   - pre and code
 *
 * Special Sections
 *   - .note       for informative notes             (div, p, span, aside, details)
 *   - .example    for informative examples          (div, p, pre, span)
 *   - .issue      for issues                        (div, p, span)
 *   - .assertion  for assertions                    (div, p, span)
 *   - .advisement for loud normative statements     (div, p, strong)
 *   - .annoying-warning for spec obsoletion notices (div, aside, details)
 *
 * Definition Boxes
 *   - pre.def   for WebIDL definitions
 *   - table.def for tables that define other entities (e.g. CSS properties)
 *   - dl.def    for definition lists that define other entitles (e.g. HTML elements)
 *
 * Numbering
 *   - .secno for section numbers in .toc and headings (<span class='secno'>3.2</span>)
 *   - .marker for source-inserted example/figure/issue numbers (<span class='marker'>Issue 4</span>)
 *   - ::before styled for CSS-generated issue/example/figure numbers:
 *     -> Documents wishing to use this only need to add
 *        figcaption::before,
 *        .caption::before { content: "Figure "  counter(figure) " ";  }
 *        .example::before { content: "Example " counter(example) " "; }
 *        .issue::before   { content: "Issue "   counter(issue) " ";   }
 *
 * Header Stuff (ignore, just don't conflict with these classes)
 *   - .head for the header
 *   - .copyright for the copyright
 *
 * Miscellaneous
 *   - .overlarge for things that should be as wide as possible, even if
 *     that overflows the body text area. This can be used on an item or
 *     on its container, depending on the effect desired.
 *     Note that this styling basically doesn't help at all when printing,
 *     since A4 paper isn't much wider than the max-width here.
 *     It's better to design things to fit into a narrower measure if possible.
 *   - js-added ToC jump links (see fixup.js)
 *
 ******************************************************************************/

/******************************************************************************/
/*                                   Body                                     */
/******************************************************************************/

	body {
		counter-reset: example figure issue;

		/* Layout */
		max-width: 50em;               /* limit line length to 50em for readability   */
		margin: 0 auto;                /* center text within page                     */
		padding: 1.6em 1.5em 2em 50px; /* assume 16px font size for downlevel clients */
		padding: 1.6em 1.5em 2em calc(26px + 1.5em); /* leave space for status flag     */

		/* Typography */
		line-height: 1.5;
		font-family: sans-serif;
		widows: 2;
		orphans: 2;
		word-wrap: break-word;
		overflow-wrap: break-word;
		hyphens: auto;

		/* Colors */
		color: black;
		background: white top left fixed no-repeat;
		background-size: 25px auto;
	}


/******************************************************************************/
/*                         Front Matter & Navigation                          */
/******************************************************************************/

/** Header ********************************************************************/

	div.head { margin-bottom: 1em }
	div.head hr { border-style: solid; }

	div.head h1 {
		font-weight: bold;
		margin: 0 0 .1em;
		font-size: 220%;
	}

	div.head h2 { margin-bottom: 1.5em;}

/** W3C Logo ******************************************************************/

	.head .logo {
		float: right;
		margin: 0.4rem 0 0.2rem .4rem;
	}

	.head img[src*="logos/W3C"] {
		display: block;
		border: solid #1a5e9a;
		border-width: .65rem .7rem .6rem;
		border-radius: .4rem;
		background: #1a5e9a;
		color: white;
		font-weight: bold;
	}

	.head a:hover > img[src*="logos/W3C"],
	.head a:focus > img[src*="logos/W3C"] {
		opacity: .8;
	}

	.head a:active > img[src*="logos/W3C"] {
		background: #c00;
		border-color: #c00;
	}

	/* see also additional rules in Link Styling section */

/** Copyright *****************************************************************/

	p.copyright,
	p.copyright small { font-size: small }

/** Back to Top / ToC Toggle **************************************************/

	@media print {
		#toc-nav {
			display: none;
		}
	}
	@media not print {
		#toc-nav {
			position: fixed;
			z-index: 2;
			bottom: 0; left: 0;
			margin: 0;
			min-width: 1.33em;
			border-top-right-radius: 2rem;
			box-shadow: 0 0 2px;
			font-size: 1.5em;
			color: black;
		}
		#toc-nav > a {
			display: block;
			white-space: nowrap;

			height: 1.33em;
			padding: .1em 0.3em;
			margin: 0;

			background: white;
			box-shadow: 0 0 2px;
			border: none;
			border-top-right-radius: 1.33em;
			background: white;
		}
		#toc-nav > #toc-jump {
			padding-bottom: 2em;
			margin-bottom: -1.9em;
		}

		#toc-nav > a:hover,
		#toc-nav > a:focus {
			background: #f8f8f8;
		}
		#toc-nav > a:not(:hover):not(:focus) {
			color: #707070;
		}

		/* statusbar gets in the way on keyboard focus; remove once browsers fix */
		#toc-nav > a[href="#toc"]:not(:hover):focus:last-child {
			padding-bottom: 1.5rem;
		}

		#toc-nav:not(:hover) > a:not(:focus) > span + span {
			/* Ideally this uses :focus-within on #toc-nav */
			display: none;
		}
		#toc-nav > a > span + span {
			padding-right: 0.2em;
		}

		#toc-toggle-inline {
			vertical-align: 0.05em;
			font-size: 80%;
			color: gray;
			color: hsla(203,20%,40%,.7);
			border-style: none;
			background: transparent;
			position: relative;
		}
		#toc-toggle-inline:hover:not(:active),
		#toc-toggle-inline:focus:not(:active) {
			text-shadow: 1px 1px silver;
			top: -1px;
			left: -1px;
		}

		#toc-nav :active {
			color: #C00;
		}
	}

/** ToC Sidebar ***************************************************************/

	/* Floating sidebar */
	@media screen {
		body.toc-sidebar #toc {
			position: fixed;
			top: 0; bottom: 0;
			left: 0;
			width: 23.5em;
			max-width: 80%;
			max-width: calc(100% - 2em - 26px);
			overflow: auto;
			padding: 0 1em;
			padding-left: 42px;
			padding-left: calc(1em + 26px);
			background: inherit;
			background-color: #f7f8f9;
			z-index: 1;
			box-shadow: -.1em 0 .25em rgba(0,0,0,.1) inset;
		}
		body.toc-sidebar #toc h2 {
			margin-top: .8rem;
			font-variant: small-caps;
			font-variant: all-small-caps;
			text-transform: lowercase;
			font-weight: bold;
			color: gray;
			color: hsla(203,20%,40%,.7);
		}
		body.toc-sidebar #toc-jump:not(:focus) {
			width: 0;
			height: 0;
			padding: 0;
			position: absolute;
			overflow: hidden;
		}
	}
	/* Hide main scroller when only the ToC is visible anyway */
	@media screen and (max-width: 28em) {
		body.toc-sidebar {
			overflow: hidden;
		}
	}

	/* Sidebar with its own space */
	@media screen and (min-width: 78em) {
		body:not(.toc-inline) #toc {
			position: fixed;
			top: 0; bottom: 0;
			left: 0;
			width: 23.5em;
			overflow: auto;
			padding: 0 1em;
			padding-left: 42px;
			padding-left: calc(1em + 26px);
			background: inherit;
			background-color: #f7f8f9;
			z-index: 1;
			box-shadow: -.1em 0 .25em rgba(0,0,0,.1) inset;
		}
		body:not(.toc-inline) #toc h2 {
			margin-top: .8rem;
			font-variant: small-caps;
			font-variant: all-small-caps;
			text-transform: lowercase;
			font-weight: bold;
			color: gray;
			color: hsla(203,20%,40%,.7);
		}

		body:not(.toc-inline) {
			padding-left: 29em;
		}
		/* See also Overflow section at the bottom */

		body:not(.toc-inline) #toc-jump:not(:focus) {
			width: 0;
			height: 0;
			padding: 0;
			position: absolute;
			overflow: hidden;
		}
	}
	@media screen and (min-width: 90em) {
		body:not(.toc-inline) {
			margin: 0 4em;
		}
	}

/******************************************************************************/
/*                                Sectioning                                  */
/******************************************************************************/

/** Headings ******************************************************************/

	h1, h2, h3, h4, h5, h6, dt {
		page-break-after: avoid;
		page-break-inside: avoid;
		font: 100% sans-serif;   /* Reset all font styling to clear out UA styles */
		font-family: inherit;    /* Inherit the font family. */
		line-height: 1.2;        /* Keep wrapped headings compact */
		hyphens: manual;         /* Hyphenated headings look weird */
	}

	h2, h3, h4, h5, h6 {
		margin-top: 3rem;
	}

	h1, h2, h3 {
		color: #005A9C;
		background: transparent;
	}

	h1 { font-size: 170%; }
	h2 { font-size: 140%; }
	h3 { font-size: 120%; }
	h4 { font-weight: bold; }
	h5 { font-style: italic; }
	h6 { font-variant: small-caps; }
	dt { font-weight: bold; }

/** Subheadings ***************************************************************/

	h1 + h2,
	#subtitle {
		/* #subtitle is a subtitle in an H2 under the H1 */
		margin-top: 0;
	}
	h2 + h3,
	h3 + h4,
	h4 + h5,
	h5 + h6 {
		margin-top: 1.2em; /* = 1 x line-height */
	}

/** Section divider ***********************************************************/

	:not(.head) > hr {
		font-size: 1.5em;
		text-align: center;
		margin: 1em auto;
		height: auto;
		border: transparent solid 0;
		background: transparent;
	}
	:not(.head) > hr::before {
		content: "\2727\2003\2003\2727\2003\2003\2727";
	}

/******************************************************************************/
/*                            Paragraphs and Lists                            */
/******************************************************************************/

	p {
		margin: 1em 0;
	}

	dd > p:first-child,
	li > p:first-child {
		margin-top: 0;
	}

	ul, ol {
		margin-left: 0;
		padding-left: 2em;
	}

	li {
		margin: 0.25em 0 0.5em;
		padding: 0;
	}

	dl dd {
		margin: 0 0 .5em 2em;
	}

	.head dd + dd { /* compact for header */
		margin-top: -.5em;
	}

	/* Style for algorithms */
	ol.algorithm ol:not(.algorithm),
	.algorithm > ol ol:not(.algorithm) {
	 border-left: 0.5em solid #DEF;
	}

	/* Put nice boxes around each algorithm. */
	[data-algorithm]:not(.heading) {
	  padding: .5em;
	  border: thin solid #ddd; border-radius: .5em;
	  margin: .5em calc(-0.5em - 1px);
	}
	[data-algorithm]:not(.heading) > :first-child {
	  margin-top: 0;
	}
	[data-algorithm]:not(.heading) > :last-child {
	  margin-bottom: 0;
	}

	/* Style for switch/case <dl>s */
	dl.switch > dd > ol.only,
	dl.switch > dd > .only > ol {
	 margin-left: 0;
	}
	dl.switch > dd > ol.algorithm,
	dl.switch > dd > .algorithm > ol {
	 margin-left: -2em;
	}
	dl.switch {
	 padding-left: 2em;
	}
	dl.switch > dt {
	 text-indent: -1.5em;
	 margin-top: 1em;
	}
	dl.switch > dt + dt {
	 margin-top: 0;
	}
	dl.switch > dt::before {
	 content: '\21AA';
	 padding: 0 0.5em 0 0;
	 display: inline-block;
	 width: 1em;
	 text-align: right;
	 line-height: 0.5em;
	}

/** Terminology Markup ********************************************************/


/******************************************************************************/
/*                                 Inline Markup                              */
/******************************************************************************/

/** Terminology Markup ********************************************************/
	dfn   { /* Defining instance */
		font-weight: bolder;
	}
	a > i { /* Instance of term */
		font-style: normal;
	}
	dt dfn code, code.idl {
		font-size: medium;
	}
	dfn var {
		font-style: normal;
	}

/** Change Marking ************************************************************/

	del { color: red;  text-decoration: line-through; }
	ins { color: #080; text-decoration: underline;    }

/** Miscellaneous improvements to inline formatting ***************************/

	sup {
		vertical-align: super;
		font-size: 80%
	}

/******************************************************************************/
/*                                    Code                                    */
/******************************************************************************/

/** General monospace/pre rules ***********************************************/

	pre, code, samp {
		font-family: Menlo, Consolas, "DejaVu Sans Mono", Monaco, monospace;
		font-size: .9em;
		page-break-inside: avoid;
		hyphens: none;
		text-transform: none;
	}
	pre code,
	code code {
		font-size: 100%;
	}

	pre {
		margin-top: 1em;
		margin-bottom: 1em;
		overflow: auto;
	}

/** Inline Code fragments *****************************************************/

  /* Do something nice. */

/******************************************************************************/
/*                                    Links                                   */
/******************************************************************************/

/** General Hyperlinks ********************************************************/

	/* We hyperlink a lot, so make it less intrusive */
	a[href] {
		color: #034575;
		text-decoration: none;
		border-bottom: 1px solid #707070;
		/* Need a bit of extending for it to look okay */
		padding: 0 1px 0;
		margin: 0 -1px 0;
	}
	a:visited {
		border-bottom-color: #BBB;
	}

	/* Use distinguishing colors when user is interacting with the link */
	a[href]:focus,
	a[href]:hover {
		background: #f8f8f8;
		background: rgba(75%, 75%, 75%, .25);
		border-bottom-width: 3px;
		margin-bottom: -2px;
	}
	a[href]:active {
		color: #C00;
		border-color: #C00;
	}

	/* Backout above styling for W3C logo */
	.head .logo,
	.head .logo a {
		border: none;
		text-decoration: none;
		background: transparent;
	}

/******************************************************************************/
/*                                    Images                                  */
/******************************************************************************/

	img {
		border-style: none;
	}

	/* For autogen numbers, add
	   .caption::before, figcaption::before { content: "Figure " counter(figure) ". "; }
	*/

	figure, .figure, .sidefigure {
		page-break-inside: avoid;
		text-align: center;
		margin: 2.5em 0;
	}
	.figure img,    .sidefigure img,    figure img,
	.figure object, .sidefigure object, figure object {
		max-width: 100%;
		margin: auto;
	}
	.figure pre, .sidefigure pre, figure pre {
		text-align: left;
		display: table;
		margin: 1em auto;
	}
	.figure table, figure table {
		margin: auto;
	}
	@media screen and (min-width: 20em) {
		.sidefigure {
			float: right;
			width: 50%;
			margin: 0 0 0.5em 0.5em
		}
	}
	.caption, figcaption, caption {
		font-style: italic;
		font-size: 90%;
	}
	.caption::before, figcaption::before, figcaption > .marker {
		font-weight: bold;
	}
	.caption, figcaption {
		counter-increment: figure;
	}

	/* DL list is indented 2em, but figure inside it is not */
	dd > .figure, dd > figure { margin-left: -2em }

/******************************************************************************/
/*                             Colored Boxes                                  */
/******************************************************************************/

	.issue, .note, .example, .assertion, .advisement, blockquote {
		padding: .5em;
		border: .5em;
		border-left-style: solid;
		page-break-inside: avoid;
	}
	span.issue, span.note {
		padding: .1em .5em .15em;
		border-right-style: solid;
	}

	.issue,
	.note,
	.example,
	.advisement,
	.assertion,
	blockquote {
		margin: 1em auto;
	}
	.note  > p:first-child,
	.issue > p:first-child,
	blockquote > :first-child {
		margin-top: 0;
	}
	blockquote > :last-child {
		margin-bottom: 0;
	}

/** Blockquotes ***************************************************************/

	blockquote {
		border-color: silver;
	}

/** Open issue ****************************************************************/

	.issue {
		border-color: #E05252;
		background: #FBE9E9;
		counter-increment: issue;
		overflow: auto;
	}
	.issue::before, .issue > .marker {
		text-transform: uppercase;
		color: #AE1E1E;
		padding-right: 1em;
		text-transform: uppercase;
	}
	/* Add .issue::before { content: "Issue " counter(issue) " "; } for autogen numbers,
	   or use class="marker" to mark up the issue number in source. */

/** Example *******************************************************************/

	.example {
		border-color: #E0CB52;
		background: #FCFAEE;
		counter-increment: example;
		overflow: auto;
		clear: both;
	}
	.example::before, .example > .marker {
		text-transform: uppercase;
		color: #827017;
		min-width: 7.5em;
		display: block;
	}
	/* Add .example::before { content: "Example " counter(example) " "; } for autogen numbers,
	   or use class="marker" to mark up the example number in source. */

/** Non-normative Note ********************************************************/

	.note {
		border-color: #52E052;
		background: #E9FBE9;
		overflow: auto;
	}

	.note::before, .note > .marker,
	details.note > summary::before,
	details.note > summary > .marker {
		text-transform: uppercase;
		display: block;
		color: hsl(120, 70%, 30%);
	}
	/* Add .note::before { content: "Note"; } for autogen label,
	   or use class="marker" to mark up the label in source. */

	details.note > summary {
		display: block;
		color: hsl(120, 70%, 30%);
	}
	details.note[open] > summary {
		border-bottom: 1px silver solid;
	}

/** Assertion Box *************************************************************/
	/*  for assertions in algorithms */

	.assertion {
		border-color: #AAA;
		background: #EEE;
	}

/** Advisement Box ************************************************************/
	/*  for attention-grabbing normative statements */

	.advisement {
		border-color: orange;
		border-style: none solid;
		background: #FFEECC;
	}
	strong.advisement {
		display: block;
		text-align: center;
	}
	.advisement > .marker {
		color: #B35F00;
	}

/** Spec Obsoletion Notice ****************************************************/
	/* obnoxious obsoletion notice for older/abandoned specs. */

	details {
		display: block;
	}
	summary {
		font-weight: bolder;
	}

	.annoying-warning:not(details),
	details.annoying-warning:not([open]) > summary,
	details.annoying-warning[open] {
		background: #fdd;
		color: red;
		font-weight: bold;
		padding: .75em 1em;
		border: thick red;
		border-style: solid;
		border-radius: 1em;
	}
	.annoying-warning :last-child {
		margin-bottom: 0;
	}

@media not print {
	details.annoying-warning[open] {
		position: fixed;
		left: 1em;
		right: 1em;
		bottom: 1em;
		z-index: 1000;
	}
}

	details.annoying-warning:not([open]) > summary {
		text-align: center;
	}

/** Entity Definition Boxes ***************************************************/

	.def {
		padding: .5em 1em;
		background: #DEF;
		margin: 1.2em 0;
		border-left: 0.5em solid #8CCBF2;
	}

/******************************************************************************/
/*                                    Tables                                  */
/******************************************************************************/

	th, td {
		text-align: left;
		text-align: start;
	}

/** Property/Descriptor Definition Tables *************************************/

	table.def {
		/* inherits .def box styling, see above */
		width: 100%;
		border-spacing: 0;
	}

	table.def td,
	table.def th {
		padding: 0.5em;
		vertical-align: baseline;
		border-bottom: 1px solid #bbd7e9;
	}

	table.def > tbody > tr:last-child th,
	table.def > tbody > tr:last-child td {
		border-bottom: 0;
	}

	table.def th {
		font-style: italic;
		font-weight: normal;
		padding-left: 1em;
		width: 3em;
	}

	/* For when values are extra-complex and need formatting for readability */
	table td.pre {
		white-space: pre-wrap;
	}

	/* A footnote at the bottom of a def table */
	table.def           td.footnote {
		padding-top: 0.6em;
	}
	table.def           td.footnote::before {
		content: " ";
		display: block;
		height: 0.6em;
		width: 4em;
		border-top: thin solid;
	}

/** Data tables (and properly marked-up index tables) *************************/
	/*
		 <table class="data"> highlights structural relationships in a table
		 when correct markup is used (e.g. thead/tbody, th vs. td, scope attribute)

		 Use class="complex data" for particularly complicated tables --
		 (This will draw more lines: busier, but clearer.)

		 Use class="long" on table cells with paragraph-like contents
		 (This will adjust text alignment accordingly.)
		 Alternately use class="longlastcol" on tables, to have the last column assume "long".
	*/

	table {
		word-wrap: normal;
		overflow-wrap: normal;
		hyphens: manual;
	}

	table.data,
	table.index {
		margin: 1em auto;
		border-collapse: collapse;
		border: hidden;
		width: 100%;
	}
	table.data caption,
	table.index caption {
		max-width: 50em;
		margin: 0 auto 1em;
	}

	table.data td,  table.data th,
	table.index td, table.index th {
		padding: 0.5em 1em;
		border-width: 1px;
		border-color: silver;
		border-top-style: solid;
	}

	table.data thead td:empty {
		padding: 0;
		border: 0;
	}

	table.data  thead,
	table.index thead,
	table.data  tbody,
	table.index tbody {
		border-bottom: 2px solid;
	}

	table.data colgroup,
	table.index colgroup {
		border-left: 2px solid;
	}

	table.data  tbody th:first-child,
	table.index tbody th:first-child  {
		border-right: 2px solid;
		border-top: 1px solid silver;
		padding-right: 1em;
	}

	table.data th[colspan],
	table.data td[colspan] {
		text-align: center;
	}

	table.complex.data th,
	table.complex.data td {
		border: 1px solid silver;
		text-align: center;
	}

	table.data.longlastcol td:last-child,
	table.data td.long {
	 vertical-align: baseline;
	 text-align: left;
	}

	table.data img {
		vertical-align: middle;
	}


/*
Alternate table alignment rules

	table.data,
	table.index {
		text-align: center;
	}

	table.data  thead th[scope="row"],
	table.index thead th[scope="row"] {
		text-align: right;
	}

	table.data  tbody th:first-child,
	table.index tbody th:first-child  {
		text-align: right;
	}

Possible extra rowspan handling

	table.data  tbody th[rowspan]:not([rowspan='1']),
	table.index tbody th[rowspan]:not([rowspan='1']),
	table.data  tbody td[rowspan]:not([rowspan='1']),
	table.index tbody td[rowspan]:not([rowspan='1']) {
		border-left: 1px solid silver;
	}

	table.data  tbody th[rowspan]:first-child,
	table.index tbody th[rowspan]:first-child,
	table.data  tbody td[rowspan]:first-child,
	table.index tbody td[rowspan]:first-child{
		border-left: 0;
		border-right: 1px solid silver;
	}
*/

/******************************************************************************/
/*                                  Indices                                   */
/******************************************************************************/


/** Table of Contents *********************************************************/

	.toc a {
		/* More spacing; use padding to make it part of the click target. */
		padding-top: 0.1rem;
		/* Larger, more consistently-sized click target */
		display: block;
		/* Reverse color scheme */
		color: black;
		border-color: #3980B5;
		border-bottom-width: 3px !important;
		margin-bottom: 0px !important;
	}
	.toc a:visited {
		border-color: #054572;
	}
	.toc a:not(:focus):not(:hover) {
		/* Allow colors to cascade through from link styling */
		border-bottom-color: transparent;
	}

	.toc, .toc ol, .toc ul, .toc li {
		list-style: none; /* Numbers must be inlined into source */
		/* because generated content isn't search/selectable and markers can't do multilevel yet */
		margin:  0;
		padding: 0;
		line-height: 1.1rem; /* consistent spacing */
	}

	/* ToC not indented until third level, but font style & margins show hierarchy */
	.toc > li             { font-weight: bold;   }
	.toc > li li          { font-weight: normal; }
	.toc > li li li       { font-size:   95%;    }
	.toc > li li li li    { font-size:   90%;    }
	.toc > li li li li .secno { font-size: 85%; }
	.toc > li li li li li { font-size:   85%;    }
	.toc > li li li li li .secno { font-size: 100%; }

	/* @supports not (display:grid) { */
		.toc > li             { margin: 1.5rem 0;    }
		.toc > li li          { margin: 0.3rem 0;    }
		.toc > li li li       { margin-left: 2rem;   }

		/* Section numbers in a column of their own */
		.toc .secno {
			float: left;
			width: 4rem;
			white-space: nowrap;
		}

		.toc li {
			clear: both;
		}

		:not(li) > .toc              { margin-left:  5rem; }
		.toc .secno                  { margin-left: -5rem; }
		.toc > li li li .secno       { margin-left: -7rem; }
		.toc > li li li li .secno    { margin-left: -9rem; }
		.toc > li li li li li .secno { margin-left: -11rem; }

		/* Tighten up indentation in narrow ToCs */
		@media (max-width: 30em) {
			:not(li) > .toc              { margin-left:  4rem; }
			.toc .secno                  { margin-left: -4rem; }
			.toc > li li li              { margin-left:  1rem; }
			.toc > li li li .secno       { margin-left: -5rem; }
			.toc > li li li li .secno    { margin-left: -6rem; }
			.toc > li li li li li .secno { margin-left: -7rem; }
		}
	/* } */

	@supports (display:grid) {
		/* Use #toc over .toc to override non-@supports rules. */
		#toc {
			display: grid;
			align-content: start;
			grid-template-columns: auto 1fr;
			grid-column-gap: 1rem;
			column-gap: 1rem;
			grid-row-gap: .6rem;
			row-gap: .6rem;
		}
		#toc h2 {
			grid-column: 1 / -1;
			margin-bottom: 0;
		}
		#toc ol,
		#toc li,
		#toc a {
			display: contents;
			/* Switch <a> to subgrid when supported */
		}
		#toc span {
			margin: 0;
		}
		#toc > .toc > li > a > span {
			/* The spans of the top-level list,
			   comprising the first items of each top-level section. */
			margin-top: 1.1rem;
		}
		#toc#toc .secno { /* Ugh, need more specificity to override base.css */
			grid-column: 1;
			width: auto;
			margin-left: 0;
		}
		#toc .content {
			grid-column: 2;
			width: auto;
			margin-right: 1rem;
		}
		#toc .content:hover {
			background: rgba(75%, 75%, 75%, .25);
			border-bottom: 3px solid #054572;
			margin-bottom: -3px;
		}
		#toc li li li .content {
			margin-left: 1rem;
		}
		#toc li li li li .content {
			margin-left: 2rem;
		}
	}


/** Index *********************************************************************/

	/* Index Lists: Layout */
	ul.index       { margin-left: 0; columns: 15em; text-indent: 1em hanging; }
	ul.index li    { margin-left: 0; list-style: none; break-inside: avoid; }
	ul.index li li { margin-left: 1em }
	ul.index dl    { margin-top: 0; }
	ul.index dt    { margin: .2em 0 .2em 20px;}
	ul.index dd    { margin: .2em 0 .2em 40px;}
	/* Index Lists: Typography */
	ul.index ul,
	ul.index dl { font-size: smaller; }
	@media not print {
		ul.index li span {
			white-space: nowrap;
			color: transparent; }
		ul.index li a:hover + span,
		ul.index li a:focus + span {
			color: #707070;
		}
	}

/** Index Tables *****************************************************/
	/* See also the data table styling section, which this effectively subclasses */

	table.index {
		font-size: small;
		border-collapse: collapse;
		border-spacing: 0;
		text-align: left;
		margin: 1em 0;
	}

	table.index td,
	table.index th {
		padding: 0.4em;
	}

	table.index tr:hover td:not([rowspan]),
	table.index tr:hover th:not([rowspan]) {
		background: #f7f8f9;
	}

	/* The link in the first column in the property table (formerly a TD) */
	table.index th:first-child a {
		font-weight: bold;
	}

/******************************************************************************/
/*                                    Print                                   */
/******************************************************************************/

	@media print {
		/* Pages have their own margins. */
		html {
			margin: 0;
		}
		/* Serif for print. */
		body {
			font-family: serif;
		}
	}
	@page {
		margin: 1.5cm 1.1cm;
	}

/******************************************************************************/
/*                                    Legacy                                  */
/******************************************************************************/

	/* This rule is inherited from past style sheets. No idea what it's for. */
	.hide { display: none }



/******************************************************************************/
/*                             Overflow Control                               */
/******************************************************************************/

	.figure .caption, .sidefigure .caption, figcaption {
		/* in case figure is overlarge, limit caption to 50em */
		max-width: 50rem;
		margin-left: auto;
		margin-right: auto;
	}
	.overlarge > table {
		/* limit preferred width of table */
		max-width: 50em;
		margin-left: auto;
		margin-right: auto;
	}

	@media (min-width: 55em) {
		.overlarge {
			margin-left: calc(13px + 26.5rem - 50vw);
			margin-right: calc(13px + 26.5rem - 50vw);
			max-width: none;
		}
	}
	@media screen and (min-width: 78em) {
		body:not(.toc-inline) .overlarge {
			/* 30.5em body padding 50em content area */
			margin-left: calc(40em - 50vw) !important;
			margin-right: calc(40em - 50vw) !important;
		}
	}
	@media screen and (min-width: 90em) {
		body:not(.toc-inline) .overlarge {
			/* 4em html margin 30.5em body padding 50em content area */
			margin-left: 0 !important;
			margin-right: calc(84.5em - 100vw) !important;
		}
	}

	@media not print {
		.overlarge {
			overflow-x: auto;
			/* See Lea Verou's explanation background-attachment:
			 * http://lea.verou.me/2012/04/background-attachment-local/
			 *
			background: top left  / 4em 100% linear-gradient(to right,  #ffffff, rgba(255, 255, 255, 0)) local,
			            top right / 4em 100% linear-gradient(to left, #ffffff, rgba(255, 255, 255, 0)) local,
			            top left  / 1em 100% linear-gradient(to right,  #c3c3c5, rgba(195, 195, 197, 0)) scroll,
			            top right / 1em 100% linear-gradient(to left, #c3c3c5, rgba(195, 195, 197, 0)) scroll,
			            white;
			background-repeat: no-repeat;
			*/
		}
	}
</style>
<style type="text/css">
    table, th, td {
      border: 1px solid black;
      border-collapse: collapse;
      vertical-align: top;
    }
    th, td {
      border-left: none;
      border-right: none;
      padding: 0px 10px;
    }
    th {
      text-align: center;
    }
  </style>
  <meta content="Bikeshed version 7d8ce2d953ffaca8e344c0dab76f68fc292738a6" name="generator">
  <link href="https://isocpp.org/favicon.ico" rel="icon">
  <meta content="0119e97cfd8363da68892ae3f65b2eb0d1ba5796" name="document-revision">
<style>
pre {
  margin-top: 0px;
  margin-bottom: 0px;
}
.ins, ins, ins *, span.ins, span.ins * {
  background-color: rgb(200, 250, 200);
  color: rgb(0, 136, 0);
  text-decoration: underline;
}
.del, del, del *, span.del, span.del * {
  background-color: rgb(250, 200, 200);
  color: rgb(255, 0, 0);
  text-decoration: line-through;
  text-decoration-color: rgb(255, 0, 0);
}
math, span.math {
  font-family: serif;
  font-style: italic;
}
ul {
  list-style-type: "— ";
}
blockquote {
  counter-reset: paragraph;
}
div.numbered, div.newnumbered {
  margin-left: 2em;
  margin-top: 1em;
  margin-bottom: 1em;
}
div.numbered:before, div.newnumbered:before {
  position: absolute;
  margin-left: -2em;
  display-style: block;
}
div.numbered:before {
  content: counter(paragraph);
  counter-increment: paragraph;
}
div.newnumbered:before {
  content: "�";
}
div.numbered ul, div.newnumbered ul {
  counter-reset: list_item;
}
div.numbered li, div.newnumbered li {
  margin-left: 3em;
}
div.numbered li:before, div.newnumbered li:before {
  position: absolute;
  margin-left: -4.8em;
  display-style: block;
}
div.numbered li:before {
  content: "(" counter(paragraph) "." counter(list_item) ")";
  counter-increment: list_item;
}
div.newnumbered li:before {
  content: "(�." counter(list_item) ")";
  counter-increment: list_item;
}
</style>
<style>/* style-md-lists */

/* This is a weird hack for me not yet following the commonmark spec
   regarding paragraph and lists. */
[data-md] > :first-child {
    margin-top: 0;
}
[data-md] > :last-child {
    margin-bottom: 0;
}</style>
<style>/* style-counters */

body {
    counter-reset: example figure issue;
}
.issue {
    counter-increment: issue;
}
.issue:not(.no-marker)::before {
    content: "Issue " counter(issue);
}

.example {
    counter-increment: example;
}
.example:not(.no-marker)::before {
    content: "Example " counter(example);
}
.invalid.example:not(.no-marker)::before,
.illegal.example:not(.no-marker)::before {
    content: "Invalid Example" counter(example);
}

figcaption {
    counter-increment: figure;
}
figcaption:not(.no-marker)::before {
    content: "Figure " counter(figure) " ";
}</style>
<style>/* style-syntax-highlighting */

.highlight:not(.idl) { background: hsl(24, 20%, 95%); }
code.highlight { padding: .1em; border-radius: .3em; }
pre.highlight, pre > code.highlight { display: block; padding: 1em; margin: .5em 0; overflow: auto; border-radius: 0; }
c-[a] { color: #990055 } /* Keyword.Declaration */
c-[b] { color: #990055 } /* Keyword.Type */
c-[c] { color: #708090 } /* Comment */
c-[d] { color: #708090 } /* Comment.Multiline */
c-[e] { color: #0077aa } /* Name.Attribute */
c-[f] { color: #669900 } /* Name.Tag */
c-[g] { color: #222222 } /* Name.Variable */
c-[k] { color: #990055 } /* Keyword */
c-[l] { color: #000000 } /* Literal */
c-[m] { color: #000000 } /* Literal.Number */
c-[n] { color: #0077aa } /* Name */
c-[o] { color: #999999 } /* Operator */
c-[p] { color: #999999 } /* Punctuation */
c-[s] { color: #a67f59 } /* Literal.String */
c-[t] { color: #a67f59 } /* Literal.String.Single */
c-[u] { color: #a67f59 } /* Literal.String.Double */
c-[cp] { color: #708090 } /* Comment.Preproc */
c-[c1] { color: #708090 } /* Comment.Single */
c-[cs] { color: #708090 } /* Comment.Special */
c-[kc] { color: #990055 } /* Keyword.Constant */
c-[kn] { color: #990055 } /* Keyword.Namespace */
c-[kp] { color: #990055 } /* Keyword.Pseudo */
c-[kr] { color: #990055 } /* Keyword.Reserved */
c-[ld] { color: #000000 } /* Literal.Date */
c-[nc] { color: #0077aa } /* Name.Class */
c-[no] { color: #0077aa } /* Name.Constant */
c-[nd] { color: #0077aa } /* Name.Decorator */
c-[ni] { color: #0077aa } /* Name.Entity */
c-[ne] { color: #0077aa } /* Name.Exception */
c-[nf] { color: #0077aa } /* Name.Function */
c-[nl] { color: #0077aa } /* Name.Label */
c-[nn] { color: #0077aa } /* Name.Namespace */
c-[py] { color: #0077aa } /* Name.Property */
c-[ow] { color: #999999 } /* Operator.Word */
c-[mb] { color: #000000 } /* Literal.Number.Bin */
c-[mf] { color: #000000 } /* Literal.Number.Float */
c-[mh] { color: #000000 } /* Literal.Number.Hex */
c-[mi] { color: #000000 } /* Literal.Number.Integer */
c-[mo] { color: #000000 } /* Literal.Number.Oct */
c-[sb] { color: #a67f59 } /* Literal.String.Backtick */
c-[sc] { color: #a67f59 } /* Literal.String.Char */
c-[sd] { color: #a67f59 } /* Literal.String.Doc */
c-[se] { color: #a67f59 } /* Literal.String.Escape */
c-[sh] { color: #a67f59 } /* Literal.String.Heredoc */
c-[si] { color: #a67f59 } /* Literal.String.Interpol */
c-[sx] { color: #a67f59 } /* Literal.String.Other */
c-[sr] { color: #a67f59 } /* Literal.String.Regex */
c-[ss] { color: #a67f59 } /* Literal.String.Symbol */
c-[vc] { color: #0077aa } /* Name.Variable.Class */
c-[vg] { color: #0077aa } /* Name.Variable.Global */
c-[vi] { color: #0077aa } /* Name.Variable.Instance */
c-[il] { color: #000000 } /* Literal.Number.Integer.Long */
</style>
<style>/* style-selflinks */

.heading, .issue, .note, .example, li, dt {
    position: relative;
}
a.self-link {
    position: absolute;
    top: 0;
    left: calc(-1 * (3.5rem - 26px));
    width: calc(3.5rem - 26px);
    height: 2em;
    text-align: center;
    border: none;
    transition: opacity .2s;
    opacity: .5;
}
a.self-link:hover {
    opacity: 1;
}
.heading > a.self-link {
    font-size: 83%;
}
li > a.self-link {
    left: calc(-1 * (3.5rem - 26px) - 2em);
}
dfn > a.self-link {
    top: auto;
    left: auto;
    opacity: 0;
    width: 1.5em;
    height: 1.5em;
    background: gray;
    color: white;
    font-style: normal;
    transition: opacity .2s, background-color .2s, color .2s;
}
dfn:hover > a.self-link {
    opacity: 1;
}
dfn > a.self-link:hover {
    color: black;
}

a.self-link::before            { content: "¶"; }
.heading > a.self-link::before { content: "§"; }
dfn > a.self-link::before      { content: "#"; }</style>
<style>/* style-autolinks */

.css.css, .property.property, .descriptor.descriptor {
    color: #005a9c;
    font-size: inherit;
    font-family: inherit;
}
.css::before, .property::before, .descriptor::before {
    content: "‘";
}
.css::after, .property::after, .descriptor::after {
    content: "’";
}
.property, .descriptor {
    /* Don't wrap property and descriptor names */
    white-space: nowrap;
}
.type { /* CSS value <type> */
    font-style: italic;
}
pre .property::before, pre .property::after {
    content: "";
}
[data-link-type="property"]::before,
[data-link-type="propdesc"]::before,
[data-link-type="descriptor"]::before,
[data-link-type="value"]::before,
[data-link-type="function"]::before,
[data-link-type="at-rule"]::before,
[data-link-type="selector"]::before,
[data-link-type="maybe"]::before {
    content: "‘";
}
[data-link-type="property"]::after,
[data-link-type="propdesc"]::after,
[data-link-type="descriptor"]::after,
[data-link-type="value"]::after,
[data-link-type="function"]::after,
[data-link-type="at-rule"]::after,
[data-link-type="selector"]::after,
[data-link-type="maybe"]::after {
    content: "’";
}

[data-link-type].production::before,
[data-link-type].production::after,
.prod [data-link-type]::before,
.prod [data-link-type]::after {
    content: "";
}

[data-link-type=element],
[data-link-type=element-attr] {
    font-family: Menlo, Consolas, "DejaVu Sans Mono", monospace;
    font-size: .9em;
}
[data-link-type=element]::before { content: "<" }
[data-link-type=element]::after  { content: ">" }

[data-link-type=biblio] {
    white-space: pre;
}</style>
 <body class="h-entry">
  <div class="head">
   <p data-fill-with="logo"></p>
   <h1 class="p-name no-ref" id="title">P1629R0<br>Standard Text Encoding</h1>
   <h2 class="no-num no-toc no-ref heading settled" id="subtitle"><span class="content">Published Proposal, <time class="dt-updated" datetime="2019-06-17">2019-06-17</time></span></h2>
   <div data-fill-with="spec-metadata">
    <dl>
     <dt>Author:
     <dd>
      <dd class="editor p-author h-card vcard"><a class="p-name fn u-email email" href="mailto:phdofthehouse@gmail.com">JeanHeyd Meneide</a>
     <dt>Audience:
     <dd>EWG, LEWG
     <dt>Project:
     <dd>ISO/IEC JTC1/SC22/WG21 14882: Programming Language — C++
     <dt>Latest:
     <dd><a href="https://thephd.github.io/vendor/future_cxx/papers/d1629.html">https://thephd.github.io/vendor/future_cxx/papers/d1629.html</a>
    </dl>
   </div>
   <div data-fill-with="warning"></div>
   <hr title="Separator for header">
  </div>
  <div class="p-summary" data-fill-with="abstract">
   <h2 class="no-num no-toc no-ref heading settled" id="abstract"><span class="content">Abstract</span></h2>
   <p>The standard lacks facilities for transliterating and transcoding text from one form into another, leaving a serious barrier to entry for individuals who want to process text in any sensible manner in the Standard Library. This paper explores and proposes a static interface for encoding that can be used and built upon for the creation of higher-level abstractions.</p>
  </div>
  <nav data-fill-with="table-of-contents" id="toc">
   <h2 class="no-num no-toc no-ref" id="contents">Table of Contents</h2>
   <ol class="toc" role="directory">
    <li>
     <a href="#changelog"><span class="secno">1</span> <span class="content">Revision History</span></a>
     <ol class="toc">
      <li><a href="#changelog-r1"><span class="secno">1.1</span> <span class="content">Revision 0 - June 17th, 2019</span></a>
     </ol>
    <li><a href="#motivation"><span class="secno">2</span> <span class="content">Motivation</span></a>
    <li>
     <a href="#design"><span class="secno">3</span> <span class="content">Design</span></a>
     <ol class="toc">
      <li><a href="#design-high-level"><span class="secno">3.1</span> <span class="content">High Level</span></a>
      <li>
       <a href="#design-low-level"><span class="secno">3.2</span> <span class="content">Low-Level</span></a>
       <ol class="toc">
        <li><a href="#design-low-level-error_codes"><span class="secno">3.2.1</span> <span class="content">Error Codes</span></a>
        <li>
         <a href="#design-low-level-results"><span class="secno">3.2.2</span> <span class="content">Result Types</span></a>
         <ol class="toc">
          <li><a href="#design-low-level-results-ranges"><span class="secno">3.2.2.1</span> <span class="content">Input and Output Ranges</span></a>
          <li><a href="#design-low-level-results-reconstructible"><span class="secno">3.2.2.2</span> <span class="content">Implementation Challenge: Ranges are not the Sum of their Parts</span></a>
          <li><a href="#design-low-level-results-error_handler"><span class="secno">3.2.2.3</span> <span class="content">Error Handling: Allow All The Options</span></a>
         </ol>
        <li>
         <a href="#design-low-level-encodings"><span class="secno">3.2.3</span> <span class="content">The Encoding Object</span></a>
         <ol class="toc">
          <li><a href="#design-low-level-encodings-standard"><span class="secno">3.2.3.1</span> <span class="content">Encodings Provided by the Standard</span></a>
          <li><a href="#design-low-level-encodings-variant"><span class="secno">3.2.3.2</span> <span class="content">UTF Encodings: variants?</span></a>
          <li><a href="#design-low-level-encodings-encoding_scheme"><span class="secno">3.2.3.3</span> <span class="content">Encoding Schemes: Byte-Based</span></a>
         </ol>
        <li>
         <a href="#design-low-level-encodings-stateful"><span class="secno">3.2.4</span> <span class="content">Stateful Objects, or Stateful Parameters?</span></a>
         <ol class="toc">
          <li><a href="#design-low-level-encodings-state-synchronizing"><span class="secno">3.2.4.1</span> <span class="content">Self-Synchronizing State</span></a>
          <li><a href="#design-low-level-encodings-state-empty"><span class="secno">3.2.4.2</span> <span class="content">Empty State and learning from Minimal Allocators</span></a>
         </ol>
       </ol>
      <li>
       <a href="#design-speed"><span class="secno">3.3</span> <span class="content">The Need for Speed</span></a>
       <ol class="toc">
        <li><a href="#design-speed-interop"><span class="secno">3.3.1</span> <span class="content">Transcoding Compatibility</span></a>
        <li><a href="#design-speed-eager"><span class="secno">3.3.2</span> <span class="content">Eager, Fast Functions with Customizability</span></a>
       </ol>
     </ol>
    <li><a href="#implementation"><span class="secno">4</span> <span class="content">Implementation</span></a>
    <li><a href="#acknowledgements"><span class="secno">5</span> <span class="content">Acknowledgements</span></a>
    <li>
     <a href="#references"><span class="secno"></span> <span class="content">References</span></a>
     <ol class="toc">
      <li><a href="#informative"><span class="secno"></span> <span class="content">Informative References</span></a>
     </ol>
   </ol>
  </nav>
  <main>
   <h2 class="heading settled" data-level="1" id="changelog"><span class="secno">1. </span><span class="content">Revision History</span><a class="self-link" href="#changelog"></a></h2>
   <h3 class="heading settled" data-level="1.1" id="changelog-r1"><span class="secno">1.1. </span><span class="content">Revision 0 - June 17th, 2019</span><a class="self-link" href="#changelog-r1"></a></h3>
   <ul>
    <li data-md>
     <p>Initial release of exploratory paper.</p>
   </ul>
   <h2 class="heading settled" data-level="2" id="motivation"><span class="secno">2. </span><span class="content">Motivation</span><a class="self-link" href="#motivation"></a></h2>
   <p>It’s 2019 and Unicode is still barely supported in both the C and C++ standards.</p>
   <p>From the POSIX standard requiring a single-byte encoding by default, heavy limitations placed in <code class="highlight"><c- n>codecvt</c-></code> facets in C and C++, and the utter lack of UTF8/16/32 multi-unit conversion functions by the standard, the programming languages that have shaped the face of development in operating systems, embedded devices and mobile applications has pushed forward a world that is incredibly unfriendly to a world of text beyond ASCII English. Developers frequently roll their own solutions, and almost every major codebase -- from Chrome to Firefox, Qt to Copperspice, and more -- all have their own variations of hand-crafted text processing. With no standard implementation in C++ and libraries split between various third party implementations plus ICU, it is increasingly difficult and error-prone to handle what is the basic means of communication between people on the planet using C++.</p>
   <p>This paper aims to explore the design space for both extremely high performing transcoding (encoding and decoding) as well as a flexible one-by-one interface for more careful and meticulous text processing. This proposal arises from industry experience in large codebases and best-practice open source explorations with <a data-link-type="biblio" href="#biblio-libogonek">[libogonek]</a>, <a data-link-type="biblio" href="#biblio-icu">[icu]</a>, <a data-link-type="biblio" href="#biblio-boosttext">[boost.text]</a> and <a data-link-type="biblio" href="#biblio-text_view">[text_view]</a> while also building on the concepts and design choices found in both <a data-link-type="biblio" href="#biblio-range-v3">[range-v3]</a> and provably fast text processing such as Windows’s WideCharToMultiByte interfaces, *nix utility iconv, and more.</p>
   <p>The ultimate goal is to allow an interface that is correct by default but capable of being fast both by Standard Library implementer efforts but also program specialization-friendly free functions. It will produce both an interface for encoding and decoding.</p>
   <h2 class="heading settled" data-level="3" id="design"><span class="secno">3. </span><span class="content">Design</span><a class="self-link" href="#design"></a></h2>
   <p>The current design has been the culmination of a few years of collaborative and independent research, starting with the earliest papers from Mark Boyall’s <a data-link-type="biblio" href="#biblio-n3574">[n3574]</a>, Tom Honermann’s <a data-link-type="biblio" href="#biblio-p0244r2">[p0244r2]</a>, study of ICU’s interface, and finally the musings, experience and work of R. Martinho Fernandes in <a data-link-type="biblio" href="#biblio-libogonek">[libogonek]</a>. Current and future optimizations are considered to ensure that fast paths are not blocked in the interface proposed for standardization. With <a data-link-type="biblio" href="#biblio-boosttext">[boost.text]</a> with hammering down the internally used encoding to be UTF8, Markus Sherer’s participation in SG16 meetings and Bob Steagall’s <a data-link-type="biblio" href="#biblio-fast-utf8">work in writing a fast UTF8 decoder</a> this paper absorbs a wealth of knowledge to get reach a flexible interface that enables high-throughput.</p>
   <p>In reading, implementing, working with and consuming all of these designs, the author of this paper, independent implementers, and several SG16 members have come to the following core tenants:</p>
   <ul>
    <li data-md>
     <p>strong types for code units allow selecting proper default encodings for these interfaces;</p>
    <li data-md>
     <p>iterators and ranges are a huge interface win for working with text but are impossible to provide the fastest possible way to encode/decode/transcode text;</p>
    <li data-md>
     <p>and, avoid creating new vocabulary: improve working with original containers and imposing well-formedness constraints upon them rather than designing new containers from the ground up.</p>
   </ul>
   <p>Given these tenants, the following interface choices have arisen for this paper. Each section will describe a piece of the interface, its goals, and how it works. We start first with the high-level low-level encoding interface and its plumbing and core types.</p>
   <h3 class="heading settled" data-level="3.1" id="design-high-level"><span class="secno">3.1. </span><span class="content">High Level</span><a class="self-link" href="#design-high-level"></a></h3>
   <p>Working with the lower level facilities for text processing is not a pretty sight. <a href="#design-low-level">Consider the usage of the low-level facilities laid out below</a>:</p>
<pre class="highlight"><c- n>std</c-><c- o>::</c-><c- n>text</c-><c- o>::</c-><c- n>utf8</c-> <c- n>encoding</c-><c- p>;</c->

<c- n>std</c-><c- o>::</c-><c- n>text</c-><c- o>::</c-><c- n>unicode_code_point</c-> <c- n>array_output</c-><c- p>[</c-><c- mi>41</c-><c- p>]{};</c->
<c- n>std</c-><c- o>::</c-><c- n>text</c-><c- o>::</c-><c- n>utf8</c-><c- o>::</c-><c- n>state</c-> <c- n>encoding_state</c-><c- p>{};</c->
<c- n>std</c-><c- o>::</c-><c- n>u8string_view</c-> <c- n>input</c-> <c- o>=</c-> u8<c- s>"𐌼𐌰𐌲 𐌲𐌻𐌴𐍃 𐌹̈𐍄𐌰𐌽, 𐌽𐌹 𐌼𐌹𐍃 𐍅𐌿 𐌽𐌳𐌰𐌽 𐌱𐍂𐌹𐌲𐌲𐌹𐌸."</c-><c- p>;</c->
<c- n>std</c-><c- o>::</c-><c- n>u8string_view</c-> <c- n>working_input</c-> <c- o>=</c-> <c- n>input</c-><c- p>;</c->
<c- n>std</c-><c- o>::</c-><c- n>span</c-> <c- n>working_output</c-> <c- o>=</c-> <c- n>std</c-><c- o>::</c-><c- n>span</c-><c- p>(</c-><c- n>array_output</c-><c- p>);</c->
<c- k>for</c-> <c- p>(;;)</c-> <c- p>{</c->
	<c- k>auto</c-> <c- n>result</c-> <c- o>=</c-> <c- n>encoding</c-><c- p>.</c-><c- n>decode</c-><c- p>(</c-><c- n>working_input</c-><c- p>,</c-> <c- n>working_output</c-><c- p>,</c-> 
		<c- n>encoding_state</c-><c- p>,</c-> <c- n>std</c-><c- o>::</c-><c- n>text</c-><c- o>::</c-><c- n>default_error_handler</c-><c- p>{});</c->
	<c- k>if</c-> <c- p>(</c-><c- n>result</c-><c- p>.</c-><c- n>error_code</c-> <c- o>!=</c-> <c- n>encoding_errc</c-><c- o>::</c-><c- n>ok</c-><c- p>)</c-> <c- p>{</c->
		<c- k>break</c-><c- p>;</c->
	<c- p>}</c->
	<c- k>if</c-> <c- p>(</c-><c- n>std</c-><c- o>::</c-><c- n>empty</c-><c- p>(</c-><c- n>result</c-><c- p>.</c-><c- n>input</c-><c- p>))</c-> <c- p>{</c->
		<c- k>break</c-><c- p>;</c->
	<c- p>}</c->
	<c- n>working_input</c->  <c- o>=</c-> <c- n>std</c-><c- o>::</c-><c- n>move</c-><c- p>(</c-><c- n>result</c-><c- p>.</c-><c- n>input</c-><c- p>);</c->
	<c- n>working_output</c-> <c- o>=</c-> <c- n>std</c-><c- o>::</c-><c- n>move</c-><c- p>(</c-><c- n>result</c-><c- p>.</c-><c- n>output</c-><c- p>);</c->
<c- p>}</c->
<c- n>assert</c-><c- p>(</c-><c- n>std</c-><c- o>::</c-><c- n>u32string_view</c-><c- p>(</c-><c- n>array_output</c-><c- p>)</c-> <c- o>==</c-> U<c- s>"𐌼𐌰𐌲 𐌲𐌻𐌴𐍃 𐌹̈𐍄𐌰𐌽, 𐌽𐌹 𐌼𐌹𐍃 𐍅𐌿 𐌽𐌳𐌰𐌽 𐌱𐍂𐌹𐌲𐌲𐌹𐌸."</c-><c- p>);</c->
</pre>
   <p>These low-level facilities -- while powerful and customizable -- do not represent what the average user will -- or should -- be wrangling with. Therefore, the higher-level facilities become incredibly pressing to make these interfaces palatable and sustainable for developers in both the short and long term. Consider the same encoding functionality, boiled down to something far easier to use:</p>
<pre class="highlight"><c- n>std</c-><c- o>::</c-><c- n>u32_string</c-> <c- n>output</c-> <c- o>=</c-> <c- n>std</c-><c- o>::</c-><c- n>text</c-><c- o>::</c-><c- n>decode</c-><c- p>(</c->u8<c- s>"𐌼𐌰𐌲 𐌲𐌻𐌴𐍃 𐌹̈𐍄𐌰𐌽, 𐌽𐌹 𐌼𐌹𐍃 𐍅𐌿 𐌽𐌳𐌰𐌽 𐌱𐍂𐌹𐌲𐌲𐌹𐌸."</c-><c- p>);</c->
<c- n>assert</c-><c- p>(</c-><c- n>output</c-> <c- o>==</c-> U<c- s>"𐌼𐌰𐌲 𐌲𐌻𐌴𐍃 𐌹̈𐍄𐌰𐌽, 𐌽𐌹 𐌼𐌹𐍃 𐍅𐌿 𐌽𐌳𐌰𐌽 𐌱𐍂𐌹𐌲𐌲𐌹𐌸."</c-><c- p>);</c->
</pre>
   <p>This is much simpler and does exactly the same as the above, without all the setup and boilerplate. Of course, taking only the input and giving the output is too <strong>much</strong> of a simplification, so there are a few overloads and variants that will be offered. Namely:</p>
<pre class="highlight"><c- k>namespace</c-> <c- n>std</c-> <c- p>{</c-> <c- k>namespace</c-> <c- n>text</c-> <c- p>{</c->

	<c- k>template</c-> <c- o>&lt;</c-><c- k>typename</c-> <c- n>Input</c-><c- p>,</c-> <c- k>typename</c-> <c- n>Output</c-><c- p>,</c-> <c- k>typename</c-> <c- n>Encoding</c-><c- p>,</c-> 
		<c- k>typename</c-> <c- n>State</c-><c- p>,</c-> <c- k>typename</c-> <c- n>ErrorHandler</c-><c- o>></c->
	<c- k>constexpr</c-> <c- b>void</c-> <c- n>decode_into</c-><c- p>(</c-><c- n>Input</c-><c- o>&amp;&amp;</c-> <c- n>input</c-><c- p>,</c-> <c- n>Output</c-><c- o>&amp;&amp;</c-> <c- n>output</c-><c- p>,</c-> 
		<c- n>Encoding</c-><c- o>&amp;&amp;</c-> <c- n>encoding</c-><c- p>,</c-> <c- n>State</c-><c- o>&amp;</c-> <c- n>state</c-><c- p>,</c-> <c- n>ErrorHandler</c-><c- o>&amp;&amp;</c-> <c- n>error_handler</c-><c- p>);</c->

	<c- k>template</c-> <c- o>&lt;</c-><c- k>typename</c-> <c- n>Input</c-><c- p>,</c-> <c- k>typename</c-> <c- n>Encoding</c-><c- p>,</c-> 
		<c- k>typename</c-> <c- n>State</c-><c- p>,</c-> <c- k>typename</c-> <c- n>ErrorHandler</c-><c- o>></c->
	<c- k>auto</c-> <c- n>decode</c-><c- p>(</c-><c- n>Input</c-><c- o>&amp;&amp;</c-> <c- n>input</c-><c- p>,</c-> <c- n>Encoding</c-><c- o>&amp;&amp;</c-> <c- n>encoding</c-><c- p>,</c-> 
		<c- n>State</c-><c- o>&amp;</c-> <c- n>state</c-><c- p>,</c-> <c- n>ErrorHandler</c-><c- o>&amp;&amp;</c-> <c- n>error_handler</c-><c- p>);</c->

	<c- k>template</c-> <c- o>&lt;</c-><c- k>typename</c-> <c- n>Input</c-><c- p>,</c-> <c- k>typename</c-> <c- n>Encoding</c-><c- p>,</c-> 
		<c- k>typename</c-> <c- n>ErrorHandler</c-><c- o>></c->
	<c- k>auto</c-> <c- n>decode</c-><c- p>(</c-><c- n>Input</c-><c- o>&amp;&amp;</c-> <c- n>input</c-><c- p>,</c-> <c- n>Encoding</c-><c- o>&amp;&amp;</c-> <c- n>encoding</c-><c- p>,</c-> 
		<c- n>ErrorHandler</c-><c- o>&amp;&amp;</c-> <c- n>error_handler</c-><c- p>);</c->

	<c- k>template</c-> <c- o>&lt;</c-><c- k>typename</c-> <c- n>Input</c-><c- p>,</c-> <c- k>typename</c-> <c- n>Encoding</c-><c- o>></c->
	<c- k>auto</c-> <c- n>decode</c-><c- p>(</c-><c- n>Input</c-><c- o>&amp;&amp;</c-> <c- n>input</c-><c- p>,</c-> <c- n>Encoding</c-><c- o>&amp;&amp;</c-> <c- n>encoding</c-><c- p>);</c->

	<c- k>template</c-> <c- o>&lt;</c-><c- k>typename</c-> <c- n>Input</c-><c- o>></c->
	<c- k>auto</c-> <c- n>decode</c-><c- p>(</c-><c- n>Input</c-><c- o>&amp;&amp;</c-> <c- n>input</c-><c- p>);</c->

<c- p>}}</c->
</pre>
   <p>Similarly named functions for encoding (<code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>text</c-><c- o>::</c-><c- n>encode</c-></code>) and transcoding (<code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>text</c-><c- o>::</c-><c- n>transcode</c-></code>) will be provided. The point of these functions is eager transformation of the source to the destination. It will also convert all available code points, meaning that it will only stop if the <a href="#design-low-level-results-error_handler">error_handler parameter forces it to stop</a>. On top of eagerly consuming free functions, there needs to be views that allow a person to walk some view of storage with a specified encoding. These encoding views will be called <code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>text_view</c-></code>. Their goal is to provide encoding-agnostic iterators and comparison, as well as some degree of text normalization as a base-line:</p>
<pre class="highlight"><c- n>std</c-><c- o>::</c-><c- n>u8text_view</c-> <c- n>my_utf8_text</c-><c- p>(</c->
	u8<c- s>"தேமதுரத் தமிழோசை உலகமெலாம் பரவும்வகை செய்தல் வேண்டும்."</c->
<c- p>);</c->
<c- n>std</c-><c- o>::</c-><c- n>u16text_view</c-> <c- n>my_utf16_text</c-><c- p>(</c->
	u<c- s>"தேமதுரத் தமிழோசை உலகமெலாம் பரவும்வகை செய்தல் வேண்டும்."</c->
<c- p>);</c->
<c- n>std</c-><c- o>::</c-><c- n>u32text_view</c-> <c- n>my_utf32_text</c-><c- p>(</c->
	U<c- s>"தேமதுரத் தமிழோசை உலகமெலாம் பரவும்வகை செய்தல் வேண்டும்."</c->
<c- p>);</c->
<c- n>assert</c-><c- p>(</c-><c- n>my_utf8_text</c-> <c- o>==</c-> <c- n>my_utf16_text</c-><c- p>);</c->
<c- n>assert</c-><c- p>(</c-><c- n>my_utf16_text</c-> <c- o>==</c-> <c- n>my_utf32_text</c-><c- p>);</c->
<c- n>assert</c-><c- p>(</c-><c- n>my_utf32_text</c-> <c- o>==</c-> <c- n>my_utf8_text</c-><c- p>);</c->
</pre>
   <p>But how do we build these higher-level functions and views? The answer to that question is going to be the primary exploration of this paper: the low-level details of creating the above higher-level functions and views for encoding. Following sufficient progress of encodings, this paper will then address the needs of normalization.</p>
   <h3 class="heading settled" data-level="3.2" id="design-low-level"><span class="secno">3.2. </span><span class="content">Low-Level</span><a class="self-link" href="#design-low-level"></a></h3>
   <p>The high-level interfaces must be built on <em>something</em>: it cannot be magically willed into existence. There is quite a bit of plumbing that goes into the low-level interfaces, most of which will be boilerplate to users but will serve keen use and importance to several library developers and standard library implementers.</p>
   <h4 class="heading settled" data-level="3.2.1" id="design-low-level-error_codes"><span class="secno">3.2.1. </span><span class="content">Error Codes</span><a class="self-link" href="#design-low-level-error_codes"></a></h4>
   <p>There is some boilerplate that needs to be taken care of before we begin building our encoding, decoding, transcoding and similar functionality. First and foremost is the error codes and result types that will go in and out of our encoding functions. The error code enumeration is <code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>text</c-><c- o>::</c-><c- n>encoding_errc</c-></code>. It lists all the reasons an encoding or decoding operation can fail:</p>
<pre class="highlight"><c- k>namespace</c-> <c- n>std</c-> <c- p>{</c-> <c- k>namespace</c-> <c- n>text</c-> <c- p>{</c->

	<c- k>enum</c-> <c- k>class</c-> <c- nc>encoding_errc</c-> <c- o>:</c-> <c- b>int</c-> <c- p>{</c->
		<c- c1>// just fine</c->
		<c- n>ok</c-> <c- o>=</c-> <c- mh>0x00</c-><c- p>,</c->
		<c- c1>// input contains ill-formed sequences</c->
		<c- n>invalid_sequence</c-> <c- o>=</c-> <c- mh>0x01</c-><c- p>,</c->
		<c- c1>// input contains incomplete sequences</c->
		<c- n>incomplete_sequence</c-> <c- o>=</c-> <c- mh>0x02</c-><c- p>,</c->
		<c- c1>// input contains overlong encoding sequence </c->
		<c- c1>// (e.g. for utf8)</c->
		<c- n>overlong_sequence</c-> <c- o>=</c-> <c- mh>0x03</c-><c- p>,</c->
		<c- c1>// output cannot receive all the completed </c->
		<c- c1>// code units</c->
		<c- n>insufficient_output_space</c-> <c- o>=</c-> <c- mh>0x04</c-><c- p>,</c->
		<c- c1>// sequence can be encoded but resulting </c->
		<c- c1>// code point is invalid (e.g., encodes a lone surrogate)</c->
		<c- n>invalid_output</c-> <c- o>=</c-> <c- mh>0x05</c-><c- p>,</c->
		<c- c1>// leading code unit is wrong</c->
		<c- n>invalid_leading_sequence</c-> <c- o>=</c-> <c- mh>0x06</c-><c- p>,</c->
		<c- c1>// leading code units were correct, trailing </c->
		<c- c1>// code units were wrong</c->
		<c- n>invalid_trailing_sequence</c-> <c- o>=</c-> <c- mh>0x07</c->
	<c- p>};</c->

<c- p>}}</c->
</pre>
   <p>The comments give some small amount of examples about what each one means. The reason 0 is used to signal success is very simple: the next part of the API creates an encoding_error_category class and hooks up the machinery for a <code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>error_condition</c-></code>:</p>
<pre class="highlight"><c- k>namespace</c-> <c- n>std</c-> <c- p>{</c->

	<c- k>template</c-> <c- o>&lt;></c->
	<c- k>class</c-> <c- nc>is_error_condition_enum</c-><c- o>&lt;</c-> <c- n>encoding_errc</c-> <c- o>></c-> <c- o>:</c-> <c- n>true_type</c-> <c- p>{};</c->

	<c- k>class</c-> <c- nc>encoding_error_category</c-> <c- o>:</c-> <c- k>public</c-> <c- n>error_category</c-> <c- p>{</c->
	<c- k>public</c-><c- o>:</c->
		<c- k>constexpr</c-> <c- n>encoding_error_category</c-><c- p>()</c-> <c- k>noexcept</c-><c- p>;</c->

		<c- k>virtual</c-> <c- k>const</c-> <c- b>char</c-><c- o>*</c-> <c- nf>name</c-><c- p>()</c-> <c- k>const</c-> <c- k>noexcept</c-> <c- k>override</c-><c- p>;</c->
		<c- k>virtual</c-> <c- n>string</c-> <c- nf>message</c-><c- p>(</c-><c- b>int</c-> <c- n>condition</c-><c- p>)</c-> <c- k>const</c-> <c- k>override</c-><c- p>;</c->
	<c- p>};</c->

<c- p>}</c->
</pre>
   <p>This allows the creation of a <code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>error_condition</c-></code>, which is used to signal platform-independent error codes.</p>
   <h4 class="heading settled" data-level="3.2.2" id="design-low-level-results"><span class="secno">3.2.2. </span><span class="content">Result Types</span><a class="self-link" href="#design-low-level-results"></a></h4>
   <p>The result types are the glue that help users who use the low level interface loop through their text properly. It returns updated ranges of both the input and output to indicate how far things have been moved along, on top of an error_code and whether or not the result came from an error being handled:</p>
<pre class="highlight"><c- k>namespace</c-> <c- n>std</c-> <c- p>{</c-> <c- k>namespace</c-> <c- n>text</c-> <c- p>{</c->

	<c- k>template</c-> <c- o>&lt;</c-><c- k>typename</c-> <c- n>Input</c-><c- p>,</c-> <c- k>typename</c-> <c- n>Output</c-><c- p>,</c-> <c- k>typename</c-> <c- n>State</c-><c- o>></c->
	<c- k>class</c-> <c- nc>encode_result</c-> <c- p>{</c->
		<c- n>Input</c-> <c- n>input</c-><c- p>;</c->
		<c- n>Output</c-> <c- n>output</c-><c- p>;</c->
		<c- n>State</c-><c- o>&amp;</c-> <c- n>state</c-><c- p>;</c->
		<c- n>encoding_errc</c-> <c- n>error_code</c-><c- p>;</c->
		<c- b>bool</c-> <c- n>handled_error</c-><c- p>;</c->

		<c- k>template</c-> <c- o>&lt;</c-><c- k>typename</c-> <c- n>InRange</c-><c- p>,</c-> <c- k>typename</c-> <c- n>OutRange</c-><c- p>,</c-> <c- k>typename</c-> <c- n>EncodingState</c-><c- o>></c->
		<c- k>constexpr</c-> <c- n>encode_result</c-><c- p>(</c-><c- n>InRange</c-><c- o>&amp;&amp;</c-> <c- n>input</c-><c- p>,</c-> <c- n>OutRange</c-><c- o>&amp;&amp;</c-> <c- n>output</c-><c- p>,</c-> 
			<c- n>EncodingState</c-><c- o>&amp;&amp;</c-> <c- n>state</c-><c- p>,</c-> <c- n>encoding_errc</c-> <c- n>error_code</c-> <c- o>=</c-> <c- n>encoding_errc</c-><c- o>::</c-><c- n>ok</c-><c- p>);</c->

		<c- k>template</c-> <c- o>&lt;</c-><c- k>typename</c-> <c- n>InRange</c-><c- p>,</c-> <c- k>typename</c-> <c- n>OutRange</c-><c- p>,</c-> <c- k>typename</c-> <c- n>EncodingState</c-><c- o>></c->
		<c- k>constexpr</c-> <c- n>encode_result</c-><c- p>(</c-><c- n>InRange</c-><c- o>&amp;&amp;</c-> <c- n>input</c-><c- p>,</c-> <c- n>OutRange</c-><c- o>&amp;&amp;</c-> <c- n>output</c-><c- p>,</c-> 
			<c- n>EncodingState</c-><c- o>&amp;&amp;</c-> <c- n>state</c-><c- p>,</c-> <c- n>encoding_errc</c-> <c- n>error_code</c-><c- p>,</c-> <c- b>bool</c-> <c- n>handled_error</c-><c- p>);</c->

		<c- k>constexpr</c-> <c- n>std</c-><c- o>::</c-><c- n>error_condition</c-> <c- n>error</c-><c- p>()</c-> <c- k>const</c-><c- p>;</c->
	<c- p>};</c->

	<c- k>template</c-> <c- o>&lt;</c-><c- k>typename</c-> <c- n>Input</c-><c- p>,</c-> <c- k>typename</c-> <c- n>Output</c-><c- p>,</c-> <c- k>typename</c-> <c- n>State</c-><c- o>></c->
	<c- k>class</c-> <c- nc>decode_result</c-> <c- p>{</c->
		<c- n>Input</c-> <c- n>input</c-><c- p>;</c->
		<c- n>Output</c-> <c- n>output</c-><c- p>;</c->
		<c- n>State</c-><c- o>&amp;</c-> <c- n>state</c-><c- p>;</c->
		<c- n>encoding_errc</c-> <c- n>error_code</c-><c- p>;</c->
		<c- b>bool</c-> <c- n>handled_error</c-><c- p>;</c->

		<c- k>template</c-> <c- o>&lt;</c-><c- k>typename</c-> <c- n>InRange</c-><c- p>,</c-> <c- k>typename</c-> <c- n>OutRange</c-><c- p>,</c-> <c- k>typename</c-> <c- n>EncodingState</c-><c- o>></c->
		<c- k>constexpr</c-> <c- n>decode_result</c-><c- p>(</c-><c- n>InRange</c-><c- o>&amp;&amp;</c-> <c- n>input</c-><c- p>,</c-> <c- n>OutRange</c-><c- o>&amp;&amp;</c-> <c- n>output</c-><c- p>,</c-> 
			<c- n>EncodingState</c-><c- o>&amp;&amp;</c-> <c- n>state</c-><c- p>,</c-> <c- n>encoding_errc</c-> <c- n>error_code</c-> <c- o>=</c-> <c- n>encoding_errc</c-><c- o>::</c-><c- n>ok</c-><c- p>);</c->

		<c- k>template</c-> <c- o>&lt;</c-><c- k>typename</c-> <c- n>InRange</c-><c- p>,</c-> <c- k>typename</c-> <c- n>OutRange</c-><c- p>,</c-> <c- k>typename</c-> <c- n>EncodingState</c-><c- o>></c->
		<c- k>constexpr</c-> <c- n>decode_result</c-><c- p>(</c-><c- n>InRange</c-><c- o>&amp;&amp;</c-> <c- n>input</c-><c- p>,</c-> <c- n>OutRange</c-><c- o>&amp;&amp;</c-> <c- n>output</c-><c- p>,</c-> 
			<c- n>EncodingState</c-><c- o>&amp;&amp;</c-> <c- n>state</c-><c- p>,</c-> <c- n>encoding_errc</c-> <c- n>error_code</c-><c- p>,</c-> <c- b>bool</c-> <c- n>handled_error</c-><c- p>);</c->

		<c- k>constexpr</c-> <c- n>std</c-><c- o>::</c-><c- n>error_condition</c-> <c- n>error</c-><c- p>()</c-> <c- k>const</c-><c- p>;</c->
	<c- p>};</c->

<c- p>}}</c->
</pre>
   <p>There is a lot to unpack here. There are two essentially identical structures: <code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>encode_result</c-></code> and <code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>decode_result</c-></code>. These contain the input range, the output range, a reference to the encoding’s current state, the error code and whether or not the error handler was invoked. The <code class="highlight"><c- b>bool</c-> <c- n>error_handled</c-></code> is important because some error handlers may change the <code class="highlight"><c- n>error_code</c-></code> member to <code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>text</c-><c- o>::</c-><c- n>encoding_errc</c-><c- o>::</c-><c- n>ok</c-></code>, indicating that things are fine (e.g., a replacement character was successfully inserted into the output stream to replace a bad character).</p>
   <p>Having 2 differently-named types with much the same interface is paramount to allow an <code class="highlight"><c- n>error_handler</c-></code> callable to know how to interpret some errors and whether to try to insert code units into the output stream or code points into the output stream (encoding means code units into output, decoding means code points into the output). If the structures were merged, we would lose this information at compile-time and have to attempt to coerce that information out by examining the <code class="highlight"><c- n>value_type</c-></code> and <code class="highlight"><c- n>reference</c-></code> types of the output range. Unfortunately, it is not foolproof because neither the input range or output ranges need to exactly dereference to exactly <code class="highlight"><c- n>Encoding</c-><c- o>::</c-><c- n>code_unit</c-></code> or <code class="highlight"><c- n>Encoding</c-><c- o>::</c-><c- n>code_point</c-></code> types, just things convertible to / from them.</p>
   <p>To start, let’s examine the <code class="highlight"><c- n>input</c-></code> and <code class="highlight"><c- n>output</c-></code> ranges.</p>
   <h5 class="heading settled" data-level="3.2.2.1" id="design-low-level-results-ranges"><span class="secno">3.2.2.1. </span><span class="content">Input and Output Ranges</span><a class="self-link" href="#design-low-level-results-ranges"></a></h5>
   <p>These are essentially the ranges moved forward as much or as little as the encoding needed to for reading from the input, converting, and writing to the output. It also solves the problem of obtaining maximal speed based on checking if the destination is filled or if the input is exhausted: <code class="highlight"><c- n>unbounded_view</c-></code> works well since its comparison sentinel always returns the literal "false" bool on comparison, meaning that any compiler beyond the typical <code class="highlight"><c- o>-</c-><c- n>O0</c-></code> / <code class="highlight"><c- o>/</c-><c- n>Od</c-></code> / etc. levels of optimization will cull those branches of code out.</p>
   <p>The decoding result and encoding result types both return the input and output range specified in the structure itself. This represents the changed ranges. Unfortunately, problems arise when one assumes that a range can be reconstructed from its <code class="highlight"><c- n>begin</c-><c- p>(</c-><c- n>rng</c-><c- p>)</c-></code> and <code class="highlight"><c- n>end</c-><c- p>(</c-><c- n>rng</c-><c- p>)</c-></code> iterator.</p>
   <h5 class="heading settled" data-level="3.2.2.2" id="design-low-level-results-reconstructible"><span class="secno">3.2.2.2. </span><span class="content">Implementation Challenge: Ranges are not the Sum of their Parts</span><a class="self-link" href="#design-low-level-results-reconstructible"></a></h5>
   <p>Ranges do not offer a generic way to reconstruct themselves from their bits. If you deconstruct a range with <code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>ranges</c-><c- o>::</c-><c- n>begin</c-></code> and <code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>ranges</c-><c- o>::</c-><c- n>end</c-></code>, the two resulting iterators cannot be put back together again for all ranges. Even ranges which can conceptually handle this are <a data-link-type="biblio" href="#biblio-range-v3-sentinel-issue">missing constructors which allow for this</a>, and since it is not part of the general interface there is no generic way to do this. However, it would be somewhat silly to lose some of the interface members and properties of the original class if it does indeed contain a way to handle it: thusly, later iterations of this proposal will likely introduce a <code class="highlight"><c- n>ReconstructibleRange</c-></code> concept, which will specify that <code class="highlight"><c- n>Range</c-><c- p>(</c-><c- n>Iterator</c-><c- o>&lt;</c-><c- n>Range</c-><c- o>></c-><c- p>,</c-> <c- n>Sentinel</c-><c- o>&lt;</c-><c- n>Range</c-><c- o>></c-><c- p>)</c-></code> is a valid expression. In cases where it is not, this paper will either check for <code class="highlight"><c- n>Range</c-><c- p>(</c-><c- n>Iterator</c-><c- o>&lt;</c-><c- n>Range</c-><c- o>></c-><c- p>)</c-></code> being a valid expression and otherwise fallback to <code class="highlight"><c- n>sub_view</c-><c- o>&lt;</c-><c- n>Iterator</c-><c- o>&lt;</c-><c- n>Range</c-><c- o>></c-><c- p>,</c-> <c- n>Sentinel</c-><c- o>&lt;</c-><c- n>Range</c-><c- o>>></c-><c- p>(</c-><c- n>Iterator</c-><c- o>&lt;</c-><c- n>Range</c-><c- o>></c-><c- p>,</c-> <c- n>Sentinel</c-><c- o>&lt;</c-><c- n>Range</c-><c- o>></c-><c- p>)</c-></code> as the type that goes into the <code class="highlight"><c- n>decode_result</c-></code>.</p>
   <h5 class="heading settled" data-level="3.2.2.3" id="design-low-level-results-error_handler"><span class="secno">3.2.2.3. </span><span class="content">Error Handling: Allow All The Options</span><a class="self-link" href="#design-low-level-results-error_handler"></a></h5>
   <p>This is a low-level interface. As such, we need a way to accommodate different error handling strategies. There are several ways to report errors used in both the C and C++ standard libraries, from throwing errors, to <code class="highlight"><c- n>error_code</c-></code> out parameters, to integral return values and even complex return structures. Choosing a scheme here is difficult given the large breadth and depth of error handling history in C++, and while the standard library shows a clear bias towards throwing exceptions it would not be prudent to throw all the time: it may exclude hard and soft real-time programming environments wherein these encoding structures will be needed,.</p>
   <p>Error reporting will be done through an error handler to accommodate multiple, which can be any type of callable that matches the desired interface. The standard will provide 3 of these error handlers:</p>
<pre class="highlight"><c- k>namespace</c-> <c- n>std</c-> <c- p>{</c-> <c- k>namespace</c-> <c- n>text</c-> <c- p>{</c->

	<c- k>class</c-> <c- nc>replacement_character_handler</c-><c- p>;</c->
	<c- k>class</c-> <c- nc>throw_handler</c-><c- p>;</c->
	<c- k>class</c-> <c- nc>assume_valid_handler</c-><c- p>;</c->

	<c- k>using</c-> <c- n>default_error_handler</c-> <c- o>=</c-> <c- n>replacement_character_handler</c-><c- p>;</c->

<c- p>}}</c->
</pre>
   <p>The interface for an error handler will look as such:</p>
<pre class="highlight"><c- k>namespace</c-> <c- n>std</c-> <c- p>{</c-> <c- k>namespace</c-> <c- n>text</c-> <c- p>{</c->

	<c- k>class</c-> <c- nc>an_error_handler</c-> <c- p>{</c->
		<c- k>template</c-> <c- o>&lt;</c-><c- k>typename</c-> <c- n>Encoding</c-><c- p>,</c-> <c- k>typename</c-> <c- n>InputRange</c-><c- p>,</c-> 
		<c- k>typename</c-> <c- n>OutputRange</c-><c- p>,</c-> <c- k>typename</c-> <c- n>State</c-><c- o>></c->
		<c- k>constexpr</c-> <c- k>auto</c-> <c- k>operator</c-><c- p>()(</c-><c- k>const</c-> <c- n>Encoding</c-><c- o>&amp;</c-> <c- n>encoding</c-><c- p>,</c-> 
		<c- n>encode_result</c-><c- o>&lt;</c-><c- n>InputRange</c-><c- p>,</c-> <c- n>OutputRange</c-><c- p>,</c-> <c- n>State</c-><c- o>></c-> <c- n>result</c-><c- p>)</c-> <c- k>const</c-> <c- p>{</c->
			<c- d>/* morph result or throw error */</c->
			<c- k>return</c-> <c- n>result</c-><c- p>;</c->
		<c- p>}</c->

		<c- k>template</c-> <c- o>&lt;</c-><c- k>typename</c-> <c- n>Encoding</c-><c- p>,</c-> <c- k>typename</c-> <c- n>InputRange</c-><c- p>,</c-> 
		<c- k>typename</c-> <c- n>OutputRange</c-><c- p>,</c-> <c- k>typename</c-> <c- n>State</c-><c- o>></c->
		<c- k>constexpr</c-> <c- k>auto</c-> <c- k>operator</c-><c- p>()(</c-><c- k>const</c-> <c- n>Encoding</c-><c- o>&amp;</c-> <c- n>encoding</c-><c- p>,</c-> 
		<c- n>decode_result</c-><c- o>&lt;</c-><c- n>InputRange</c-><c- p>,</c-> <c- n>OutputRange</c-><c- p>,</c-> <c- n>State</c-><c- o>></c-> <c- n>result</c-><c- p>)</c-> <c- k>const</c-> <c- p>{</c->
			<c- d>/* morph result or throw error */</c->
			<c- k>return</c-> <c- n>result</c-><c- p>;</c->
		<c- p>}</c->
	<c- p>};</c->

<c- p>}}</c->
</pre>
   <p>The implementation is a value-based one, wherein the <code class="highlight"><c- n>current_result</c-></code> is taken from the implementation of <code class="highlight"><c- n>encode</c-></code> or <code class="highlight"><c- n>decode</c-></code> function on the encoding object that puts together its current progress in the form of the current state of the forward-moved input range, the current state of the forward-moved output range, a reference to the current state, and the type of error encountered according to the <code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>encoding_errc</c-></code>. The error handler is then responsible for performing any modifications it wants to the result type, before returning the modified result to be propagated back by the encoding interface.</p>
   <p>There are a few things that can be done in the <code class="highlight"><c- d>/* morph result or throw error */</c-></code> part of that example error handler definition. First and foremost is that someone could look at <code class="highlight"><c- n>current_result</c-><c- p>.</c-><c- n>error</c-><c- p>()</c-></code> and simply throw a hand-tailored exception. This would bubble out of the function and let the caller decide what to do. Throwing is <strong>explicitly not recommended by default</strong> by prominent vendors and implementers (Mozilla, Apple, the Unicode Consortium, WHATWG, etc.). The recommendation is a good one, because ill-formed text is common and is also the most frequent kind of user input. It is extremely easy to provoke a Denial of Service Attack (DoS Attack) if an application throws an error on malformed input that the application author did not consider.</p>
   <p>The default selection of error handler will be the <code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>text</c-><c- o>::</c-><c- n>replacement_character_handler</c-></code>. The <code class="highlight"><c- n>replacement_character_handler</c-></code> will look inside <code class="highlight"><c- n>Encoding</c-></code> to see if the expression <code class="highlight"><c- n>Encoding</c-><c- o>::</c-><c- n>replacement_code_point</c-></code> or <code class="highlight"><c- n>Encoding</c-><c- o>::</c-><c- n>replacement_code_unit</c-></code> is well-formed. If so, that character will attempt to be inserted into the <code class="highlight"><c- n>output</c-></code> range and the error code on the result will be corrected to say "everything is fine" (<code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>text</c-><c- o>::</c-><c- n>encoding_errc</c-><c- o>::</c-><c- n>ok</c-></code>) and then returned from the function. This follows the Unicode Consortium’s and many, many vendors explicit recommendations.</p>
   <p>For performance reasons and flexibility, the error callable must have a way to ensure that the user and implementation can agree on whether or not we invoke Undefined Behavior and assume that the text is valid. <a data-link-type="biblio" href="#biblio-libogonek">[libogonek]</a> made an object of type <code class="highlight"><c- n>assume_valid_t</c-></code>. This paper provides the same here: an error handler of <code class="highlight"><c- n>assume_valid_handler</c-></code> means that the implementation will eliminate all of its checks and subsequent calls to the error handling interface. A trait will be provided to check if an error handler is ignorable: <code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>is_ignorable_error_handler_v</c-><c- o>&lt;</c-><c- n>Handler</c-><c- o>></c-></code>. A user can opt into this, but it will not be the default and will require explicit passing of such an error handler to use.</p>
   <h4 class="heading settled" data-level="3.2.3" id="design-low-level-encodings"><span class="secno">3.2.3. </span><span class="content">The Encoding Object</span><a class="self-link" href="#design-low-level-encodings"></a></h4>
   <p>Given our result types and error handlers, we can now define the interface for the encoding object itself. Here is the example encoding:</p>
<pre class="highlight"><c- k>namespace</c-> <c- n>std</c-> <c- p>{</c-> <c- k>namespace</c-> <c- n>text</c-> <c- p>{</c->

	<c- c1>// NOTE: exemplary encoding</c->
	<c- c1>// for expository purposes</c->
	<c- c1>// containing all the types</c->
	<c- k>class</c-> <c- nc>example_locale_encoding</c-> <c- p>{</c->
		<c- k>class</c-> <c- nc>__ex_state</c-> <c- p>{</c->
			<c- n>std</c-><c- o>::</c-><c- b>mbstate_t</c-> <c- n>multibyte_state</c-><c- p>;</c->
		<c- p>};</c->
		<c- k>using</c-> <c- n>code_point</c-> <c- o>=</c-> <c- b>char32_t</c-><c- p>;</c->
		<c- k>using</c-> <c- n>code_unit</c-> <c- o>=</c-> <c- b>char</c-><c- p>;</c->
		<c- k>using</c-> <c- n>state</c-> <c- o>=</c-> <c- n>__ex_state</c-><c- p>;</c->
		<c- k>static</c-> <c- k>constexpr</c-> <c- b>size_t</c-> <c- n>max_code_unit_sequence</c-> <c- o>=</c-> <c- n>MB_LEN_MAX</c-><c- p>;</c->
		<c- k>static</c-> <c- k>constexpr</c-> <c- b>size_t</c-> <c- n>max_code_point_sequence</c-> <c- o>=</c-> <c- mi>1</c-><c- p>;</c->

		<c- c1>// optional</c->
		<c- k>using</c-> <c- n>is_encoding_injective</c-> <c- o>=</c-> <c- n>std</c-><c- o>::</c-><c- n>false_type</c-><c- p>;</c->
		<c- c1>// optional</c->
		<c- k>using</c-> <c- n>is_decoding_injective</c-> <c- o>=</c-> <c- n>std</c-><c- o>::</c-><c- n>true_type</c-><c- p>;</c->
		<c- c1>// optional</c->
		<c- n>code_point</c-> <c- n>replacement_code_point</c-> <c- o>=</c-> '<c- mh>0xFFFD</c->'<c- p>;</c->
		<c- c1>// optional</c->
		<c- n>code_unit</c-> <c- n>replacement_code_unit</c-> <c- o>=</c-> <c- sc>'?'</c-><c- p>;</c->

		<c- c1>// encodes exactly one full code unit sequence</c->
		<c- c1>// into one full code point sequence</c->
		<c- k>template</c-> <c- o>&lt;</c-><c- k>typename</c-> <c- n>In</c-><c- p>,</c-> <c- k>typename</c-> <c- n>Out</c-><c- p>,</c-> <c- k>typename</c-> <c- n>Handler</c-><c- o>></c->
		<c- n>encode_result</c-><c- o>&lt;</c-><c- n>In</c-><c- p>,</c-> <c- n>Out</c-><c- p>,</c-> <c- n>state</c-><c- o>></c-> <c- n>encode</c-><c- p>(</c->
			<c- n>In</c-><c- o>&amp;&amp;</c-> <c- n>in_range</c-><c- p>,</c-> 
			<c- n>Out</c-><c- o>&amp;&amp;</c-> <c- n>out_range</c-><c- p>,</c->
			<c- n>state</c-><c- o>&amp;</c-> <c- n>current_state</c-><c- p>,</c->
			<c- n>Handler</c-><c- o>&amp;&amp;</c-> <c- n>handler</c->
		<c- p>);</c->

		<c- c1>// decodes exactly one full code point sequence</c->
		<c- c1>// into one full code unit sequence</c->
		<c- k>template</c-> <c- o>&lt;</c-><c- k>typename</c-> <c- n>In</c-><c- p>,</c-> <c- k>typename</c-> <c- n>Out</c-><c- p>,</c-> <c- k>typename</c-> <c- n>Handler</c-><c- o>></c->
		<c- n>decode_result</c-><c- o>&lt;</c-><c- n>In</c-><c- p>,</c-> <c- n>Out</c-><c- p>,</c-> <c- n>state</c-><c- o>></c-> <c- n>decode</c-><c- p>(</c->
			<c- n>In</c-><c- o>&amp;&amp;</c-> <c- n>in_range</c-><c- p>,</c-> 
			<c- n>Out</c-><c- o>&amp;&amp;</c-> <c- n>out_range</c-><c- p>,</c->
			<c- n>state</c-><c- o>&amp;</c-> <c- n>current_state</c-><c- p>,</c->
			<c- n>Handler</c-><c- o>&amp;&amp;</c-> <c- n>handler</c->
		<c- p>);</c->

		<c- k>static</c-> <c- b>void</c-> <c- nf>reset</c-><c- p>(</c-><c- n>state</c-><c- o>&amp;</c-><c- p>);</c->
	<c- p>};</c->
<c- p>}}</c->
</pre>
   <p>There are many pieces of this encoding object. Some of them fit the purposes explained above. As an overview:</p>
   <ul>
    <li data-md>
     <p><code class="highlight"><c- n>code_unit</c-></code> and <code class="highlight"><c- n>code_point</c-></code> type definitions let us know what an Encoding’s inputs and outputs will be from its functions. It also helps us tell if 2 encodings can be transcoded from one another by having at least the <code class="highlight"><c- n>code_point</c-></code> in common.</p>
    <li data-md>
     <p><code class="highlight"><c- n>is_encoding_injective</c-></code> and <code class="highlight"><c- n>is_decoding_injective</c-></code> tells us whether or not the encode or decode operations provide a lossless map from the code_point to code_unit or vice-versa, respectively.</p>
    <li data-md>
     <p>state allows a user to instantiate the type and control its parameters. At the very least, <code class="highlight"><c- n>Encoding</c-><c- o>::</c-><c- n>state</c-></code> must be default-constructible and its associated encoding must have a <code class="highlight"><c- n>reset</c-></code> function which will put the <code class="highlight"><c- n>state</c-></code> in its original, unprocessed internal representation.</p>
    <li data-md>
     <p><code class="highlight"><c- n>max_code_unit_sequence</c-></code> and <code class="highlight"><c- n>max_code_point_sequence</c-></code> represent integral values which inform users of the encoding the necessary size of a buffer to handle at least one full, encoded sequence of conde units and one full, decoded sequence of code points. In most cases, <code class="highlight"><c- n>max_code_point_sequence</c-></code> will be <code class="highlight"><c- mi>1</c-></code>. If the <code class="highlight"><c- n>code_point</c-></code> type names a Unicode type, it will be <code class="highlight"><c- mi>1</c-></code>.</p>
   </ul>
   <h5 class="heading settled" data-level="3.2.3.1" id="design-low-level-encodings-standard"><span class="secno">3.2.3.1. </span><span class="content">Encodings Provided by the Standard</span><a class="self-link" href="#design-low-level-encodings-standard"></a></h5>
   <p>The primary reason for the standard to provide an encoding is to ensure that it produces a way for applications to communicate with one another. As a baseline, the standard should support all the encodings it ships with its string literal types. On top of that, there is an important base-level optimization when working with strictly ASCII text that can be implemented with UTF8 which would most library implementers are interested in shipping. This means that the following encodings will be shipped by the standard library:</p>
<pre class="highlight"><c- k>namespace</c-> <c- n>std</c-> <c- p>{</c-> <c- k>namespace</c-> <c- n>text</c-> <c- p>{</c->

	<c- k>class</c-> <c- nc>ascii</c-><c- p>;</c->
	<c- k>class</c-> <c- nc>utf8</c-><c- p>;</c->
	<c- k>class</c-> <c- nc>utf16</c-><c- p>;</c->
	<c- k>class</c-> <c- nc>utf32</c-><c- p>;</c->
	<c- k>class</c-> <c- nc>narrow_execution</c-><c- p>;</c->
	<c- k>class</c-> <c- nc>wide_execution</c-><c- p>;</c->

<c- p>}}</c->
</pre>
   <p>The first four structures correspond directly to what they name. The last two structures are specific, key encodings for interoperating with locale-dependent narrow execution encoding data as well as locale-dependent wide execution encoding data. It is imperative the standard ships these because only the implementation knows the runtime execution encoding. The case is similar for the wide execution encoding. These 6 are the total of all that at the bare minimum must be shipped with the standard. <code class="highlight"><c- n>ascii</c-></code> holds a special place here because it is a direct subset of <code class="highlight"><c- n>utf8</c-></code>. If an individual knows their text is in purely ASCII ahead of time and they work in UTF8, this information can be used to bit-blast (<code class="highlight"><c- n>memcpy</c-></code>) the data from UTF8 to ASCII.</p>
   <h5 class="heading settled" data-level="3.2.3.2" id="design-low-level-encodings-variant"><span class="secno">3.2.3.2. </span><span class="content">UTF Encodings: variants?</span><a class="self-link" href="#design-low-level-encodings-variant"></a></h5>
   <p>There are many variants of encodings like UTF8 and UTF16. These include <a data-link-type="biblio" href="#biblio-wtf8">[wtf8]</a> or <a data-link-type="biblio" href="#biblio-cesu8">[cesu8]</a> and are useful for internal processing and interoperability with certain systems, like direct interfacing with Java or communication with an Oracle database. However, almost none of these are publicly recommend as interchange formats: both CESU-8 and WTF-8 are documented and used internally for legacy reasons. In some cases, they also represent security vulnerabilities if they are used in interchange for the internet. This makes them less and less desirable to provide VIA the standard. However, it is worth acknowledging that supporting WTF-8 and CESU-8 as encodings will ease individuals who need to roll such encodings for their applications.</p>
   <p>More pressingly, there is a wide body of code that operates with <code class="highlight"><c- b>char</c-></code> as the code unit for their UTF8 encodings. This is also subtly wrong, because on a handful of systems <code class="highlight"><c- b>char</c-></code> is not unsigned, but signed. Math and bit characteristics for these types are wrong for the typical operations performed in UTF8 encoders and decoders (and many people -- including Markus Schrerer that spends a lot of time with ICU -- just wish <code class="highlight"><c- b>char</c-></code> was unsigned since it would have saved a lot of time from bugs). On one hand, providing variants that allow someone to pick something like the code unit for UTF16 or UTF8 would make it easier to have text types which play nice with the Windows APIs or existing code bases. The interface would look something like this...</p>
<pre class="highlight"><c- k>namespace</c-> <c- n>std</c-> <c- p>{</c-> <c- k>namespace</c-> <c- n>text</c-> <c- p>{</c->

	<c- k>template</c-> <c- o>&lt;</c-><c- k>typename</c-> <c- n>CharT</c-><c- p>,</c-> <c- b>bool</c-> <c- n>encode_null</c-><c- p>,</c-> <c- b>bool</c-> <c- n>encode_lone_surrogates</c-><c- o>></c->
	<c- k>class</c-> <c- nc>basic_utf8</c-><c- p>;</c->

	<c- k>using</c-> <c- n>utf8</c-> <c- o>=</c-> <c- n>basic_utf8</c-><c- o>&lt;</c-><c- n>char8_t</c-><c- p>,</c-> false<c- p>,</c-> false<c- o>></c-><c- p>;</c->

	<c- k>template</c-> <c- o>&lt;</c-><c- k>typename</c-> <c- n>CharT</c-><c- p>,</c-> <c- b>bool</c-> <c- n>allow_lone_surrogates</c-><c- o>></c->
	<c- k>class</c-> <c- nc>basic_utf16</c-><c- p>;</c->

	<c- k>using</c-> <c- n>utf16</c-> <c- o>=</c-> <c- n>basic_utf8</c-><c- o>&lt;</c-><c- b>char16_t</c-><c- p>,</c-> false<c- o>></c-><c- p>;</c->

<c- p>}}</c->
</pre>
   <p>And externally, libraries and applications could add their own using statements and type definitions for the purposes of internal interoperation:</p>
<pre class="highlight"><c- k>namespace</c-> <c- n>my_app</c-> <c- p>{</c->

	<c- k>using</c-> <c- n>compat_utf8</c-> <c- o>=</c-> <c- n>std</c-><c- o>::</c-><c- n>basic_utf8</c-><c- o>&lt;</c-><c- b>char</c-><c- o>></c-><c- p>;</c->
	<c- k>using</c-> <c- n>filesystem16</c-> <c- o>=</c-> <c- n>std</c-><c- o>::</c-><c- n>basic_utf16</c-><c- o>&lt;</c-><c- b>wchar_t</c-><c- p>,</c-> true<c- o>></c-><c- p>;</c->

<c- p>}</c->
</pre>
   <p>There is clear utility that can be had here. But, this is not going to be looked into too deeply for the first iteration of this proposal.</p>
   <h5 class="heading settled" data-level="3.2.3.3" id="design-low-level-encodings-encoding_scheme"><span class="secno">3.2.3.3. </span><span class="content">Encoding Schemes: Byte-Based</span><a class="self-link" href="#design-low-level-encodings-encoding_scheme"></a></h5>
   <p>Unicode specifies what are called Encoding Schemes for the encodings whose code unit size exceeds a single byte. This is essentially UTF16 and UTF32, of which there is UTF16 Little Endian (UTF16-LE), UTF16 Big Endian (UTF16-BE), UTF32 Little Endian (UTF32-LE), and UTF32 Big Endian (UTF32-BE). Encoding schemes can be generically handled without creating extremely specific encodings by creating an <code class="highlight"><c- n>encoding_scheme</c-><c- o>&lt;</c-><c- p>...</c-><c- o>></c-></code> template. It will look much like so:</p>
<pre class="highlight"><c- k>namespace</c-> <c- n>std</c-> <c- p>{</c-> <c- k>namespace</c-> <c- n>text</c-> <c- p>{</c->

	<c- k>template</c-> <c- o>&lt;</c-><c- n>std</c-><c- o>::</c-><c- n>endian</c-> <c- n>endianness</c-><c- p>,</c-> <c- k>typename</c-> <c- n>Encoding</c-><c- p>,</c-> <c- k>typename</c-> <c- n>Byte</c-> <c- o>=</c-> <c- n>std</c-><c- o>::</c-><c- n>byte</c-><c- o>></c->
	<c- k>class</c-> <c- nc>encoding_scheme</c-><c- p>;</c->

<c- p>}}</c->
</pre>
   <p>This is a transformative encoding type that takes the source (network) endianness and translates it to the native (host) endianness. It has an identical interface to the <code class="highlight"><c- n>Encoding</c-></code> type passed in, with the caveat that the <code class="highlight"><c- n>code_unit</c-></code> member type is the same as <code class="highlight"><c- n>Byte</c-></code>. Really, all it does it call the same <code class="highlight"><c- n>encode</c-></code> or <code class="highlight"><c- n>decode</c-></code> function with small wrappers around the passed-in ranges that takes bytes and composes them into the internal <code class="highlight"><c- n>Encoding</c-><c- o>::</c-><c- n>code_unit</c-></code> type, or when writing out takes an <code class="highlight"><c- n>Encoding</c-><c- o>::</c-><c- n>code_unit</c-></code> type and writes it out into its byte-based form. A few SG16 members have frequently advocated that the base input and outputs for all types matching the <code class="highlight"><c- n>Encoding</c-></code> concept should be byte-based.</p>
   <p>This paper disagrees with that supposition and instead goes the route of providing a wrapping encoding scheme. The benefit here is flexibility and independence from byte ordering at the <code class="highlight"><c- n>Encoding</c-></code> level: the <code class="highlight"><c- n>encoding_scheme</c-></code> becomes the layer at which such a concern is both concentrated and isolated. Now, <em>no</em> encoding needs to duplicate its interface at all, while still retaining strong and separately named types that one can perform additional optimization on. This has also already seen implementation experience in <a data-link-type="biblio" href="#biblio-libogonek">[libogonek]</a>'s <a data-link-type="biblio" href="#biblio-libogonek-encoding_scheme">[libogonek-encoding_scheme]</a> type, with no qualms from users.</p>
   <h4 class="heading settled" data-level="3.2.4" id="design-low-level-encodings-stateful"><span class="secno">3.2.4. </span><span class="content">Stateful Objects, or Stateful Parameters?</span><a class="self-link" href="#design-low-level-encodings-stateful"></a></h4>
   <p>Stateful objects are good for encapsulation, reuse and transportation. They have been proven in many APIs both C and C++ to provide a good, reentrant API with all relevant details captured on the (sometimes opaque) object itself. After careful evaluation, stateful parameter rather than a wholly stateful object for the function calls in encoding and decoding types are a better choice for this low-level interface. The main and important benefits for having the state be passed to the encoding / decoding function calls as a parameter are that it:</p>
   <ul>
    <li data-md>
     <p>maintains that encoding objects can be cheap to construct, copy and move;</p>
    <li data-md>
     <p>improves the general reusability of encoding objects by allowing state to be massaged into certain configurations by users;</p>
    <li data-md>
     <p>and, allows users to set the state in a public way without having to prescribe a specific API for all encoders to do that.</p>
   </ul>
   <p>The reason for keeping encoding types cheap is that they will be constructed, copied, and moved a lot, especially in the face of the ranges that SG16 is going to be putting a lot of work into (<code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>text_view</c-><c- o>&lt;</c-><c- n>View</c-><c- p>,</c-> <c- n>Encoding</c-><c- p>,</c-> <c- p>...</c-><c- o>></c-></code>). Ranges require that they can be constructed in (amortized) constant time; this change allows us to shift the construction for what may be potentially expensive state to other places.</p>
   <p>As a poignant example: consider the case of execution encoding character sets today, which often defer to the current locale. Locale is inherently expensive to construct and use: if the standard has to have an encoding that grabs or creates a <code class="highlight"><c- n>codecvt</c-></code> or <code class="highlight"><c- n>locale</c-></code> member, we will immediately lose a large portion of users over the performance drag during construction of higher-level abstractions that rely on the encoding. It is also notable that this is the same <a data-link-type="biblio" href="#biblio-sol2-wstring_convert">mistake std::wstring_convert shipped with</a> and is one of the largest contributing reasons to its lack of use and subsequent deprecation (on top of its poor implementation in several libraries, from the VC++ standard library to libc++).</p>
   <p>In contrast, consider having an explicit parameter. At the cost of making a low-level interface take one more parameter, the state can be paid for once and reused in many separate places, allowing a user to front-load the state’s expenses up-front. It also allows the users to set or get the locale ahead of time and reuse it consistently. It also allows for encoding or decoding operations to be reused or restart in the cases of interruptible or incomplete streams, such as network reading or I/O buffering. These are potent use cases wherein such a design decision becomes very helpful.</p>
   <h5 class="heading settled" data-level="3.2.4.1" id="design-low-level-encodings-state-synchronizing"><span class="secno">3.2.4.1. </span><span class="content">Self-Synchronizing State</span><a class="self-link" href="#design-low-level-encodings-state-synchronizing"></a></h5>
   <p>A self-synchronizing code is a uniquely decodable source symbol stream whose output provides a direct and unambiguous mapping with the source symbol stream. These require no state to parse given a sequence, because a sequence must be either valid or invalid with no intermediate states of "potentially valid". For example, not fully decoding any of the Unicode Transformation Formats’s code units into a single code point -- unfinished surrogates or half-delivered byte sequences -- in full is an error because no sub-sequence can identify another code point. This is the primary usage of stateful encoding and decoding operations: tracking what was last seen -- among other parameters -- for the purposes of disambiguating incoming input.</p>
   <p>If an encoding is self-synchronizing, then at no point is there a need to refer to an "potentially correct but need to see more" state: the input is either wholly correct, or it is not. Therefore, an encoding is considered self-synchronizing <em>by default</em> if it’s state parameter is empty (i.e. <code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>is_empty_v</c-><c- o>&lt;</c-><c- n>state</c-><c- o>></c-></code> is true). Note that the inverse cannot be assumed to be true: if a state object is not empty, it can still be self-synchronizing. The implementation just cannot assume so, and thusly must treat the state parameter by-default as non-self-synchronizing.</p>
   <p>Thusly, the trait <code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>is_self_synchronizing</c-><c- o>&lt;</c-><c- n>T</c-><c- o>></c-></code> will give users a way to avoid needing to have to inspect the <code class="highlight"><c- n>state</c-></code> at all. This trait eliminates the need to worry about shift states or other hidden shenanigans in the encoding and decoding operations, simplifying error handling. In the case of a stateful but self-synchronizing state, one must override the trait <code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>is_self_synchronizing_v</c-><c- o>&lt;</c-><c- n>T</c-><c- o>></c-></code> to declare their state by-default self-synchronizing.</p>
   <h5 class="heading settled" data-level="3.2.4.2" id="design-low-level-encodings-state-empty"><span class="secno">3.2.4.2. </span><span class="content">Empty State and learning from Minimal Allocators</span><a class="self-link" href="#design-low-level-encodings-state-empty"></a></h5>
   <p>If <code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>is_empty_v</c-><c- o>&lt;</c-><c- n>State</c-><c- o>></c-></code> is true, then there is no reason to require that the state is passed to the encoding functions. This is more or less an API "kindness", but so long as the state is an empty object it does not have to be passed to the <code class="highlight"><c- n>encode</c-></code> or <code class="highlight"><c- n>decode</c-></code> functions. This is not going to be proposed at this time, but for API usability it should be looked into later in the life of this proposal (e.g., revision 2).</p>
   <h3 class="heading settled" data-level="3.3" id="design-speed"><span class="secno">3.3. </span><span class="content">The Need for Speed</span><a class="self-link" href="#design-speed"></a></h3>
   <p>Correctness is correctness. So is performance. If these methods and the resulting interface are not fast enough to meet the needs of the programmers, there will be little to no adoption. Thanks to work by Bob Steagall and Zach Laine, we know for a fact that it is incredibly hard -- perhaps even impossible -- to make a range-based or iterator-based interface which will achieve the text processing speeds that will satisfy users. There shall be no room for a lower level abstraction or language here, and the first steps to doing that are recognizing the benefits of eager encoding, decoding and transcoding interfaces.</p>
   <h4 class="heading settled" data-level="3.3.1" id="design-speed-interop"><span class="secno">3.3.1. </span><span class="content">Transcoding Compatibility</span><a class="self-link" href="#design-speed-interop"></a></h4>
   <p>A set of program-overridable traits will be provided to clue implementations in on the ability to trivially relocate/trivially copy data from source to destination with respect to encodings. This is done primarily because of cases where one encoding is a strict superset or subset of another encoding. For example, ASCII encodings are a subset of UTF8 encodings, and in general allow someone to strictly <code class="highlight"><c- n>memcpy</c-></code> the bits from one storage to the other without loss of information. Therefore, there will be a trait that specifies transcoding compatibility named <code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>is_bitwise_compatible_encoding_v</c-><c- o>&lt;</c-><c- n>From</c-><c- p>,</c-> <c- n>To</c-><c- o>></c-></code>. This will allow implementations to use <code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>copy</c-></code> directly when going from one encoding to another, rather than round-tripping through the common <code class="highlight"><c- n>code_point</c-></code> type and some small intermediate storage.</p>
   <h4 class="heading settled" data-level="3.3.2" id="design-speed-eager"><span class="secno">3.3.2. </span><span class="content">Eager, Fast Functions with Customizability</span><a class="self-link" href="#design-speed-eager"></a></h4>
   <p>Research and implementation experience with <a data-link-type="biblio" href="#biblio-boosttext">[boost.text]</a>, <a data-link-type="biblio" href="#biblio-text_view">[text_view]</a> and others has made it plainly clear that while iterators and ranges can produce an extremely efficient binary, it is still not the fastest code that can be written to compete with hand-written or vectorized text processing routines made specifically for each encoding. Therefore, it is imperative that lazy ranges cannot be the only solution if we want the standard to steadily and nicely supplant the codebase-specific or ad-hoc solutions individuals keep rolling for encoding and decoding operations.</p>
   <p>Considering this is going to be one of the most fundamental text layers that sits between typical text and a lot of the new I/O routines, it is imperative that these conversions and transcodes are not only as fast as possible, but customizable. The user can already customize the encoding by creating their own conforming encoding object, but encodings still do their transformations on a code point-by-code point basis. Therefore, a means of extensibility needs to be chosen for the <code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>text</c-><c- o>::</c-><c- n>encode</c-></code>, <code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>text</c-><c- o>::</c-><c- n>decode</c-></code> and <code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>text</c-><c- o>::</c-><c- n>transcode</c-></code> functions. As this paper is targeting C++23, we hope that Matt Calabrese’s <a data-link-type="biblio" href="#biblio-p1292">[p1292]</a> receives favor in the Evolution Design Groups so that our extension mechanisms are nice. Failing that, a design similar to <code class="highlight"><c- n>std</c-><c- o>::</c-><c- n>ranges</c-></code>'s customization points -- as laid out in <a data-link-type="biblio" href="#biblio-n4381">[n4381]</a> -- might be useful here, albeit this is worrisome considering the amount of templated arguments which we do not want to apply overly-restrictive concepts or restraints to. We can also provide a struct that users can use partial template specialization matching and concepts. These are all non-ideal ways of specializing this interface, so we will wait to pick the method of extension.</p>
   <p>What is not negotiable is that it must be extensible. Users should be able to write fast transcoding functions that the standard picks up for their own encoding types. From GB1032 to other ISO and WHATWG encodings, there will always be a need to extend the fast bulk processing of the standard.</p>
   <h2 class="heading settled" data-level="4" id="implementation"><span class="secno">4. </span><span class="content">Implementation</span><a class="self-link" href="#implementation"></a></h2>
   <p>While the ideas presented in this paper have been explored in various different forms, the ideas have never been succinctly composed into a single distributable library. Therefore, the author of this paper is working on an implementation that synthesizes all of the learning from <a data-link-type="biblio" href="#biblio-icu">[icu]</a>, <a data-link-type="biblio" href="#biblio-boosttext">[boost.text]</a>, <a data-link-type="biblio" href="#biblio-text_view">[text_view]</a> and <a data-link-type="biblio" href="#biblio-libogonek">[libogonek]</a>.</p>
   <p>This paper’s r1 hopes to contain benchmarks, initial implementation and usage experience. This paper’s r2 hopes to contain more benchmarks, refined implementation and additional field and usage experience after a more valuable and viable minimum product is established. The current implementation is being incubated in <a data-link-type="biblio" href="#biblio-phdtext">[phd.text]</a>, but will likely be moved to its own repository soon after the initial implementation for <code class="highlight"><c- n>phd</c-><c- o>::</c-><c- n>text_view</c-></code> and <code class="highlight"><c- n>phd</c-><c- o>::</c-><c- n>text</c-></code> are finished.</p>
   <h2 class="heading settled" data-level="5" id="acknowledgements"><span class="secno">5. </span><span class="content">Acknowledgements</span><a class="self-link" href="#acknowledgements"></a></h2>
   <p>Thanks to R. Martinho Fernandes, whose insightful Unicode quips got me hooked on the problem place many, many years ago and helped me develop my first in-house solution for an encoding container adaptor several years ago. Thanks to Mark Boyall, Xeo, and Eric Tremblay for bouncing off ideas, fixes, and other thoughts many years ago when struggling to compile libogonek on a disastrous Microsoft Visual Studio November 2012 CTP compiler.</p>
   <p>Thanks to Tom Honermann, who had me present my second SG16 meeting before it was SG16 and help represent and carry his papers which gave me the drive to help fix the C++ standard for text. Many thanks to Zach Laine, whose tireless implementation efforts have given me much insight and understanding into the complexities of Unicode and whose implementation in Boost.Text made clear the tradeoffs and performance issues. Thanks to Mark Zeren who helped keep me in SG16 and working on these problems.</p>
   <p>And thank you to those of you who grew tired of an ASCII-only world and supported this effort.</p>
  </main>
<script>
(function() {
  "use strict";
  var collapseSidebarText = '<span aria-hidden="true">←</span> '
                          + '<span>Collapse Sidebar</span>';
  var expandSidebarText   = '<span aria-hidden="true">→</span> '
                          + '<span>Pop Out Sidebar</span>';
  var tocJumpText         = '<span aria-hidden="true">↑</span> '
                          + '<span>Jump to Table of Contents</span>';

  var sidebarMedia = window.matchMedia('screen and (min-width: 78em)');
  var autoToggle   = function(e){ toggleSidebar(e.matches) };
  if(sidebarMedia.addListener) {
    sidebarMedia.addListener(autoToggle);
  }

  function toggleSidebar(on) {
    if (on == undefined) {
      on = !document.body.classList.contains('toc-sidebar');
    }

    /* Don’t scroll to compensate for the ToC if we’re above it already. */
    var headY = 0;
    var head = document.querySelector('.head');
    if (head) {
      // terrible approx of "top of ToC"
      headY += head.offsetTop + head.offsetHeight;
    }
    var skipScroll = window.scrollY < headY;

    var toggle = document.getElementById('toc-toggle');
    var tocNav = document.getElementById('toc');
    if (on) {
      var tocHeight = tocNav.offsetHeight;
      document.body.classList.add('toc-sidebar');
      document.body.classList.remove('toc-inline');
      toggle.innerHTML = collapseSidebarText;
      if (!skipScroll) {
        window.scrollBy(0, 0 - tocHeight);
      }
      tocNav.focus();
      sidebarMedia.addListener(autoToggle); // auto-collapse when out of room
    }
    else {
      document.body.classList.add('toc-inline');
      document.body.classList.remove('toc-sidebar');
      toggle.innerHTML = expandSidebarText;
      if (!skipScroll) {
        window.scrollBy(0, tocNav.offsetHeight);
      }
      if (toggle.matches(':hover')) {
        /* Unfocus button when not using keyboard navigation,
           because I don’t know where else to send the focus. */
        toggle.blur();
      }
    }
  }

  function createSidebarToggle() {
    /* Create the sidebar toggle in JS; it shouldn’t exist when JS is off. */
    var toggle = document.createElement('a');
      /* This should probably be a button, but appearance isn’t standards-track.*/
    toggle.id = 'toc-toggle';
    toggle.class = 'toc-toggle';
    toggle.href = '#toc';
    toggle.innerHTML = collapseSidebarText;

    sidebarMedia.addListener(autoToggle);
    var toggler = function(e) {
      e.preventDefault();
      sidebarMedia.removeListener(autoToggle); // persist explicit off states
      toggleSidebar();
      return false;
    }
    toggle.addEventListener('click', toggler, false);


    /* Get <nav id=toc-nav>, or make it if we don’t have one. */
    var tocNav = document.getElementById('toc-nav');
    if (!tocNav) {
      tocNav = document.createElement('p');
      tocNav.id = 'toc-nav';
      /* Prepend for better keyboard navigation */
      document.body.insertBefore(tocNav, document.body.firstChild);
    }
    /* While we’re at it, make sure we have a Jump to Toc link. */
    var tocJump = document.getElementById('toc-jump');
    if (!tocJump) {
      tocJump = document.createElement('a');
      tocJump.id = 'toc-jump';
      tocJump.href = '#toc';
      tocJump.innerHTML = tocJumpText;
      tocNav.appendChild(tocJump);
    }

    tocNav.appendChild(toggle);
  }

  var toc = document.getElementById('toc');
  if (toc) {
    createSidebarToggle();
    toggleSidebar(sidebarMedia.matches);

    /* If the sidebar has been manually opened and is currently overlaying the text
       (window too small for the MQ to add the margin to body),
       then auto-close the sidebar once you click on something in there. */
    toc.addEventListener('click', function(e) {
      if(e.target.tagName.toLowerCase() == "a" && document.body.classList.contains('toc-sidebar') && !sidebarMedia.matches) {
        toggleSidebar(false);
      }
    }, false);
  }
  else {
    console.warn("Can’t find Table of Contents. Please use <nav id='toc'> around the ToC.");
  }

  /* Wrap tables in case they overflow */
  var tables = document.querySelectorAll(':not(.overlarge) > table.data, :not(.overlarge) > table.index');
  var numTables = tables.length;
  for (var i = 0; i < numTables; i++) {
    var table = tables[i];
    var wrapper = document.createElement('div');
    wrapper.className = 'overlarge';
    table.parentNode.insertBefore(wrapper, table);
    wrapper.appendChild(table);
  }

})();
</script>
  <h2 class="no-num no-ref heading settled" id="references"><span class="content">References</span><a class="self-link" href="#references"></a></h2>
  <h3 class="no-num no-ref heading settled" id="informative"><span class="content">Informative References</span><a class="self-link" href="#informative"></a></h3>
  <dl>
   <dt id="biblio-boosttext">[BOOST.TEXT]
   <dd>Zach Laine. <a href="https://github.com/tzlaine/text">Boost.Text</a>. October 20th, 2018. URL: <a href="https://github.com/tzlaine/text">https://github.com/tzlaine/text</a>
   <dt id="biblio-cesu8">[CESU8]
   <dd>Unicode Consortium. <a href="https://www.unicode.org/reports/tr26/">UTR #26, Compatibility Encoding Scheme for UTF-16: 8-Bit (CESU-8)</a>. March 13th, 2019. URL: <a href="https://www.unicode.org/reports/tr26/">https://www.unicode.org/reports/tr26/</a>
   <dt id="biblio-fast-utf8">[FAST-UTF8]
   <dd>Bob Steagall. <a href="https://www.youtube.com/watch?v=5FQ87-Ecb-A">Fast Conversion From UTF-8 with C++, DFAs, and SSE Intrinsics</a>. September 26th, 2019. URL: <a href="https://www.youtube.com/watch?v=5FQ87-Ecb-A">https://www.youtube.com/watch?v=5FQ87-Ecb-A</a>
   <dt id="biblio-icu">[ICU]
   <dd>Unicode Consortium. <a href="http://site.icu-project.org/">International Components for Unicode</a>. April 17th, 2019. URL: <a href="http://site.icu-project.org/">http://site.icu-project.org/</a>
   <dt id="biblio-libogonek">[LIBOGONEK]
   <dd>R. Martinho Fernandes. <a href="https://github.com/libogonek/ogonek">Ogonek</a>. December 9th, 2013. URL: <a href="https://github.com/libogonek/ogonek">https://github.com/libogonek/ogonek</a>
   <dt id="biblio-libogonek-encoding_scheme">[LIBOGONEK-ENCODING_SCHEME]
   <dd>R. Martinho Fernandes. <a href="https://github.com/libogonek/ogonek/blob/devel/include/ogonek/encoding/encoding_scheme.h%2B%2B#L80">encoding_scheme</a>. December 9th, 2013. URL: <a href="https://github.com/libogonek/ogonek/blob/devel/include/ogonek/encoding/encoding_scheme.h%2B%2B#L80">https://github.com/libogonek/ogonek/blob/devel/include/ogonek/encoding/encoding_scheme.h%2B%2B#L80</a>
   <dt id="biblio-n3574">[N3574]
   <dd>Mark Boyall. <a href="https://wg21.link/n3574">Binding stateful functions as function pointers</a>. 10 March 2013. URL: <a href="https://wg21.link/n3574">https://wg21.link/n3574</a>
   <dt id="biblio-n4381">[N4381]
   <dd>Eric Niebler. <a href="https://wg21.link/n4381">Suggested Design for Customization Points</a>. 11 March 2015. URL: <a href="https://wg21.link/n4381">https://wg21.link/n4381</a>
   <dt id="biblio-p0244r2">[P0244R2]
   <dd>Tom Honermann. <a href="https://wg21.link/p0244r2">Text_view: A C++ concepts and range based character encoding and code point enumeration library</a>. 13 June 2017. URL: <a href="https://wg21.link/p0244r2">https://wg21.link/p0244r2</a>
   <dt id="biblio-p1292">[P1292]
   <dd>Matt Calabrese. <a href="https://wg21.link/p1292">Customization Point Functions</a>. October 10th, 2018. URL: <a href="https://wg21.link/p1292">https://wg21.link/p1292</a>
   <dt id="biblio-phdtext">[PHD.TEXT]
   <dd>ThePhD. <a href="https://github.com/ThePhD/phd/tree/master/include/phd/text">phd::text -- encoding and unicode for C++23</a>. June 12th, 2019. URL: <a href="https://github.com/ThePhD/phd/tree/master/include/phd/text">https://github.com/ThePhD/phd/tree/master/include/phd/text</a>
   <dt id="biblio-range-v3">[RANGE-V3]
   <dd>Eric Niebler; Casey Carter. <a href="https://github.com/ericniebler/range-v3">range-v3</a>. June 11th, 2019. URL: <a href="https://github.com/ericniebler/range-v3">https://github.com/ericniebler/range-v3</a>
   <dt id="biblio-range-v3-sentinel-issue">[RANGE-V3-SENTINEL-ISSUE]
   <dd>ThePhD; Eric Niebler. <a href="https://github.com/ericniebler/range-v3/issues/1192">Ranges which take a sentinel should be constructible from {Iterator, Sentinel}</a>. June 11th, 2019. URL: <a href="https://github.com/ericniebler/range-v3/issues/1192">https://github.com/ericniebler/range-v3/issues/1192</a>
   <dt id="biblio-sol2-wstring_convert">[SOL2-WSTRING_CONVERT]
   <dd>ThePhD. <a href="https://github.com/ThePhD/sol2/issues/571">wstring_convert sucks</a>. January 27th, 2018. URL: <a href="https://github.com/ThePhD/sol2/issues/571">https://github.com/ThePhD/sol2/issues/571</a>
   <dt id="biblio-text_view">[TEXT_VIEW]
   <dd>Tom Honermann. <a href="https://github.com/tahonermann/text_view">text_view</a>. November 10th, 2017. URL: <a href="https://github.com/tahonermann/text_view">https://github.com/tahonermann/text_view</a>
   <dt id="biblio-wtf8">[WTF8]
   <dd>Simon Sapin. <a href="https://simonsapin.github.io/wtf-8/">The WTF-8 encoding</a>. September 26th, 2019. URL: <a href="https://simonsapin.github.io/wtf-8/">https://simonsapin.github.io/wtf-8/</a>
  </dl>