Noncharacters are code points that are permanently reserved in the Unicode Standard for internal use. They are not recommended for use in open interchange of Unicode text data. However, they are valid code points and will not cause a string to return as invalid. Still, HTML and XML both impose semantic rules on their use and it may be important for code to know whether they are present in a string. This patch introduces a new function, `wp_has_noncharacters()`, which answers this question. See https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-23/#G12612 Developed in https://github.com/WordPress/wordpress-develop/pull/9827 Discussed in https://core.trac.wordpress.org/ticket/63863 See #63863. Built from https://develop.svn.wordpress.org/trunk@61000 git-svn-id: http://core.svn.wordpress.org/trunk@60336 1a063a9b-81f0-0310-95a4-ce76da25c4cd
178 lines
7.1 KiB
PHP
178 lines
7.1 KiB
PHP
<?php
|
||
|
||
if ( extension_loaded( 'mbstring' ) ) :
|
||
/**
|
||
* Determines if a given byte string represents a valid UTF-8 encoding.
|
||
*
|
||
* Note that it’s unlikely for non-UTF-8 data to validate as UTF-8, but
|
||
* it is still possible. Many texts are simultaneously valid UTF-8,
|
||
* valid US-ASCII, and valid ISO-8859-1 (`latin1`).
|
||
*
|
||
* Example:
|
||
*
|
||
* true === wp_is_valid_utf8( '' );
|
||
* true === wp_is_valid_utf8( 'just a test' );
|
||
* true === wp_is_valid_utf8( "\xE2\x9C\x8F" ); // Pencil, U+270F.
|
||
* true === wp_is_valid_utf8( "\u{270F}" ); // Pencil, U+270F.
|
||
* true === wp_is_valid_utf8( '✏' ); // Pencil, U+270F.
|
||
*
|
||
* false === wp_is_valid_utf8( "just \xC0 test" ); // Invalid bytes.
|
||
* false === wp_is_valid_utf8( "\xE2\x9C" ); // Invalid/incomplete sequences.
|
||
* false === wp_is_valid_utf8( "\xC1\xBF" ); // Overlong sequences.
|
||
* false === wp_is_valid_utf8( "\xED\xB0\x80" ); // Surrogate halves.
|
||
* false === wp_is_valid_utf8( "B\xFCch" ); // ISO-8859-1 high-bytes.
|
||
* // E.g. The “ü” in ISO-8859-1 is a single byte 0xFC,
|
||
* // but in UTF-8 is the two-byte sequence 0xC3 0xBC.
|
||
*
|
||
* A “valid” string consists of “well-formed UTF-8 code unit sequence[s],” meaning
|
||
* that the bytes conform to the UTF-8 encoding scheme, all characters use the minimal
|
||
* byte sequence required by UTF-8, and that no sequence encodes a UTF-16 surrogate
|
||
* code point or any character above the representable range.
|
||
*
|
||
* @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G32860
|
||
*
|
||
* @since 6.9.0
|
||
*
|
||
* @param string $bytes String which might contain text encoded as UTF-8.
|
||
* @return bool Whether the provided bytes can decode as valid UTF-8.
|
||
*/
|
||
function wp_is_valid_utf8( string $bytes ): bool {
|
||
return mb_check_encoding( $bytes, 'UTF-8' );
|
||
}
|
||
else :
|
||
/**
|
||
* Fallback function for validating UTF-8.
|
||
*
|
||
* @ignore
|
||
* @private
|
||
*
|
||
* @since 6.9.0
|
||
*/
|
||
function wp_is_valid_utf8( string $string ): bool {
|
||
return _wp_is_valid_utf8_fallback( $string );
|
||
}
|
||
endif;
|
||
|
||
if (
|
||
extension_loaded( 'mbstring' ) &&
|
||
// Maximal subpart substitution introduced by php/php-src@04e59c916f12b322ac55f22314e31bd0176d01cb.
|
||
version_compare( PHP_VERSION, '8.1.6', '>=' )
|
||
) :
|
||
/**
|
||
* Replaces ill-formed UTF-8 byte sequences with the Unicode Replacement Character.
|
||
*
|
||
* Knowing what to do in the presence of text encoding issues can be complicated.
|
||
* This function replaces invalid spans of bytes to neutralize any corruption that
|
||
* may be there and prevent it from causing further problems downstream.
|
||
*
|
||
* However, it’s not always ideal to replace those bytes. In some settings it may
|
||
* be best to leave the invalid bytes in the string so that downstream code can handle
|
||
* them in a specific way. Replacing the bytes too early, like escaping for HTML too
|
||
* early, can introduce other forms of corruption and data loss.
|
||
*
|
||
* When in doubt, use this function to replace spans of invalid bytes.
|
||
*
|
||
* Replacement follows the “maximal subpart” algorithm for secure and interoperable
|
||
* strings. This can lead to sequences of multiple replacement characters in a row.
|
||
*
|
||
* Example:
|
||
*
|
||
* // Valid strings come through unchanged.
|
||
* 'test' === wp_scrub_utf8( 'test' );
|
||
*
|
||
* // Invalid sequences of bytes are replaced.
|
||
* $invalid = "the byte \xC0 is never allowed in a UTF-8 string.";
|
||
* "the byte \u{FFFD} is never allowed in a UTF-8 string." === wp_scrub_utf8( $invalid, true );
|
||
* 'the byte <20> is never allowed in a UTF-8 string.' === wp_scrub_utf8( $invalid, true );
|
||
*
|
||
* // Maximal subparts are replaced individually.
|
||
* '.<2E>.' === wp_scrub_utf8( ".\xC0." ); // C0 is never valid.
|
||
* '.<2E>.' === wp_scrub_utf8( ".\xE2\x8C." ); // Missing A3 at end.
|
||
* '.<2E><>.' === wp_scrub_utf8( ".\xE2\x8C\xE2\x8C." ); // Maximal subparts replaced separately.
|
||
* '.<2E><>.' === wp_scrub_utf8( ".\xC1\xBF." ); // Overlong sequence.
|
||
* '.<2E><><EFBFBD>.' === wp_scrub_utf8( ".\xED\xA0\x80." ); // Surrogate half.
|
||
*
|
||
* Note! The Unicode Replacement Character is itself a Unicode character (U+FFFD).
|
||
* Once a span of invalid bytes has been replaced by one, it will not be possible
|
||
* to know whether the replacement character was originally intended to be there
|
||
* or if it is the result of scrubbing bytes. It is ideal to leave replacement for
|
||
* display only, but some contexts (e.g. generating XML or passing data into a
|
||
* large language model) require valid input strings.
|
||
*
|
||
* @since 6.9.0
|
||
*
|
||
* @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40630
|
||
*
|
||
* @param string $text String which is assumed to be UTF-8 but may contain invalid sequences of bytes.
|
||
* @return string Input text with invalid sequences of bytes replaced with the Unicode replacement character.
|
||
*/
|
||
function wp_scrub_utf8( $text ) {
|
||
/*
|
||
* While it looks like setting the substitute character could fail,
|
||
* the internal PHP code will never fail when provided a valid
|
||
* code point as a number. In this case, there’s no need to check
|
||
* its return value to see if it succeeded.
|
||
*/
|
||
$prev_replacement_character = mb_substitute_character();
|
||
mb_substitute_character( 0xFFFD );
|
||
$scrubbed = mb_scrub( $text, 'UTF-8' );
|
||
mb_substitute_character( $prev_replacement_character );
|
||
|
||
return $scrubbed;
|
||
}
|
||
else :
|
||
/**
|
||
* Fallback function for scrubbing UTF-8.
|
||
*
|
||
* @ignore
|
||
* @private
|
||
*
|
||
* @since 6.9.0
|
||
*/
|
||
function wp_scrub_utf8( $text ) {
|
||
return _wp_scrub_utf8_fallback( $text );
|
||
}
|
||
endif;
|
||
|
||
if ( _wp_can_use_pcre_u() ) :
|
||
/**
|
||
* Returns whether the given string contains Unicode noncharacters.
|
||
*
|
||
* XML recommends against using noncharacters and HTML forbids their
|
||
* use in attribute names. Unicode recommends that they not be used
|
||
* in open exchange of data.
|
||
*
|
||
* Noncharacters are code points within the following ranges:
|
||
* - U+FDD0–U+FDEF
|
||
* - U+FFFE–U+FFFF
|
||
* - U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF
|
||
*
|
||
* @see https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-23/#G12612
|
||
* @see https://www.w3.org/TR/xml/#charsets
|
||
* @see https://html.spec.whatwg.org/#attributes-2
|
||
*
|
||
* @since 6.9.0
|
||
*
|
||
* @param string $text Are there noncharacters in this string?
|
||
* @return bool Whether noncharacters were found in the string.
|
||
*/
|
||
function wp_has_noncharacters( string $text ): bool {
|
||
return 1 === preg_match(
|
||
'/[\x{FDD0}-\x{FDEF}\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}\x{10FFFE}\x{10FFFF}]/u',
|
||
$text
|
||
);
|
||
}
|
||
else :
|
||
/**
|
||
* Fallback function for detecting noncharacters in a text.
|
||
*
|
||
* @ignore
|
||
* @private
|
||
*
|
||
* @since 6.9.0
|
||
*/
|
||
function wp_has_noncharacters( string $text ): bool {
|
||
return _wp_has_noncharacters_fallback( $text );
|
||
}
|
||
endif;
|