Charset: wp_has_noncharacters() for more-specific Unicode handling.
Noncharacters are code points that are permanently reserved in the Unicode Standard for internal use. They are not recommended for use in open interchange of Unicode text data. However, they are valid code points and will not cause a string to return as invalid. Still, HTML and XML both impose semantic rules on their use and it may be important for code to know whether they are present in a string. This patch introduces a new function, `wp_has_noncharacters()`, which answers this question. See https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-23/#G12612 Developed in https://github.com/WordPress/wordpress-develop/pull/9827 Discussed in https://core.trac.wordpress.org/ticket/63863 See #63863. Built from https://develop.svn.wordpress.org/trunk@61000 git-svn-id: http://core.svn.wordpress.org/trunk@60336 1a063a9b-81f0-0310-95a4-ce76da25c4cd
This commit is contained in:
@@ -35,19 +35,21 @@
|
||||
* @since 6.9.0
|
||||
* @access private
|
||||
*
|
||||
* @param string $bytes UTF-8 encoded string which might include invalid spans of bytes.
|
||||
* @param int $at Where to start scanning.
|
||||
* @param int $invalid_length Will be set to how many bytes are to be ignored after `$at`.
|
||||
* @param int|null $max_bytes Stop scanning after this many bytes have been seen.
|
||||
* @param int|null $max_code_points Stop scanning after this many code points have been seen.
|
||||
* @param string $bytes UTF-8 encoded string which might include invalid spans of bytes.
|
||||
* @param int $at Where to start scanning.
|
||||
* @param int $invalid_length Will be set to how many bytes are to be ignored after `$at`.
|
||||
* @param int|null $max_bytes Stop scanning after this many bytes have been seen.
|
||||
* @param int|null $max_code_points Stop scanning after this many code points have been seen.
|
||||
* @param bool $has_noncharacters Set to indicate if scanned string contained noncharacters.
|
||||
* @return int How many code points were successfully scanned.
|
||||
*/
|
||||
function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null ): int {
|
||||
$byte_length = strlen( $bytes );
|
||||
$end = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) );
|
||||
$invalid_length = 0;
|
||||
$count = 0;
|
||||
$max_count = $max_code_points ?? PHP_INT_MAX;
|
||||
function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int {
|
||||
$byte_length = strlen( $bytes );
|
||||
$end = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) );
|
||||
$invalid_length = 0;
|
||||
$count = 0;
|
||||
$max_count = $max_code_points ?? PHP_INT_MAX;
|
||||
$has_noncharacters = false;
|
||||
|
||||
for ( $i = $at; $i < $end && $count <= $max_count; $i++ ) {
|
||||
/*
|
||||
@@ -145,6 +147,15 @@ function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max
|
||||
) {
|
||||
++$count;
|
||||
$i += 2;
|
||||
|
||||
// Covers the range U+FDD0–U+FDEF, U+FFFE, U+FFFF.
|
||||
if ( 0xEF === $b1 ) {
|
||||
$has_noncharacters |= (
|
||||
( 0xB7 === $b2 && $b3 >= 0x90 && $b3 <= 0xAF ) ||
|
||||
( 0xBF === $b2 && ( 0xBE === $b3 || 0xBF === $b3 ) )
|
||||
);
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -162,6 +173,14 @@ function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max
|
||||
) {
|
||||
++$count;
|
||||
$i += 3;
|
||||
|
||||
// Covers U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF.
|
||||
$has_noncharacters |= (
|
||||
( 0x0F === ( $b2 & 0x0F ) ) &&
|
||||
0xBF === $b3 &&
|
||||
( 0xBE === $b4 || 0xBF === $b4 )
|
||||
);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -380,6 +399,31 @@ function _wp_utf8_codepoint_span( string $text, int $byte_offset, int $max_code_
|
||||
return $byte_offset - $was_at;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fallback support for determining if a string contains Unicode noncharacters.
|
||||
*
|
||||
* @since 6.9.0
|
||||
* @access private
|
||||
*
|
||||
* @see \wp_has_noncharacters()
|
||||
*
|
||||
* @param string $text Are there noncharacters in this string?
|
||||
* @return bool Whether noncharacters were found in the string.
|
||||
*/
|
||||
function _wp_has_noncharacters_fallback( string $text ): bool {
|
||||
$at = 0;
|
||||
$invalid_length = 0;
|
||||
$has_noncharacters = false;
|
||||
$end = strlen( $text );
|
||||
|
||||
while ( $at < $end && ! $has_noncharacters ) {
|
||||
_wp_scan_utf8( $text, $at, $invalid_length, null, null, $has_noncharacters );
|
||||
$at += $invalid_length;
|
||||
}
|
||||
|
||||
return $has_noncharacters;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility
|
||||
* with the deprecated function from the PHP standard library.
|
||||
|
||||
@@ -133,3 +133,45 @@ else :
|
||||
return _wp_scrub_utf8_fallback( $text );
|
||||
}
|
||||
endif;
|
||||
|
||||
if ( _wp_can_use_pcre_u() ) :
|
||||
/**
|
||||
* Returns whether the given string contains Unicode noncharacters.
|
||||
*
|
||||
* XML recommends against using noncharacters and HTML forbids their
|
||||
* use in attribute names. Unicode recommends that they not be used
|
||||
* in open exchange of data.
|
||||
*
|
||||
* Noncharacters are code points within the following ranges:
|
||||
* - U+FDD0–U+FDEF
|
||||
* - U+FFFE–U+FFFF
|
||||
* - U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF
|
||||
*
|
||||
* @see https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-23/#G12612
|
||||
* @see https://www.w3.org/TR/xml/#charsets
|
||||
* @see https://html.spec.whatwg.org/#attributes-2
|
||||
*
|
||||
* @since 6.9.0
|
||||
*
|
||||
* @param string $text Are there noncharacters in this string?
|
||||
* @return bool Whether noncharacters were found in the string.
|
||||
*/
|
||||
function wp_has_noncharacters( string $text ): bool {
|
||||
return 1 === preg_match(
|
||||
'/[\x{FDD0}-\x{FDEF}\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}\x{10FFFE}\x{10FFFF}]/u',
|
||||
$text
|
||||
);
|
||||
}
|
||||
else :
|
||||
/**
|
||||
* Fallback function for detecting noncharacters in a text.
|
||||
*
|
||||
* @ignore
|
||||
* @private
|
||||
*
|
||||
* @since 6.9.0
|
||||
*/
|
||||
function wp_has_noncharacters( string $text ): bool {
|
||||
return _wp_has_noncharacters_fallback( $text );
|
||||
}
|
||||
endif;
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
*
|
||||
* @global string $wp_version
|
||||
*/
|
||||
$wp_version = '6.9-alpha-60999';
|
||||
$wp_version = '6.9-alpha-61000';
|
||||
|
||||
/**
|
||||
* Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.
|
||||
|
||||
Reference in New Issue
Block a user