From 6096e1d2e5108e32a281973bde91cbcdfa216011 Mon Sep 17 00:00:00 2001 From: nerrad Date: Tue, 26 Aug 2025 18:22:32 +0000 Subject: [PATCH] HTML API: Reliably parse HTML in get_url_in_content() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As part of a larger effort in #63694, this utlizes `WP_HTML_Tag_Processor` instead of regex to parse the string passed into `get_url_in_content`. As a benefit this also decodes the URL whereas the previous code didn’t, so strings like `http://` will be properly decoded as `http://`. Developed in: https://github.com/WordPress/wordpress-develop/pull/9272 Discussed in: https://core.trac.wordpress.org/ticket/63694 Props dmsnell, jonsurrell, nerrad. Fixes #63694. Built from https://develop.svn.wordpress.org/trunk@60665 git-svn-id: http://core.svn.wordpress.org/trunk@60001 1a063a9b-81f0-0310-95a4-ce76da25c4cd --- wp-includes/formatting.php | 12 ++++++++---- wp-includes/version.php | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/wp-includes/formatting.php b/wp-includes/formatting.php index 7f69321a71..72fcb4e554 100644 --- a/wp-includes/formatting.php +++ b/wp-includes/formatting.php @@ -5978,16 +5978,20 @@ function wp_unslash( $value ) { * * @since 3.6.0 * - * @param string $content A string which might contain a URL. - * @return string|false The found URL. + * @param string $content A string which might contain an `A` element with a non-empty `href` attribute. + * @return string|false Database-escaped URL via {@see esc_url()} if found, otherwise `false`. */ function get_url_in_content( $content ) { if ( empty( $content ) ) { return false; } - if ( preg_match( '/]*?href=([\'"])(.+?)\1/is', $content, $matches ) ) { - return sanitize_url( $matches[2] ); + $processor = new WP_HTML_Tag_Processor( $content ); + while ( $processor->next_tag( 'A' ) ) { + $href = $processor->get_attribute( 'href' ); + if ( is_string( $href ) && ! empty( $href ) ) { + return sanitize_url( $href ); + } } return false; diff --git a/wp-includes/version.php b/wp-includes/version.php index 7a0ae041ff..332a838908 100644 --- a/wp-includes/version.php +++ b/wp-includes/version.php @@ -16,7 +16,7 @@ * * @global string $wp_version */ -$wp_version = '6.9-alpha-60664'; +$wp_version = '6.9-alpha-60665'; /** * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.