HTML API: Reliably parse HTML in get_url_in_content()

As part of a larger effort in #63694, this utlizes `WP_HTML_Tag_Processor` instead of regex to parse the string passed into `get_url_in_content`. As a benefit this also decodes the URL whereas the previous code didn’t, so strings like `http://` will be properly decoded as `http://`. Developed in: https://github.com/WordPress/wordpress-develop/pull/9272 Discussed in: https://core.trac.wordpress.org/ticket/63694 Props dmsnell, jonsurrell, nerrad. Fixes #63694. Built from https://develop.svn.wordpress.org/trunk@60665 git-svn-id: http://core.svn.wordpress.org/trunk@60001 1a063a9b-81f0-0310-95a4-ce76da25c4cd
2025-08-26 18:22:32 +00:00
parent 025eab7b5a
commit 6096e1d2e5
2 changed files with 9 additions and 5 deletions
--- a/wp-includes/formatting.php
+++ b/wp-includes/formatting.php
@@ -5978,16 +5978,20 @@ function wp_unslash( $value ) {
 *
 * @since 3.6.0
 *
- * @param string $content A string which might contain a URL.
- * @return string|false The found URL.
+ * @param string $content A string which might contain an `A` element with a non-empty `href` attribute.
+ * @return string|false Database-escaped URL via {@see esc_url()} if found, otherwise `false`.
 */
 function get_url_in_content( $content ) {
 	if ( empty( $content ) ) {
 		return false;
 	}

-	if ( preg_match( '/<a\s[^>]*?href=([\'"])(.+?)\1/is', $content, $matches ) ) {
-		return sanitize_url( $matches[2] );
+	$processor = new WP_HTML_Tag_Processor( $content );
+	while ( $processor->next_tag( 'A' ) ) {
+		$href = $processor->get_attribute( 'href' );
+		if ( is_string( $href ) && ! empty( $href ) ) {
+			return sanitize_url( $href );
+		}
 	}

 	return false;
--- a/wp-includes/version.php
+++ b/wp-includes/version.php
@@ -16,7 +16,7 @@
 *
 * @global string $wp_version
 */
-$wp_version = '6.9-alpha-60664';
+$wp_version = '6.9-alpha-60665';

 /**
 * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.