Commit 5d3497add5 for wordpress.org
commit 5d3497add5339f939ea04544e6f436f7fac0f3b8
Author: dmsnell <dmsnell@git.wordpress.org>
Date: Sat Jan 10 21:47:47 2026 +0000
HTML API: Refactor `wp_kses_hair()` for spec-compliance.
`wp_kses_hair()` is built around an impressive state machine for parsing the span of text following an HTML tag name and the tag’s closing `>` into a structured representation of the attributes. Unfortunately that parsing code doesn’t comply with the HTML Living Standard and is prone to mis-parsing attributes, particularly in the presence of malformed inputs.
This patch replaces the existing state machine with the spec-compliant parsing from the HTML API. With a comprehensive test suite covering attribute parsing, the same reliability the Tag Processor affords will be applied to `wp_kses_hair()`, giving new guarantees not previously available in Core:
- All attribute values are reported fully-normalized, where character references are decoded and then re-encoded in a predictable manner. Only the “big five” syntax characters (“&<>'"”) will remain, and in their named forms.
- All `whole` values are fully normalized and presented either as boolean attributes without a value, or with double-quoted attribute values.
- All attributes and their values will be properly parsed according to how a browser would parse them, bringing agreement between the server and user agents.
Developed in https://github.com/WordPress/wordpress-develop/pull/9248
Discussed in https://core.trac.wordpress.org/ticket/63724
Props adamziel, dmsnell, jonsurrell, jorbin, westonruter.
Fixes #63724.
Built from https://develop.svn.wordpress.org/trunk@61467
git-svn-id: http://core.svn.wordpress.org/trunk@60779 1a063a9b-81f0-0310-95a4-ce76da25c4cd
diff --git a/wp-includes/css/dist/index.php b/wp-includes/css/dist/index.php
index e2d1a3addc..1e931e3a7a 100644
--- a/wp-includes/css/dist/index.php
+++ b/wp-includes/css/dist/index.php
@@ -7,6 +7,11 @@
*/
return array(
+ array(
+ 'handle' => 'wp-list-reusable-blocks',
+ 'path' => 'list-reusable-blocks/style',
+ 'dependencies' => array('wp-components'),
+ ),
array(
'handle' => 'wp-nux',
'path' => 'nux/style',
@@ -18,8 +23,8 @@ return array(
'dependencies' => array('wp-components'),
),
array(
- 'handle' => 'wp-list-reusable-blocks',
- 'path' => 'list-reusable-blocks/style',
+ 'handle' => 'wp-commands',
+ 'path' => 'commands/style',
'dependencies' => array('wp-components'),
),
array(
@@ -28,25 +33,20 @@ return array(
'dependencies' => array('wp-block-editor', 'wp-components'),
),
array(
- 'handle' => 'wp-patterns',
- 'path' => 'patterns/style',
+ 'handle' => 'wp-widgets',
+ 'path' => 'widgets/style',
'dependencies' => array('wp-block-editor', 'wp-components'),
),
array(
- 'handle' => 'wp-commands',
- 'path' => 'commands/style',
- 'dependencies' => array('wp-components'),
+ 'handle' => 'wp-patterns',
+ 'path' => 'patterns/style',
+ 'dependencies' => array('wp-block-editor', 'wp-components'),
),
array(
'handle' => 'wp-components',
'path' => 'components/style',
'dependencies' => array(),
),
- array(
- 'handle' => 'wp-widgets',
- 'path' => 'widgets/style',
- 'dependencies' => array('wp-block-editor', 'wp-components'),
- ),
array(
'handle' => 'wp-format-library',
'path' => 'format-library/style',
@@ -57,26 +57,26 @@ return array(
'path' => 'block-directory/style',
'dependencies' => array('wp-block-editor', 'wp-components', 'wp-editor'),
),
- array(
- 'handle' => 'wp-media-utils',
- 'path' => 'media-utils/style',
- 'dependencies' => array('wp-components'),
- ),
array(
'handle' => 'wp-customize-widgets',
'path' => 'customize-widgets/style',
'dependencies' => array('wp-block-editor', 'wp-block-library', 'wp-components', 'wp-media-utils', 'wp-preferences', 'wp-widgets'),
),
array(
- 'handle' => 'wp-edit-widgets',
- 'path' => 'edit-widgets/style',
- 'dependencies' => array('wp-block-editor', 'wp-block-library', 'wp-components', 'wp-media-utils', 'wp-patterns', 'wp-preferences', 'wp-widgets'),
+ 'handle' => 'wp-media-utils',
+ 'path' => 'media-utils/style',
+ 'dependencies' => array('wp-components'),
),
array(
'handle' => 'wp-edit-post',
'path' => 'edit-post/style',
'dependencies' => array('wp-block-editor', 'wp-block-library', 'wp-commands', 'wp-components', 'wp-editor', 'wp-preferences', 'wp-widgets'),
),
+ array(
+ 'handle' => 'wp-edit-widgets',
+ 'path' => 'edit-widgets/style',
+ 'dependencies' => array('wp-block-editor', 'wp-block-library', 'wp-components', 'wp-media-utils', 'wp-patterns', 'wp-preferences', 'wp-widgets'),
+ ),
array(
'handle' => 'wp-block-library',
'path' => 'block-library/style',
diff --git a/wp-includes/js/dist/script-modules/index.php b/wp-includes/js/dist/script-modules/index.php
index 5147fa73bb..113805a3fc 100644
--- a/wp-includes/js/dist/script-modules/index.php
+++ b/wp-includes/js/dist/script-modules/index.php
@@ -7,26 +7,6 @@
*/
return array(
- array(
- 'id' => '@wordpress/interactivity',
- 'path' => 'interactivity/index',
- 'asset' => 'interactivity/index.min.asset.php',
- ),
- array(
- 'id' => '@wordpress/abilities',
- 'path' => 'abilities/index',
- 'asset' => 'abilities/index.min.asset.php',
- ),
- array(
- 'id' => '@wordpress/latex-to-mathml',
- 'path' => 'latex-to-mathml/index',
- 'asset' => 'latex-to-mathml/index.min.asset.php',
- ),
- array(
- 'id' => '@wordpress/latex-to-mathml/loader',
- 'path' => 'latex-to-mathml/loader',
- 'asset' => 'latex-to-mathml/loader.min.asset.php',
- ),
array(
'id' => '@wordpress/a11y',
'path' => 'a11y/index',
@@ -42,11 +22,31 @@ return array(
'path' => 'interactivity-router/full-page',
'asset' => 'interactivity-router/full-page.min.asset.php',
),
+ array(
+ 'id' => '@wordpress/interactivity',
+ 'path' => 'interactivity/index',
+ 'asset' => 'interactivity/index.min.asset.php',
+ ),
array(
'id' => '@wordpress/core-abilities',
'path' => 'core-abilities/index',
'asset' => 'core-abilities/index.min.asset.php',
),
+ array(
+ 'id' => '@wordpress/latex-to-mathml',
+ 'path' => 'latex-to-mathml/index',
+ 'asset' => 'latex-to-mathml/index.min.asset.php',
+ ),
+ array(
+ 'id' => '@wordpress/latex-to-mathml/loader',
+ 'path' => 'latex-to-mathml/loader',
+ 'asset' => 'latex-to-mathml/loader.min.asset.php',
+ ),
+ array(
+ 'id' => '@wordpress/abilities',
+ 'path' => 'abilities/index',
+ 'asset' => 'abilities/index.min.asset.php',
+ ),
array(
'id' => '@wordpress/route',
'path' => 'route/index',
diff --git a/wp-includes/kses.php b/wp-includes/kses.php
index 1d77491c29..5c3e3d4021 100644
--- a/wp-includes/kses.php
+++ b/wp-includes/kses.php
@@ -1585,160 +1585,72 @@ function wp_kses_attr_check( &$name, &$value, &$whole, $vless, $element, $allowe
}
/**
- * Builds an attribute list from string containing attributes.
- *
- * This function does a lot of work. It parses an attribute list into an array
- * with attribute data, and tries to do the right thing even if it gets weird
- * input. It will add quotes around attribute values that don't have any quotes
- * or apostrophes around them, to make it easier to produce HTML code that will
- * conform to W3C's HTML specification. It will also remove bad URL protocols
- * from attribute values. It also reduces duplicate attributes by using the
- * attribute defined first (`foo='bar' foo='baz'` will result in `foo='bar'`).
+ * Given a string of HTML attributes and values, parse into a structured attribute list.
+ *
+ * This function performs a number of transformations while parsing attribute strings:
+ * - It normalizes attribute values and surrounds them with double quotes.
+ * - It normalizes HTML character references inside attribute values.
+ * - It removes “bad” URL protocols from attribute values.
+ *
+ * Otherwise this reads the attributes as if they were part of an HTML tag. It performs
+ * these transformations to lower the risk of mis-parsing down the line and to perform
+ * URL sanitization in line with the rest of the `kses` subsystem. Importantly, it does
+ * not decode the attribute values, meaning that special HTML syntax characters will
+ * be left with character references in the `value` property.
+ *
+ * Example:
+ *
+ * $attrs = wp_kses_hair( 'class="is-wide" inert data-lazy=\'<img>\' =/🐮=/' );
+ * $attrs === array(
+ * 'class' => array( 'name' => 'class', 'value' => 'is-wide', 'whole' => 'class="is-wide"', 'vless' => 'n' ),
+ * 'inert' => array( 'name' => 'inert', 'value' => '', 'whole' => 'inert', 'vless' => 'y' ),
+ * 'data-lazy' => array( 'name' => 'data-lazy', 'value' => '<img>', 'whole' => 'data-lazy="<img>"', 'vless' => 'n' ),
+ * '=' => array( 'name' => '=', 'value' => '', 'whole' => '=', 'vless' => 'y' ),
+ * '🐮' => array( 'name' => '🐮', 'value' => '/', 'whole' => '🐮="/"', 'vless' => 'n' ),
+ * );
*
* @since 1.0.0
+ * @since 7.0.0 Reliably parses HTML via the HTML API.
*
* @param string $attr Attribute list from HTML element to closing HTML element tag.
* @param string[] $allowed_protocols Array of allowed URL protocols.
- * @return array[] Array of attribute information after parsing.
+ * @return array<string, array{name: string, value: string, whole: string, vless: 'y'|'n'}> Array of attribute information after parsing.
*/
function wp_kses_hair( $attr, $allowed_protocols ) {
- $attrarr = array();
- $mode = 0;
- $attrname = '';
- $uris = wp_kses_uri_attributes();
-
- // Loop through the whole attribute list.
-
- while ( strlen( $attr ) !== 0 ) {
- $working = 0; // Was the last operation successful?
-
- switch ( $mode ) {
- case 0:
- if ( preg_match( '/^([_a-zA-Z][-_a-zA-Z0-9:.]*)/', $attr, $match ) ) {
- $attrname = $match[1];
- $working = 1;
- $mode = 1;
- $attr = preg_replace( '/^[_a-zA-Z][-_a-zA-Z0-9:.]*/', '', $attr );
- }
-
- break;
-
- case 1:
- if ( preg_match( '/^\s*=\s*/', $attr ) ) { // Equals sign.
- $working = 1;
- $mode = 2;
- $attr = preg_replace( '/^\s*=\s*/', '', $attr );
- break;
- }
-
- if ( preg_match( '/^\s+/', $attr ) ) { // Valueless.
- $working = 1;
- $mode = 0;
-
- if ( false === array_key_exists( $attrname, $attrarr ) ) {
- $attrarr[ $attrname ] = array(
- 'name' => $attrname,
- 'value' => '',
- 'whole' => $attrname,
- 'vless' => 'y',
- );
- }
-
- $attr = preg_replace( '/^\s+/', '', $attr );
- }
-
- break;
-
- case 2:
- if ( preg_match( '%^"([^"]*)"(\s+|/?$)%', $attr, $match ) ) {
- // "value"
- $thisval = $match[1];
- if ( in_array( strtolower( $attrname ), $uris, true ) ) {
- $thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols );
- }
-
- if ( false === array_key_exists( $attrname, $attrarr ) ) {
- $attrarr[ $attrname ] = array(
- 'name' => $attrname,
- 'value' => $thisval,
- 'whole' => "$attrname=\"$thisval\"",
- 'vless' => 'n',
- );
- }
-
- $working = 1;
- $mode = 0;
- $attr = preg_replace( '/^"[^"]*"(\s+|$)/', '', $attr );
- break;
- }
-
- if ( preg_match( "%^'([^']*)'(\s+|/?$)%", $attr, $match ) ) {
- // 'value'
- $thisval = $match[1];
- if ( in_array( strtolower( $attrname ), $uris, true ) ) {
- $thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols );
- }
-
- if ( false === array_key_exists( $attrname, $attrarr ) ) {
- $attrarr[ $attrname ] = array(
- 'name' => $attrname,
- 'value' => $thisval,
- 'whole' => "$attrname='$thisval'",
- 'vless' => 'n',
- );
- }
-
- $working = 1;
- $mode = 0;
- $attr = preg_replace( "/^'[^']*'(\s+|$)/", '', $attr );
- break;
- }
-
- if ( preg_match( "%^([^\s\"']+)(\s+|/?$)%", $attr, $match ) ) {
- // value
- $thisval = $match[1];
- if ( in_array( strtolower( $attrname ), $uris, true ) ) {
- $thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols );
- }
-
- if ( false === array_key_exists( $attrname, $attrarr ) ) {
- $attrarr[ $attrname ] = array(
- 'name' => $attrname,
- 'value' => $thisval,
- 'whole' => "$attrname=\"$thisval\"",
- 'vless' => 'n',
- );
- }
-
- // We add quotes to conform to W3C's HTML spec.
- $working = 1;
- $mode = 0;
- $attr = preg_replace( "%^[^\s\"']+(\s+|$)%", '', $attr );
- }
-
- break;
- } // End switch.
+ $attributes = array();
+ $uris = wp_kses_uri_attributes();
+
+ $processor = new WP_HTML_Tag_Processor( "<wp {$attr}>" );
+ $processor->next_token();
+
+ $syntax_characters = array(
+ '&' => '&',
+ '<' => '<',
+ '>' => '>',
+ "'" => ''',
+ '"' => '"',
+ );
- if ( 0 === $working ) { // Not well-formed, remove and try again.
- $attr = wp_kses_html_error( $attr );
- $mode = 0;
+ foreach ( $processor->get_attribute_names_with_prefix( '' ) as $name ) {
+ $value = $processor->get_attribute( $name );
+ $is_bool = true === $value;
+ if ( is_string( $value ) && in_array( $name, $uris, true ) ) {
+ $value = wp_kses_bad_protocol( $value, $allowed_protocols );
}
- } // End while.
- if ( 1 === $mode && false === array_key_exists( $attrname, $attrarr ) ) {
- /*
- * Special case, for when the attribute list ends with a valueless
- * attribute like "selected".
- */
- $attrarr[ $attrname ] = array(
- 'name' => $attrname,
- 'value' => '',
- 'whole' => $attrname,
- 'vless' => 'y',
+ // Reconstruct and normalize the attribute value.
+ $recoded = $is_bool ? '' : strtr( $value, $syntax_characters );
+ $whole = $is_bool ? $name : "{$name}=\"{$recoded}\"";
+
+ $attributes[ $name ] = array(
+ 'name' => $name,
+ 'value' => $recoded,
+ 'whole' => $whole,
+ 'vless' => $is_bool ? 'y' : 'n',
);
}
- return $attrarr;
+ return $attributes;
}
/**
diff --git a/wp-includes/version.php b/wp-includes/version.php
index f91c0d7a74..1b8d964a95 100644
--- a/wp-includes/version.php
+++ b/wp-includes/version.php
@@ -16,7 +16,7 @@
*
* @global string $wp_version
*/
-$wp_version = '7.0-alpha-61466';
+$wp_version = '7.0-alpha-61467';
/**
* Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.