Description: CVE-2015-5622 improve reliability of shortcodes
 There are no shortcode input escaping functions available in core even 
 though the Shortcode API is increasingly strict about not allowing 
 special characters inside shortcode attributes.
Author: pento@wordpress.org
Origin: upstream, https://core.trac.wordpress.org/changeset/33359
Bug: https://core.trac.wordpress.org/ticket/15694
Applied-Upstream: 4.2.3
Reviewed-by: Craig Small <csmall@debian.org>
Last-Update: 2015-10-26
---
This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
--- a/wp-includes/class-wp-embed.php
+++ b/wp-includes/class-wp-embed.php
@@ -57,7 +57,7 @@
 		add_shortcode( 'embed', array( $this, 'shortcode' ) );
 
 		// Do the shortcode (only the [embed] one is registered)
-		$content = do_shortcode( $content );
+		$content = do_shortcode( $content, true );
 
 		// Put the original shortcodes back
 		$shortcode_tags = $orig_shortcode_tags;
@@ -243,6 +243,10 @@
 	 * @return string Potentially modified $content.
 	 */
 	function autoembed( $content ) {
+		// Strip newlines from all elements.
+		$content = wp_replace_in_html_tags( $content, array( "\n" => " " ) );
+
+		// Find URLs that are on their own line.
 		return preg_replace_callback( '|^\s*(https?://[^\s"]+)\s*$|im', array( $this, 'autoembed_callback' ), $content );
 	}
 
--- a/wp-includes/formatting.php
+++ b/wp-includes/formatting.php
@@ -233,6 +233,9 @@
 	$pee = preg_replace('!(<' . $allblocks . '[^>]*>)!', "\n$1", $pee);
 	$pee = preg_replace('!(</' . $allblocks . '>)!', "$1\n\n", $pee);
 	$pee = str_replace(array("\r\n", "\r"), "\n", $pee); // cross-platform newlines
+	// Strip newlines from all elements.
+	$pee = wp_replace_in_html_tags( $pee, array( "\n" => " " ) );
+
 	if ( strpos($pee, '<object') !== false ) {
 		$pee = preg_replace('|\s*<param([^>]*)>\s*|', "<param$1>", $pee); // no pee inside object/embed
 		$pee = preg_replace('|\s*</embed>\s*|', '</embed>', $pee);
@@ -267,6 +270,74 @@
 }
 
 /**
+ * Replace characters or phrases within HTML elements only.
+ *
+ * @since 4.2.3
+ *
+ * @param string $haystack The text which has to be formatted.
+ * @param array $replace_pairs In the form array('from' => 'to', ...).
+ * @return string The formatted text.
+ */
+function wp_replace_in_html_tags( $haystack, $replace_pairs ) {
+	// Find all elements.
+	$comments =
+		  '!'           // Start of comment, after the <.
+		. '(?:'         // Unroll the loop: Consume everything until --> is found.
+		.     '-(?!->)' // Dash not followed by end of comment.
+		.     '[^\-]*+' // Consume non-dashes.
+		. ')*+'         // Loop possessively.
+		. '(?:-->)?';   // End of comment. If not found, match all input.
+
+	$regex =
+		  '/('              // Capture the entire match.
+		.     '<'           // Find start of element.
+		.     '(?(?=!--)'   // Is this a comment?
+		.         $comments // Find end of comment.
+		.     '|'
+		.         '[^>]*>?' // Find end of element. If not found, match all input.
+		.     ')'
+		. ')/s';
+
+	$textarr = preg_split( $regex, $haystack, -1, PREG_SPLIT_DELIM_CAPTURE );
+	$changed = false;
+
+	// Optimize when searching for one item.
+	if ( 1 === count( $replace_pairs ) ) {
+		// Extract $needle and $replace.
+		foreach ( $replace_pairs as $needle => $replace );
+
+		// Loop through delimeters (elements) only.
+		for ( $i = 1, $c = count( $textarr ); $i < $c; $i += 2 ) {
+			if ( false !== strpos( $textarr[$i], $needle ) ) {
+				$textarr[$i] = str_replace( $needle, $replace, $textarr[$i] );
+				$changed = true;
+			}
+		}
+	} else {
+		// Extract all $needles.
+		$needles = array_keys( $replace_pairs );
+
+		// Loop through delimeters (elements) only.
+		for ( $i = 1, $c = count( $textarr ); $i < $c; $i += 2 ) {
+			foreach ( $needles as $needle ) {
+				if ( false !== strpos( $textarr[$i], $needle ) ) {
+					$textarr[$i] = strtr( $textarr[$i], $replace_pairs );
+					$changed = true;
+					// After one strtr() break out of the foreach loop and look at next element.
+					break;
+				}
+			}
+		}
+	}
+
+	if ( $changed ) {
+		$haystack = implode( $textarr );
+	}
+
+	return $haystack;
+}
+
+/**
  * Newline preservation help function for wpautop
  *
  * @since 3.1.0
--- a/wp-includes/kses.php
+++ b/wp-includes/kses.php
@@ -484,6 +484,81 @@
 }
 
 /**
+ * Filters one attribute only and ensures its value is allowed.
+ *
+ * This function has the advantage of being more secure than esc_attr() and can
+ * escape data in some situations where wp_kses() must strip the whole attribute.
+ *
+ * @since 4.2.3
+ *
+ * @param string $string The 'whole' attribute, including name and value.
+ * @param string $element The element name to which the attribute belongs.
+ * @return string Filtered attribute.
+ */
+function wp_kses_one_attr( $string, $element ) {
+	$uris = array('xmlns', 'profile', 'href', 'src', 'cite', 'classid', 'codebase', 'data', 'usemap', 'longdesc', 'action');
+	$allowed_html = wp_kses_allowed_html( 'post' );
+	$allowed_protocols = wp_allowed_protocols();
+	$string = wp_kses_no_null( $string, array( 'slash_zero' => 'keep' ) );
+	$string = wp_kses_js_entities( $string );
+
+	// Preserve leading and trailing whitespace.
+	$matches = array();
+	preg_match('/^\s*/', $string, $matches);
+	$lead = $matches[0];
+	preg_match('/\s*$/', $string, $matches);
+	$trail = $matches[0];
+	if ( empty( $trail ) ) {
+		$string = substr( $string, strlen( $lead ) );
+	} else {
+		$string = substr( $string, strlen( $lead ), -strlen( $trail ) );
+	}
+
+	// Parse attribute name and value from input.
+	$split = preg_split( '/\s*=\s*/', $string, 2 );
+	$name = $split[0];
+	if ( count( $split ) == 2 ) {
+		$value = $split[1];
+
+		// Remove quotes surrounding $value.
+		// Also guarantee correct quoting in $string for this one attribute.
+		if ( '' == $value ) {
+			$quote = '';
+		} else {
+			$quote = $value[0];
+		}
+		if ( '"' == $quote || "'" == $quote ) {
+			if ( substr( $value, -1 ) != $quote ) {
+				return '';
+			}
+			$value = substr( $value, 1, -1 );
+		} else {
+			$quote = '"';
+		}
+
+		// Sanitize quotes, angle braces, and entities.
+		$value = esc_attr( $value );
+
+		// Sanitize URI values.
+		if ( in_array( strtolower( $name ), $uris ) ) {
+			$value = wp_kses_bad_protocol( $value, $allowed_protocols );
+		}
+
+		$string = "$name=$quote$value$quote";
+		$vless = 'n';
+	} else {
+		$value = '';
+		$vless = 'y';
+	}
+
+	// Sanitize attribute by name.
+	wp_kses_attr_check( $name, $value, $string, $vless, $element, $allowed_html );
+
+	// Restore whitespace.
+	return $lead . $string . $trail;
+}
+
+/**
  * Return a list of allowed tags and attributes for a given context.
  *
  * @since 3.5.0
@@ -661,11 +736,10 @@
  * @return string Sanitized HTML element
  */
 function wp_kses_attr($element, $attr, $allowed_html, $allowed_protocols) {
-	# Is there a closing XHTML slash at the end of the attributes?
-
 	if ( ! is_array( $allowed_html ) )
 		$allowed_html = wp_kses_allowed_html( $allowed_html );
 
+	// Is there a closing XHTML slash at the end of the attributes?
 	$xhtml_slash = '';
 	if (preg_match('%\s*/\s*$%', $attr))
 		$xhtml_slash = ' /';
@@ -680,50 +754,63 @@
 	# Go through $attrarr, and save the allowed attributes for this element
 	# in $attr2
 	$attr2 = '';
+	foreach ( $attrarr as $arreach ) {
+		if ( wp_kses_attr_check( $arreach['name'], $arreach['value'], $arreach['whole'], $arreach['vless'], $element, $allowed_html ) ) {
+			$attr2 .= ' '.$arreach['whole'];
+		}
+	}
 
-	$allowed_attr = $allowed_html[strtolower($element)];
-	foreach ($attrarr as $arreach) {
-		if ( ! isset( $allowed_attr[strtolower($arreach['name'])] ) )
-			continue; # the attribute is not allowed
-
-		$current = $allowed_attr[strtolower($arreach['name'])];
-		if ( $current == '' )
-			continue; # the attribute is not allowed
-
-		if ( strtolower( $arreach['name'] ) == 'style' ) {
-			$orig_value = $arreach['value'];
-			$value = safecss_filter_attr( $orig_value );
+	# Remove any "<" or ">" characters
+	$attr2 = preg_replace('/[<>]/', '', $attr2);
 
-			if ( empty( $value ) )
-				continue;
+	return "<$element$attr2$xhtml_slash>";
+}
 
-			$arreach['value'] = $value;
-			$arreach['whole'] = str_replace( $orig_value, $value, $arreach['whole'] );
-		}
+/**
+ * Determine whether an attribute is allowed.
+ *
+ * @since 4.2.3
+ *
+ * @param string $name The attribute name. Returns empty string when not allowed.
+ * @param string $value The attribute value. Returns a filtered value.
+ * @param string $whole The name=value input. Returns filtered input.
+ * @param string $vless 'y' when attribute like "enabled", otherwise 'n'.
+ * @param string $element The name of the element to which this attribute belongs.
+ * @param array $allowed_html The full list of allowed elements and attributes.
+ * @return bool Is the attribute allowed?
+ */
+function wp_kses_attr_check( &$name, &$value, &$whole, $vless, $element, $allowed_html ) {
+	$allowed_attr = $allowed_html[strtolower( $element )];
 
-		if ( ! is_array($current) ) {
-			$attr2 .= ' '.$arreach['whole'];
-		# there are no checks
+	$name_low = strtolower( $name );
+	if ( ! isset( $allowed_attr[$name_low] ) || '' == $allowed_attr[$name_low] ) {
+		$name = $value = $whole = '';
+		return false;
+	}
 
-		} else {
-			# there are some checks
-			$ok = true;
-			foreach ($current as $currkey => $currval) {
-				if ( ! wp_kses_check_attr_val($arreach['value'], $arreach['vless'], $currkey, $currval) ) {
-					$ok = false;
-					break;
-				}
-			}
+	if ( 'style' == $name_low ) {
+		$new_value = safecss_filter_attr( $value );
 
-			if ( $ok )
-				$attr2 .= ' '.$arreach['whole']; # it passed them
-		} # if !is_array($current)
-	} # foreach
+		if ( empty( $new_value ) ) {
+			$name = $value = $whole = '';
+			return false;
+		}
 
-	# Remove any "<" or ">" characters
-	$attr2 = preg_replace('/[<>]/', '', $attr2);
+		$whole = str_replace( $value, $new_value, $whole );
+		$value = $new_value;
+	}
 
-	return "<$element$attr2$xhtml_slash>";
+	if ( is_array( $allowed_attr[$name_low] ) ) {
+		// there are some checks
+		foreach ( $allowed_attr[$name_low] as $currkey => $currval ) {
+			if ( ! wp_kses_check_attr_val( $value, $vless, $currkey, $currval ) ) {
+				$name = $value = $whole = '';
+				return false;
+			}
+		}
+	}
+
+	return true;
 }
 
 /**
@@ -856,6 +943,109 @@
 }
 
 /**
+ * Finds all attributes of an HTML element.
+ *
+ * Does not modify input.  May return "evil" output.
+ *
+ * Based on wp_kses_split2() and wp_kses_attr()
+ *
+ * @since 4.2.3
+ *
+ * @param string $element HTML element/tag
+ * @return array|bool List of attributes found in $element. Returns false on failure.
+ */
+function wp_kses_attr_parse( $element ) {
+	$valid = preg_match('%^(<\s*)(/\s*)?([a-zA-Z0-9]+\s*)([^>]*)(>?)$%', $element, $matches);
+	if ( 1 !== $valid ) {
+		return false;
+	}
+
+	$begin =  $matches[1];
+	$slash =  $matches[2];
+	$elname = $matches[3];
+	$attr =   $matches[4];
+	$end =    $matches[5];
+
+	if ( '' !== $slash ) {
+		// Closing elements do not get parsed.
+		return false;
+	}
+
+	// Is there a closing XHTML slash at the end of the attributes?
+	if ( 1 === preg_match( '%\s*/\s*$%', $attr, $matches ) ) {
+		$xhtml_slash = $matches[0];
+		$attr = substr( $attr, 0, -strlen( $xhtml_slash ) );
+	} else {
+		$xhtml_slash = '';
+	}
+
+	// Split it
+	$attrarr = wp_kses_hair_parse( $attr );
+	if ( false === $attrarr ) {
+		return false;
+	}
+
+	// Make sure all input is returned by adding front and back matter.
+	array_unshift( $attrarr, $begin . $slash . $elname );
+	array_push( $attrarr, $xhtml_slash . $end );
+
+	return $attrarr;
+}
+
+/**
+ * Builds an attribute list from string containing attributes.
+ *
+ * Does not modify input.  May return "evil" output.
+ * In case of unexpected input, returns false instead of stripping things.
+ *
+ * Based on wp_kses_hair() but does not return a multi-dimensional array.
+ *
+ * @since 4.2.3
+ *
+ * @param string $attr Attribute list from HTML element to closing HTML element tag
+ * @return array|bool List of attributes found in $attr. Returns false on failure.
+ */
+function wp_kses_hair_parse( $attr ) {
+	if ( '' === $attr ) {
+		return array();
+	}
+
+	$regex =
+	  '(?:'
+	.     '[-a-zA-Z:]+'   // Attribute name.
+	. '|'
+	.     '\[\[?[^\[\]]+\]\]?' // Shortcode in the name position implies unfiltered_html.
+	. ')'
+	. '(?:'               // Attribute value.
+	.     '\s*=\s*'       // All values begin with '='
+	.     '(?:'
+	.         '"[^"]*"'   // Double-quoted
+	.     '|'
+	.         "'[^']*'"   // Single-quoted
+	.     '|'
+	.         '[^\s"\']+' // Non-quoted
+	.         '(?:\s|$)'  // Must have a space
+	.     ')'
+	. '|'
+	.     '(?:\s|$)'      // If attribute has no value, space is required.
+	. ')'
+	. '\s*';              // Trailing space is optional except as mentioned above.
+
+	// Although it is possible to reduce this procedure to a single regexp,
+	// we must run that regexp twice to get exactly the expected result.
+
+	$validation = "%^($regex)+$%";
+	$extraction = "%$regex%";
+
+	if ( 1 === preg_match( $validation, $attr ) ) {
+		preg_match_all( $extraction, $attr, $attrarr );
+		return $attrarr[0];
+	} else {
+		return false;
+	}
+}
+
+/**
  * Performs different checks for attribute values.
  *
  * The currently implemented checks are "maxlen", "minlen", "maxval", "minval"
--- a/wp-includes/shortcodes.php
+++ b/wp-includes/shortcodes.php
@@ -176,16 +176,33 @@
  * @uses get_shortcode_regex() Gets the search pattern for searching shortcodes.
  *
  * @param string $content Content to search for shortcodes
+ * @param bool $ignore_html When true, shortcodes inside HTML elements will be skipped.
  * @return string Content with shortcodes filtered out.
  */
-function do_shortcode($content) {
+function do_shortcode( $content, $ignore_html = false ) {
 	global $shortcode_tags;
 
 	if (empty($shortcode_tags) || !is_array($shortcode_tags))
 		return $content;
 
+	$tagnames = array_keys($shortcode_tags);
+	$tagregexp = join( '|', array_map('preg_quote', $tagnames) );
+	$pattern = "/\\[($tagregexp)/s";
+
+	if ( 1 !== preg_match( $pattern, $content ) ) {
+		// Avoids parsing HTML when there are no shortcodes or embeds anyway.
+		return $content;
+	}
+
+	$content = do_shortcodes_in_html_tags( $content, $ignore_html );
+
 	$pattern = get_shortcode_regex();
-	return preg_replace_callback( "/$pattern/s", 'do_shortcode_tag', $content );
+	$content = preg_replace_callback( "/$pattern/s", 'do_shortcode_tag', $content );
+
+	// Always restore square braces so we don't break things like <!--[if IE ]>
+	$content = unescape_invalid_shortcodes( $content );
+
+	return $content;
 }
 
 /**
@@ -278,6 +295,141 @@
 }
 
 /**
+ * Search only inside HTML elements for shortcodes and process them.
+ *
+ * Any [ or ] characters remaining inside elements will be HTML encoded
+ * to prevent interference with shortcodes that are outside the elements.
+ * Assumes $content processed by KSES already.  Users with unfiltered_html
+ * capability may get unexpected output if angle braces are nested in tags.
+ *
+ * @since 4.2.3
+ *
+ * @param string $content Content to search for shortcodes
+ * @param bool $ignore_html When true, all square braces inside elements will be encoded.
+ * @return string Content with shortcodes filtered out.
+ */
+function do_shortcodes_in_html_tags( $content, $ignore_html ) {
+	// Normalize entities in unfiltered HTML before adding placeholders.
+	$trans = array( '&#91;' => '&#091;', '&#93;' => '&#093;' );
+	$content = strtr( $content, $trans );
+	$trans = array( '[' => '&#91;', ']' => '&#93;' );
+
+	$pattern = get_shortcode_regex();
+
+	$comment_regex =
+		  '!'           // Start of comment, after the <.
+		. '(?:'         // Unroll the loop: Consume everything until --> is found.
+		.     '-(?!->)' // Dash not followed by end of comment.
+		.     '[^\-]*+' // Consume non-dashes.
+		. ')*+'         // Loop possessively.
+		. '(?:-->)?';   // End of comment. If not found, match all input.
+
+	$regex =
+		  '/('                   // Capture the entire match.
+		.     '<'                // Find start of element.
+		.     '(?(?=!--)'        // Is this a comment?
+		.         $comment_regex // Find end of comment.
+		.     '|'
+		.         '[^>]*>?'      // Find end of element. If not found, match all input.
+		.     ')'
+		. ')/s';
+
+	$textarr = preg_split( $regex, $content, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
+
+	foreach ( $textarr as &$element ) {
+		if ( '<' !== $element[0] ) {
+			continue;
+		}
+
+		$noopen = false === strpos( $element, '[' );
+		$noclose = false === strpos( $element, ']' );
+		if ( $noopen || $noclose ) {
+			// This element does not contain shortcodes.
+			if ( $noopen xor $noclose ) {
+				// Need to encode stray [ or ] chars.
+				$element = strtr( $element, $trans );
+			}
+			continue;
+		}
+
+		if ( $ignore_html || '<!--' === substr( $element, 0, 4 ) ) {
+			// Encode all [ and ] chars.
+			$element = strtr( $element, $trans );
+			continue;
+		}
+
+		$attributes = wp_kses_attr_parse( $element );
+		if ( false === $attributes ) {
+			// Looks like we found some crazy unfiltered HTML.  Skipping it for sanity.
+			$element = strtr( $element, $trans );
+			continue;
+		}
+
+		// Get element name
+		$front = array_shift( $attributes );
+		$back = array_pop( $attributes );
+		$matches = array();
+		preg_match('%[a-zA-Z0-9]+%', $front, $matches);
+		$elname = $matches[0];
+
+		// Look for shortcodes in each attribute separately.
+		foreach ( $attributes as &$attr ) {
+			$open = strpos( $attr, '[' );
+			$close = strpos( $attr, ']' );
+			if ( false === $open || false === $close ) {
+				continue; // Go to next attribute.  Square braces will be escaped at end of loop.
+			}
+			$double = strpos( $attr, '"' );
+			$single = strpos( $attr, "'" );
+			if ( ( false === $single || $open < $single ) && ( false === $double || $open < $double ) ) {
+				// $attr like '[shortcode]' or 'name = [shortcode]' implies unfiltered_html.
+				// In this specific situation we assume KSES did not run because the input
+				// was written by an administrator, so we should avoid changing the output
+				// and we do not need to run KSES here.
+				$attr = preg_replace_callback( "/$pattern/s", 'do_shortcode_tag', $attr );
+			} else {
+				// $attr like 'name = "[shortcode]"' or "name = '[shortcode]'"
+				// We do not know if $content was unfiltered. Assume KSES ran before shortcodes.
+				$count = 0;
+				$new_attr = preg_replace_callback( "/$pattern/s", 'do_shortcode_tag', $attr, -1, $count );
+				if ( $count > 0 ) {
+					// Sanitize the shortcode output using KSES.
+					$new_attr = wp_kses_one_attr( $new_attr, $elname );
+					if ( '' !== $new_attr ) {
+						// The shortcode is safe to use now.
+						$attr = $new_attr;
+					}
+				}
+			}
+		}
+		$element = $front . implode( '', $attributes ) . $back;
+
+		// Now encode any remaining [ or ] chars.
+		$element = strtr( $element, $trans );
+	}
+
+	$content = implode( '', $textarr );
+
+	return $content;
+}
+
+/**
+ * Remove placeholders added by do_shortcodes_in_html_tags().
+ *
+ * @since 4.2.3
+ *
+ * @param string $content Content to search for placeholders.
+ * @return string Content with placeholders removed.
+ */
+function unescape_invalid_shortcodes( $content ) {
+        // Clean up entire string, avoids re-parsing HTML.
+        $trans = array( '&#91;' => '[', '&#93;' => ']' );
+        $content = strtr( $content, $trans );
+
+        return $content;
+}
+
+/**
  * Retrieve all attributes from the shortcodes tag.
  *
  * The attributes list has the attribute name as the key and the value of the
@@ -360,9 +512,15 @@
 	if (empty($shortcode_tags) || !is_array($shortcode_tags))
 		return $content;
 
+	$content = do_shortcodes_in_html_tags( $content, true );
+
 	$pattern = get_shortcode_regex();
+	$content = preg_replace_callback( "/$pattern/s", 'strip_shortcode_tag', $content );
+
+	// Always restore square braces so we don't break things like <!--[if IE ]>
+	$content = unescape_invalid_shortcodes( $content );
 
-	return preg_replace_callback( "/$pattern/s", 'strip_shortcode_tag', $content );
+	return $content;
 }
 
 function strip_shortcode_tag( $m ) {
