PHPXRef 0.7.1 : Unnamed Project : /se3master/var/www/se3/includes/library/HTMLPurifier/Lexer/DirectLex.php source

[Summary view] [Print] [Text view]
   1  <?php
   2  
   3  /**
   4   * Our in-house implementation of a parser.
   5   *
   6   * A pure PHP parser, DirectLex has absolutely no dependencies, making
   7   * it a reasonably good default for PHP4.  Written with efficiency in mind,
   8   * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
   9   * pales in comparison to HTMLPurifier_Lexer_DOMLex.
  10   *
  11   * @todo Reread XML spec and document differences.
  12   */
  13  class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
  14  {
  15      /**
  16       * @type bool
  17       */
  18      public $tracksLineNumbers = true;
  19  
  20      /**
  21       * Whitespace characters for str(c)spn.
  22       * @type string
  23       */
  24      protected $_whitespace = "\x20\x09\x0D\x0A";
  25  
  26      /**
  27       * Callback function for script CDATA fudge
  28       * @param array $matches, in form of array(opening tag, contents, closing tag)
  29       * @return string
  30       */
  31      protected function scriptCallback($matches)
  32      {
  33          return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
  34      }
  35  
  36      /**
  37       * @param String $html
  38       * @param HTMLPurifier_Config $config
  39       * @param HTMLPurifier_Context $context
  40       * @return array|HTMLPurifier_Token[]
  41       */
  42      public function tokenizeHTML($html, $config, $context)
  43      {
  44          // special normalization for script tags without any armor
  45          // our "armor" heurstic is a < sign any number of whitespaces after
  46          // the first script tag
  47          if ($config->get('HTML.Trusted')) {
  48              $html = preg_replace_callback(
  49                  '#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
  50                  array($this, 'scriptCallback'),
  51                  $html
  52              );
  53          }
  54  
  55          $html = $this->normalize($html, $config, $context);
  56  
  57          $cursor = 0; // our location in the text
  58          $inside_tag = false; // whether or not we're parsing the inside of a tag
  59          $array = array(); // result array
  60  
  61          // This is also treated to mean maintain *column* numbers too
  62          $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');
  63  
  64          if ($maintain_line_numbers === null) {
  65              // automatically determine line numbering by checking
  66              // if error collection is on
  67              $maintain_line_numbers = $config->get('Core.CollectErrors');
  68          }
  69  
  70          if ($maintain_line_numbers) {
  71              $current_line = 1;
  72              $current_col = 0;
  73              $length = strlen($html);
  74          } else {
  75              $current_line = false;
  76              $current_col = false;
  77              $length = false;
  78          }
  79          $context->register('CurrentLine', $current_line);
  80          $context->register('CurrentCol', $current_col);
  81          $nl = "\n";
  82          // how often to manually recalculate. This will ALWAYS be right,
  83          // but it's pretty wasteful. Set to 0 to turn off
  84          $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');
  85  
  86          $e = false;
  87          if ($config->get('Core.CollectErrors')) {
  88              $e =& $context->get('ErrorCollector');
  89          }
  90  
  91          // for testing synchronization
  92          $loops = 0;
  93  
  94          while (++$loops) {
  95              // $cursor is either at the start of a token, or inside of
  96              // a tag (i.e. there was a < immediately before it), as indicated
  97              // by $inside_tag
  98  
  99              if ($maintain_line_numbers) {
 100                  // $rcursor, however, is always at the start of a token.
 101                  $rcursor = $cursor - (int)$inside_tag;
 102  
 103                  // Column number is cheap, so we calculate it every round.
 104                  // We're interested at the *end* of the newline string, so
 105                  // we need to add strlen($nl) == 1 to $nl_pos before subtracting it
 106                  // from our "rcursor" position.
 107                  $nl_pos = strrpos($html, $nl, $rcursor - $length);
 108                  $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
 109  
 110                  // recalculate lines
 111                  if ($synchronize_interval && // synchronization is on
 112                      $cursor > 0 && // cursor is further than zero
 113                      $loops % $synchronize_interval === 0) { // time to synchronize!
 114                      $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
 115                  }
 116              }
 117  
 118              $position_next_lt = strpos($html, '<', $cursor);
 119              $position_next_gt = strpos($html, '>', $cursor);
 120  
 121              // triggers on "<b>asdf</b>" but not "asdf <b></b>"
 122              // special case to set up context
 123              if ($position_next_lt === $cursor) {
 124                  $inside_tag = true;
 125                  $cursor++;
 126              }
 127  
 128              if (!$inside_tag && $position_next_lt !== false) {
 129                  // We are not inside tag and there still is another tag to parse
 130                  $token = new
 131                  HTMLPurifier_Token_Text(
 132                      $this->parseData(
 133                          substr(
 134                              $html,
 135                              $cursor,
 136                              $position_next_lt - $cursor
 137                          )
 138                      )
 139                  );
 140                  if ($maintain_line_numbers) {
 141                      $token->rawPosition($current_line, $current_col);
 142                      $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
 143                  }
 144                  $array[] = $token;
 145                  $cursor = $position_next_lt + 1;
 146                  $inside_tag = true;
 147                  continue;
 148              } elseif (!$inside_tag) {
 149                  // We are not inside tag but there are no more tags
 150                  // If we're already at the end, break
 151                  if ($cursor === strlen($html)) {
 152                      break;
 153                  }
 154                  // Create Text of rest of string
 155                  $token = new
 156                  HTMLPurifier_Token_Text(
 157                      $this->parseData(
 158                          substr(
 159                              $html,
 160                              $cursor
 161                          )
 162                      )
 163                  );
 164                  if ($maintain_line_numbers) {
 165                      $token->rawPosition($current_line, $current_col);
 166                  }
 167                  $array[] = $token;
 168                  break;
 169              } elseif ($inside_tag && $position_next_gt !== false) {
 170                  // We are in tag and it is well formed
 171                  // Grab the internals of the tag
 172                  $strlen_segment = $position_next_gt - $cursor;
 173  
 174                  if ($strlen_segment < 1) {
 175                      // there's nothing to process!
 176                      $token = new HTMLPurifier_Token_Text('<');
 177                      $cursor++;
 178                      continue;
 179                  }
 180  
 181                  $segment = substr($html, $cursor, $strlen_segment);
 182  
 183                  if ($segment === false) {
 184                      // somehow, we attempted to access beyond the end of
 185                      // the string, defense-in-depth, reported by Nate Abele
 186                      break;
 187                  }
 188  
 189                  // Check if it's a comment
 190                  if (substr($segment, 0, 3) === '!--') {
 191                      // re-determine segment length, looking for -->
 192                      $position_comment_end = strpos($html, '-->', $cursor);
 193                      if ($position_comment_end === false) {
 194                          // uh oh, we have a comment that extends to
 195                          // infinity. Can't be helped: set comment
 196                          // end position to end of string
 197                          if ($e) {
 198                              $e->send(E_WARNING, 'Lexer: Unclosed comment');
 199                          }
 200                          $position_comment_end = strlen($html);
 201                          $end = true;
 202                      } else {
 203                          $end = false;
 204                      }
 205                      $strlen_segment = $position_comment_end - $cursor;
 206                      $segment = substr($html, $cursor, $strlen_segment);
 207                      $token = new
 208                      HTMLPurifier_Token_Comment(
 209                          substr(
 210                              $segment,
 211                              3,
 212                              $strlen_segment - 3
 213                          )
 214                      );
 215                      if ($maintain_line_numbers) {
 216                          $token->rawPosition($current_line, $current_col);
 217                          $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
 218                      }
 219                      $array[] = $token;
 220                      $cursor = $end ? $position_comment_end : $position_comment_end + 3;
 221                      $inside_tag = false;
 222                      continue;
 223                  }
 224  
 225                  // Check if it's an end tag
 226                  $is_end_tag = (strpos($segment, '/') === 0);
 227                  if ($is_end_tag) {
 228                      $type = substr($segment, 1);
 229                      $token = new HTMLPurifier_Token_End($type);
 230                      if ($maintain_line_numbers) {
 231                          $token->rawPosition($current_line, $current_col);
 232                          $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 233                      }
 234                      $array[] = $token;
 235                      $inside_tag = false;
 236                      $cursor = $position_next_gt + 1;
 237                      continue;
 238                  }
 239  
 240                  // Check leading character is alnum, if not, we may
 241                  // have accidently grabbed an emoticon. Translate into
 242                  // text and go our merry way
 243                  if (!ctype_alpha($segment[0])) {
 244                      // XML:  $segment[0] !== '_' && $segment[0] !== ':'
 245                      if ($e) {
 246                          $e->send(E_NOTICE, 'Lexer: Unescaped lt');
 247                      }
 248                      $token = new HTMLPurifier_Token_Text('<');
 249                      if ($maintain_line_numbers) {
 250                          $token->rawPosition($current_line, $current_col);
 251                          $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 252                      }
 253                      $array[] = $token;
 254                      $inside_tag = false;
 255                      continue;
 256                  }
 257  
 258                  // Check if it is explicitly self closing, if so, remove
 259                  // trailing slash. Remember, we could have a tag like <br>, so
 260                  // any later token processing scripts must convert improperly
 261                  // classified EmptyTags from StartTags.
 262                  $is_self_closing = (strrpos($segment, '/') === $strlen_segment - 1);
 263                  if ($is_self_closing) {
 264                      $strlen_segment--;
 265                      $segment = substr($segment, 0, $strlen_segment);
 266                  }
 267  
 268                  // Check if there are any attributes
 269                  $position_first_space = strcspn($segment, $this->_whitespace);
 270  
 271                  if ($position_first_space >= $strlen_segment) {
 272                      if ($is_self_closing) {
 273                          $token = new HTMLPurifier_Token_Empty($segment);
 274                      } else {
 275                          $token = new HTMLPurifier_Token_Start($segment);
 276                      }
 277                      if ($maintain_line_numbers) {
 278                          $token->rawPosition($current_line, $current_col);
 279                          $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 280                      }
 281                      $array[] = $token;
 282                      $inside_tag = false;
 283                      $cursor = $position_next_gt + 1;
 284                      continue;
 285                  }
 286  
 287                  // Grab out all the data
 288                  $type = substr($segment, 0, $position_first_space);
 289                  $attribute_string =
 290                      trim(
 291                          substr(
 292                              $segment,
 293                              $position_first_space
 294                          )
 295                      );
 296                  if ($attribute_string) {
 297                      $attr = $this->parseAttributeString(
 298                          $attribute_string,
 299                          $config,
 300                          $context
 301                      );
 302                  } else {
 303                      $attr = array();
 304                  }
 305  
 306                  if ($is_self_closing) {
 307                      $token = new HTMLPurifier_Token_Empty($type, $attr);
 308                  } else {
 309                      $token = new HTMLPurifier_Token_Start($type, $attr);
 310                  }
 311                  if ($maintain_line_numbers) {
 312                      $token->rawPosition($current_line, $current_col);
 313                      $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 314                  }
 315                  $array[] = $token;
 316                  $cursor = $position_next_gt + 1;
 317                  $inside_tag = false;
 318                  continue;
 319              } else {
 320                  // inside tag, but there's no ending > sign
 321                  if ($e) {
 322                      $e->send(E_WARNING, 'Lexer: Missing gt');
 323                  }
 324                  $token = new
 325                  HTMLPurifier_Token_Text(
 326                      '<' .
 327                      $this->parseData(
 328                          substr($html, $cursor)
 329                      )
 330                  );
 331                  if ($maintain_line_numbers) {
 332                      $token->rawPosition($current_line, $current_col);
 333                  }
 334                  // no cursor scroll? Hmm...
 335                  $array[] = $token;
 336                  break;
 337              }
 338              break;
 339          }
 340  
 341          $context->destroy('CurrentLine');
 342          $context->destroy('CurrentCol');
 343          return $array;
 344      }
 345  
 346      /**
 347       * PHP 5.0.x compatible substr_count that implements offset and length
 348       * @param string $haystack
 349       * @param string $needle
 350       * @param int $offset
 351       * @param int $length
 352       * @return int
 353       */
 354      protected function substrCount($haystack, $needle, $offset, $length)
 355      {
 356          static $oldVersion;
 357          if ($oldVersion === null) {
 358              $oldVersion = version_compare(PHP_VERSION, '5.1', '<');
 359          }
 360          if ($oldVersion) {
 361              $haystack = substr($haystack, $offset, $length);
 362              return substr_count($haystack, $needle);
 363          } else {
 364              return substr_count($haystack, $needle, $offset, $length);
 365          }
 366      }
 367  
 368      /**
 369       * Takes the inside of an HTML tag and makes an assoc array of attributes.
 370       *
 371       * @param string $string Inside of tag excluding name.
 372       * @param HTMLPurifier_Config $config
 373       * @param HTMLPurifier_Context $context
 374       * @return array Assoc array of attributes.
 375       */
 376      public function parseAttributeString($string, $config, $context)
 377      {
 378          $string = (string)$string; // quick typecast
 379  
 380          if ($string == '') {
 381              return array();
 382          } // no attributes
 383  
 384          $e = false;
 385          if ($config->get('Core.CollectErrors')) {
 386              $e =& $context->get('ErrorCollector');
 387          }
 388  
 389          // let's see if we can abort as quickly as possible
 390          // one equal sign, no spaces => one attribute
 391          $num_equal = substr_count($string, '=');
 392          $has_space = strpos($string, ' ');
 393          if ($num_equal === 0 && !$has_space) {
 394              // bool attribute
 395              return array($string => $string);
 396          } elseif ($num_equal === 1 && !$has_space) {
 397              // only one attribute
 398              list($key, $quoted_value) = explode('=', $string);
 399              $quoted_value = trim($quoted_value);
 400              if (!$key) {
 401                  if ($e) {
 402                      $e->send(E_ERROR, 'Lexer: Missing attribute key');
 403                  }
 404                  return array();
 405              }
 406              if (!$quoted_value) {
 407                  return array($key => '');
 408              }
 409              $first_char = @$quoted_value[0];
 410              $last_char = @$quoted_value[strlen($quoted_value) - 1];
 411  
 412              $same_quote = ($first_char == $last_char);
 413              $open_quote = ($first_char == '"' || $first_char == "'");
 414  
 415              if ($same_quote && $open_quote) {
 416                  // well behaved
 417                  $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
 418              } else {
 419                  // not well behaved
 420                  if ($open_quote) {
 421                      if ($e) {
 422                          $e->send(E_ERROR, 'Lexer: Missing end quote');
 423                      }
 424                      $value = substr($quoted_value, 1);
 425                  } else {
 426                      $value = $quoted_value;
 427                  }
 428              }
 429              if ($value === false) {
 430                  $value = '';
 431              }
 432              return array($key => $this->parseData($value));
 433          }
 434  
 435          // setup loop environment
 436          $array = array(); // return assoc array of attributes
 437          $cursor = 0; // current position in string (moves forward)
 438          $size = strlen($string); // size of the string (stays the same)
 439  
 440          // if we have unquoted attributes, the parser expects a terminating
 441          // space, so let's guarantee that there's always a terminating space.
 442          $string .= ' ';
 443  
 444          $old_cursor = -1;
 445          while ($cursor < $size) {
 446              if ($old_cursor >= $cursor) {
 447                  throw new Exception("Infinite loop detected");
 448              }
 449              $old_cursor = $cursor;
 450  
 451              $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
 452              // grab the key
 453  
 454              $key_begin = $cursor; //we're currently at the start of the key
 455  
 456              // scroll past all characters that are the key (not whitespace or =)
 457              $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
 458  
 459              $key_end = $cursor; // now at the end of the key
 460  
 461              $key = substr($string, $key_begin, $key_end - $key_begin);
 462  
 463              if (!$key) {
 464                  if ($e) {
 465                      $e->send(E_ERROR, 'Lexer: Missing attribute key');
 466                  }
 467                  $cursor += 1 + strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
 468                  continue; // empty key
 469              }
 470  
 471              // scroll past all whitespace
 472              $cursor += strspn($string, $this->_whitespace, $cursor);
 473  
 474              if ($cursor >= $size) {
 475                  $array[$key] = $key;
 476                  break;
 477              }
 478  
 479              // if the next character is an equal sign, we've got a regular
 480              // pair, otherwise, it's a bool attribute
 481              $first_char = @$string[$cursor];
 482  
 483              if ($first_char == '=') {
 484                  // key="value"
 485  
 486                  $cursor++;
 487                  $cursor += strspn($string, $this->_whitespace, $cursor);
 488  
 489                  if ($cursor === false) {
 490                      $array[$key] = '';
 491                      break;
 492                  }
 493  
 494                  // we might be in front of a quote right now
 495  
 496                  $char = @$string[$cursor];
 497  
 498                  if ($char == '"' || $char == "'") {
 499                      // it's quoted, end bound is $char
 500                      $cursor++;
 501                      $value_begin = $cursor;
 502                      $cursor = strpos($string, $char, $cursor);
 503                      $value_end = $cursor;
 504                  } else {
 505                      // it's not quoted, end bound is whitespace
 506                      $value_begin = $cursor;
 507                      $cursor += strcspn($string, $this->_whitespace, $cursor);
 508                      $value_end = $cursor;
 509                  }
 510  
 511                  // we reached a premature end
 512                  if ($cursor === false) {
 513                      $cursor = $size;
 514                      $value_end = $cursor;
 515                  }
 516  
 517                  $value = substr($string, $value_begin, $value_end - $value_begin);
 518                  if ($value === false) {
 519                      $value = '';
 520                  }
 521                  $array[$key] = $this->parseData($value);
 522                  $cursor++;
 523              } else {
 524                  // boolattr
 525                  if ($key !== '') {
 526                      $array[$key] = $key;
 527                  } else {
 528                      // purely theoretical
 529                      if ($e) {
 530                          $e->send(E_ERROR, 'Lexer: Missing attribute key');
 531                      }
 532                  }
 533              }
 534          }
 535          return $array;
 536      }
 537  }
 538  
 539  // vim: et sw=4 sts=4
PHP Cross Reference of Unnamed Project

/se3master/var/www/se3/includes/library/HTMLPurifier/Lexer/ -> DirectLex.php (source)