package.xml0000644000076500000240000000400111425670544012345 0ustar demritstaff HTML_Safe pear.php.net This parser strips down all potentially dangerous content within HTML This parser strips down all potentially dangerous content within HTML Miguel Vazquez Gocobachi demrit demrit@php.net yes 2010-08-02 0.10.1 0.10.1 beta beta BSD (3 Clause) * UTF-7 XSS vulnerability fixed 5.2 1.6 XML_HTMLSax3 pear.php.net 3.0.0RC1 0.10.1 0.10.1 beta beta 2010-08-04 BSD (3 Clause) HTML_Safe-0.10.1/docs/README0000644000076500000240000000714311425670544014267 0ustar demritstaffHTML_Safe -------- Version 1.3.5. http://pixel-apes.com/safehtml/ -------- This parser strips down all potentially dangerous content within HTML: * opening tag without its closing tag * closing tag without its opening tag * any of these tags: "base", "basefont", "head", "html", "body", "applet", "object", "iframe", "frame", "frameset", "script", "layer", "ilayer", "embed", "bgsound", "link", "meta", "style", "title", "blink", "xml" etc. * any of these attributes: on*, data*, dynsrc * javascript:/vbscript:/about: etc. protocols * expression/behavior etc. in styles * any other active content It also tries to convert code to XHTML valid, but htmltidy is far better solution for this task. If you found any bugs in this parser, please inform me -- ICQ:551593 or mailto:thingol@mail.ru Please, subscribe to http://pixel-apes.com/safehtml/feed/rss feed in order to receive notices when SAFEHTML will be updated. -- Roman Ivanov. -- Pixel-Apes ( http://pixel-apes.com ). -- JetStyle ( http://jetstyle.ru/ ). -------- Version history: -------- 1.3.5. * Two serious security flaws fixed: UTF-7 XSS and CSS comments handling. 1.3.2. * Security flaw (improper quotes handling in attributes' values) fixed. Big thanks to Nick Cleaton. 1.3.1. * Dumb bug fixed (some closing tags were ignored). 1.3.0. * Two holes (with decimal HTML entities and with \x00 symbol) fixed. * Class rewritten under PEAR coding standarts. * Class now uses unmodified HTMLSax3 from PEAR. * To the list of table tags added: "caption", "col", "colgroup". 1.2.1. * It was possible to create XSS with hexadecimal HTML entities. Fixed. Big thanks to Christian Stocker. 1.2.0. * "id" and "name" attributes added to dangerous attributes list, because malefactor can broke legal javascript by spoofing ID or NAME of some element. * New method parse() allows to do all parsing process in two lines of code. Examples also updated. * New array, closeParagraph, contains list of block-level elements. When we open such elemet, we should close paragraph before. . It allows SafeHTML to produce more XHTML compliant code. * Added "webcal" to white list of protocols for those who uses calendar programs (Mozilla/iCal/etc). * Now SafeHTML strips down table elements when we are not inside table. * Now SafeHTML correctly closes unclosed "li" tags: before opening "li" of the same nesting level. 1.1.0. * New "dangerous" protocols: hcp, ms-help, help, disk, vnd.ms.radio, opera, res, resource, chrome, mocha, livescript. * tag was moved from "tags for deletion" to "tags for deletion with content". * New "dangerous" CSS instruction "include-source" (NN4 specific). * New array, Attributes, contains list of attributes for removal. If you need to remove "id" or "name" attribute, just add it to this array. * Now it is possible to choose between white-list and black-list filtering of protocols. Defaults are "white-list". This list is: "http", "https", "ftp", "telnet", "news", "nntp", "gopher", "mailto", "file". * For speed purposes, we now filter protocols only from these attributes: src, href, action, lowsrc, dynsrc, background, codebase. * Opera6 XSS bug ([\xC0][\xBC]script>alert(1)[\xC0][\xBC]/script> [UTF-8] workarounded. 1.0.4. New "dangerous" tag: plaintext. 1.0.3. Added array of elements that can have no closing tag. 1.0.2. Bug fix: attack. Thanks to shmel. 1.0.1. Bug fix: safehtml hangs on code. Thanks to lj user=electrocat. 1.0.0. First public release HTML_Safe-0.10.1/HTML/Safe.php0000644000076500000240000004503711425670544014616 0ustar demritstaff * @author Miguel Vazquez Gocobachi * @copyright 2004-2009 Roman Ivanov, Miguel Vazquez Gocobachi * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause) * @version SVN: $Id$ * @link http://pear.php.net/package/HTML_Safe */ /** * This package requires HTMLSax3 package */ require_once 'XML/HTMLSax3.php'; /** * HTML_Safe Parser * * This parser strips down all potentially dangerous content within HTML: * * It also tries to convert code to XHTML valid, but htmltidy is far better * solution for this task. * * Example: *
 * $parser = new HTML_Safe;
 * $result = $parser->parse($doc);
 * 
* * @category HTML * @package HTML_Safe * @author Roman Ivanov * @author Miguel Vazquez Gocobachi * @copyright 2004-2009 Roman Ivanov, Miguel Vazquez Gocobachi * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause) * @version Release: @package_version@ * @link http://pear.php.net/package/HTML_Safe */ class HTML_Safe { /** * Storage for resulting HTML output * * @var string */ protected $xhtml = ''; /** * Array of counters for each tag * * @var array */ protected $counter = array(); /** * Stack of unclosed tags * * @var array */ protected $stack = array(); /** * Array of counters for tags that must be deleted with all content * * @var array */ protected $dcCounter = array(); /** * Stack of unclosed tags that must be deleted with all content * * @var array */ protected $dcStack = array(); /** * Stores level of list (ol/ul) nesting * * @var int */ protected $listScope = 0; /** * Stack of unclosed list tags * * @var array */ protected $liStack = array(); /** * Array of prepared regular expressions for protocols (schemas) matching * * @var array */ protected $protoRegexps = array(); /** * Array of prepared regular expressions for CSS matching * * @var array */ protected $cssRegexps = array(); /** * Allowed tags * * @var array */ protected $allowTags = array(); /** * List of single tags ("") * * @var array */ public $singleTags = array('area', 'br', 'img', 'input', 'hr', 'wbr', ); /** * List of dangerous tags (such tags will be deleted) * * @var array */ public $deleteTags = array( 'applet', 'base', 'basefont', 'bgsound', 'blink', 'body', 'embed', 'frame', 'frameset', 'head', 'html', 'ilayer', 'iframe', 'layer', 'link', 'meta', 'object', 'style', 'title', 'script', ); /** * List of dangerous tags (such tags will be deleted, and all content * inside this tags will be also removed) * * @var array */ public $deleteTagsContent = array('script', 'style', 'title', 'xml', ); /** * Type of protocols filtering ('white' or 'black') * * @var string */ public $protocolFiltering = 'white'; /** * List of "dangerous" protocols (used for blacklist-filtering) * * @var array */ public $blackProtocols = array( 'about', 'chrome', 'data', 'disk', 'hcp', 'help', 'javascript', 'livescript', 'lynxcgi', 'lynxexec', 'ms-help', 'ms-its', 'mhtml', 'mocha', 'opera', 'res', 'resource', 'shell', 'vbscript', 'view-source', 'vnd.ms.radio', 'wysiwyg', ); /** * List of "safe" protocols (used for whitelist-filtering) * * @var array */ public $whiteProtocols = array( 'ed2k', 'file', 'ftp', 'gopher', 'http', 'https', 'irc', 'mailto', 'news', 'nntp', 'telnet', 'webcal', 'xmpp', 'callto', ); /** * List of attributes that can contain protocols * * @var array */ public $protocolAttributes = array( 'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src', ); /** * List of dangerous CSS keywords * * Whole style="" attribute will be removed, if parser will find one of * these keywords * * @var array */ public $cssKeywords = array( 'absolute', 'behavior', 'behaviour', 'content', 'expression', 'fixed', 'include-source', 'moz-binding', ); /** * List of tags that can have no "closing tag" * * @var array * @deprecated XHTML does not allow such tags */ public $noClose = array(); /** * List of block-level tags that terminates paragraph * * Paragraph will be closed when this tags opened * * @var array */ public $closeParagraph = array( 'address', 'blockquote', 'center', 'dd', 'dir', 'div', 'dl', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'isindex', 'listing', 'marquee', 'menu', 'multicol', 'ol', 'p', 'plaintext', 'pre', 'table', 'ul', 'xmp', ); /** * List of table tags, all table tags outside a table will be removed * * @var array */ public $tableTags = array( 'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', ); /** * List of list tags * * @var array */ public $listTags = array('dir', 'menu', 'ol', 'ul', 'dl', ); /** * List of dangerous attributes * * @var array */ public $attributes = array('dynsrc', 'id', 'name', ); /** * List of allowed "namespaced" attributes * * @var array */ public $attributesNS = array('xml:lang', ); /** * Constructs class * * @access public */ public function __construct() { //making regular expressions based on Proto & CSS arrays foreach ($this->blackProtocols as $proto) { $preg = "/[\s\x01-\x1F]*"; for ($i=0; $iprotoRegexps[] = $preg; } foreach ($this->cssKeywords as $css) { $this->cssRegexps[] = '/' . $css . '/i'; } return true; } /** * Handles the writing of attributes - called from $this->openHandler() * * @param array $attrs array of attributes $name => $value * * @return boolean */ protected function writeAttrs($attrs) { if (is_array($attrs)) { foreach ($attrs as $name => $value) { $name = strtolower($name); if (strpos($name, 'on') === 0) { continue; } if (strpos($name, 'data') === 0) { continue; } if (in_array($name, $this->attributes)) { continue; } if (!preg_match('/^[a-z0-9]+$/i', $name)) { if (!in_array($name, $this->attributesNS)) { continue; } } if (($value === true) || (is_null($value))) { $value = $name; } if ($name == 'style') { // removes insignificant backslahes $value = str_replace("\\", '', $value); // removes CSS comments while (1) { $_value = preg_replace('!/\*.*?\*/!s', '', $value); if ($_value == $value) { break; } $value = $_value; } // replace all & to & $value = str_replace('&', '&', $value); $value = str_replace('&', '&', $value); foreach ($this->cssRegexps as $css) { if (preg_match($css, $value)) { continue 2; } } foreach ($this->protoRegexps as $proto) { if (preg_match($proto, $value)) { continue 2; } } } $tempval = preg_replace('/&#(\d+);?/me', "chr('\\1')", $value); //"' $tempval = preg_replace( '/&#x([0-9a-f]+);?/mei', "chr(hexdec('\\1'))", $tempval ); if ((in_array($name, $this->protocolAttributes)) && (strpos($tempval, ':') !== false) ) { if ($this->protocolFiltering == 'black') { foreach ($this->protoRegexps as $proto) { if (preg_match($proto, $tempval)) { continue 2; } } } else { $_tempval = explode(':', $tempval); $proto = $_tempval[0]; if (!in_array($proto, $this->whiteProtocols)) { continue; } } } $value = str_replace("\"", '"', $value); $this->xhtml .= ' ' . $name . '="' . $value . '"'; } } return true; } /** * Opening tag handler - called from HTMLSax * * @param object &$parser HTML Parser * @param string $name tag name * @param array $attrs tag attributes * * @return boolean */ public function openHandler(&$parser, $name, $attrs) { $name = strtolower($name); if (in_array($name, $this->deleteTagsContent)) { array_push($this->dcStack, $name); $this->dcCounter[$name] = isset($this->dcCounter[$name]) ? $this->dcCounter[$name]+1 : 1; } if (count($this->dcStack) != 0) { return true; } if (in_array($name, $this->deleteTags) && !in_array($name, $this->allowTags) ) { return true; } if (!preg_match("/^[a-z0-9]+$/i", $name)) { if (preg_match("!(?:\@|://)!i", $name)) { $this->xhtml .= '<' . $name . '>'; } return true; } if (in_array($name, $this->singleTags)) { $this->xhtml .= '<' . $name; $this->writeAttrs($attrs); $this->xhtml .= ' />'; return true; } // TABLES: cannot open table elements when we are not inside table if ((isset($this->counter['table'])) && ($this->counter['table'] <= 0) && (in_array($name, $this->tableTags)) ) { return true; } // PARAGRAPHS: close paragraph when closeParagraph tags opening if ((in_array($name, $this->closeParagraph)) && (in_array('p', $this->stack)) ) { $this->closeHandler($parser, 'p'); } // LISTS: we should close
  • if
  • of the same level opening if (($name == 'li') && count($this->liStack) && ($this->listScope == $this->liStack[count($this->liStack) - 1]) ) { $this->closeHandler($parser, 'li'); } // LISTS: we want to know on what nesting level of lists we are if (in_array($name, $this->listTags)) { ++$this->listScope; } if ($name == 'li') { array_push($this->liStack, $this->listScope); } $this->xhtml .= '<' . $name; $this->writeAttrs($attrs); $this->xhtml .= '>'; array_push($this->stack, $name); $this->counter[$name] = isset($this->counter[$name]) ? ($this->counter[$name] + 1) : 1; return true; } /** * Closing tag handler - called from HTMLSax * * @param object &$parser HTML parser * @param string $name tag name * * @return boolean */ public function closeHandler(&$parser, $name) { $name = strtolower($name); if (isset($this->dcCounter[$name]) && ($this->dcCounter[$name] > 0) && (in_array($name, $this->deleteTagsContent)) ) { while ($name != ($tag = array_pop($this->dcStack))) { --$this->dcCounter[$tag]; } --$this->dcCounter[$name]; } if (count($this->dcStack) != 0) { return true; } if ((isset($this->counter[$name])) && ($this->counter[$name] > 0)) { while ($name != ($tag = array_pop($this->stack))) { $this->closeTag($tag); } $this->closeTag($name); } return true; } /** * Closes tag * * @param string $tag tag name * * @return boolean */ protected function closeTag($tag) { if (!in_array($tag, $this->noClose)) { $this->xhtml .= ''; } --$this->counter[$tag]; if (in_array($tag, $this->listTags)) { --$this->listScope; } if ($tag == 'li') { array_pop($this->liStack); } return true; } /** * Character data handler - called from HTMLSax * * @param object &$parser HTML parser * @param string $data textual data * * @return boolean */ public function dataHandler(&$parser, $data) { if (count($this->dcStack) == 0) { $this->xhtml .= $data; } return true; } /** * Escape handler - called from HTMLSax * * @param object &$parser HTML parser * @param string $data comments or other type of data * * @return boolean */ public function escapeHandler(&$parser, $data) { return true; } /** * Allow tags * * Example: *
         * $safe = new HTML_Safe;
         * $safe->setAllowTags(array('body'));
         * 
    * * @param array $tags Tags to allow * * @return void */ public function setAllowTags($tags = array()) { if (is_array($tags)) { $this->allowTags = $tags; } } /** * Returns the allowed tags * * @return array */ public function getAllowTags() { return $this->allowTags; } /** * Reset the allowed tags * * @return void */ public function resetAllowTags() { $this->allowTags = array(); } /** * Returns the XHTML document * * @return string Processed (X)HTML document */ public function getXHTML() { while ($tag = array_pop($this->stack)) { $this->closeTag($tag); } return $this->xhtml; } /** * Clears current document data * * @return boolean */ public function clear() { $this->xhtml = ''; return true; } /** * Main parsing fuction * * @param string $doc HTML document for processing * * @return string Processed (X)HTML document */ public function parse($doc) { $result = ''; // Save all '<' symbols $doc = preg_replace("/<(?=[^a-zA-Z\/\!\?\%])/", '<', $doc); // UTF7 pack $doc = $this->repackUTF7($doc); // Instantiate the parser $parser = new XML_HTMLSax3; // Set up the parser $parser->set_object($this); $parser->set_element_handler('openHandler', 'closeHandler'); $parser->set_data_handler('dataHandler'); $parser->set_escape_handler('escapeHandler'); $parser->parse($doc); $result = $this->getXHTML(); $this->clear(); return $result; } /** * UTF-7 decoding fuction * * @param string $str HTML document for recode ASCII part of UTF-7 back to ASCII * @return string Decoded document * @access private */ function repackUTF7($str) { return preg_replace_callback('!\+([0-9a-zA-Z/]+)\-!', array($this, 'repackUTF7Callback'), $str); } /** * Additional UTF-7 decoding fuction * * @param string $str String for recode ASCII part of UTF-7 back to ASCII * @return string Recoded string * @access private */ function repackUTF7Callback($str) { $str = base64_decode($str[1]); $str = preg_replace_callback('/^((?:\x00.)*)((?:[^\x00].)+)/', array($this, 'repackUTF7Back'), $str); return preg_replace('/\x00(.)/', '$1', $str); } /** * Additional UTF-7 encoding fuction * * @param string $str String for recode ASCII part of UTF-7 back to ASCII * @return string Recoded string * @access private */ function repackUTF7Back($str) { return $str[1].'+'.rtrim(base64_encode($str[2]), '=').'-'; } } HTML_Safe-0.10.1/tests/testHTML_Safe.php0000644000076500000240000000150211425670544016725 0ustar demritstaff

    my text

    '; $expected = '

    my text

    '; $safe = new HTML_Safe; $safe->setAllowTags(array('body')); $this->assertSame($expected, $safe->parse($input)); } public function testSpecialChars() { $inputOne = 'a+b-c'; $expectedOne = 'a+b-c'; $inputTwo = '+49-52
    '; $expectedTwo = '+49-52
    '; $safe = new HTML_Safe; $this->assertSame($expectedOne, $safe->parse($inputOne)); $this->assertSame($expectedTwo, $safe->parse($inputTwo)); } }