Source for file safehtml.php
Documentation is available at safehtml.php
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
* @author Roman Ivanov <thingol@mail.ru>
* @copyright 2004-2005 Roman Ivanov
* @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
* @version 1.3.7 SVN: $Id: safehtml.php 23054 2007-10-25 15:04:24Z landseer $
* @link http://pixel-apes.com/safehtml/
* This package requires HTMLSax3 package
* This parser strips down all potentially dangerous content within HTML:
* <li>opening tag without its closing tag</li>
* <li>closing tag without its opening tag</li>
* <li>any of these tags: "base", "basefont", "head", "html", "body", "applet",
* "object", "iframe", "frame", "frameset", "script", "layer", "ilayer", "embed",
* "bgsound", "link", "meta", "style", "title", "blink", "xml" etc.</li>
* <li>any of these attributes: on*, data*, dynsrc</li>
* <li>javascript:/vbscript:/about: etc. protocols</li>
* <li>expression/behavior etc. in styles</li>
* <li>any other active content</li>
* It also tries to convert code to XHTML valid, but htmltidy is far better
* solution for this task.
* $parser =& new SafeHTML();
* $result = $parser->parse($doc);
* @author Roman Ivanov <thingol@mail.ru>
* @copyright 1997-2005 Roman Ivanov
* @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
* @version Release: @package_version@
* @link http://pear.php.net/package/SafeHTML
* Storage for resulting HTML output
* Array of counters for each tag
* Array of counters for tags that must be deleted with all content
var $_dcCounter = array();
* Stack of unclosed tags that must be deleted with all content
* Stores level of list (ol/ul) nesting
* Stack of unclosed list tags
* Array of prepared regular expressions for protocols (schemas) matching
var $_protoRegexps = array();
* Array of prepared regular expressions for CSS matching
var $_cssRegexps = array();
* List of single tags ("<tag />")
var $singleTags = array('area', 'br', 'img', 'input', 'hr', 'wbr', );
* List of dangerous tags (such tags will be deleted)
'applet', 'base', 'basefont', 'bgsound', 'blink', 'body',
'embed', 'frame', 'frameset', 'head', 'html', 'ilayer',
'iframe', 'layer', 'link', 'meta', 'object', 'style',
* List of dangerous tags (such tags will be deleted, and all content
* inside this tags will be also removed)
* Type of protocols filtering ('white' or 'black')
* List of "dangerous" protocols (used for blacklist-filtering)
'about', 'chrome', 'data', 'disk', 'hcp',
'help', 'javascript', 'livescript', 'lynxcgi', 'lynxexec',
'ms-help', 'ms-its', 'mhtml', 'mocha', 'opera',
'res', 'resource', 'shell', 'vbscript', 'view-source',
'vnd.ms.radio', 'wysiwyg',
* List of "safe" protocols (used for whitelist-filtering)
'ed2k', 'file', 'ftp', 'gopher', 'http', 'https',
'irc', 'mailto', 'news', 'nntp', 'telnet', 'webcal',
* List of attributes that can contain protocols
'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src',
* List of dangerous CSS keywords
* Whole style="" attribute will be removed, if parser will find one of
'absolute', 'behavior', 'behaviour', 'content', 'expression',
'fixed', 'include-source', 'moz-binding',
* List of tags that can have no "closing tag"
* @deprecated XHTML does not allow such tags
* List of block-level tags that terminates paragraph
* Paragraph will be closed when this tags opened
'address', 'blockquote', 'center', 'dd', 'dir', 'div',
'dl', 'dt', 'h1', 'h2', 'h3', 'h4',
'h5', 'h6', 'hr', 'isindex', 'listing', 'marquee',
'menu', 'multicol', 'ol', 'p', 'plaintext', 'pre',
* List of table tags, all table tags outside a table will be removed
'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
var $listTags = array('dir', 'menu', 'ol', 'ul', 'dl', );
* List of dangerous attributes
var $attributes = array('dynsrc'); // , 'id', 'name', ); // commented out as bugfix for #5188
// TODO: make this configurable
* List of allowed "namespaced" attributes
//making regular expressions based on Proto & CSS arrays
$preg = "/[\s\x01-\x1F]*";
for ($i= 0; $i< strlen($proto); $i++ ) {
$preg .= $proto{$i} . "[\s\x01-\x1F]*";
$this->_protoRegexps[] = $preg;
$this->_cssRegexps[] = '/' . $css . '/i';
* Handles the writing of attributes - called from $this->_openHandler()
* @param array $attrs array of attributes $name => $value
function _writeAttrs ($attrs)
foreach ($attrs as $name => $value) {
if (strpos($name, 'on') === 0) {
if (strpos($name, 'data') === 0) {
if (($value === TRUE) || (is_null($value))) {
// removes insignificant backslahes
if ($_value == $value) break;
// replace all & to &
foreach ($this->_cssRegexps as $css) {
foreach ($this->_protoRegexps as $proto) {
$tempval = preg_replace('/&#(\d+);?/me', "chr('\\1')", $value); //"'
$tempval = preg_replace('/&#x([0-9a-f]+);?/mei', "chr(hexdec('\\1'))", $tempval);
(strpos($tempval, ':') !== false))
foreach ($this->_protoRegexps as $proto) {
$_tempval = explode(':', $tempval);
$this->_xhtml .= ' ' . $name . '="' . $value . '"';
* Opening tag handler - called from HTMLSax
* @param object $parser HTML Parser
* @param string $name tag name
* @param array $attrs tag attributes
function _openHandler(&$parser, $name, $attrs)
$this->_dcCounter[$name] = isset ($this->_dcCounter[$name]) ? $this->_dcCounter[$name]+ 1 : 1;
if (count($this->_dcStack) != 0) {
$this->_xhtml .= '<' . $name . '>';
$this->_xhtml .= '<' . $name;
$this->_writeAttrs($attrs);
// TABLES: cannot open table elements when we are not inside table
if ((isset ($this->_counter['table'])) && ($this->_counter['table'] <= 0)
// PARAGRAPHS: close paragraph when closeParagraph tags opening
$this->_closeHandler($parser, 'p');
// LISTS: we should close <li> if <li> of the same level opening
if ($name == 'li' && count($this->_liStack) &&
$this->_listScope == $this->_liStack[count($this->_liStack)- 1])
$this->_closeHandler($parser, 'li');
// LISTS: we want to know on what nesting level of lists we are
$this->_xhtml .= '<' . $name;
$this->_writeAttrs($attrs);
$this->_counter[$name] = isset ($this->_counter[$name]) ? $this->_counter[$name]+ 1 : 1;
* Closing tag handler - called from HTMLSax
* @param object $parsers HTML parser
* @param string $name tag name
function _closeHandler(&$parser, $name)
if (isset ($this->_dcCounter[$name]) && ($this->_dcCounter[$name] > 0) &&
while ($name != ($tag = array_pop($this->_dcStack))) {
$this->_dcCounter[$tag]-- ;
$this->_dcCounter[$name]-- ;
if (count($this->_dcStack) != 0) {
if ((isset ($this->_counter[$name])) && ($this->_counter[$name] > 0)) {
while ($name != ($tag = array_pop($this->_stack))) {
* @param string $tag tag name
$this->_xhtml .= '</' . $tag . '>';
* Character data handler - called from HTMLSax
* @param object $parser HTML parser
* @param string $data textual data
function _dataHandler(&$parser, $data)
if (count($this->_dcStack) == 0) {
* Escape handler - called from HTMLSax
* @param object $parser HTML parser
* @param string $data comments or other type of data
function _escapeHandler(&$parser, $data)
* Returns the XHTML document
* @return string Processed (X)HTML document
* Clears current document data
* @param string $doc HTML document for processing
* @return string Processed (X)HTML document
$doc = preg_replace("/<(?=[^a-zA-Z\/\!\?\%])/", '<', $doc);
// Web documents shouldn't contains \x00 symbol
// UTF-7 encoding ASCII decode
$doc = $this->repackUTF7($doc);
// Instantiate the parser
$parser->set_object($this);
$parser->set_element_handler('_openHandler','_closeHandler');
$parser->set_data_handler('_dataHandler');
$parser->set_escape_handler('_escapeHandler');
* @param string $str HTML document for recode ASCII part of UTF-7 back to ASCII
* @return string Decoded document
function repackUTF7($str)
* Additional UTF-7 decoding fuction
* @param string $str String for recode ASCII part of UTF-7 back to ASCII
* @return string Recoded string
function repackUTF7Callback($str)
* Additional UTF-7 encoding fuction
* @param string $str String for recode ASCII part of UTF-7 back to ASCII
* @return string Recoded string
function repackUTF7Back($str)
* c-hanging-comment-ender-p: nil
|