15 static $libXMLErrorIgnoreCodes = array(
28 public static function xssClean(
$data)
31 $data = str_replace(array(
40 $data = preg_replace(
'/(&#*\w+)[\x00-\x20]+;/u',
'$1;',
$data);
41 $data = preg_replace(
'/(&#x*[0-9A-F]+);*/iu',
'$1;',
$data);
42 $data = html_entity_decode(
$data, ENT_COMPAT,
'UTF-8');
44 $data = preg_replace(
'#(<[^>]+?[\x00-\x20"\'])(?:on|xmlns)[a-z]*\s*=\s*["][^"]*["]+#iu',
'$1',
$data);
45 $data = preg_replace(
'#(<[^>]+?[\x00-\x20"\'])(?:on|xmlns)[a-z]*\s*=\s*[\'][^\']*[\']+#iu',
'$1',
$data);
47 $data = preg_replace(
'#([a-z]*)[\x00-\x20]*=[\x00-\x20]*([`\'"]*)[\x00-\x20]*j[\x00-\x20]*a[\x00-\x20]*v[\x00-\x20]*a[\x00-\x20]*s[\x00-\x20]*c[\x00-\x20]*r[\x00-\x20]*i[\x00-\x20]*p[\x00-\x20]*t[\x00-\x20]*:#iu',
'$1=$2nojavascript...',
$data);
48 $data = preg_replace(
'#([a-z]*)[\x00-\x20]*=([\'"]*)[\x00-\x20]*v[\x00-\x20]*b[\x00-\x20]*s[\x00-\x20]*c[\x00-\x20]*r[\x00-\x20]*i[\x00-\x20]*p[\x00-\x20]*t[\x00-\x20]*:#iu',
'$1=$2novbscript...',
$data);
49 $data = preg_replace(
'#([a-z]*)[\x00-\x20]*=([\'"]*)[\x00-\x20]*-moz-binding[\x00-\x20]*:#u',
'$1=$2nomozbinding...',
$data);
51 $data = preg_replace(
'#(<[^>]+?)style[\x00-\x20]*=[\x00-\x20]*([`\'"])(?:.(?!\2))*?expression[\x00-\x20]*\((?:.(?!\2))*[^>]*+>#i',
'$1>',
$data);
52 $data = preg_replace(
'#(<[^>]+?)style[\x00-\x20]*=[\x00-\x20]*([`\'"])(?:.(?!\2))*?behaviour[\x00-\x20]*\((?:.(?!\2))*[^>]*+>#i',
'$1>',
$data);
53 $data = preg_replace(
'#(<[^>]+?)style[\x00-\x20]*=[\x00-\x20]*([`\'"])(?:.(?!\2))*?s[\x00-\x20]*c[\x00-\x20]*r[\x00-\x20]*i[\x00-\x20]*p[\x00-\x20]*t[\x00-\x20]*:*(?:.(?!\2))*[^>]*>#iu',
'$1>',
$data);
55 $data = preg_replace(
'#</*\w+:\w[^>]*+>#i',
'',
$data);
60 $data = preg_replace(
'#</*(?:applet|script)[^>]*+>#i',
'',
$data);
61 }
while ($old_data !==
$data);
66 public static function cleanStyle(
$data)
69 $data = preg_replace(
'/<\/?span[^>]*>/is',
"",
$data);
71 $data = preg_replace(
'/<\/?font[^>]*>/is',
"",
$data);
73 $data = preg_replace(
'/<([^>]*) style\s*=\s*"[^"]*"/is',
"<\\1",
$data);
74 $data = preg_replace(
'/<([^>]*) style\s*=\s*\'[^\']*\'/is',
"<\\1",
$data);
76 $data = preg_replace(
'/<([^>]*) class\s*=\s*"[^"]*"/is',
"<\\1",
$data);
77 $data = preg_replace(
'/<([^>]*) class\s*=\s*\'[^\']*\'/is',
"<\\1",
$data);
79 $data = preg_replace(
'/<\s*style[^>]*>[^<]*<\/style>/iu',
"",
$data);
96 public static function normalizeHTMLFragment($html, &$error =
'')
98 $dom =
new XDOMDocument();
99 $dom->setLibXMLErrorIgnoreCodes(self::$libXMLErrorIgnoreCodes);
100 $dom->preserveWhiteSpace =
false;
101 $dom->formatOutput =
false;
106 $html =
'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/></head><body>' . self::cleanXMLUTF8($html);
110 $libXMLErrors = array();
111 $libXMLOpts = LIBXML_NONET;
112 if (defined(
'LIBXML_HTML_NOIMPLIED') && defined(
'LIBXML_HTML_NODEFDTD')) {
117 $libXMLOpts|= LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD;
120 $dom->loadHTML($html, $libXMLOpts, $libXMLErrors);
122 catch(XDOMDocumentException $e) {
123 $error = $e->getMessage();
126 $error = self::getFirstErrorMessage($libXMLErrors);
128 $dom->normalizeDocument();
129 $wrapper = $dom->documentElement->getElementsByTagName(
'body')->item(0);
130 if ($wrapper === null || $wrapper ===
false) {
131 $error =
"body wrapper not found";
136 for ($i = 0; $i < $wrapper->childNodes->length; $i++) {
137 $html.= $dom->saveHTML($wrapper->childNodes->item($i));
140 $html = str_replace(array(
156 public static function convertHTMLFragmentToXHTMLDocument($html, &$error =
'')
158 $dom =
new XDOMDocument();
159 $dom->setLibXMLErrorIgnoreCodes(self::$libXMLErrorIgnoreCodes);
160 $dom->preserveWhiteSpace =
false;
161 $dom->formatOutput =
false;
166 $html =
'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/></head><body>' . self::cleanXMLUTF8($html);
170 $libXMLErrors = array();
171 $libXMLOpts = LIBXML_NONET;
172 if (defined(
'LIBXML_HTML_NOIMPLIED') && defined(
'LIBXML_HTML_NODEFDTD')) {
177 $libXMLOpts|= LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD;
180 $dom->loadHTML($html, $libXMLOpts, $libXMLErrors);
182 catch(XDOMDocumentException $e) {
183 $error = $e->getMessage();
186 $error = self::getFirstErrorMessage($libXMLErrors);
188 $dom->normalizeDocument();
189 $wrapper = $dom->documentElement->getElementsByTagName(
'body')->item(0);
190 if ($wrapper === null || $wrapper ===
false) {
191 $error =
'body top node not found';
195 $xhtml =
'<?xml version="1.0" encoding="UTF-8"?>' .
"\n";
196 $xhtml.=
'<html xmlns="http://www.w3.org/1999/xhtml"><body>';
197 for ($i = 0; $i < $wrapper->childNodes->length; $i++) {
198 $xhtml.= $dom->saveXML($wrapper->childNodes->item($i));
200 $xhtml.=
'</body></html>';
207 public static function getFirstErrorMessage(&$libXMLErrors = array())
209 if (count($libXMLErrors) > 0) {
210 return $libXMLErrors[0]->message;
222 public static function cleanXMLUTF8($str)
230 $str2 = preg_replace_callback(
'/(?P<char>[\x{00}-\x{08}\x{0B}\x{0C}\x{0E}-\x{1F}])/u',
function ($m)
232 return "\xe2\x90" . chr(0x80 + ord($m[
'char']));
235 if ($str2 === null) {
244 $str2 = preg_replace(
'/[^\x{09}\x{0A}\x{0D}\x{20}-\x{d7ff}\x{e000}-\x{fffd}\x{10000}-\x{10ffff}]/u',
"\xef\xbf\xbd", $str2);
245 if ($str2 === null) {