Core  3.2
PHP API documentation
 All Data Structures Namespaces Files Functions Variables Pages
htmlclean.php
Go to the documentation of this file.
1 <?php
2 /*
3  * @author Anakeen
4  * @package FDL
5 */
6 
7 namespace Dcp\Utils;
8 
9 class htmlclean
10 {
11  /*
12  * Libxml's errors to ignore.
13  * Error codes are taken from `include/libxml/xmlerror.h` from libxml's source code.
14  */
15  static $libXMLErrorIgnoreCodes = array(
16  /* Ignore "htmlParseEntityRef: expecting ';'" (XML_ERR_ENTITYREF_SEMICOL_MISSING) errors */
17  23,
18  /* Ignore "Tag video invalid in Entity" (XML_HTML_UNKNOWN_TAG) errors */
19  801,
20  /* Ignore "ID %s already defined" (XML_DTD_ID_REDEFINED) errors */
21  513
22  );
23  /**
24  * Delete dangerous attributes and elements to prevent xss attacks
25  * @param string $data html fragment
26  * @return string
27  */
28  public static function xssClean($data)
29  {
30  // Fix &entity\n;
31  $data = str_replace(array(
32  '&amp;',
33  '&lt;',
34  '&gt;'
35  ) , array(
36  '&amp;amp;',
37  '&amp;lt;',
38  '&amp;gt;'
39  ) , $data);
40  $data = preg_replace('/(&#*\w+)[\x00-\x20]+;/u', '$1;', $data);
41  $data = preg_replace('/(&#x*[0-9A-F]+);*/iu', '$1;', $data);
42  $data = html_entity_decode($data, ENT_COMPAT, 'UTF-8');
43  // Remove any attribute starting with "on" or xmlns
44  $data = preg_replace('#(<[^>]+?[\x00-\x20"\'])(?:on|xmlns)[a-z]*\s*=\s*["][^"]*["]+#iu', '$1', $data);
45  $data = preg_replace('#(<[^>]+?[\x00-\x20"\'])(?:on|xmlns)[a-z]*\s*=\s*[\'][^\']*[\']+#iu', '$1', $data);
46  // Remove javascript: and vbscript: protocols
47  $data = preg_replace('#([a-z]*)[\x00-\x20]*=[\x00-\x20]*([`\'"]*)[\x00-\x20]*j[\x00-\x20]*a[\x00-\x20]*v[\x00-\x20]*a[\x00-\x20]*s[\x00-\x20]*c[\x00-\x20]*r[\x00-\x20]*i[\x00-\x20]*p[\x00-\x20]*t[\x00-\x20]*:#iu', '$1=$2nojavascript...', $data);
48  $data = preg_replace('#([a-z]*)[\x00-\x20]*=([\'"]*)[\x00-\x20]*v[\x00-\x20]*b[\x00-\x20]*s[\x00-\x20]*c[\x00-\x20]*r[\x00-\x20]*i[\x00-\x20]*p[\x00-\x20]*t[\x00-\x20]*:#iu', '$1=$2novbscript...', $data);
49  $data = preg_replace('#([a-z]*)[\x00-\x20]*=([\'"]*)[\x00-\x20]*-moz-binding[\x00-\x20]*:#u', '$1=$2nomozbinding...', $data);
50  // Only works in IE: <span style="width: expression(alert('Ping!'));"></span>
51  $data = preg_replace('#(<[^>]+?)style[\x00-\x20]*=[\x00-\x20]*([`\'"])(?:.(?!\2))*?expression[\x00-\x20]*\((?:.(?!\2))*[^>]*+>#i', '$1>', $data);
52  $data = preg_replace('#(<[^>]+?)style[\x00-\x20]*=[\x00-\x20]*([`\'"])(?:.(?!\2))*?behaviour[\x00-\x20]*\((?:.(?!\2))*[^>]*+>#i', '$1>', $data);
53  $data = preg_replace('#(<[^>]+?)style[\x00-\x20]*=[\x00-\x20]*([`\'"])(?:.(?!\2))*?s[\x00-\x20]*c[\x00-\x20]*r[\x00-\x20]*i[\x00-\x20]*p[\x00-\x20]*t[\x00-\x20]*:*(?:.(?!\2))*[^>]*>#iu', '$1>', $data);
54  // Remove namespaced elements (we do not need them)
55  $data = preg_replace('#</*\w+:\w[^>]*+>#i', '', $data);
56 
57  do {
58  // Remove really unwanted tags
59  $old_data = $data;
60  $data = preg_replace('#</*(?:applet|script)[^>]*+>#i', '', $data);
61  } while ($old_data !== $data);
62  // we are done...
63  return $data;
64  }
65 
66  public static function cleanStyle($data)
67  {
68  // Remove span tags and keep content
69  $data = preg_replace('/<\/?span[^>]*>/is', "", $data);
70  // Remove font tags and keep content
71  $data = preg_replace('/<\/?font[^>]*>/is', "", $data);
72  // Remove style attributes
73  $data = preg_replace('/<([^>]*) style\s*=\s*"[^"]*"/is', "<\\1", $data);
74  $data = preg_replace('/<([^>]*) style\s*=\s*\'[^\']*\'/is', "<\\1", $data);
75  // Delete class attributes
76  $data = preg_replace('/<([^>]*) class\s*=\s*"[^"]*"/is', "<\\1", $data);
77  $data = preg_replace('/<([^>]*) class\s*=\s*\'[^\']*\'/is', "<\\1", $data);
78  // Delete style tags
79  $data = preg_replace('/<\s*style[^>]*>[^<]*<\/style>/iu', "", $data);
80  /*
81  do {
82  // Remove really unwanted tags
83  $old_data = $data;
84  // $data = preg_replace('#</*(?:applet|b(?:ase|gsound|link)|embed|frame(?:set)?|i(?:frame|layer)|l(?:ayer|ink)|meta|object|s(?:cript|tyle)|title|xml)[^>]*+>#i', '', $data);
85  } while ($old_data !== $data);
86  */
87  return $data;
88  }
89  /**
90  * Normalize/correct an HTML fragment by loading and serializing it back through libxml
91  *
92  * @param string $html The HTML fragment to cleanup/correct (HTML must be encoded in UTF-8)
93  * @param string $error Empty string if no error or non-empty string containing the error message
94  * @return bool(false)|string The resulting HTML on success or bool(false) on failure (the error message is returned in the $error argument)
95  */
96  public static function normalizeHTMLFragment($html, &$error = '')
97  {
98  $dom = new XDOMDocument();
99  $dom->setLibXMLErrorIgnoreCodes(self::$libXMLErrorIgnoreCodes);
100  $dom->preserveWhiteSpace = false;
101  $dom->formatOutput = false;
102  /*
103  * Add a HTML meta header to setup DOMDocument to UTF-8 encoding and no trailing </body></html>
104  * to not interfere with the given $html fragment.
105  */
106  $html = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/></head><body>' . self::cleanXMLUTF8($html);
107  /**
108  * @var \libXMLError[] $libXMLErrors
109  */
110  $libXMLErrors = array();
111  $libXMLOpts = LIBXML_NONET;
112  if (defined('LIBXML_HTML_NOIMPLIED') && defined('LIBXML_HTML_NODEFDTD')) {
113  /*
114  * LIBXML_HTML_NOIMPLIED is available in libxml >= 2.7.7
115  * LIBXML_HTML_NODEFDTD is available in libxml >= 2.7.8
116  */
117  $libXMLOpts|= LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD;
118  }
119  try {
120  $dom->loadHTML($html, $libXMLOpts, $libXMLErrors);
121  }
122  catch(XDOMDocumentException $e) {
123  $error = $e->getMessage();
124  return false;
125  }
126  $error = self::getFirstErrorMessage($libXMLErrors);
127  /* Get back the top <body> wrapper node added by loadHTML() */
128  $dom->normalizeDocument();
129  $wrapper = $dom->documentElement->getElementsByTagName('body')->item(0);
130  if ($wrapper === null || $wrapper === false) {
131  $error = "body wrapper not found";
132  return false;
133  }
134  /* Extract and serialize back all the childs to HTML */
135  $html = '';
136  for ($i = 0; $i < $wrapper->childNodes->length; $i++) {
137  $html.= $dom->saveHTML($wrapper->childNodes->item($i));
138  }
139  /* Remove carriage-returns inserted by libxml's HTML serialization */
140  $html = str_replace(array(
141  "\n<",
142  ">\n"
143  ) , array(
144  "<",
145  ">"
146  ) , $html);
147  return $html;
148  }
149  /**
150  * Convert an HTML fragment to a XHTML document
151  *
152  * @param string $html The HTML fragment to cleanup/correct (HTML must be encoded in UTF-8)
153  * @param string $error Empty string if no error or non-empty string containing the error message
154  * @return bool(false)|string The resulting XHTML on success or bool(false) on failure (the error message is returned in the $error argument)
155  */
156  public static function convertHTMLFragmentToXHTMLDocument($html, &$error = '')
157  {
158  $dom = new XDOMDocument();
159  $dom->setLibXMLErrorIgnoreCodes(self::$libXMLErrorIgnoreCodes);
160  $dom->preserveWhiteSpace = false;
161  $dom->formatOutput = false;
162  /*
163  * Add a HTML meta header to setup DOMDocument to UTF-8 encoding and no trailing </body></html>
164  * to not interfere with the given $html fragment.
165  */
166  $html = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/></head><body>' . self::cleanXMLUTF8($html);
167  /**
168  * @var \libXMLError[] $libXMLErrors
169  */
170  $libXMLErrors = array();
171  $libXMLOpts = LIBXML_NONET;
172  if (defined('LIBXML_HTML_NOIMPLIED') && defined('LIBXML_HTML_NODEFDTD')) {
173  /*
174  * LIBXML_HTML_NOIMPLIED is available in libxml >= 2.7.7
175  * LIBXML_HTML_NODEFDTD is available in libxml >= 2.7.8
176  */
177  $libXMLOpts|= LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD;
178  }
179  try {
180  $dom->loadHTML($html, $libXMLOpts, $libXMLErrors);
181  }
182  catch(XDOMDocumentException $e) {
183  $error = $e->getMessage();
184  return false;
185  }
186  $error = self::getFirstErrorMessage($libXMLErrors);
187  /* Get back the top <body> wrapper node */
188  $dom->normalizeDocument();
189  $wrapper = $dom->documentElement->getElementsByTagName('body')->item(0);
190  if ($wrapper === null || $wrapper === false) {
191  $error = 'body top node not found';
192  return false;
193  }
194  /* Extract and serialize back to XML all the childs */
195  $xhtml = '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
196  $xhtml.= '<html xmlns="http://www.w3.org/1999/xhtml"><body>';
197  for ($i = 0; $i < $wrapper->childNodes->length; $i++) {
198  $xhtml.= $dom->saveXML($wrapper->childNodes->item($i));
199  }
200  $xhtml.= '</body></html>';
201  return $xhtml;
202  }
203  /**
204  * @param \libXMLError[] List of libXMLError objects
205  * @return string
206  */
207  public static function getFirstErrorMessage(&$libXMLErrors = array())
208  {
209  if (count($libXMLErrors) > 0) {
210  return $libXMLErrors[0]->message;
211  }
212  return '';
213  }
214  /**
215  * Replace unsupported XML chars:
216  * - Replace control chars with their corresponding Unicode pictogram from the Control Pictures Block.
217  * - Replace unsupported XML chars with the Unicode replacement symbol.
218  *
219  * @param $str
220  * @return mixed
221  */
222  public static function cleanXMLUTF8($str)
223  {
224  /*
225  * Pass #1
226  *
227  * Map invalid control chars to theirs corresponding pictogram from the Control Pictures block:
228  * - https://codepoints.net/control_pictures
229  */
230  $str2 = preg_replace_callback('/(?P<char>[\x{00}-\x{08}\x{0B}\x{0C}\x{0E}-\x{1F}])/u', function ($m)
231  {
232  return "\xe2\x90" . chr(0x80 + ord($m['char']));
233  }
234  , $str);
235  if ($str2 === null) {
236  /* str is not a valid UTF8 string, so we return the original string */
237  return $str;
238  }
239  /*
240  * Pass #2
241  *
242  * Replace unsupported XML chars
243  */
244  $str2 = preg_replace('/[^\x{09}\x{0A}\x{0D}\x{20}-\x{d7ff}\x{e000}-\x{fffd}\x{10000}-\x{10ffff}]/u', "\xef\xbf\xbd", $str2);
245  if ($str2 === null) {
246  /* str is not a valid UTF8 string, so we return the original string */
247  return $str;
248  }
249  return $str2;
250  }
251 }
$data
← centre documentaire © anakeen