
474 lines
13 KiB
Raw Normal View History

2013-02-18 03:48:21 +01:00
namespace PicoFeed;
class Filter
private $data = '';
private $url = '';
private $input = '';
2013-04-07 03:14:52 +02:00
private $empty_tags = array();
2013-02-18 03:48:21 +01:00
private $strip_content = false;
private $is_code = false;
2013-02-18 03:48:21 +01:00
// Allow only these tags and attributes
public static $whitelist_tags = array(
'audio' => array('controls', 'src'),
'video' => array('poster', 'controls', 'height', 'width', 'src'),
'source' => array('src', 'type'),
2013-02-24 20:03:14 +01:00
'dt' => array(),
'dd' => array(),
'dl' => array(),
'table' => array(),
'caption' => array(),
'tr' => array(),
'th' => array(),
'td' => array(),
'tbody' => array(),
'thead' => array(),
2013-02-18 03:48:21 +01:00
'h2' => array(),
'h3' => array(),
'h4' => array(),
'h5' => array(),
'h6' => array(),
'strong' => array(),
'em' => array(),
'code' => array(),
'pre' => array(),
'blockquote' => array(),
'p' => array(),
'ul' => array(),
'li' => array(),
'ol' => array(),
'br' => array(),
'del' => array(),
'a' => array('href'),
2013-03-17 23:16:25 +01:00
'img' => array('src'),
'figure' => array(),
'figcaption' => array(),
'cite' => array(),
'time' => array('datetime'),
2013-04-05 05:34:07 +02:00
'abbr' => array('title'),
2013-04-25 02:00:30 +02:00
'iframe' => array('width', 'height', 'frameborder', 'src'),
'q' => array('cite')
2013-02-18 03:48:21 +01:00
// Strip content of these tags
public static $blacklist_tags = array(
2013-02-18 03:48:21 +01:00
// Allowed URI scheme
// For a complete list go to
public static $scheme_whitelist = array(
2013-04-12 21:57:54 +02:00
2013-02-18 03:48:21 +01:00
// Attributes used for external resources
public static $media_attributes = array(
2013-02-18 03:48:21 +01:00
2013-02-18 03:48:21 +01:00
// Blacklisted resources
public static $media_blacklist = array(
2013-02-18 03:48:21 +01:00
2013-05-21 13:06:57 +02:00
2013-04-07 03:14:52 +02:00
2013-04-07 03:14:52 +02:00
2013-02-18 03:48:21 +01:00
2013-04-07 03:14:52 +02:00
2013-04-07 16:58:46 +02:00
2013-02-18 03:48:21 +01:00
// Mandatory attributes for specified tags
public static $required_attributes = array(
2013-02-18 03:48:21 +01:00
'a' => array('href'),
2013-04-07 03:14:52 +02:00
'img' => array('src'),
'iframe' => array('src'),
'audio' => array('src'),
'source' => array('src'),
2013-02-18 03:48:21 +01:00
// Add attributes to specified tags
public static $add_attributes = array(
2013-02-24 20:03:14 +01:00
'a' => 'rel="noreferrer" target="_blank"'
// Attributes that must be integer
public static $integer_attributes = array(
// Iframe source whitelist, everything else is ignored
public static $iframe_whitelist = array(
2013-07-19 01:24:04 +02:00
2013-04-05 05:34:07 +02:00
2013-04-07 03:14:52 +02:00
2013-07-19 01:24:04 +02:00
2013-04-05 05:34:07 +02:00
2013-02-18 03:48:21 +01:00
2013-04-07 03:14:52 +02:00
public function __construct($data, $site_url)
2013-02-18 03:48:21 +01:00
2013-04-07 03:14:52 +02:00
$this->url = $site_url;
2013-02-18 03:48:21 +01:00
2013-05-22 15:27:35 +02:00
2013-02-20 04:09:32 +01:00
// Convert bad formatted documents to XML
$dom = new \DOMDocument;
$dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$data);
2013-02-18 03:48:21 +01:00
$this->input = $dom->saveXML($dom->getElementsByTagName('body')->item(0));
public function execute()
$parser = xml_parser_create();
xml_set_object($parser, $this);
xml_set_element_handler($parser, 'startTag', 'endTag');
xml_set_character_data_handler($parser, 'dataTag');
xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, false);
xml_parse($parser, $this->input, true); // We ignore parsing error (for old libxml)
2013-02-18 03:48:21 +01:00
$this->data = $this->removeEmptyTags($this->data);
$this->data = $this->removeMultipleTags($this->data);
2013-02-18 03:48:21 +01:00
return $this->data;
public function startTag($parser, $name, $attributes)
2013-04-07 03:14:52 +02:00
$empty_tag = false;
2013-02-18 03:48:21 +01:00
$this->strip_content = false;
if ($this->is_code === false && $name === 'pre') $this->is_code = true;
2013-02-18 03:48:21 +01:00
if ($this->isPixelTracker($name, $attributes)) {
2013-04-07 03:14:52 +02:00
$empty_tag = true;
2013-02-18 03:48:21 +01:00
else if ($this->isAllowedTag($name)) {
$attr_data = '';
$used_attributes = array();
foreach ($attributes as $attribute => $value) {
2013-04-05 05:34:07 +02:00
if ($value != '' && $this->isAllowedAttribute($name, $attribute)) {
2013-02-18 03:48:21 +01:00
if ($this->isResource($attribute)) {
2013-04-07 03:14:52 +02:00
if ($name === 'iframe') {
2013-04-05 05:34:07 +02:00
2013-04-07 03:14:52 +02:00
if ($this->isAllowedIframeResource($value)) {
$attr_data .= ' '.$attribute.'="'.$value.'"';
$used_attributes[] = $attribute;
2013-04-05 05:34:07 +02:00
else if ($this->isRelativePath($value)) {
2013-02-18 03:48:21 +01:00
$attr_data .= ' '.$attribute.'="'.$this->getAbsoluteUrl($value, $this->url).'"';
$used_attributes[] = $attribute;
else if ($this->isAllowedProtocol($value) && ! $this->isBlacklistedMedia($value)) {
2013-02-18 03:48:21 +01:00
2013-04-12 21:57:54 +02:00
if ($attribute == 'src' &&
isset($attributes['data-src']) &&
$this->isAllowedProtocol($attributes['data-src']) &&
! $this->isBlacklistedMedia($attributes['data-src'])) {
2013-04-12 21:57:54 +02:00
$value = $attributes['data-src'];
2013-02-18 03:48:21 +01:00
$attr_data .= ' '.$attribute.'="'.$value.'"';
$used_attributes[] = $attribute;
else if ($this->validateAttributeValue($attribute, $value)) {
2013-02-18 03:48:21 +01:00
$attr_data .= ' '.$attribute.'="'.$value.'"';
$used_attributes[] = $attribute;
2013-04-07 03:14:52 +02:00
// Check for required attributes
if (isset(self::$required_attributes[$name])) {
2013-02-18 03:48:21 +01:00
foreach (self::$required_attributes[$name] as $required_attribute) {
2013-02-18 03:48:21 +01:00
if (! in_array($required_attribute, $used_attributes)) {
2013-04-07 03:14:52 +02:00
$empty_tag = true;
2013-02-18 03:48:21 +01:00
2013-04-07 03:14:52 +02:00
if (! $empty_tag) {
2013-02-18 03:48:21 +01:00
$this->data .= '<'.$name.$attr_data;
2013-04-07 03:14:52 +02:00
// Add custom attributes
if (isset(self::$add_attributes[$name])) {
2013-02-24 20:03:14 +01:00
$this->data .= ' '.self::$add_attributes[$name].' ';
2013-02-24 20:03:14 +01:00
2013-04-07 03:14:52 +02:00
// If img or br, we don't close it here
2013-02-18 03:48:21 +01:00
if ($name !== 'img' && $name !== 'br') $this->data .= '>';
if (in_array($name, self::$blacklist_tags)) {
2013-02-18 03:48:21 +01:00
$this->strip_content = true;
2013-04-07 03:14:52 +02:00
$this->empty_tags[] = $empty_tag;
2013-02-18 03:48:21 +01:00
public function endTag($parser, $name)
2013-04-07 03:14:52 +02:00
if (! array_pop($this->empty_tags) && $this->isAllowedTag($name)) {
2013-02-18 03:48:21 +01:00
$this->data .= $name !== 'img' && $name !== 'br' ? '</'.$name.'>' : '/>';
if ($this->is_code && $name === 'pre') $this->is_code = false;
2013-02-18 03:48:21 +01:00
public function dataTag($parser, $content)
$content = str_replace("\xc2\xa0", ' ', $content); // Replace &nbsp; with normal space
// Replace mutliple space by a single one
if (! $this->is_code) {
$content = preg_replace('!\s+!', ' ', $content);
if (! $this->strip_content && trim($content) !== '') {
$this->data .= htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false);
2013-02-18 03:48:21 +01:00
public function getAbsoluteUrl($path, $url)
$components = parse_url($url);
2013-06-29 03:50:15 +02:00
if (! isset($components['scheme'])) $components['scheme'] = 'http';
if (! isset($components['host'])) {
if ($url) {
$components['host'] = $url;
$components['path'] = '/';
else {
return '';
2013-02-18 03:48:21 +01:00
if ($path{0} === '/') {
// Absolute path
return $components['scheme'].'://'.$components['host'].$path;
else {
// Relative path
2013-06-29 03:50:15 +02:00
$url_path = isset($components['path']) && ! empty($components['path']) ? $components['path'] : '/';
$length = strlen($url_path);
2013-02-18 03:48:21 +01:00
2013-06-29 03:50:15 +02:00
if ($length > 1 && $url_path{$length - 1} !== '/') {
2013-02-18 03:48:21 +01:00
$url_path = dirname($url_path).'/';
if (substr($path, 0, 2) === './') {
$path = substr($path, 2);
return $components['scheme'].'://'.$components['host'].$url_path.$path;
public function isRelativePath($value)
2013-04-12 21:57:54 +02:00
if (strpos($value, 'data:') === 0) return false;
2013-02-18 03:48:21 +01:00
return strpos($value, '://') === false && strpos($value, '//') !== 0;
public function isAllowedTag($name)
return isset(self::$whitelist_tags[$name]);
2013-02-18 03:48:21 +01:00
public function isAllowedAttribute($tag, $attribute)
return in_array($attribute, self::$whitelist_tags[$tag]);
2013-02-18 03:48:21 +01:00
public function isResource($attribute)
return in_array($attribute, self::$media_attributes);
2013-02-18 03:48:21 +01:00
2013-04-05 05:34:07 +02:00
public function isAllowedIframeResource($value)
foreach (self::$iframe_whitelist as $url) {
2013-04-05 05:34:07 +02:00
if (strpos($value, $url) === 0) {
return true;
return false;
2013-02-18 03:48:21 +01:00
public function isAllowedProtocol($value)
foreach (self::$scheme_whitelist as $protocol) {
2013-02-18 03:48:21 +01:00
if (strpos($value, $protocol) === 0) {
return true;
return false;
public function isBlacklistedMedia($resource)
2013-02-18 03:48:21 +01:00
foreach (self::$media_blacklist as $name) {
2013-02-18 03:48:21 +01:00
if (strpos($resource, $name) !== false) {
return true;
return false;
public function isPixelTracker($tag, array $attributes)
return $tag === 'img' &&
isset($attributes['height']) && isset($attributes['width']) &&
$attributes['height'] == 1 && $attributes['width'] == 1;
public function validateAttributeValue($attribute, $value)
if (in_array($attribute, self::$integer_attributes)) {
return ctype_digit($value);
return true;
public function removeMultipleTags($data)
// Replace <br/><br/> by only one
return preg_replace("/(<br\s*\/?>\s*)+/", "<br/>", $data);
public function removeEmptyTags($data)
return preg_replace('/<([^<\/>]*)>([\s]*?|(?R))<\/\1>/imsU', '', $data);
public function removeHTMLTags($data)
return preg_replace('~<(?:!DOCTYPE|/?(?:html|head|body))[^>]*>\s*~i', '', $data);
public static function stripXmlTag($data)
if (strpos($data, '<?xml') !== false) {
$data = substr($data, strrpos($data, '?>') + 2);
return $data;
2013-09-01 00:37:26 +02:00
public static function stripMetaTags($data)
return preg_replace('/<meta\s.*?\/>/is', '', $data);
2013-02-18 03:48:21 +01:00