2013-02-18 03:48:21 +01:00
|
|
|
<?php
|
|
|
|
|
|
|
|
namespace PicoFeed;
|
|
|
|
|
2014-05-20 20:20:27 +02:00
|
|
|
use DOMDocument;
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Filter class
|
|
|
|
*
|
|
|
|
* @author Frederic Guillot
|
2014-05-20 20:20:27 +02:00
|
|
|
* @package picofeed
|
2014-03-30 00:48:29 +01:00
|
|
|
*/
|
2013-02-18 03:48:21 +01:00
|
|
|
class Filter
|
|
|
|
{
|
2014-05-20 20:20:27 +02:00
|
|
|
/**
|
|
|
|
* Config object
|
|
|
|
*
|
|
|
|
* @access private
|
|
|
|
* @var \PicoFeed\Config
|
|
|
|
*/
|
|
|
|
private $config = null;
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Filtered XML data
|
|
|
|
*
|
|
|
|
* @access private
|
|
|
|
* @var string
|
|
|
|
*/
|
2013-02-18 03:48:21 +01:00
|
|
|
private $data = '';
|
2014-03-30 00:48:29 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Site URL (used to build absolute URL)
|
|
|
|
*
|
|
|
|
* @access private
|
|
|
|
* @var string
|
|
|
|
*/
|
2013-02-18 03:48:21 +01:00
|
|
|
private $url = '';
|
2014-03-30 00:48:29 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Unfiltered XML data
|
|
|
|
*
|
|
|
|
* @access private
|
|
|
|
* @var string
|
|
|
|
*/
|
2013-02-18 03:48:21 +01:00
|
|
|
private $input = '';
|
2014-03-30 00:48:29 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* List of empty tags
|
|
|
|
*
|
|
|
|
* @access private
|
|
|
|
* @var array
|
|
|
|
*/
|
2013-04-07 03:14:52 +02:00
|
|
|
private $empty_tags = array();
|
2014-03-30 00:48:29 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Flag to remove the content of a tag
|
|
|
|
*
|
|
|
|
* @access private
|
|
|
|
* @var boolean
|
|
|
|
*/
|
2013-02-18 03:48:21 +01:00
|
|
|
private $strip_content = false;
|
2014-03-30 00:48:29 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Flag to remember if the current payload is a source code <pre/>
|
|
|
|
*
|
|
|
|
* @access private
|
|
|
|
* @var boolean
|
|
|
|
*/
|
2013-08-04 03:08:44 +02:00
|
|
|
private $is_code = false;
|
2013-02-18 03:48:21 +01:00
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Tags and attribute whitelist
|
|
|
|
*
|
2014-05-20 20:20:27 +02:00
|
|
|
* @access private
|
2014-03-30 00:48:29 +01:00
|
|
|
* @var array
|
|
|
|
*/
|
2014-05-20 20:20:27 +02:00
|
|
|
private $whitelist_tags = array(
|
2013-07-27 03:16:19 +02:00
|
|
|
'audio' => array('controls', 'src'),
|
2013-07-23 02:23:57 +02:00
|
|
|
'video' => array('poster', 'controls', 'height', 'width', 'src'),
|
|
|
|
'source' => array('src', 'type'),
|
2013-02-24 20:03:14 +01:00
|
|
|
'dt' => array(),
|
|
|
|
'dd' => array(),
|
|
|
|
'dl' => array(),
|
|
|
|
'table' => array(),
|
|
|
|
'caption' => array(),
|
|
|
|
'tr' => array(),
|
|
|
|
'th' => array(),
|
|
|
|
'td' => array(),
|
|
|
|
'tbody' => array(),
|
|
|
|
'thead' => array(),
|
2013-02-18 03:48:21 +01:00
|
|
|
'h2' => array(),
|
|
|
|
'h3' => array(),
|
|
|
|
'h4' => array(),
|
|
|
|
'h5' => array(),
|
|
|
|
'h6' => array(),
|
|
|
|
'strong' => array(),
|
|
|
|
'em' => array(),
|
|
|
|
'code' => array(),
|
|
|
|
'pre' => array(),
|
|
|
|
'blockquote' => array(),
|
|
|
|
'p' => array(),
|
|
|
|
'ul' => array(),
|
|
|
|
'li' => array(),
|
|
|
|
'ol' => array(),
|
|
|
|
'br' => array(),
|
|
|
|
'del' => array(),
|
|
|
|
'a' => array('href'),
|
2014-03-01 14:54:33 +01:00
|
|
|
'img' => array('src', 'title', 'alt'),
|
2013-03-17 23:16:25 +01:00
|
|
|
'figure' => array(),
|
|
|
|
'figcaption' => array(),
|
|
|
|
'cite' => array(),
|
|
|
|
'time' => array('datetime'),
|
2013-04-05 05:34:07 +02:00
|
|
|
'abbr' => array('title'),
|
2013-04-25 02:00:30 +02:00
|
|
|
'iframe' => array('width', 'height', 'frameborder', 'src'),
|
|
|
|
'q' => array('cite')
|
2013-02-18 03:48:21 +01:00
|
|
|
);
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Tags blacklist, strip the content of those tags
|
|
|
|
*
|
2014-05-20 20:20:27 +02:00
|
|
|
* @access private
|
2014-03-30 00:48:29 +01:00
|
|
|
* @var array
|
|
|
|
*/
|
2014-05-20 20:20:27 +02:00
|
|
|
private $blacklisted_tags = array(
|
2013-02-18 03:48:21 +01:00
|
|
|
'script'
|
|
|
|
);
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Scheme whitelist
|
|
|
|
* For a complete list go to http://en.wikipedia.org/wiki/URI_scheme
|
|
|
|
*
|
2014-05-20 20:20:27 +02:00
|
|
|
* @access private
|
2014-03-30 00:48:29 +01:00
|
|
|
* @var array
|
|
|
|
*/
|
2014-05-20 20:20:27 +02:00
|
|
|
private $scheme_whitelist = array(
|
2013-04-12 21:57:54 +02:00
|
|
|
'//',
|
|
|
|
'data:image/png;base64,',
|
|
|
|
'data:image/gif;base64,',
|
2013-07-27 03:16:19 +02:00
|
|
|
'data:image/jpg;base64,',
|
|
|
|
'bitcoin:',
|
|
|
|
'callto:',
|
|
|
|
'ed2k://',
|
|
|
|
'facetime://',
|
|
|
|
'feed:',
|
|
|
|
'ftp://',
|
|
|
|
'geo:',
|
|
|
|
'git://',
|
|
|
|
'http://',
|
|
|
|
'https://',
|
|
|
|
'irc://',
|
|
|
|
'irc6://',
|
|
|
|
'ircs://',
|
|
|
|
'jabber:',
|
|
|
|
'magnet:',
|
|
|
|
'mailto:',
|
|
|
|
'nntp://',
|
|
|
|
'rtmp://',
|
|
|
|
'sftp://',
|
|
|
|
'sip:',
|
|
|
|
'sips:',
|
|
|
|
'skype:',
|
|
|
|
'smb://',
|
|
|
|
'sms:',
|
|
|
|
'spotify:',
|
|
|
|
'ssh:',
|
|
|
|
'steam:',
|
|
|
|
'svn://',
|
|
|
|
'tel:',
|
2013-02-18 03:48:21 +01:00
|
|
|
);
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Attributes used for external resources
|
|
|
|
*
|
2014-05-20 20:20:27 +02:00
|
|
|
* @access private
|
2014-03-30 00:48:29 +01:00
|
|
|
* @var array
|
|
|
|
*/
|
2014-05-20 20:20:27 +02:00
|
|
|
private $media_attributes = array(
|
2013-02-18 03:48:21 +01:00
|
|
|
'src',
|
|
|
|
'href',
|
2013-07-28 23:53:17 +02:00
|
|
|
'poster',
|
2013-02-18 03:48:21 +01:00
|
|
|
);
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Blacklisted resources
|
|
|
|
*
|
2014-05-20 20:20:27 +02:00
|
|
|
* @access private
|
2014-03-30 00:48:29 +01:00
|
|
|
* @var array
|
|
|
|
*/
|
2014-05-20 20:20:27 +02:00
|
|
|
private $media_blacklist = array(
|
2013-02-18 03:48:21 +01:00
|
|
|
'feeds.feedburner.com',
|
2013-05-21 13:06:57 +02:00
|
|
|
'share.feedsportal.com',
|
2013-04-07 03:14:52 +02:00
|
|
|
'da.feedsportal.com',
|
|
|
|
'rss.feedsportal.com',
|
|
|
|
'res.feedsportal.com',
|
2013-06-29 19:41:36 +02:00
|
|
|
'res1.feedsportal.com',
|
|
|
|
'res2.feedsportal.com',
|
|
|
|
'res3.feedsportal.com',
|
2013-04-07 03:14:52 +02:00
|
|
|
'pi.feedsportal.com',
|
2013-02-18 03:48:21 +01:00
|
|
|
'rss.nytimes.com',
|
|
|
|
'feeds.wordpress.com',
|
2013-04-07 03:14:52 +02:00
|
|
|
'stats.wordpress.com',
|
2013-04-07 16:58:46 +02:00
|
|
|
'rss.cnn.com',
|
|
|
|
'twitter.com/home?status=',
|
|
|
|
'twitter.com/share',
|
|
|
|
'twitter_icon_large.png',
|
|
|
|
'www.facebook.com/sharer.php',
|
|
|
|
'facebook_icon_large.png',
|
|
|
|
'plus.google.com/share',
|
|
|
|
'www.gstatic.com/images/icons/gplus-16.png',
|
|
|
|
'www.gstatic.com/images/icons/gplus-32.png',
|
2013-07-28 23:53:17 +02:00
|
|
|
'www.gstatic.com/images/icons/gplus-64.png',
|
2013-02-18 03:48:21 +01:00
|
|
|
);
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Mandatory attributes for specified tags
|
|
|
|
*
|
2014-05-20 20:20:27 +02:00
|
|
|
* @access private
|
2014-03-30 00:48:29 +01:00
|
|
|
* @var array
|
|
|
|
*/
|
2014-05-20 20:20:27 +02:00
|
|
|
private $required_attributes = array(
|
2013-02-18 03:48:21 +01:00
|
|
|
'a' => array('href'),
|
2013-04-07 03:14:52 +02:00
|
|
|
'img' => array('src'),
|
2013-07-28 23:53:17 +02:00
|
|
|
'iframe' => array('src'),
|
|
|
|
'audio' => array('src'),
|
|
|
|
'source' => array('src'),
|
2013-02-18 03:48:21 +01:00
|
|
|
);
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Add attributes to specified tags
|
|
|
|
*
|
2014-05-20 20:20:27 +02:00
|
|
|
* @access private
|
2014-03-30 00:48:29 +01:00
|
|
|
* @var array
|
|
|
|
*/
|
2014-05-20 20:20:27 +02:00
|
|
|
private $add_attributes = array(
|
2013-02-24 20:03:14 +01:00
|
|
|
'a' => 'rel="noreferrer" target="_blank"'
|
|
|
|
);
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Attributes that must be integer
|
|
|
|
*
|
2014-05-20 20:20:27 +02:00
|
|
|
* @access private
|
2014-03-30 00:48:29 +01:00
|
|
|
* @var array
|
|
|
|
*/
|
2014-05-20 20:20:27 +02:00
|
|
|
private $integer_attributes = array(
|
2013-07-28 23:53:17 +02:00
|
|
|
'width',
|
|
|
|
'height',
|
|
|
|
'frameborder',
|
|
|
|
);
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Iframe source whitelist, everything else is ignored
|
|
|
|
*
|
2014-05-20 20:20:27 +02:00
|
|
|
* @access private
|
2014-03-30 00:48:29 +01:00
|
|
|
* @var array
|
|
|
|
*/
|
2014-05-20 20:20:27 +02:00
|
|
|
private $iframe_whitelist = array(
|
2014-02-16 01:31:22 +01:00
|
|
|
'//www.youtube.com',
|
|
|
|
'http://www.youtube.com',
|
|
|
|
'https://www.youtube.com',
|
|
|
|
'http://player.vimeo.com',
|
|
|
|
'https://player.vimeo.com',
|
2013-07-19 01:24:04 +02:00
|
|
|
'http://www.dailymotion.com',
|
|
|
|
'https://www.dailymotion.com',
|
2013-04-05 05:34:07 +02:00
|
|
|
);
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Initialize the filter, all inputs data must be encoded in UTF-8 before
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param string $data XML content
|
|
|
|
* @param string $site_url Site URL (used to build absolute URL)
|
|
|
|
*/
|
2013-04-07 03:14:52 +02:00
|
|
|
public function __construct($data, $site_url)
|
2013-02-18 03:48:21 +01:00
|
|
|
{
|
2013-04-07 03:14:52 +02:00
|
|
|
$this->url = $site_url;
|
2013-02-18 03:48:21 +01:00
|
|
|
|
2014-05-20 20:20:27 +02:00
|
|
|
libxml_use_internal_errors(true);
|
2013-05-22 15:27:35 +02:00
|
|
|
|
2013-02-20 04:09:32 +01:00
|
|
|
// Convert bad formatted documents to XML
|
2014-05-20 20:20:27 +02:00
|
|
|
$dom = new DOMDocument;
|
2013-05-21 16:01:10 +02:00
|
|
|
$dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$data);
|
2013-02-18 03:48:21 +01:00
|
|
|
$this->input = $dom->saveXML($dom->getElementsByTagName('body')->item(0));
|
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Run tags/attributes filtering
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @return string
|
|
|
|
*/
|
2013-02-18 03:48:21 +01:00
|
|
|
public function execute()
|
|
|
|
{
|
|
|
|
$parser = xml_parser_create();
|
|
|
|
xml_set_object($parser, $this);
|
|
|
|
xml_set_element_handler($parser, 'startTag', 'endTag');
|
|
|
|
xml_set_character_data_handler($parser, 'dataTag');
|
|
|
|
xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, false);
|
2013-06-02 19:25:27 +02:00
|
|
|
xml_parse($parser, $this->input, true); // We ignore parsing error (for old libxml)
|
2013-02-18 03:48:21 +01:00
|
|
|
xml_parser_free($parser);
|
|
|
|
|
2013-08-04 03:08:44 +02:00
|
|
|
$this->data = $this->removeEmptyTags($this->data);
|
|
|
|
$this->data = $this->removeMultipleTags($this->data);
|
|
|
|
|
2014-05-20 20:20:27 +02:00
|
|
|
return trim($this->data);
|
2013-02-18 03:48:21 +01:00
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Parse opening tag
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param resource $parser XML parser
|
|
|
|
* @param string $name Tag name
|
|
|
|
* @param array $attributes Tag attributes
|
|
|
|
*/
|
2013-02-18 03:48:21 +01:00
|
|
|
public function startTag($parser, $name, $attributes)
|
|
|
|
{
|
2013-04-07 03:14:52 +02:00
|
|
|
$empty_tag = false;
|
2013-02-18 03:48:21 +01:00
|
|
|
$this->strip_content = false;
|
|
|
|
|
2013-08-04 03:08:44 +02:00
|
|
|
if ($this->is_code === false && $name === 'pre') $this->is_code = true;
|
|
|
|
|
2013-02-18 03:48:21 +01:00
|
|
|
if ($this->isPixelTracker($name, $attributes)) {
|
|
|
|
|
2013-04-07 03:14:52 +02:00
|
|
|
$empty_tag = true;
|
2013-02-18 03:48:21 +01:00
|
|
|
}
|
|
|
|
else if ($this->isAllowedTag($name)) {
|
|
|
|
|
|
|
|
$attr_data = '';
|
|
|
|
$used_attributes = array();
|
|
|
|
|
|
|
|
foreach ($attributes as $attribute => $value) {
|
|
|
|
|
2013-04-05 05:34:07 +02:00
|
|
|
if ($value != '' && $this->isAllowedAttribute($name, $attribute)) {
|
2013-02-18 03:48:21 +01:00
|
|
|
|
|
|
|
if ($this->isResource($attribute)) {
|
|
|
|
|
2013-04-07 03:14:52 +02:00
|
|
|
if ($name === 'iframe') {
|
2013-04-05 05:34:07 +02:00
|
|
|
|
2013-04-07 03:14:52 +02:00
|
|
|
if ($this->isAllowedIframeResource($value)) {
|
|
|
|
|
2014-03-01 14:54:33 +01:00
|
|
|
$attr_data .= ' '.$attribute.'="'.$this->escape($value).'"';
|
2013-04-07 03:14:52 +02:00
|
|
|
$used_attributes[] = $attribute;
|
|
|
|
}
|
2013-04-05 05:34:07 +02:00
|
|
|
}
|
|
|
|
else if ($this->isRelativePath($value)) {
|
2013-02-18 03:48:21 +01:00
|
|
|
|
2014-03-01 14:54:33 +01:00
|
|
|
$attr_data .= ' '.$attribute.'="'.$this->escape($this->getAbsoluteUrl($value, $this->url)).'"';
|
2013-02-18 03:48:21 +01:00
|
|
|
$used_attributes[] = $attribute;
|
|
|
|
}
|
2013-07-28 23:53:17 +02:00
|
|
|
else if ($this->isAllowedProtocol($value) && ! $this->isBlacklistedMedia($value)) {
|
2013-02-18 03:48:21 +01:00
|
|
|
|
2013-04-12 21:57:54 +02:00
|
|
|
if ($attribute == 'src' &&
|
|
|
|
isset($attributes['data-src']) &&
|
|
|
|
$this->isAllowedProtocol($attributes['data-src']) &&
|
2013-07-28 23:53:17 +02:00
|
|
|
! $this->isBlacklistedMedia($attributes['data-src'])) {
|
2013-04-12 21:57:54 +02:00
|
|
|
|
|
|
|
$value = $attributes['data-src'];
|
|
|
|
}
|
|
|
|
|
2013-10-22 03:51:16 +02:00
|
|
|
// Replace protocol-relative url // by http://
|
|
|
|
if (substr($value, 0, 2) === '//') $value = 'http:'.$value;
|
|
|
|
|
2014-03-01 14:54:33 +01:00
|
|
|
$attr_data .= ' '.$attribute.'="'.$this->escape($value).'"';
|
2013-02-18 03:48:21 +01:00
|
|
|
$used_attributes[] = $attribute;
|
|
|
|
}
|
|
|
|
}
|
2013-07-28 23:53:17 +02:00
|
|
|
else if ($this->validateAttributeValue($attribute, $value)) {
|
2013-02-18 03:48:21 +01:00
|
|
|
|
2014-03-01 14:54:33 +01:00
|
|
|
$attr_data .= ' '.$attribute.'="'.$this->escape($value).'"';
|
2013-02-18 03:48:21 +01:00
|
|
|
$used_attributes[] = $attribute;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-04-07 03:14:52 +02:00
|
|
|
// Check for required attributes
|
2014-05-20 20:20:27 +02:00
|
|
|
if (isset($this->required_attributes[$name])) {
|
2013-02-18 03:48:21 +01:00
|
|
|
|
2014-05-20 20:20:27 +02:00
|
|
|
foreach ($this->required_attributes[$name] as $required_attribute) {
|
2013-02-18 03:48:21 +01:00
|
|
|
|
|
|
|
if (! in_array($required_attribute, $used_attributes)) {
|
|
|
|
|
2013-04-07 03:14:52 +02:00
|
|
|
$empty_tag = true;
|
2013-02-18 03:48:21 +01:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-04-07 03:14:52 +02:00
|
|
|
if (! $empty_tag) {
|
2013-02-18 03:48:21 +01:00
|
|
|
|
|
|
|
$this->data .= '<'.$name.$attr_data;
|
|
|
|
|
2013-04-07 03:14:52 +02:00
|
|
|
// Add custom attributes
|
2014-05-20 20:20:27 +02:00
|
|
|
if (isset($this->add_attributes[$name])) {
|
2013-02-24 20:03:14 +01:00
|
|
|
|
2014-05-20 20:20:27 +02:00
|
|
|
$this->data .= ' '.$this->add_attributes[$name].' ';
|
2013-02-24 20:03:14 +01:00
|
|
|
}
|
|
|
|
|
2013-04-07 03:14:52 +02:00
|
|
|
// If img or br, we don't close it here
|
2013-02-18 03:48:21 +01:00
|
|
|
if ($name !== 'img' && $name !== 'br') $this->data .= '>';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-05-20 20:20:27 +02:00
|
|
|
if (in_array($name, $this->blacklisted_tags)) {
|
2013-02-18 03:48:21 +01:00
|
|
|
$this->strip_content = true;
|
|
|
|
}
|
2013-04-07 03:14:52 +02:00
|
|
|
|
|
|
|
$this->empty_tags[] = $empty_tag;
|
2013-02-18 03:48:21 +01:00
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Parse closing tag
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param resource $parser XML parser
|
|
|
|
* @param string $name Tag name
|
|
|
|
*/
|
2013-02-18 03:48:21 +01:00
|
|
|
public function endTag($parser, $name)
|
|
|
|
{
|
2013-04-07 03:14:52 +02:00
|
|
|
if (! array_pop($this->empty_tags) && $this->isAllowedTag($name)) {
|
2013-02-18 03:48:21 +01:00
|
|
|
$this->data .= $name !== 'img' && $name !== 'br' ? '</'.$name.'>' : '/>';
|
|
|
|
}
|
2013-08-04 03:08:44 +02:00
|
|
|
|
|
|
|
if ($this->is_code && $name === 'pre') $this->is_code = false;
|
2013-02-18 03:48:21 +01:00
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Parse tag content
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param resource $parser XML parser
|
|
|
|
* @param string $content Tag content
|
|
|
|
*/
|
2013-02-18 03:48:21 +01:00
|
|
|
public function dataTag($parser, $content)
|
|
|
|
{
|
2013-08-04 03:08:44 +02:00
|
|
|
$content = str_replace("\xc2\xa0", ' ', $content); // Replace with normal space
|
|
|
|
|
2013-10-04 05:14:39 +02:00
|
|
|
// Issue with Cyrillic characters
|
2013-08-04 03:08:44 +02:00
|
|
|
// Replace mutliple space by a single one
|
2013-10-04 05:14:39 +02:00
|
|
|
// if (! $this->is_code) {
|
|
|
|
// $content = preg_replace('!\s+!', ' ', $content);
|
|
|
|
// }
|
2013-08-04 03:08:44 +02:00
|
|
|
|
2013-09-09 00:29:27 +02:00
|
|
|
if (! $this->strip_content) {
|
2014-03-01 14:54:33 +01:00
|
|
|
$this->data .= $this->escape($content);
|
2013-08-04 03:08:44 +02:00
|
|
|
}
|
2013-02-18 03:48:21 +01:00
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Escape HTML content
|
|
|
|
*
|
|
|
|
* @static
|
|
|
|
* @access public
|
|
|
|
* @return string
|
|
|
|
*/
|
2014-03-01 14:54:33 +01:00
|
|
|
public static function escape($content)
|
|
|
|
{
|
|
|
|
return htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false);
|
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Get the absolute url for a relative link
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param string $path Relative path
|
|
|
|
* @param string $url Site base url
|
|
|
|
* @return string
|
|
|
|
*/
|
2014-02-18 04:04:49 +01:00
|
|
|
public static function getAbsoluteUrl($path, $url)
|
2013-02-18 03:48:21 +01:00
|
|
|
{
|
|
|
|
$components = parse_url($url);
|
|
|
|
|
2013-06-29 03:50:15 +02:00
|
|
|
if (! isset($components['scheme'])) $components['scheme'] = 'http';
|
|
|
|
|
|
|
|
if (! isset($components['host'])) {
|
|
|
|
|
|
|
|
if ($url) {
|
|
|
|
|
|
|
|
$components['host'] = $url;
|
|
|
|
$components['path'] = '/';
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
|
|
|
|
return '';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-02-23 03:52:18 +01:00
|
|
|
if (! strlen($path)) return $url;
|
|
|
|
|
2013-02-18 03:48:21 +01:00
|
|
|
if ($path{0} === '/') {
|
|
|
|
|
|
|
|
// Absolute path
|
|
|
|
return $components['scheme'].'://'.$components['host'].$path;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
|
|
|
|
// Relative path
|
2013-06-29 03:50:15 +02:00
|
|
|
$url_path = isset($components['path']) && ! empty($components['path']) ? $components['path'] : '/';
|
|
|
|
$length = strlen($url_path);
|
2013-02-18 03:48:21 +01:00
|
|
|
|
2013-06-29 03:50:15 +02:00
|
|
|
if ($length > 1 && $url_path{$length - 1} !== '/') {
|
2013-02-18 03:48:21 +01:00
|
|
|
$url_path = dirname($url_path).'/';
|
|
|
|
}
|
|
|
|
|
|
|
|
if (substr($path, 0, 2) === './') {
|
|
|
|
$path = substr($path, 2);
|
|
|
|
}
|
|
|
|
|
|
|
|
return $components['scheme'].'://'.$components['host'].$url_path.$path;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Check if an url is relative
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param string $value Attribute value
|
|
|
|
* @return boolean
|
|
|
|
*/
|
2014-02-18 04:04:49 +01:00
|
|
|
public static function isRelativePath($value)
|
2013-02-18 03:48:21 +01:00
|
|
|
{
|
2013-04-12 21:57:54 +02:00
|
|
|
if (strpos($value, 'data:') === 0) return false;
|
2013-02-18 03:48:21 +01:00
|
|
|
return strpos($value, '://') === false && strpos($value, '//') !== 0;
|
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Check if a tag is on the whitelist
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param string $name Tag name
|
|
|
|
* @return boolean
|
|
|
|
*/
|
2013-02-18 03:48:21 +01:00
|
|
|
public function isAllowedTag($name)
|
|
|
|
{
|
2014-05-20 20:20:27 +02:00
|
|
|
return isset($this->whitelist_tags[$name]);
|
2013-02-18 03:48:21 +01:00
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Check if an attribute is allowed for a given tag
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param string $tag Tag name
|
|
|
|
* @param array $attribute Attribute name
|
|
|
|
* @return boolean
|
|
|
|
*/
|
2013-02-18 03:48:21 +01:00
|
|
|
public function isAllowedAttribute($tag, $attribute)
|
|
|
|
{
|
2014-05-20 20:20:27 +02:00
|
|
|
return in_array($attribute, $this->whitelist_tags[$tag]);
|
2013-02-18 03:48:21 +01:00
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Check if an attribute name is an external resource
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param string $data Attribute name
|
|
|
|
* @return boolean
|
|
|
|
*/
|
2013-02-18 03:48:21 +01:00
|
|
|
public function isResource($attribute)
|
|
|
|
{
|
2014-05-20 20:20:27 +02:00
|
|
|
return in_array($attribute, $this->media_attributes);
|
2013-02-18 03:48:21 +01:00
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Check if an iframe url is allowed
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param string $value Attribute value
|
|
|
|
* @return boolean
|
|
|
|
*/
|
2013-04-05 05:34:07 +02:00
|
|
|
public function isAllowedIframeResource($value)
|
|
|
|
{
|
2014-05-20 20:20:27 +02:00
|
|
|
foreach ($this->iframe_whitelist as $url) {
|
2013-04-05 05:34:07 +02:00
|
|
|
|
|
|
|
if (strpos($value, $url) === 0) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Detect if the protocol is allowed or not
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param string $value Attribute value
|
|
|
|
* @return boolean
|
|
|
|
*/
|
2013-02-18 03:48:21 +01:00
|
|
|
public function isAllowedProtocol($value)
|
|
|
|
{
|
2014-05-20 20:20:27 +02:00
|
|
|
foreach ($this->scheme_whitelist as $protocol) {
|
2013-02-18 03:48:21 +01:00
|
|
|
|
|
|
|
if (strpos($value, $protocol) === 0) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Detect if an url is blacklisted
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param string $resouce Attribute value (URL)
|
|
|
|
* @return boolean
|
|
|
|
*/
|
2013-07-28 23:53:17 +02:00
|
|
|
public function isBlacklistedMedia($resource)
|
2013-02-18 03:48:21 +01:00
|
|
|
{
|
2014-05-20 20:20:27 +02:00
|
|
|
foreach ($this->media_blacklist as $name) {
|
2013-02-18 03:48:21 +01:00
|
|
|
|
|
|
|
if (strpos($resource, $name) !== false) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Detect if an image tag is a pixel tracker
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param string $tag Tag name
|
|
|
|
* @param array $attributes Tag attributes
|
|
|
|
* @return boolean
|
|
|
|
*/
|
2013-02-18 03:48:21 +01:00
|
|
|
public function isPixelTracker($tag, array $attributes)
|
|
|
|
{
|
|
|
|
return $tag === 'img' &&
|
|
|
|
isset($attributes['height']) && isset($attributes['width']) &&
|
|
|
|
$attributes['height'] == 1 && $attributes['width'] == 1;
|
|
|
|
}
|
2013-07-28 23:53:17 +02:00
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Check if an attribute value is integer
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param string $attribute Attribute name
|
|
|
|
* @param string $value Attribute value
|
|
|
|
* @return boolean
|
|
|
|
*/
|
2013-07-28 23:53:17 +02:00
|
|
|
public function validateAttributeValue($attribute, $value)
|
|
|
|
{
|
2014-05-20 20:20:27 +02:00
|
|
|
if (in_array($attribute, $this->integer_attributes)) {
|
2013-07-28 23:53:17 +02:00
|
|
|
return ctype_digit($value);
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
2013-08-04 03:08:44 +02:00
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Replace <br/><br/> by only one
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param string $data Input data
|
|
|
|
* @return string
|
|
|
|
*/
|
2013-08-04 03:08:44 +02:00
|
|
|
public function removeMultipleTags($data)
|
|
|
|
{
|
|
|
|
return preg_replace("/(<br\s*\/?>\s*)+/", "<br/>", $data);
|
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Remove empty tags
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param string $data Input data
|
|
|
|
* @return string
|
|
|
|
*/
|
2013-08-04 03:08:44 +02:00
|
|
|
public function removeEmptyTags($data)
|
|
|
|
{
|
|
|
|
return preg_replace('/<([^<\/>]*)>([\s]*?|(?R))<\/\1>/imsU', '', $data);
|
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Remove HTML tags
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param string $data Input data
|
|
|
|
* @return string
|
|
|
|
*/
|
2013-08-04 03:08:44 +02:00
|
|
|
public function removeHTMLTags($data)
|
|
|
|
{
|
|
|
|
return preg_replace('~<(?:!DOCTYPE|/?(?:html|head|body))[^>]*>\s*~i', '', $data);
|
|
|
|
}
|
2013-08-31 17:05:45 +02:00
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Remove the XML tag from a document
|
|
|
|
*
|
|
|
|
* @static
|
|
|
|
* @access public
|
|
|
|
* @param string $data Input data
|
|
|
|
* @return string
|
|
|
|
*/
|
2013-08-31 17:05:45 +02:00
|
|
|
public static function stripXmlTag($data)
|
|
|
|
{
|
2014-03-30 00:48:29 +01:00
|
|
|
if (strpos($data, '<?xml') !== false) {
|
2013-12-16 04:38:06 +01:00
|
|
|
$data = ltrim(substr($data, strpos($data, '?>') + 2));
|
2013-08-31 17:05:45 +02:00
|
|
|
}
|
|
|
|
|
2013-12-16 04:38:06 +01:00
|
|
|
do {
|
|
|
|
|
|
|
|
$pos = strpos($data, '<?xml-stylesheet ');
|
|
|
|
|
|
|
|
if ($pos !== false) {
|
|
|
|
$data = ltrim(substr($data, strpos($data, '?>') + 2));
|
|
|
|
}
|
|
|
|
|
|
|
|
} while ($pos !== false && $pos < 200);
|
|
|
|
|
2013-08-31 17:05:45 +02:00
|
|
|
return $data;
|
|
|
|
}
|
2013-09-01 00:37:26 +02:00
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
2014-04-16 00:15:31 +02:00
|
|
|
* Strip head tag from the HTML content
|
2014-03-30 00:48:29 +01:00
|
|
|
*
|
|
|
|
* @static
|
|
|
|
* @access public
|
|
|
|
* @param string $data Input data
|
|
|
|
* @return string
|
|
|
|
*/
|
2014-04-16 00:15:31 +02:00
|
|
|
public static function stripHeadTags($data)
|
2013-09-01 00:37:26 +02:00
|
|
|
{
|
2014-04-16 00:15:31 +02:00
|
|
|
$start = strpos($data, '<head>');
|
|
|
|
$end = strpos($data, '</head>');
|
|
|
|
|
|
|
|
if ($start !== false && $end !== false) {
|
|
|
|
$before = substr($data, 0, $start);
|
|
|
|
$after = substr($data, $end + 7);
|
|
|
|
$data = $before.$after;
|
|
|
|
}
|
|
|
|
|
|
|
|
return $data;
|
2013-09-01 00:37:26 +02:00
|
|
|
}
|
2013-10-04 05:14:39 +02:00
|
|
|
|
2014-05-20 20:20:27 +02:00
|
|
|
/**
|
|
|
|
* Set whitelisted tags adn attributes for each tag
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param array $values List of tags: ['video' => ['src', 'cover'], 'img' => ['src']]
|
|
|
|
* @return \PicoFeed\Filter
|
|
|
|
*/
|
|
|
|
public function setWhitelistedTags(array $values)
|
|
|
|
{
|
|
|
|
$this->whitelist_tags = $values ?: $this->whitelist_tags;
|
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Set blacklisted tags
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param array $values List of tags: ['video', 'img']
|
|
|
|
* @return \PicoFeed\Filter
|
|
|
|
*/
|
|
|
|
public function setBlacklistedTags(array $values)
|
|
|
|
{
|
|
|
|
$this->blacklisted_tags = $values ?: $this->blacklisted_tags;
|
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Set scheme whitelist
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param array $values List of scheme: ['http://', 'ftp://']
|
|
|
|
* @return \PicoFeed\Filter
|
|
|
|
*/
|
|
|
|
public function setSchemeWhitelist(array $values)
|
|
|
|
{
|
|
|
|
$this->scheme_whitelist = $values ?: $this->scheme_whitelist;
|
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Set media attributes (used to load external resources)
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param array $values List of values: ['src', 'href']
|
|
|
|
* @return \PicoFeed\Filter
|
|
|
|
*/
|
|
|
|
public function setMediaAttributes(array $values)
|
|
|
|
{
|
|
|
|
$this->media_attributes = $values ?: $this->media_attributes;
|
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Set blacklisted external resources
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param array $values List of tags: ['http://google.com/', '...']
|
|
|
|
* @return \PicoFeed\Filter
|
|
|
|
*/
|
|
|
|
public function setMediaBlacklist(array $values)
|
|
|
|
{
|
|
|
|
$this->media_blacklist = $values ?: $this->media_blacklist;
|
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Set mandatory attributes for whitelisted tags
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param array $values List of tags: ['img' => 'src']
|
|
|
|
* @return \PicoFeed\Filter
|
|
|
|
*/
|
|
|
|
public function setRequiredAttributes(array $values)
|
|
|
|
{
|
|
|
|
$this->required_attributes = $values ?: $this->required_attributes;
|
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Set attributes to automatically to specific tags
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param array $values List of tags: ['a' => 'target="_blank"']
|
|
|
|
* @return \PicoFeed\Filter
|
|
|
|
*/
|
|
|
|
public function setAttributeOverrides(array $values)
|
|
|
|
{
|
|
|
|
$this->add_attributes = $values ?: $this->add_attributes;
|
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Set attributes that must be an integer
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param array $values List of tags: ['width', 'height']
|
|
|
|
* @return \PicoFeed\Filter
|
|
|
|
*/
|
|
|
|
public function setIntegerAttributes(array $values)
|
|
|
|
{
|
|
|
|
$this->integer_attributes = $values ?: $this->integer_attributes;
|
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Set allowed iframe resources
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param array $values List of tags: ['http://www.youtube.com']
|
|
|
|
* @return \PicoFeed\Filter
|
|
|
|
*/
|
|
|
|
public function setIframeWhitelist(array $values)
|
|
|
|
{
|
|
|
|
$this->iframe_whitelist = $values ?: $this->iframe_whitelist;
|
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Set config object
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param \PicoFeed\Config $config Config instance
|
|
|
|
* @return \PicoFeed\Parse
|
|
|
|
*/
|
|
|
|
public function setConfig($config)
|
|
|
|
{
|
|
|
|
$this->config = $config;
|
|
|
|
|
|
|
|
if ($this->config !== null) {
|
|
|
|
$this->setIframeWhitelist($this->config->getFilterIframeWhitelist(array()));
|
|
|
|
$this->setIntegerAttributes($this->config->getFilterIntegerAttributes(array()));
|
|
|
|
$this->setAttributeOverrides($this->config->getFilterAttributeOverrides(array()));
|
|
|
|
$this->setRequiredAttributes($this->config->getFilterRequiredAttributes(array()));
|
|
|
|
$this->setMediaBlacklist($this->config->getFilterMediaBlacklist(array()));
|
|
|
|
$this->setMediaAttributes($this->config->getFilterMediaAttributes(array()));
|
|
|
|
$this->setSchemeWhitelist($this->config->getFilterSchemeWhitelist(array()));
|
|
|
|
$this->setBlacklistedTags($this->config->getFilterBlacklistedTags(array()));
|
|
|
|
$this->setWhitelistedTags($this->config->getFilterWhitelistedTags(array()));
|
|
|
|
}
|
|
|
|
|
|
|
|
return $this;
|
|
|
|
}
|
2013-02-18 03:48:21 +01:00
|
|
|
}
|