2014-10-19 20:42:31 +02:00
|
|
|
<?php
|
|
|
|
|
|
|
|
namespace PicoFeed\Filter;
|
|
|
|
|
2015-04-28 18:08:42 +02:00
|
|
|
use PicoFeed\Config\Config;
|
2015-03-01 19:56:11 +01:00
|
|
|
use PicoFeed\Client\Url;
|
2015-04-28 18:08:42 +02:00
|
|
|
use PicoFeed\Scraper\RuleLoader;
|
2015-03-01 19:56:11 +01:00
|
|
|
use PicoFeed\Parser\XmlParser;
|
2014-10-19 20:42:31 +02:00
|
|
|
|
|
|
|
/**
|
|
|
|
* HTML Filter class
|
|
|
|
*
|
|
|
|
* @author Frederic Guillot
|
2014-12-24 03:28:26 +01:00
|
|
|
* @package Filter
|
2014-10-19 20:42:31 +02:00
|
|
|
*/
|
|
|
|
class Html
|
|
|
|
{
|
|
|
|
/**
|
|
|
|
* Config object
|
|
|
|
*
|
|
|
|
* @access private
|
2014-12-24 03:28:26 +01:00
|
|
|
* @var \PicoFeed\Config\Config
|
2014-10-19 20:42:31 +02:00
|
|
|
*/
|
2014-12-24 03:28:26 +01:00
|
|
|
private $config;
|
2014-10-19 20:42:31 +02:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Unfiltered XML data
|
|
|
|
*
|
|
|
|
* @access private
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
private $input = '';
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Filtered XML data
|
|
|
|
*
|
|
|
|
* @access private
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
private $output = '';
|
|
|
|
|
|
|
|
/**
|
|
|
|
* List of empty tags
|
|
|
|
*
|
|
|
|
* @access private
|
|
|
|
* @var array
|
|
|
|
*/
|
|
|
|
private $empty_tags = array();
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Empty flag
|
|
|
|
*
|
|
|
|
* @access private
|
|
|
|
* @var boolean
|
|
|
|
*/
|
|
|
|
private $empty = true;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Tag instance
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @var \PicoFeed\Filter\Tag
|
|
|
|
*/
|
|
|
|
public $tag = '';
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Attribute instance
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @var \PicoFeed\Filter\Attribute
|
|
|
|
*/
|
|
|
|
public $attribute = '';
|
|
|
|
|
2015-04-28 18:08:42 +02:00
|
|
|
/**
|
|
|
|
* The website to filter
|
|
|
|
*
|
|
|
|
* @access private
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
private $website;
|
|
|
|
|
2014-10-19 20:42:31 +02:00
|
|
|
/**
|
|
|
|
* Initialize the filter, all inputs data must be encoded in UTF-8 before
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param string $html HTML content
|
|
|
|
* @param string $website Site URL (used to build absolute URL)
|
|
|
|
*/
|
|
|
|
public function __construct($html, $website)
|
|
|
|
{
|
|
|
|
$this->input = XmlParser::HtmlToXml($html);
|
|
|
|
$this->output = '';
|
|
|
|
$this->tag = new Tag;
|
2015-04-28 18:08:42 +02:00
|
|
|
$this->website = $website;
|
2014-10-19 20:42:31 +02:00
|
|
|
$this->attribute = new Attribute(new Url($website));
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Set config object
|
|
|
|
*
|
|
|
|
* @access public
|
2014-12-24 03:28:26 +01:00
|
|
|
* @param \PicoFeed\Config\Config $config Config instance
|
|
|
|
* @return \PicoFeed\Filter\Html
|
2014-10-19 20:42:31 +02:00
|
|
|
*/
|
|
|
|
public function setConfig($config)
|
|
|
|
{
|
|
|
|
$this->config = $config;
|
|
|
|
|
|
|
|
if ($this->config !== null) {
|
2014-12-24 03:28:26 +01:00
|
|
|
$this->attribute->setImageProxyCallback($this->config->getFilterImageProxyCallback());
|
|
|
|
$this->attribute->setImageProxyUrl($this->config->getFilterImageProxyUrl());
|
2015-02-06 03:16:34 +01:00
|
|
|
$this->attribute->setImageProxyProtocol($this->config->getFilterImageProxyProtocol());
|
2014-10-19 20:42:31 +02:00
|
|
|
$this->attribute->setIframeWhitelist($this->config->getFilterIframeWhitelist(array()));
|
|
|
|
$this->attribute->setIntegerAttributes($this->config->getFilterIntegerAttributes(array()));
|
|
|
|
$this->attribute->setAttributeOverrides($this->config->getFilterAttributeOverrides(array()));
|
|
|
|
$this->attribute->setRequiredAttributes($this->config->getFilterRequiredAttributes(array()));
|
|
|
|
$this->attribute->setMediaBlacklist($this->config->getFilterMediaBlacklist(array()));
|
|
|
|
$this->attribute->setMediaAttributes($this->config->getFilterMediaAttributes(array()));
|
|
|
|
$this->attribute->setSchemeWhitelist($this->config->getFilterSchemeWhitelist(array()));
|
|
|
|
$this->attribute->setWhitelistedAttributes($this->config->getFilterWhitelistedTags(array()));
|
|
|
|
$this->tag->setWhitelistedTags(array_keys($this->config->getFilterWhitelistedTags(array())));
|
|
|
|
}
|
|
|
|
|
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Run tags/attributes filtering
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
public function execute()
|
|
|
|
{
|
2015-03-01 19:56:11 +01:00
|
|
|
$this->preFilter();
|
|
|
|
|
2014-10-19 20:42:31 +02:00
|
|
|
$parser = xml_parser_create();
|
|
|
|
|
|
|
|
xml_set_object($parser, $this);
|
|
|
|
xml_set_element_handler($parser, 'startTag', 'endTag');
|
|
|
|
xml_set_character_data_handler($parser, 'dataTag');
|
|
|
|
xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, false);
|
|
|
|
xml_parse($parser, $this->input, true);
|
|
|
|
xml_parser_free($parser);
|
|
|
|
|
|
|
|
$this->postFilter();
|
|
|
|
|
|
|
|
return $this->output;
|
|
|
|
}
|
|
|
|
|
2015-03-01 19:56:11 +01:00
|
|
|
/**
|
|
|
|
* Called before XML parsing
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
*/
|
|
|
|
public function preFilter()
|
|
|
|
{
|
|
|
|
$this->input = $this->tag->removeBlacklistedTags($this->input);
|
|
|
|
}
|
|
|
|
|
2014-12-24 03:28:26 +01:00
|
|
|
/**
|
|
|
|
* Called after XML parsing
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
*/
|
2014-10-19 20:42:31 +02:00
|
|
|
public function postFilter()
|
|
|
|
{
|
|
|
|
$this->output = $this->tag->removeEmptyTags($this->output);
|
2015-04-28 18:08:42 +02:00
|
|
|
$this->output = $this->filterRules($this->output);
|
|
|
|
$this->output = $this->tag->removeMultipleBreakTags($this->output);
|
2014-10-19 20:42:31 +02:00
|
|
|
$this->output = trim($this->output);
|
|
|
|
}
|
|
|
|
|
2015-04-28 18:08:42 +02:00
|
|
|
/**
|
|
|
|
* Called after XML parsing
|
|
|
|
* @param string $content the content that should be filtered
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
*/
|
|
|
|
public function filterRules($content)
|
|
|
|
{
|
|
|
|
// the constructor should require a config, then this if can be removed
|
|
|
|
if ($this->config === null) {
|
|
|
|
$config = new Config;
|
|
|
|
} else {
|
|
|
|
$config = $this->config;
|
|
|
|
}
|
|
|
|
|
|
|
|
$loader = new RuleLoader($config);
|
|
|
|
$rules = $loader->getRules($this->website);
|
|
|
|
|
|
|
|
$url = new Url($this->website);
|
|
|
|
$sub_url = $url->getFullPath();
|
|
|
|
|
|
|
|
if (isset($rules['filter'])) {
|
|
|
|
foreach ($rules['filter'] as $pattern => $rule) {
|
|
|
|
if (preg_match($pattern, $sub_url)) {
|
|
|
|
foreach($rule as $search => $replace) {
|
|
|
|
$content = preg_replace($search, $replace, $content);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return $content;
|
|
|
|
}
|
|
|
|
|
2014-10-19 20:42:31 +02:00
|
|
|
/**
|
|
|
|
* Parse opening tag
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param resource $parser XML parser
|
2014-12-24 03:28:26 +01:00
|
|
|
* @param string $tag Tag name
|
2014-10-19 20:42:31 +02:00
|
|
|
* @param array $attributes Tag attributes
|
|
|
|
*/
|
|
|
|
public function startTag($parser, $tag, array $attributes)
|
|
|
|
{
|
|
|
|
$this->empty = true;
|
|
|
|
|
|
|
|
if ($this->tag->isAllowed($tag, $attributes)) {
|
|
|
|
|
|
|
|
$attributes = $this->attribute->filter($tag, $attributes);
|
|
|
|
|
|
|
|
if ($this->attribute->hasRequiredAttributes($tag, $attributes)) {
|
|
|
|
|
|
|
|
$attributes = $this->attribute->addAttributes($tag, $attributes);
|
|
|
|
|
|
|
|
$this->output .= $this->tag->openHtmlTag($tag, $this->attribute->toHtml($attributes));
|
|
|
|
$this->empty = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$this->empty_tags[] = $this->empty;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Parse closing tag
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param resource $parser XML parser
|
2014-12-24 03:28:26 +01:00
|
|
|
* @param string $tag Tag name
|
2014-10-19 20:42:31 +02:00
|
|
|
*/
|
|
|
|
public function endTag($parser, $tag)
|
|
|
|
{
|
|
|
|
if (! array_pop($this->empty_tags) && $this->tag->isAllowedTag($tag)) {
|
|
|
|
$this->output .= $this->tag->closeHtmlTag($tag);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Parse tag content
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param resource $parser XML parser
|
|
|
|
* @param string $content Tag content
|
|
|
|
*/
|
|
|
|
public function dataTag($parser, $content)
|
|
|
|
{
|
|
|
|
// Replace with normal space
|
|
|
|
$content = str_replace("\xc2\xa0", ' ', $content);
|
|
|
|
$this->output .= Filter::escape($content);
|
|
|
|
}
|
|
|
|
}
|