2013-02-18 03:48:21 +01:00
|
|
|
<?php
|
|
|
|
|
2014-12-24 03:28:26 +01:00
|
|
|
namespace PicoFeed\Filter;
|
2014-05-20 20:20:27 +02:00
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Filter class
|
|
|
|
*
|
|
|
|
* @author Frederic Guillot
|
2014-12-24 03:28:26 +01:00
|
|
|
* @package Filter
|
2014-03-30 00:48:29 +01:00
|
|
|
*/
|
2013-02-18 03:48:21 +01:00
|
|
|
class Filter
|
|
|
|
{
|
2014-05-20 20:20:27 +02:00
|
|
|
/**
|
2014-10-19 20:42:31 +02:00
|
|
|
* Get the Html filter instance
|
2014-03-30 00:48:29 +01:00
|
|
|
*
|
2014-10-19 20:42:31 +02:00
|
|
|
* @static
|
2014-03-30 00:48:29 +01:00
|
|
|
* @access public
|
2014-10-19 20:42:31 +02:00
|
|
|
* @param string $html HTML content
|
|
|
|
* @param string $website Site URL (used to build absolute URL)
|
2014-12-24 03:28:26 +01:00
|
|
|
* @return Html
|
2014-03-30 00:48:29 +01:00
|
|
|
*/
|
2014-10-19 20:42:31 +02:00
|
|
|
public static function html($html, $website)
|
2013-02-18 03:48:21 +01:00
|
|
|
{
|
2014-10-19 20:42:31 +02:00
|
|
|
$filter = new Html($html, $website);
|
|
|
|
return $filter;
|
2013-02-18 03:48:21 +01:00
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Escape HTML content
|
|
|
|
*
|
|
|
|
* @static
|
|
|
|
* @access public
|
|
|
|
* @return string
|
|
|
|
*/
|
2014-03-01 14:54:33 +01:00
|
|
|
public static function escape($content)
|
|
|
|
{
|
2014-10-19 20:42:31 +02:00
|
|
|
return @htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false);
|
2013-08-04 03:08:44 +02:00
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Remove HTML tags
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param string $data Input data
|
|
|
|
* @return string
|
|
|
|
*/
|
2013-08-04 03:08:44 +02:00
|
|
|
public function removeHTMLTags($data)
|
|
|
|
{
|
|
|
|
return preg_replace('~<(?:!DOCTYPE|/?(?:html|head|body))[^>]*>\s*~i', '', $data);
|
|
|
|
}
|
2013-08-31 17:05:45 +02:00
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Remove the XML tag from a document
|
|
|
|
*
|
|
|
|
* @static
|
|
|
|
* @access public
|
|
|
|
* @param string $data Input data
|
|
|
|
* @return string
|
|
|
|
*/
|
2013-08-31 17:05:45 +02:00
|
|
|
public static function stripXmlTag($data)
|
|
|
|
{
|
2014-03-30 00:48:29 +01:00
|
|
|
if (strpos($data, '<?xml') !== false) {
|
2013-12-16 04:38:06 +01:00
|
|
|
$data = ltrim(substr($data, strpos($data, '?>') + 2));
|
2013-08-31 17:05:45 +02:00
|
|
|
}
|
|
|
|
|
2013-12-16 04:38:06 +01:00
|
|
|
do {
|
|
|
|
|
|
|
|
$pos = strpos($data, '<?xml-stylesheet ');
|
|
|
|
|
|
|
|
if ($pos !== false) {
|
|
|
|
$data = ltrim(substr($data, strpos($data, '?>') + 2));
|
|
|
|
}
|
|
|
|
|
|
|
|
} while ($pos !== false && $pos < 200);
|
|
|
|
|
2013-08-31 17:05:45 +02:00
|
|
|
return $data;
|
|
|
|
}
|
2013-09-01 00:37:26 +02:00
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
2014-04-16 00:15:31 +02:00
|
|
|
* Strip head tag from the HTML content
|
2014-03-30 00:48:29 +01:00
|
|
|
*
|
|
|
|
* @static
|
|
|
|
* @access public
|
|
|
|
* @param string $data Input data
|
|
|
|
* @return string
|
|
|
|
*/
|
2014-04-16 00:15:31 +02:00
|
|
|
public static function stripHeadTags($data)
|
2013-09-01 00:37:26 +02:00
|
|
|
{
|
2014-12-24 03:28:26 +01:00
|
|
|
return preg_replace('@<head[^>]*?>.*?</head>@siu','', $data );
|
2013-09-01 00:37:26 +02:00
|
|
|
}
|
2013-10-04 05:14:39 +02:00
|
|
|
|
2014-05-20 20:20:27 +02:00
|
|
|
/**
|
2014-10-19 20:42:31 +02:00
|
|
|
* Trim whitespace from the begining, the end and inside a string and don't break utf-8 string
|
2014-05-20 20:20:27 +02:00
|
|
|
*
|
2014-10-19 20:42:31 +02:00
|
|
|
* @static
|
2014-05-20 20:20:27 +02:00
|
|
|
* @access public
|
2014-10-19 20:42:31 +02:00
|
|
|
* @param string $value Raw data
|
|
|
|
* @return string Normalized data
|
2014-05-20 20:20:27 +02:00
|
|
|
*/
|
2014-10-19 20:42:31 +02:00
|
|
|
public static function stripWhiteSpace($value)
|
2014-05-20 20:20:27 +02:00
|
|
|
{
|
2014-10-28 23:40:13 +01:00
|
|
|
$value = str_replace("\r", ' ', $value);
|
|
|
|
$value = str_replace("\t", ' ', $value);
|
|
|
|
$value = str_replace("\n", ' ', $value);
|
2014-12-24 03:28:26 +01:00
|
|
|
// $value = preg_replace('/\s+/', ' ', $value); <= break utf-8
|
2014-10-19 20:42:31 +02:00
|
|
|
return trim($value);
|
2014-05-20 20:20:27 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-06-21 15:56:36 +02:00
|
|
|
* Fixes before XML parsing
|
2014-05-20 20:20:27 +02:00
|
|
|
*
|
2014-10-19 20:42:31 +02:00
|
|
|
* @static
|
2014-05-20 20:20:27 +02:00
|
|
|
* @access public
|
2014-10-19 20:42:31 +02:00
|
|
|
* @param string $data Raw data
|
|
|
|
* @return string Normalized data
|
2014-05-20 20:20:27 +02:00
|
|
|
*/
|
2014-10-19 20:42:31 +02:00
|
|
|
public static function normalizeData($data)
|
2014-05-20 20:20:27 +02:00
|
|
|
{
|
2015-06-21 15:56:36 +02:00
|
|
|
$entities = array(
|
|
|
|
'/(&#)(\d+);/m', // decimal encoded
|
|
|
|
'/(&#x)([a-f0-9]+);/mi', // hex encoded
|
2014-10-19 20:42:31 +02:00
|
|
|
);
|
2014-05-20 20:20:27 +02:00
|
|
|
|
2015-06-21 15:56:36 +02:00
|
|
|
// strip invalid XML 1.0 characters which are encoded as entities
|
|
|
|
$data = preg_replace_callback($entities, function($matches) {
|
|
|
|
$code_point = $matches[2];
|
2014-05-20 20:20:27 +02:00
|
|
|
|
2015-06-21 15:56:36 +02:00
|
|
|
// convert hex entity to decimal
|
|
|
|
if (strtolower($matches[1]) === '&#x') {
|
|
|
|
$code_point = hexdec($code_point);
|
|
|
|
}
|
|
|
|
|
|
|
|
$code_point = (int) $code_point;
|
|
|
|
|
|
|
|
// replace invalid characters
|
|
|
|
if ($code_point < 9
|
|
|
|
|| ($code_point > 10 && $code_point < 13)
|
|
|
|
|| ($code_point > 13 && $code_point < 32)
|
|
|
|
|| ($code_point > 55295 && $code_point < 57344)
|
|
|
|
|| ($code_point > 65533 && $code_point < 65536)
|
|
|
|
|| $code_point > 1114111
|
|
|
|
) {
|
|
|
|
return '';
|
|
|
|
};
|
|
|
|
|
|
|
|
return $matches[0];
|
|
|
|
}, $data);
|
|
|
|
|
|
|
|
// strip every utf-8 character than isn't in the range of valid XML 1.0 characters
|
|
|
|
return (string) preg_replace('/[^\x{0009}\x{000A}\x{000D}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]/u', '', $data);
|
2014-05-20 20:20:27 +02:00
|
|
|
}
|
2013-02-18 03:48:21 +01:00
|
|
|
}
|