miniflux-legacy/vendor/miniflux/picofeed/lib/PicoFeed/Filter/Filter.php

<?php

namespace PicoFeed\Filter;

/**
 * Filter class.
 *
 * @author  Frederic Guillot
 */
class Filter
{
    /**
     * Get the Html filter instance.
     *
     * @static
     *
     * @param string $html    HTML content
     * @param string $website Site URL (used to build absolute URL)
     *
     * @return Html
     */
    public static function html($html, $website)
    {
        $filter = new Html($html, $website);

        return $filter;
    }

    /**
     * Escape HTML content.
     *
     * @static
     *
     * @return string
     */
    public static function escape($content)
    {
        return htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false);
    }

    /**
     * Remove HTML tags.
     *
     * @param string $data Input data
     *
     * @return string
     */
    public function removeHTMLTags($data)
    {
        return preg_replace('~<(?:!DOCTYPE|/?(?:html|head|body))[^>]*>\s*~i', '', $data);
    }

    /**
     * Remove the XML tag from a document.
     *
     * @static
     *
     * @param string $data Input data
     *
     * @return string
     */
    public static function stripXmlTag($data)
    {
        if (strpos($data, '<?xml') !== false) {
            $data = ltrim(substr($data, strpos($data, '?>') + 2));
        }

        do {
            $pos = strpos($data, '<?xml-stylesheet ');

            if ($pos !== false) {
                $data = ltrim(substr($data, strpos($data, '?>') + 2));
            }
        } while ($pos !== false && $pos < 200);

        return $data;
    }

    /**
     * Strip head tag from the HTML content.
     *
     * @static
     *
     * @param string $data Input data
     *
     * @return string
     */
    public static function stripHeadTags($data)
    {
        return preg_replace('@<head[^>]*?>.*?</head>@siu', '', $data);
    }

    /**
     * Trim whitespace from the begining, the end and inside a string and don't break utf-8 string.
     *
     * @static
     *
     * @param string $value Raw data
     *
     * @return string Normalized data
     */
    public static function stripWhiteSpace($value)
    {
        $value = str_replace("\r", ' ', $value);
        $value = str_replace("\t", ' ', $value);
        $value = str_replace("\n", ' ', $value);
        // $value = preg_replace('/\s+/', ' ', $value); <= break utf-8
        return trim($value);
    }

    /**
     * Fixes before XML parsing.
     *
     * @static
     *
     * @param string $data Raw data
     *
     * @return string Normalized data
     */
    public static function normalizeData($data)
    {
        $entities = array(
            '/(&#)(\d+);/m', // decimal encoded
            '/(&#x)([a-f0-9]+);/mi', // hex encoded
        );

        // strip invalid XML 1.0 characters which are encoded as entities
        $data = preg_replace_callback($entities, function ($matches) {
            $code_point = $matches[2];

            // convert hex entity to decimal
            if (strtolower($matches[1]) === '&#x') {
                $code_point = hexdec($code_point);
            }

            $code_point = (int) $code_point;

            // replace invalid characters
            if ($code_point < 9
                || ($code_point > 10 && $code_point < 13)
                || ($code_point > 13 && $code_point < 32)
                || ($code_point > 55295 && $code_point < 57344)
                || ($code_point > 65533 && $code_point < 65536)
                || $code_point > 1114111
            ) {
                return '';
            };

            return $matches[0];
        }, $data);

        // strip every utf-8 character than isn't in the range of valid XML 1.0 characters
        return (string) preg_replace('/[^\x{0009}\x{000A}\x{000D}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]/u', '', $data);
    }
}
first commit 2013-02-18 03:48:21 +01:00			`<?php`

Move to Composer and update to the last version of PicoFeed 2014-12-24 03:28:26 +01:00			`namespace PicoFeed\Filter;`
Update to the last version of PicoFeed 2014-05-20 20:20:27 +02:00
Bug fix feed parser and add a grabber rule for distrowatch.com 2014-03-30 00:48:29 +01:00			`/**`
Update picofeed due to bug in rule file 2015-10-20 04:49:30 +02:00			`* Filter class.`
Bug fix feed parser and add a grabber rule for distrowatch.com 2014-03-30 00:48:29 +01:00			`*`
			`* @author Frederic Guillot`
			`*/`
first commit 2013-02-18 03:48:21 +01:00			`class Filter`
			`{`
Update to the last version of PicoFeed 2014-05-20 20:20:27 +02:00			`/**`
Update picofeed due to bug in rule file 2015-10-20 04:49:30 +02:00			`* Get the Html filter instance.`
Bug fix feed parser and add a grabber rule for distrowatch.com 2014-03-30 00:48:29 +01:00			`*`
Update PicoFeed and PicoDb 2014-10-19 20:42:31 +02:00			`* @static`
Update picofeed due to bug in rule file 2015-10-20 04:49:30 +02:00			`*`
			`* @param string $html HTML content`
			`* @param string $website Site URL (used to build absolute URL)`
			`*`
Move to Composer and update to the last version of PicoFeed 2014-12-24 03:28:26 +01:00			`* @return Html`
Bug fix feed parser and add a grabber rule for distrowatch.com 2014-03-30 00:48:29 +01:00			`*/`
Update PicoFeed and PicoDb 2014-10-19 20:42:31 +02:00			`public static function html($html, $website)`
first commit 2013-02-18 03:48:21 +01:00			`{`
Update PicoFeed and PicoDb 2014-10-19 20:42:31 +02:00			`$filter = new Html($html, $website);`
Update picofeed due to bug in rule file 2015-10-20 04:49:30 +02:00
Update PicoFeed and PicoDb 2014-10-19 20:42:31 +02:00			`return $filter;`
first commit 2013-02-18 03:48:21 +01:00			`}`

Bug fix feed parser and add a grabber rule for distrowatch.com 2014-03-30 00:48:29 +01:00			`/**`
Update picofeed due to bug in rule file 2015-10-20 04:49:30 +02:00			`* Escape HTML content.`
Bug fix feed parser and add a grabber rule for distrowatch.com 2014-03-30 00:48:29 +01:00			`*`
			`* @static`
Update picofeed due to bug in rule file 2015-10-20 04:49:30 +02:00			`*`
Bug fix feed parser and add a grabber rule for distrowatch.com 2014-03-30 00:48:29 +01:00			`* @return string`
			`*/`
Add title and alt attributes for img tags (useful for xkcd fans) 2014-03-01 14:54:33 +01:00			`public static function escape($content)`
			`{`
Update picofeed due to bug in rule file 2015-10-20 04:49:30 +02:00			`return htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false);`
Improve filtering (remove empty tags…) 2013-08-04 03:08:44 +02:00			`}`

Bug fix feed parser and add a grabber rule for distrowatch.com 2014-03-30 00:48:29 +01:00			`/**`
Update picofeed due to bug in rule file 2015-10-20 04:49:30 +02:00			`* Remove HTML tags.`
			`*`
			`* @param string $data Input data`
Bug fix feed parser and add a grabber rule for distrowatch.com 2014-03-30 00:48:29 +01:00			`*`
			`* @return string`
			`*/`
Improve filtering (remove empty tags…) 2013-08-04 03:08:44 +02:00			`public function removeHTMLTags($data)`
			`{`
			`return preg_replace('~<(?:!DOCTYPE\|/?(?:html\|head\|body))[^>]>\s~i', '', $data);`
			`}`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00
Bug fix feed parser and add a grabber rule for distrowatch.com 2014-03-30 00:48:29 +01:00			`/**`
Update picofeed due to bug in rule file 2015-10-20 04:49:30 +02:00			`* Remove the XML tag from a document.`
Bug fix feed parser and add a grabber rule for distrowatch.com 2014-03-30 00:48:29 +01:00			`*`
			`* @static`
Update picofeed due to bug in rule file 2015-10-20 04:49:30 +02:00			`*`
			`* @param string $data Input data`
			`*`
Bug fix feed parser and add a grabber rule for distrowatch.com 2014-03-30 00:48:29 +01:00			`* @return string`
			`*/`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`public static function stripXmlTag($data)`
			`{`
Bug fix feed parser and add a grabber rule for distrowatch.com 2014-03-30 00:48:29 +01:00			`if (strpos($data, '<?xml') !== false) {`
Bug fix for RSS2 detection 2013-12-16 04:38:06 +01:00			`$data = ltrim(substr($data, strpos($data, '?>') + 2));`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`}`

Bug fix for RSS2 detection 2013-12-16 04:38:06 +01:00			`do {`
			`$pos = strpos($data, '<?xml-stylesheet ');`

			`if ($pos !== false) {`
			`$data = ltrim(substr($data, strpos($data, '?>') + 2));`
			`}`
			`} while ($pos !== false && $pos < 200);`

Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`return $data;`
			`}`
Improve content grabber 2013-09-01 00:37:26 +02:00
Bug fix feed parser and add a grabber rule for distrowatch.com 2014-03-30 00:48:29 +01:00			`/**`
Update picofeed due to bug in rule file 2015-10-20 04:49:30 +02:00			`* Strip head tag from the HTML content.`
Bug fix feed parser and add a grabber rule for distrowatch.com 2014-03-30 00:48:29 +01:00			`*`
			`* @static`
Update picofeed due to bug in rule file 2015-10-20 04:49:30 +02:00			`*`
			`* @param string $data Input data`
			`*`
Bug fix feed parser and add a grabber rule for distrowatch.com 2014-03-30 00:48:29 +01:00			`* @return string`
			`*/`
Improve content grabber 2014-04-16 00:15:31 +02:00			`public static function stripHeadTags($data)`
Improve content grabber 2013-09-01 00:37:26 +02:00			`{`
Update picofeed due to bug in rule file 2015-10-20 04:49:30 +02:00			`return preg_replace('@<head[^>]?>.?</head>@siu', '', $data);`
Improve content grabber 2013-09-01 00:37:26 +02:00			`}`
Add support for CP1251 encoding 2013-10-04 05:14:39 +02:00
Update to the last version of PicoFeed 2014-05-20 20:20:27 +02:00			`/**`
Update picofeed due to bug in rule file 2015-10-20 04:49:30 +02:00			`* Trim whitespace from the begining, the end and inside a string and don't break utf-8 string.`
Update to the last version of PicoFeed 2014-05-20 20:20:27 +02:00			`*`
Update PicoFeed and PicoDb 2014-10-19 20:42:31 +02:00			`* @static`
Update picofeed due to bug in rule file 2015-10-20 04:49:30 +02:00			`*`
			`* @param string $value Raw data`
			`*`
			`* @return string Normalized data`
Update to the last version of PicoFeed 2014-05-20 20:20:27 +02:00			`*/`
Update PicoFeed and PicoDb 2014-10-19 20:42:31 +02:00			`public static function stripWhiteSpace($value)`
Update to the last version of PicoFeed 2014-05-20 20:20:27 +02:00			`{`
Add new grabber rules: degroupnews.com and sitepoint.com 2014-10-28 23:40:13 +01:00			`$value = str_replace("\r", ' ', $value);`
			`$value = str_replace("\t", ' ', $value);`
			`$value = str_replace("\n", ' ', $value);`
Move to Composer and update to the last version of PicoFeed 2014-12-24 03:28:26 +01:00			`// $value = preg_replace('/\s+/', ' ', $value); <= break utf-8`
Update PicoFeed and PicoDb 2014-10-19 20:42:31 +02:00			`return trim($value);`
Update to the last version of PicoFeed 2014-05-20 20:20:27 +02:00			`}`

			`/**`
Update picofeed due to bug in rule file 2015-10-20 04:49:30 +02:00			`* Fixes before XML parsing.`
Update to the last version of PicoFeed 2014-05-20 20:20:27 +02:00			`*`
Update PicoFeed and PicoDb 2014-10-19 20:42:31 +02:00			`* @static`
Update picofeed due to bug in rule file 2015-10-20 04:49:30 +02:00			`*`
			`* @param string $data Raw data`
			`*`
			`* @return string Normalized data`
Update to the last version of PicoFeed 2014-05-20 20:20:27 +02:00			`*/`
Update PicoFeed and PicoDb 2014-10-19 20:42:31 +02:00			`public static function normalizeData($data)`
Update to the last version of PicoFeed 2014-05-20 20:20:27 +02:00			`{`
Update vendor 2015-06-21 15:56:36 +02:00			`$entities = array(`
			`'/(&#)(\d+);/m', // decimal encoded`
			`'/(&#x)([a-f0-9]+);/mi', // hex encoded`
Update PicoFeed and PicoDb 2014-10-19 20:42:31 +02:00			`);`
Update to the last version of PicoFeed 2014-05-20 20:20:27 +02:00
Update vendor 2015-06-21 15:56:36 +02:00			`// strip invalid XML 1.0 characters which are encoded as entities`
Update picofeed due to bug in rule file 2015-10-20 04:49:30 +02:00			`$data = preg_replace_callback($entities, function ($matches) {`
Update vendor 2015-06-21 15:56:36 +02:00			`$code_point = $matches[2];`
Update to the last version of PicoFeed 2014-05-20 20:20:27 +02:00
Update vendor 2015-06-21 15:56:36 +02:00			`// convert hex entity to decimal`
			`if (strtolower($matches[1]) === '&#x') {`
			`$code_point = hexdec($code_point);`
			`}`

			`$code_point = (int) $code_point;`

			`// replace invalid characters`
			`if ($code_point < 9`
			`\|\| ($code_point > 10 && $code_point < 13)`
			`\|\| ($code_point > 13 && $code_point < 32)`
			`\|\| ($code_point > 55295 && $code_point < 57344)`
			`\|\| ($code_point > 65533 && $code_point < 65536)`
			`\|\| $code_point > 1114111`
			`) {`
			`return '';`
			`};`

			`return $matches[0];`
			`}, $data);`

			`// strip every utf-8 character than isn't in the range of valid XML 1.0 characters`
			`return (string) preg_replace('/[^\x{0009}\x{000A}\x{000D}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]/u', '', $data);`
Update to the last version of PicoFeed 2014-05-20 20:20:27 +02:00			`}`
first commit 2013-02-18 03:48:21 +01:00			`}`