miniflux-legacy/vendor/PicoFeed/Grabber.php

<?php

namespace PicoFeed;

require_once __DIR__.'/Client.php';
require_once __DIR__.'/Encoding.php';
require_once __DIR__.'/Logging.php';
require_once __DIR__.'/Filter.php';

class Grabber
{
    public $content = '';
    public $html = '';
    public $encoding = '';

    // Order is important, generic terms at the end
    public $candidatesAttributes = array(
        'articleBody',
        'articlebody',
        'article-body',
        'articleContent',
        'articlecontent',
        'article-content',
        'articlePage',
        'post-content',
        'post_content',
        'entry-content',
        'main-content',
        'story_content',
        'storycontent',
        'entryBox',
        'entrytext',
        'comic',
        'post',
        'article',
        'content',
        'main',
    );

    public $stripAttributes = array(
        'comment',
        'share',
        'links',
        'toolbar',
        'fb',
        'footer',
        'credit',
        'bottom',
        'nav',
        'header',
        'social',
        'tag',
        'metadata',
        'entry-utility',
        'related-posts',
        'tweet',
        'categories',
    );

    public $stripTags = array(
        'script',
        'style',
        'nav',
        'header',
        'footer',
        'aside',
        'form',
    );


    public function __construct($url, $html = '', $encoding = 'utf-8')
    {
        $this->url = $url;
        $this->html = $html;
        $this->encoding = $encoding;
    }


    public function parse()
    {
        if ($this->html) {

            Logging::log(\get_called_class().' Fix encoding');
            Logging::log(\get_called_class().': HTTP Encoding "'.$this->encoding.'"');

            $this->html = Filter::stripMetaTags($this->html);

            if ($this->encoding == 'windows-1251') {
                $this->html = Encoding::cp1251ToUtf8($this->html);
            }
            else {
                $this->html = Encoding::toUTF8($this->html);
            }

            Logging::log(\get_called_class().' Content length: '.strlen($this->html).' bytes');
            $rules = $this->getRules();

            if (is_array($rules)) {
                Logging::log(\get_called_class().' Parse content with rules');
                $this->parseContentWithRules($rules);
            }
            else {
                Logging::log(\get_called_class().' Parse content with candidates');
                $this->parseContentWithCandidates();
            }
        }
        else {
            Logging::log(\get_called_class().' No content fetched');
        }

        Logging::log(\get_called_class().' Content length: '.strlen($this->content).' bytes');
        Logging::log(\get_called_class().' Grabber done');

        return $this->content !== '';
    }


    public function download($timeout = 5, $user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36')
    {
        $client = Client::create();
        $client->url = $this->url;
        $client->timeout = $timeout;
        $client->user_agent = $user_agent;
        $client->execute();
        $this->html = $client->getContent();

        return $this->html;
    }


    public function getRules()
    {
        $hostname = parse_url($this->url, PHP_URL_HOST);
        $files = array($hostname);

        if (substr($hostname, 0, 4) == 'www.') {
            $files[] = substr($hostname, 4);
        }

        if (($pos = strpos($hostname, '.')) !== false) {
            $files[] = substr($hostname, $pos);
            $files[] = substr($hostname, 0, $pos);
        }

        foreach ($files as $file) {

            $filename = __DIR__.'/Rules/'.$file.'.php';

            if (file_exists($filename)) {
                Logging::log(\get_called_class().' Load rule: '.$file);
                return include $filename;
            }
        }

        return false;
    }


    public function parseContentWithRules(array $rules)
    {
        \libxml_use_internal_errors(true);
        $dom = new \DOMDocument;
        $dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$this->html);
        $xpath = new \DOMXPath($dom);

        if (isset($rules['strip']) && is_array($rules['strip'])) {

            foreach ($rules['strip'] as $pattern) {

                $nodes = $xpath->query($pattern);

                if ($nodes !== false && $nodes->length > 0) {
                    foreach ($nodes as $node) {
                        $node->parentNode->removeChild($node);
                    }
                }
            }
        }

        if (isset($rules['body']) && is_array($rules['body'])) {

            foreach ($rules['body'] as $pattern) {

                $nodes = $xpath->query($pattern);

                if ($nodes !== false && $nodes->length > 0) {
                    foreach ($nodes as $node) {
                        $this->content .= $dom->saveXML($node);
                    }
                }
            }
        }
    }


    public function parseContentWithCandidates()
    {
        \libxml_use_internal_errors(true);
        $dom = new \DOMDocument;
        $dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$this->html);
        $xpath = new \DOMXPath($dom);

        // Try to lookup in each tag
        foreach ($this->candidatesAttributes as $candidate) {

            Logging::log(\get_called_class().' Try this candidate: "'.$candidate.'"');

            $nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');

            if ($nodes !== false && $nodes->length > 0) {
                $this->content = $dom->saveXML($nodes->item(0));
                Logging::log(\get_called_class().' Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
                break;
            }
        }

        // Try to fetch <article/>
        if (! $this->content) {

            $nodes = $xpath->query('//article');

            if ($nodes !== false && $nodes->length > 0) {
                $this->content = $dom->saveXML($nodes->item(0));
                Logging::log(\get_called_class().' Find <article/> tag ('.strlen($this->content).' bytes)');
            }
        }

        if (strlen($this->content) < 50) {
            Logging::log(\get_called_class().' No enought content fetched, get the full body');
            $this->content = $dom->saveXML($dom->firstChild);
        }

        Logging::log(\get_called_class().' Strip garbage');
        $this->stripGarbage();
    }


    public function stripGarbage()
    {
        \libxml_use_internal_errors(true);
        $dom = new \DOMDocument;
        $dom->loadXML($this->content);
        $xpath = new \DOMXPath($dom);

        foreach ($this->stripTags as $tag) {

            $nodes = $xpath->query('//'.$tag);

            if ($nodes !== false && $nodes->length > 0) {
                Logging::log(\get_called_class().' Strip tag: "'.$tag.'"');
                foreach ($nodes as $node) {
                    $node->parentNode->removeChild($node);
                }
            }
        }

        foreach ($this->stripAttributes as $attribute) {

            $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');

            if ($nodes !== false && $nodes->length > 0) {
                Logging::log(\get_called_class().' Strip attribute: "'.$tag.'"');
                foreach ($nodes as $node) {
                    $node->parentNode->removeChild($node);
                }
            }
        }

        $this->content = $dom->saveXML($dom->documentElement);
    }
}
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`<?php`

			`namespace PicoFeed;`

			`require_once __DIR__.'/Client.php';`
			`require_once __DIR__.'/Encoding.php';`
			`require_once __DIR__.'/Logging.php';`
Improve content grabber 2013-09-01 00:37:26 +02:00			`require_once __DIR__.'/Filter.php';`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00
			`class Grabber`
			`{`
			`public $content = '';`
			`public $html = '';`
Add support for CP1251 encoding 2013-10-04 05:14:39 +02:00			`public $encoding = '';`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00
Improve content dowloader and add Github rule 2013-09-02 20:04:10 +02:00			`// Order is important, generic terms at the end`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`public $candidatesAttributes = array(`
			`'articleBody',`
			`'articlebody',`
Improve content dowloader and add Github rule 2013-09-02 20:04:10 +02:00			`'article-body',`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`'articleContent',`
			`'articlecontent',`
Improve content dowloader and add Github rule 2013-09-02 20:04:10 +02:00			`'article-content',`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`'articlePage',`
			`'post-content',`
Add bookmarklet link (add a subscription directly from any browsers) 2013-09-05 02:45:06 +02:00			`'post_content',`
Improve content grabber 2013-09-01 00:37:26 +02:00			`'entry-content',`
Improve content dowloader and add Github rule 2013-09-02 20:04:10 +02:00			`'main-content',`
Add new content grabber rules 2013-10-01 04:15:18 +02:00			`'story_content',`
			`'storycontent',`
Add bookmarklet link (add a subscription directly from any browsers) 2013-09-05 02:45:06 +02:00			`'entryBox',`
			`'entrytext',`
Improve content dowloader and add Github rule 2013-09-02 20:04:10 +02:00			`'comic',`
Add bookmarklet link (add a subscription directly from any browsers) 2013-09-05 02:45:06 +02:00			`'post',`
Improve content dowloader and add Github rule 2013-09-02 20:04:10 +02:00			`'article',`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`'content',`
			`'main',`
			`);`

			`public $stripAttributes = array(`
			`'comment',`
			`'share',`
			`'links',`
			`'toolbar',`
			`'fb',`
			`'footer',`
			`'credit',`
			`'bottom',`
			`'nav',`
			`'header',`
			`'social',`
Add new content grabber rules 2013-10-01 04:15:18 +02:00			`'tag',`
			`'metadata',`
Improve content grabber 2013-09-01 00:37:26 +02:00			`'entry-utility',`
Add new content grabber rules 2013-10-01 04:15:18 +02:00			`'related-posts',`
			`'tweet',`
			`'categories',`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`);`

			`public $stripTags = array(`
			`'script',`
			`'style',`
			`'nav',`
			`'header',`
			`'footer',`
			`'aside',`
Add new content grabber rules 2013-10-01 04:15:18 +02:00			`'form',`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`);`


Add support for CP1251 encoding 2013-10-04 05:14:39 +02:00			`public function __construct($url, $html = '', $encoding = 'utf-8')`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`{`
			`$this->url = $url;`
Add support for CP1251 encoding 2013-10-04 05:14:39 +02:00			`$this->html = $html;`
			`$this->encoding = $encoding;`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`}`


			`public function parse()`
			`{`
			`if ($this->html) {`

Improve content grabber 2013-09-01 00:37:26 +02:00			`Logging::log(\get_called_class().' Fix encoding');`
Add support for CP1251 encoding 2013-10-04 05:14:39 +02:00			`Logging::log(\get_called_class().': HTTP Encoding "'.$this->encoding.'"');`

Improve content grabber 2013-09-01 00:37:26 +02:00			`$this->html = Filter::stripMetaTags($this->html);`
Add support for CP1251 encoding 2013-10-04 05:14:39 +02:00
			`if ($this->encoding == 'windows-1251') {`
			`$this->html = Encoding::cp1251ToUtf8($this->html);`
			`}`
			`else {`
			`$this->html = Encoding::toUTF8($this->html);`
			`}`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00
Add new content grabber rules 2014-02-16 01:31:22 +01:00			`Logging::log(\get_called_class().' Content length: '.strlen($this->html).' bytes');`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`$rules = $this->getRules();`

			`if (is_array($rules)) {`
			`Logging::log(\get_called_class().' Parse content with rules');`
Improve content grabber 2013-09-01 00:37:26 +02:00			`$this->parseContentWithRules($rules);`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`}`
			`else {`
			`Logging::log(\get_called_class().' Parse content with candidates');`
Improve content grabber 2013-09-01 00:37:26 +02:00			`$this->parseContentWithCandidates();`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`}`
			`}`
			`else {`
			`Logging::log(\get_called_class().' No content fetched');`
			`}`

Improve content dowloader and add Github rule 2013-09-02 20:04:10 +02:00			`Logging::log(\get_called_class().' Content length: '.strlen($this->content).' bytes');`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`Logging::log(\get_called_class().' Grabber done');`

			`return $this->content !== '';`
			`}`


			`public function download($timeout = 5, $user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36')`
			`{`
			`$client = Client::create();`
			`$client->url = $this->url;`
			`$client->timeout = $timeout;`
			`$client->user_agent = $user_agent;`
			`$client->execute();`
			`$this->html = $client->getContent();`

			`return $this->html;`
			`}`


			`public function getRules()`
			`{`
			`$hostname = parse_url($this->url, PHP_URL_HOST);`
			`$files = array($hostname);`

Add new content grabber rules 2013-10-01 04:15:18 +02:00			`if (substr($hostname, 0, 4) == 'www.') {`
			`$files[] = substr($hostname, 4);`
			`}`

			`if (($pos = strpos($hostname, '.')) !== false) {`
			`$files[] = substr($hostname, $pos);`
			`$files[] = substr($hostname, 0, $pos);`
			`}`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00
			`foreach ($files as $file) {`

			`$filename = __DIR__.'/Rules/'.$file.'.php';`

			`if (file_exists($filename)) {`
Add new content grabber rules 2014-02-16 01:31:22 +01:00			`Logging::log(\get_called_class().' Load rule: '.$file);`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`return include $filename;`
			`}`
			`}`

			`return false;`
			`}`


Improve content grabber 2013-09-01 00:37:26 +02:00			`public function parseContentWithRules(array $rules)`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`{`
Improve content grabber 2013-09-01 00:37:26 +02:00			`\libxml_use_internal_errors(true);`
			`$dom = new \DOMDocument;`
			`$dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$this->html);`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`$xpath = new \DOMXPath($dom);`

			`if (isset($rules['strip']) && is_array($rules['strip'])) {`

			`foreach ($rules['strip'] as $pattern) {`

			`$nodes = $xpath->query($pattern);`

			`if ($nodes !== false && $nodes->length > 0) {`
			`foreach ($nodes as $node) {`
			`$node->parentNode->removeChild($node);`
			`}`
			`}`
			`}`
			`}`

			`if (isset($rules['body']) && is_array($rules['body'])) {`

			`foreach ($rules['body'] as $pattern) {`

			`$nodes = $xpath->query($pattern);`

			`if ($nodes !== false && $nodes->length > 0) {`
			`foreach ($nodes as $node) {`
			`$this->content .= $dom->saveXML($node);`
			`}`
			`}`
			`}`
			`}`
			`}`


Improve content grabber 2013-09-01 00:37:26 +02:00			`public function parseContentWithCandidates()`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`{`
Improve content grabber 2013-09-01 00:37:26 +02:00			`\libxml_use_internal_errors(true);`
			`$dom = new \DOMDocument;`
			`$dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$this->html);`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`$xpath = new \DOMXPath($dom);`

Improve content dowloader and add Github rule 2013-09-02 20:04:10 +02:00			`// Try to lookup in each tag`
			`foreach ($this->candidatesAttributes as $candidate) {`

			`Logging::log(\get_called_class().' Try this candidate: "'.$candidate.'"');`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00
Improve content dowloader and add Github rule 2013-09-02 20:04:10 +02:00			`$nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');`

			`if ($nodes !== false && $nodes->length > 0) {`
			`$this->content = $dom->saveXML($nodes->item(0));`
			`Logging::log(\get_called_class().' Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');`
			`break;`
			`}`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`}`

Improve content dowloader and add Github rule 2013-09-02 20:04:10 +02:00			`// Try to fetch <article/>`
Improve content grabber 2013-09-01 00:37:26 +02:00			`if (! $this->content) {`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00
Improve content dowloader and add Github rule 2013-09-02 20:04:10 +02:00			`$nodes = $xpath->query('//article');`
Improve content grabber 2013-09-01 00:37:26 +02:00
Improve content dowloader and add Github rule 2013-09-02 20:04:10 +02:00			`if ($nodes !== false && $nodes->length > 0) {`
			`$this->content = $dom->saveXML($nodes->item(0));`
			`Logging::log(\get_called_class().' Find <article/> tag ('.strlen($this->content).' bytes)');`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`}`
			`}`
Improve content grabber 2013-09-01 00:37:26 +02:00
			`if (strlen($this->content) < 50) {`
			`Logging::log(\get_called_class().' No enought content fetched, get the full body');`
			`$this->content = $dom->saveXML($dom->firstChild);`
			`}`

			`Logging::log(\get_called_class().' Strip garbage');`
			`$this->stripGarbage();`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`}`


			`public function stripGarbage()`
			`{`
			`\libxml_use_internal_errors(true);`
			`$dom = new \DOMDocument;`
Improve content dowloader and add Github rule 2013-09-02 20:04:10 +02:00			`$dom->loadXML($this->content);`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`$xpath = new \DOMXPath($dom);`

			`foreach ($this->stripTags as $tag) {`

			`$nodes = $xpath->query('//'.$tag);`

			`if ($nodes !== false && $nodes->length > 0) {`
Improve content dowloader and add Github rule 2013-09-02 20:04:10 +02:00			`Logging::log(\get_called_class().' Strip tag: "'.$tag.'"');`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`foreach ($nodes as $node) {`
			`$node->parentNode->removeChild($node);`
			`}`
			`}`
			`}`

			`foreach ($this->stripAttributes as $attribute) {`

			`$nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');`

			`if ($nodes !== false && $nodes->length > 0) {`
Improve content dowloader and add Github rule 2013-09-02 20:04:10 +02:00			`Logging::log(\get_called_class().' Strip attribute: "'.$tag.'"');`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`foreach ($nodes as $node) {`
			`$node->parentNode->removeChild($node);`
			`}`
			`}`
			`}`

Improve content dowloader and add Github rule 2013-09-02 20:04:10 +02:00			`$this->content = $dom->saveXML($dom->documentElement);`
Improve content grabber: add rules for specific websites and add automatic download for feeds 2013-08-31 17:05:45 +02:00			`}`
			`}`