miniflux-legacy/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php

<?php

namespace PicoFeed\Client;

use DOMXPath;
use PicoFeed\Encoding\Encoding;
use PicoFeed\Logging\Logger;
use PicoFeed\Filter\Filter;
use PicoFeed\Parser\XmlParser;

/**
 * Grabber class
 *
 * @author  Frederic Guillot
 * @package Client
 */
class Grabber
{
    /**
     * URL
     *
     * @access private
     * @var string
     */
    private $url = '';

    /**
     * Relevant content
     *
     * @access private
     * @var string
     */
    private $content = '';

    /**
     * HTML content
     *
     * @access private
     * @var string
     */
    private $html = '';

    /**
     * HTML content encoding
     *
     * @access private
     * @var string
     */
    private $encoding = '';

    /**
     * Flag to skip download and parsing
     *
     * @access private
     * @var boolean
     */
    private $skip_processing = false;

    /**
     * List of attributes to try to get the content, order is important, generic terms at the end
     *
     * @access private
     * @var array
     */
    private $candidatesAttributes = array(
        'articleBody',
        'articlebody',
        'article-body',
        'articleContent',
        'articlecontent',
        'article-content',
        'articlePage',
        'post-content',
        'post_content',
        'entry-content',
        'entry-body',
        'main-content',
        'story_content',
        'storycontent',
        'entryBox',
        'entrytext',
        'comic',
        'post',
        'article',
        'content',
        'main',
    );

    /**
     * List of attributes to strip
     *
     * @access private
     * @var array
     */
    private $stripAttributes = array(
        'comment',
        'share',
        'links',
        'toolbar',
        'fb',
        'footer',
        'credit',
        'bottom',
        'nav',
        'header',
        'social',
        'tag',
        'metadata',
        'entry-utility',
        'related-posts',
        'tweet',
        'categories',
        'post_title',
        'by_line',
        'byline',
        'sponsors',
    );

    /**
     * Tags to remove
     *
     * @access private
     * @var array
     */
    private $stripTags = array(
        'nav',
        'header',
        'footer',
        'aside',
        'form',
    );

    /**
     * Config object
     *
     * @access private
     * @var \PicoFeed\Config\Config
     */
    private $config;

    /**
     * Constructor
     *
     * @access public
     * @param  string   $url       Url
     * @param  string   $html      HTML content
     * @param  string   $encoding  Charset
     */
    public function __construct($url, $html = '', $encoding = 'utf-8')
    {
        $this->url = $url;
        $this->html = $html;
        $this->encoding = $encoding;

        $this->handleFiles();
        $this->handleStreamingVideos();
    }

    /**
     * Set config object
     *
     * @access public
     * @param  \PicoFeed\Config\Config   $config    Config instance
     * @return Grabber
     */
    public function setConfig($config)
    {
        $this->config = $config;
        return $this;
    }

    /**
     * Get URL to download.
     *
     * @access  public
     * @return  string
     */
    public function getUrl()
    {
        return $this->url;
    }

    /**
     * Set URL to download and reset object to use for another grab.
     *
     * @access  public
     * @param   string  $url    URL
     * @return  string
     */
    public function setUrl($url)
    {
        $this->url = $url;
        $this->html = "";
        $this->content = "";
        $this->encoding = "";

        $this->handleFiles();
        $this->handleStreamingVideos();
    }

    /**
     * Get relevant content
     *
     * @access public
     * @return string
     */
    public function getContent()
    {
        return $this->content;
    }

    /**
     * Get raw content (unfiltered)
     *
     * @access public
     * @return string
     */
    public function getRawContent()
    {
        return $this->html;
    }

    /**
     * Get filtered relevant content
     *
     * @access public
     * @return string
     */
    public function getFilteredContent()
    {
        $filter = Filter::html($this->content, $this->url);
        $filter->setConfig($this->config);
        return $filter->execute();
    }

    /**
     * Return the Youtube embed player and skip processing
     *
     * @access public
     * @return string
     */
    public function handleStreamingVideos()
    {
        if (preg_match("#(?<=v=|v\/|vi=|vi\/|youtu.be\/)[a-zA-Z0-9_-]{11}#", $this->url, $matches)) {
            $this->content = '<iframe width="560" height="315" src="//www.youtube.com/embed/'.$matches[0].'" frameborder="0"></iframe>';
            $this->skip_processing = true;
        }
    }

    /**
     * Skip processing for PDF documents
     *
     * @access public
     * @return string
     */
    public function handleFiles()
    {
        if (substr($this->url, -3) === 'pdf') {
            $this->skip_processing = true;
            Logger::setMessage(get_called_class().': PDF document => processing skipped');
        }
    }

    /**
     * Parse the HTML content
     *
     * @access public
     * @return bool
     */
    public function parse()
    {
        if ($this->skip_processing) {
            return true;
        }

        if ($this->html) {
            $html_encoding = XmlParser::getEncodingFromMetaTag($this->html);

            // Encode everything in UTF-8
            Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'" ; HTML Encoding "'.$html_encoding.'"');
            $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding);
            $this->html = Filter::stripHeadTags($this->html);

            Logger::setMessage(get_called_class().': Content length: '.strlen($this->html).' bytes');
            $rules = $this->getRules();

            if (! empty($rules)) {
                Logger::setMessage(get_called_class().': Parse content with rules');
                $this->parseContentWithRules($rules);
            }
            else {
                Logger::setMessage(get_called_class().': Parse content with candidates');
                $this->parseContentWithCandidates();
            }
        }
        else {
            Logger::setMessage(get_called_class().': No content fetched');
        }

        Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
        Logger::setMessage(get_called_class().': Grabber done');

        return $this->content !== '';
    }

    /**
     * Download the HTML content
     *
     * @access public
     * @return HTML content
     */
    public function download()
    {
        if (! $this->skip_processing && $this->url != '') {

            try {

                $client = Client::getInstance();

                if ($this->config !== null) {
                    $client->setConfig($this->config);
                    $client->setTimeout($this->config->getGrabberTimeout());
                    $client->setUserAgent($this->config->getGrabberUserAgent());
                }

                $client->execute($this->url);

                $this->url = $client->getUrl();
                $this->html = $client->getContent();
                $this->encoding = $client->getEncoding();
            }
            catch (ClientException $e) {
                Logger::setMessage(get_called_class().': '.$e->getMessage());
            }
        }

        return $this->html;
    }

    /**
     * Try to find a predefined rule
     *
     * @access public
     * @return array
     */
    public function getRules()
    {
        $hostname = parse_url($this->url, PHP_URL_HOST);

        if ($hostname !== false) {

            $files = $this->getRulesFileList($hostname);

            foreach ($this->getRulesFolders() as $folder) {
                $rule = $this->loadRuleFile($folder, $files);

                if (! empty($rule)) {
                    return $rule;
                }
            }
        }

        return array();
    }

    /**
     * Get the list of possible rules file names for a given hostname
     *
     * @access public
     * @param  string  $hostname  Hostname
     * @return array
     */
    public function getRulesFileList($hostname)
    {
        $files = array($hostname);                 // subdomain.domain.tld
        $parts = explode('.', $hostname);
        $len = count($parts);

        if ($len > 2) {
            $subdomain = array_shift($parts);
            $files[] = implode('.', $parts);       // domain.tld
            $files[] = '.'.implode('.', $parts);   // .domain.tld
            $files[] = $subdomain;                 // subdomain
        }
        else if ($len === 2) {
            $files[] = '.'.implode('.', $parts);    // .domain.tld
            $files[] = $parts[0];                   // domain
        }

        return $files;
    }

    /**
     * Load a rule file from the defined folder
     *
     * @access public
     * @param  string   $folder     Rule directory
     * @param  array    $files      List of possible file names
     * @return array
     */
    public function loadRuleFile($folder, array $files)
    {
        foreach ($files as $file) {
            $filename = $folder.'/'.$file.'.php';

            if (file_exists($filename)) {
                Logger::setMessage(get_called_class().' Load rule: '.$file);
                return include $filename;
            }
        }

        return array();
    }

    /**
     * Get the list of folders that contains rules
     *
     * @access public
     * @return array
     */
    public function getRulesFolders()
    {
        $folders = array(__DIR__.'/../Rules');

        if ($this->config !== null && $this->config->getGrabberRulesFolder() !== null) {
            $folders[] = $this->config->getGrabberRulesFolder();
        }

        return $folders;
    }

    /**
     * Get the relevant content with predefined rules
     *
     * @access public
     * @param  array   $rules   Rules
     */
    public function parseContentWithRules(array $rules)
    {
        // Logger::setMessage($this->html);
        $dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$this->html);
        $xpath = new DOMXPath($dom);

        if (isset($rules['strip']) && is_array($rules['strip'])) {

            foreach ($rules['strip'] as $pattern) {

                $nodes = $xpath->query($pattern);

                if ($nodes !== false && $nodes->length > 0) {
                    foreach ($nodes as $node) {
                        $node->parentNode->removeChild($node);
                    }
                }
            }
        }

        if (isset($rules['body']) && is_array($rules['body'])) {

            foreach ($rules['body'] as $pattern) {

                $nodes = $xpath->query($pattern);

                if ($nodes !== false && $nodes->length > 0) {
                    foreach ($nodes as $node) {
                        $this->content .= $dom->saveXML($node);
                    }
                }
            }
        }
    }

    /**
     * Get the relevant content with the list of potential attributes
     *
     * @access public
     */
    public function parseContentWithCandidates()
    {
        $dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$this->html);
        $xpath = new DOMXPath($dom);

        // Try to lookup in each tag
        foreach ($this->candidatesAttributes as $candidate) {

            Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"');

            $nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');

            if ($nodes !== false && $nodes->length > 0) {
                $this->content = $dom->saveXML($nodes->item(0));
                Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
                break;
            }
        }

        // Try to fetch <article/>
        if (strlen($this->content) < 200) {

            $nodes = $xpath->query('//article');

            if ($nodes !== false && $nodes->length > 0) {
                $this->content = $dom->saveXML($nodes->item(0));
                Logger::setMessage(get_called_class().': Find <article/> tag ('.strlen($this->content).' bytes)');
            }
        }

        // Get everything
        if (strlen($this->content) < 50) {

            $nodes = $xpath->query('//body');

            if ($nodes !== false && $nodes->length > 0) {
                Logger::setMessage(get_called_class().' No enought content fetched, get //body');
                $this->content = $dom->saveXML($nodes->item(0));
            }
        }

        Logger::setMessage(get_called_class().': Strip garbage');
        $this->stripGarbage();
    }

    /**
     * Strip useless tags
     *
     * @access public
     */
    public function stripGarbage()
    {
        $dom = XmlParser::getDomDocument($this->content);

        if ($dom !== false) {

            $xpath = new DOMXPath($dom);

            foreach ($this->stripTags as $tag) {

                $nodes = $xpath->query('//'.$tag);

                if ($nodes !== false && $nodes->length > 0) {
                    Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"');
                    foreach ($nodes as $node) {
                        $node->parentNode->removeChild($node);
                    }
                }
            }

            foreach ($this->stripAttributes as $attribute) {

                $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');

                if ($nodes !== false && $nodes->length > 0) {
                    Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"');
                    foreach ($nodes as $node) {
                        if ($this->shouldRemove($dom, $node)) {
                            $node->parentNode->removeChild($node);
                        }
                    }
                }
            }

            $this->content = $dom->saveXML($dom->documentElement);
        }
    }

    /**
     * Return false if the node should not be removed
     *
     * @access public
     * @param  DomDocument  $dom
     * @param  DomNode      $node
     * @return boolean
     */
    public function shouldRemove($dom, $node)
    {
        $document_length = strlen($dom->textContent);
        $node_length = strlen($node->textContent);

        if ($document_length === 0) {
            return true;
        }

        $ratio = $node_length * 100 / $document_length;

        if ($ratio >= 90) {
            Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%');
            return false;
        }

        return true;
    }
}