Improve content grabber

2013-08-31 18:37:26 -04:00 · 2013-08-31 18:37:26 -04:00 · 242234c0a0
commit 242234c0a0
parent 14d67d85e8
10 changed files with 107 additions and 53 deletions
--- a/model.php
+++ b/model.php
@ -367,11 +367,9 @@ function update_feed_cache_infos($feed_id, $last_modified, $etag)
 function parse_content_with_readability($content, $url)
 {
    require_once 'vendor/Readability/Readability.php';
    require_once 'vendor/PicoFeed/Encoding.php';
    if (! empty($content)) {
        $content = \PicoFeed\Encoding::toUTF8($content);
        $readability = new \Readability($content, $url);
        if ($readability->init()) {
@ -400,13 +398,14 @@ function download_content($url)
        // Try first with PicoFeed grabber and with Readability after
        $grabber = new \PicoFeed\Grabber($url);
        $grabber->html = $html;
        $content = '';
        if ($grabber->parse()) {
            $content = $grabber->content;
        }
        if (empty($content)) {
-            $content = parse_content_with_readability($html, $url);
+            $content = parse_content_with_readability($grabber->html, $url);
        }
        // Filter content
--- a/vendor/PicoFeed/Filter.php
+++ b/vendor/PicoFeed/Filter.php
@ -464,4 +464,10 @@ class Filter
        return $data;
    }
    public static function stripMetaTags($data)
    {
        return preg_replace('/<meta\s.*?\/>/is', '', $data);
    }
 }
--- a/vendor/PicoFeed/Grabber.php
+++ b/vendor/PicoFeed/Grabber.php
@ -5,6 +5,7 @@ namespace PicoFeed;
 require_once __DIR__.'/Client.php';
 require_once __DIR__.'/Encoding.php';
 require_once __DIR__.'/Logging.php';
 require_once __DIR__.'/Filter.php';
 class Grabber
 {
@ -20,6 +21,7 @@ class Grabber
        'articlecontent',
        'articlePage',
        'post-content',
        'entry-content',
        'content',
        'main',
    );
@ -36,6 +38,7 @@ class Grabber
        'nav',
        'header',
        'social',
        'entry-utility',
    );
    public $stripTags = array(
@ -58,34 +61,23 @@ class Grabber
    {
        if ($this->html) {
-            Logging::log(\get_called_class().' HTML fetched');
+            Logging::log(\get_called_class().' Fix encoding');
            $this->html = Filter::stripMetaTags($this->html);
            $this->html = Encoding::toUtf8($this->html);
            Logging::log(\get_called_class().' Try to find rules');
            $rules = $this->getRules();
            \libxml_use_internal_errors(true);
            $dom = new \DOMDocument;
            $dom->loadHTML($this->html);
            if (is_array($rules)) {
                Logging::log(\get_called_class().' Parse content with rules');
-                $this->parseContentWithRules($dom, $rules);
+                $this->parseContentWithRules($rules);
            }
            else {
                Logging::log(\get_called_class().' Parse content with candidates');
-                $this->parseContentWithCandidates($dom);
+                $this->parseContentWithCandidates();
                if (strlen($this->content) < 50) {
                    Logging::log(\get_called_class().' No enought content fetched, get the full body');
                    $this->content = $dom->saveXML($dom->firstChild);
                }
                Logging::log(\get_called_class().' Strip garbage');
                $this->stripGarbage();
            }
        }
        else {
            Logging::log(\get_called_class().' No content fetched');
        }
@ -129,8 +121,11 @@ class Grabber
    }
-    public function parseContentWithRules($dom, array $rules)
+    public function parseContentWithRules(array $rules)
    {
        \libxml_use_internal_errors(true);
        $dom = new \DOMDocument;
        $dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$this->html);
        $xpath = new \DOMXPath($dom);
        if (isset($rules['strip']) && is_array($rules['strip'])) {
@ -147,21 +142,6 @@ class Grabber
            }
        }
        if (isset($rules['strip_id_or_class']) && is_array($rules['strip_id_or_class'])) {
            foreach ($rules['strip_id_or_class'] as $pattern) {
                $pattern = strtr($pattern, array("'" => '', '"' => ''));
                $nodes = $xpath->query("//*[contains(@class, '$pattern') or contains(@id, '$pattern')]");
                if ($nodes !== false && $nodes->length > 0) {
                    foreach ($nodes as $node) {
                        $node->parentNode->removeChild($node);
                    }
                }
            }
        }
        if (isset($rules['body']) && is_array($rules['body'])) {
            foreach ($rules['body'] as $pattern) {
@ -178,8 +158,11 @@ class Grabber
    }
-    public function parseContentWithCandidates($dom)
+    public function parseContentWithCandidates()
    {
        \libxml_use_internal_errors(true);
        $dom = new \DOMDocument;
        $dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$this->html);
        $xpath = new \DOMXPath($dom);
        // Try to fetch <article/>
@ -187,19 +170,28 @@ class Grabber
        if ($nodes !== false && $nodes->length > 0) {
            $this->content = $dom->saveXML($nodes->item(0));
            return;
        }
        // Try to lookup in each <div/>
-        foreach ($this->candidatesAttributes as $candidate) {
+        if (! $this->content) {
-            $nodes = $xpath->query('//div[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
+            foreach ($this->candidatesAttributes as $candidate) {
-            if ($nodes !== false && $nodes->length > 0) {
+                $nodes = $xpath->query('//div[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
-                $this->content = $dom->saveXML($nodes->item(0));
+
-                return;
+                if ($nodes !== false && $nodes->length > 0) {
                    $this->content = $dom->saveXML($nodes->item(0));
                }
            }
        }
        if (strlen($this->content) < 50) {
            Logging::log(\get_called_class().' No enought content fetched, get the full body');
            $this->content = $dom->saveXML($dom->firstChild);
        }
        Logging::log(\get_called_class().' Strip garbage');
        $this->stripGarbage();
    }
@ -207,7 +199,7 @@ class Grabber
    {
        \libxml_use_internal_errors(true);
        $dom = new \DOMDocument;
-        $dom->loadXML($this->content);
+        $dom->loadXML('<?xml version="1.0" encoding="UTF-8">'.$this->content);
        $xpath = new \DOMXPath($dom);
        foreach ($this->stripTags as $tag) {
--- a/vendor/PicoFeed/Parser.php
+++ b/vendor/PicoFeed/Parser.php
@ -18,8 +18,8 @@ abstract class Parser
    public $items = array();
    public $grabber = false;
    public $grabber_ignore_urls = array();
-    public $grabber_timeout = 5;
+    public $grabber_timeout = null;
-    public $grabber_user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)';
+    public $grabber_user_agent = null;
    abstract public function execute();
@ -45,8 +45,7 @@ abstract class Parser
        if ($this->grabber && ! in_array($item_url, $this->grabber_ignore_urls)) {
            $grabber = new Grabber($item_url);
            $grabber->download($this->grabber_timeout, $this->grabber_user_agent);
-            $grabber->parse();
+            if ($grabber->parse()) $item_content = $grabber->content;
            if ($grabber->content) $item_content = $grabber->content;
        }
        if ($item_content) {
--- a/vendor/PicoFeed/Rules/.blog.lemonde.fr.php
+++ b/vendor/PicoFeed/Rules/.blog.lemonde.fr.php
@ -6,5 +6,5 @@ return array(
    ),
    'strip' => array(
        '//*[contains(@class, "fb-like") or contains(@class, "social")]'
-    )
+    ),
 );
--- a/vendor/PicoFeed/Rules/.blogs.nytimes.com.php
+++ b/vendor/PicoFeed/Rules/.blogs.nytimes.com.php
@ -1,8 +1,6 @@
 <?php
 return array(
    'title' => '//header/h1',
    'test_url' => 'http://opinionator.blogs.nytimes.com/2011/02/03/lost-and-gone-forever/',
    'test_url' => 'http://krugman.blogs.nytimes.com/2012/09/12/a-vote-of-confidence/',
    'test_url' => 'http://bits.blogs.nytimes.com/2012/01/16/wikipedia-plans-to-go-dark-on-wednesday-to-protest-sopa/',
    'body' => array(
         '//div[@class="postContent"]',
--- a/vendor/PicoFeed/Rules/.wikipedia.org.php
+++ b/vendor/PicoFeed/Rules/.wikipedia.org.php
@ -0,0 +1,25 @@
 <?php
 return array(
    'test_url' => 'https://en.wikipedia.org/wiki/Grace_Hopper',
    'body' => array(
        '//div[@id="bodyContent"]',
    ),
    'strip' => array(
        "//div[@id='toc']",
        "//div[@id='catlinks']",
        "//div[@id='jump-to-nav']",
        "//div[@class='thumbcaption']//div[@class='magnify']",
        "//table[@class='navbox']",
        "//table[contains(@class, 'infobox')]",
        "//div[@class='dablink']",
        "//div[@id='contentSub']",
        "//div[@id='siteSub']",
        "//table[@id='persondata']",
        "//table[contains(@class, 'metadata')]",
        "//*[contains(@class, 'noprint')]",
        "//*[contains(@class, 'printfooter')]",
        "//*[contains(@class, 'editsection')]",
        "//*[contains(@class, 'error')]",
        "//span[@title='pronunciation:']",
    ),
 );
--- a/vendor/PicoFeed/Rules/techcrunch.com.php
+++ b/vendor/PicoFeed/Rules/techcrunch.com.php
@ -0,0 +1,12 @@
 <?php
 return array(
    'test_url' => 'http://techcrunch.com/2013/08/31/indias-visa-maze/',
    'body' => array(
        '//div[contains(@class, "media-container")]',
        '//div[@class="body-copy"]',
    ),
    'strip' => array(
        '//script',
        '//style',
    )
 );
--- a/vendor/PicoFeed/Rules/www.cnn.com.php
+++ b/vendor/PicoFeed/Rules/www.cnn.com.php
@ -2,7 +2,21 @@
 return array(
    'test_url' => 'http://www.cnn.com/2013/08/31/world/meast/syria-civil-war/index.html?hpt=hp_t1',
    'body' => array(
-        '//*[contains(@class, "cnn_storypgraphtxt")]]',
+        '//div[@class="cnn_strycntntlft"]',
        '//*[contains(@class, "cnnvideo_wrapper")]]',
    ),
    'strip' => array(
        '//script',
        '//style',
        '//div[@class="cnn_stryshrwdgtbtm"]',
        '//div[@class="cnn_strybtmcntnt"]',
        '//div[@class="cnn_strylftcntnt"]',
        '//div[contains(@class, "cnnGalleryContainer")]',
        '//div[contains(@class, "cnn_strylftcexpbx")]',
        '//div[contains(@class, "articleGalleryNavContainer")]',
        '//div[contains(@class, "cnnArticleGalleryCaptionControl")]',
        '//div[contains(@class, "cnnArticleGalleryNavPrevNextDisabled")]',
        '//div[contains(@class, "cnnArticleGalleryNavPrevNext")]',
        '//div[contains(@class, "cnn_html_media_title_new")]',
        '//div[contains(@id, "disqus")]',
    )
 );
--- a/vendor/PicoFeed/Rules/www.theguardian.com.php
+++ b/vendor/PicoFeed/Rules/www.theguardian.com.php
@ -0,0 +1,9 @@
 <?php
 return array(
    'test_url' => 'http://www.theguardian.com/law/2013/aug/31/microsoft-google-sue-us-fisa',
    'body' => array(
        '//div[@id="article-wrapper"]',
    ),
    'strip' => array(
    ),
 );