Improve content grabber

2013-08-31 18:37:26 -04:00 · 2013-08-31 18:37:26 -04:00 · 242234c0a0
commit 242234c0a0
parent 14d67d85e8
10 changed files with 107 additions and 53 deletions
--- a/model.php
+++ b/model.php
@ -367,11 +367,9 @@ function update_feed_cache_infos($feed_id, $last_modified, $etag)
 function parse_content_with_readability($content, $url)
 {
    require_once 'vendor/Readability/Readability.php';
-    require_once 'vendor/PicoFeed/Encoding.php';

    if (! empty($content)) {

-        $content = \PicoFeed\Encoding::toUTF8($content);
        $readability = new \Readability($content, $url);

        if ($readability->init()) {
@ -400,13 +398,14 @@ function download_content($url)
        // Try first with PicoFeed grabber and with Readability after
        $grabber = new \PicoFeed\Grabber($url);
        $grabber->html = $html;
+        $content = '';

        if ($grabber->parse()) {
            $content = $grabber->content;
        }

        if (empty($content)) {
-            $content = parse_content_with_readability($html, $url);
+            $content = parse_content_with_readability($grabber->html, $url);
        }

        // Filter content
--- a/vendor/PicoFeed/Filter.php
+++ b/vendor/PicoFeed/Filter.php
@ -464,4 +464,10 @@ class Filter

        return $data;
    }
+
+
+    public static function stripMetaTags($data)
+    {
+        return preg_replace('/<meta\s.*?\/>/is', '', $data);
+    }
 }
--- a/vendor/PicoFeed/Grabber.php
+++ b/vendor/PicoFeed/Grabber.php
@ -5,6 +5,7 @@ namespace PicoFeed;
 require_once __DIR__.'/Client.php';
 require_once __DIR__.'/Encoding.php';
 require_once __DIR__.'/Logging.php';
+require_once __DIR__.'/Filter.php';

 class Grabber
 {
@ -20,6 +21,7 @@ class Grabber
        'articlecontent',
        'articlePage',
        'post-content',
+        'entry-content',
        'content',
        'main',
    );
@ -36,6 +38,7 @@ class Grabber
        'nav',
        'header',
        'social',
+        'entry-utility',
    );

    public $stripTags = array(
@ -58,34 +61,23 @@ class Grabber
    {
        if ($this->html) {

-            Logging::log(\get_called_class().' HTML fetched');
+            Logging::log(\get_called_class().' Fix encoding');
+            $this->html = Filter::stripMetaTags($this->html);
+            $this->html = Encoding::toUtf8($this->html);

+            Logging::log(\get_called_class().' Try to find rules');
            $rules = $this->getRules();

-            \libxml_use_internal_errors(true);
-            $dom = new \DOMDocument;
-            $dom->loadHTML($this->html);
-
            if (is_array($rules)) {
                Logging::log(\get_called_class().' Parse content with rules');
-                $this->parseContentWithRules($dom, $rules);
+                $this->parseContentWithRules($rules);
            }
            else {
-
                Logging::log(\get_called_class().' Parse content with candidates');
-                $this->parseContentWithCandidates($dom);
-
-                if (strlen($this->content) < 50) {
-                    Logging::log(\get_called_class().' No enought content fetched, get the full body');
-                    $this->content = $dom->saveXML($dom->firstChild);
-                }
-
-                Logging::log(\get_called_class().' Strip garbage');
-                $this->stripGarbage();
+                $this->parseContentWithCandidates();
            }
        }
        else {
-
            Logging::log(\get_called_class().' No content fetched');
        }

@ -129,8 +121,11 @@ class Grabber
    }


-    public function parseContentWithRules($dom, array $rules)
+    public function parseContentWithRules(array $rules)
    {
+        \libxml_use_internal_errors(true);
+        $dom = new \DOMDocument;
+        $dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$this->html);
        $xpath = new \DOMXPath($dom);

        if (isset($rules['strip']) && is_array($rules['strip'])) {
@ -147,21 +142,6 @@ class Grabber
            }
        }

-        if (isset($rules['strip_id_or_class']) && is_array($rules['strip_id_or_class'])) {
-
-            foreach ($rules['strip_id_or_class'] as $pattern) {
-
-                $pattern = strtr($pattern, array("'" => '', '"' => ''));
-                $nodes = $xpath->query("//*[contains(@class, '$pattern') or contains(@id, '$pattern')]");
-
-                if ($nodes !== false && $nodes->length > 0) {
-                    foreach ($nodes as $node) {
-                        $node->parentNode->removeChild($node);
-                    }
-                }
-            }
-        }
-
        if (isset($rules['body']) && is_array($rules['body'])) {

            foreach ($rules['body'] as $pattern) {
@ -178,8 +158,11 @@ class Grabber
    }


-    public function parseContentWithCandidates($dom)
+    public function parseContentWithCandidates()
    {
+        \libxml_use_internal_errors(true);
+        $dom = new \DOMDocument;
+        $dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$this->html);
        $xpath = new \DOMXPath($dom);

        // Try to fetch <article/>
@ -187,27 +170,36 @@ class Grabber

        if ($nodes !== false && $nodes->length > 0) {
            $this->content = $dom->saveXML($nodes->item(0));
-            return;
        }

        // Try to lookup in each <div/>
+        if (! $this->content) {
+
            foreach ($this->candidatesAttributes as $candidate) {

                $nodes = $xpath->query('//div[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');

                if ($nodes !== false && $nodes->length > 0) {
                    $this->content = $dom->saveXML($nodes->item(0));
-                return;
                }
            }
        }

+        if (strlen($this->content) < 50) {
+            Logging::log(\get_called_class().' No enought content fetched, get the full body');
+            $this->content = $dom->saveXML($dom->firstChild);
+        }
+
+        Logging::log(\get_called_class().' Strip garbage');
+        $this->stripGarbage();
+    }
+

    public function stripGarbage()
    {
        \libxml_use_internal_errors(true);
        $dom = new \DOMDocument;
-        $dom->loadXML($this->content);
+        $dom->loadXML('<?xml version="1.0" encoding="UTF-8">'.$this->content);
        $xpath = new \DOMXPath($dom);

        foreach ($this->stripTags as $tag) {
--- a/vendor/PicoFeed/Parser.php
+++ b/vendor/PicoFeed/Parser.php
@ -18,8 +18,8 @@ abstract class Parser
    public $items = array();
    public $grabber = false;
    public $grabber_ignore_urls = array();
-    public $grabber_timeout = 5;
-    public $grabber_user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)';
+    public $grabber_timeout = null;
+    public $grabber_user_agent = null;


    abstract public function execute();
@ -45,8 +45,7 @@ abstract class Parser
        if ($this->grabber && ! in_array($item_url, $this->grabber_ignore_urls)) {
            $grabber = new Grabber($item_url);
            $grabber->download($this->grabber_timeout, $this->grabber_user_agent);
-            $grabber->parse();
-            if ($grabber->content) $item_content = $grabber->content;
+            if ($grabber->parse()) $item_content = $grabber->content;
        }

        if ($item_content) {
--- a/vendor/PicoFeed/Rules/.blog.lemonde.fr.php
+++ b/vendor/PicoFeed/Rules/.blog.lemonde.fr.php
@ -6,5 +6,5 @@ return array(
    ),
    'strip' => array(
        '//*[contains(@class, "fb-like") or contains(@class, "social")]'
-    )
+    ),
 );
--- a/vendor/PicoFeed/Rules/.blogs.nytimes.com.php
+++ b/vendor/PicoFeed/Rules/.blogs.nytimes.com.php
@ -1,8 +1,6 @@
 <?php
 return array(
    'title' => '//header/h1',
-    'test_url' => 'http://opinionator.blogs.nytimes.com/2011/02/03/lost-and-gone-forever/',
-    'test_url' => 'http://krugman.blogs.nytimes.com/2012/09/12/a-vote-of-confidence/',
    'test_url' => 'http://bits.blogs.nytimes.com/2012/01/16/wikipedia-plans-to-go-dark-on-wednesday-to-protest-sopa/',
    'body' => array(
         '//div[@class="postContent"]',
--- a/vendor/PicoFeed/Rules/.wikipedia.org.php
+++ b/vendor/PicoFeed/Rules/.wikipedia.org.php
@ -0,0 +1,25 @@
+<?php
+return array(
+    'test_url' => 'https://en.wikipedia.org/wiki/Grace_Hopper',
+    'body' => array(
+        '//div[@id="bodyContent"]',
+    ),
+    'strip' => array(
+        "//div[@id='toc']",
+        "//div[@id='catlinks']",
+        "//div[@id='jump-to-nav']",
+        "//div[@class='thumbcaption']//div[@class='magnify']",
+        "//table[@class='navbox']",
+        "//table[contains(@class, 'infobox')]",
+        "//div[@class='dablink']",
+        "//div[@id='contentSub']",
+        "//div[@id='siteSub']",
+        "//table[@id='persondata']",
+        "//table[contains(@class, 'metadata')]",
+        "//*[contains(@class, 'noprint')]",
+        "//*[contains(@class, 'printfooter')]",
+        "//*[contains(@class, 'editsection')]",
+        "//*[contains(@class, 'error')]",
+        "//span[@title='pronunciation:']",
+    ),
+);
--- a/vendor/PicoFeed/Rules/techcrunch.com.php
+++ b/vendor/PicoFeed/Rules/techcrunch.com.php
@ -0,0 +1,12 @@
+<?php
+return array(
+    'test_url' => 'http://techcrunch.com/2013/08/31/indias-visa-maze/',
+    'body' => array(
+        '//div[contains(@class, "media-container")]',
+        '//div[@class="body-copy"]',
+    ),
+    'strip' => array(
+        '//script',
+        '//style',
+    )
+);
--- a/vendor/PicoFeed/Rules/www.cnn.com.php
+++ b/vendor/PicoFeed/Rules/www.cnn.com.php
@ -2,7 +2,21 @@
 return array(
    'test_url' => 'http://www.cnn.com/2013/08/31/world/meast/syria-civil-war/index.html?hpt=hp_t1',
    'body' => array(
-        '//*[contains(@class, "cnn_storypgraphtxt")]]',
-        '//*[contains(@class, "cnnvideo_wrapper")]]',
+        '//div[@class="cnn_strycntntlft"]',
    ),
+    'strip' => array(
+        '//script',
+        '//style',
+        '//div[@class="cnn_stryshrwdgtbtm"]',
+        '//div[@class="cnn_strybtmcntnt"]',
+        '//div[@class="cnn_strylftcntnt"]',
+        '//div[contains(@class, "cnnGalleryContainer")]',
+        '//div[contains(@class, "cnn_strylftcexpbx")]',
+        '//div[contains(@class, "articleGalleryNavContainer")]',
+        '//div[contains(@class, "cnnArticleGalleryCaptionControl")]',
+        '//div[contains(@class, "cnnArticleGalleryNavPrevNextDisabled")]',
+        '//div[contains(@class, "cnnArticleGalleryNavPrevNext")]',
+        '//div[contains(@class, "cnn_html_media_title_new")]',
+        '//div[contains(@id, "disqus")]',
+    )
 );
--- a/vendor/PicoFeed/Rules/www.theguardian.com.php
+++ b/vendor/PicoFeed/Rules/www.theguardian.com.php
@ -0,0 +1,9 @@
+<?php
+return array(
+    'test_url' => 'http://www.theguardian.com/law/2013/aug/31/microsoft-google-sue-us-fisa',
+    'body' => array(
+        '//div[@id="article-wrapper"]',
+    ),
+    'strip' => array(
+    ),
+);