diff --git a/model.php b/model.php index a53c636..d9f8046 100644 --- a/model.php +++ b/model.php @@ -367,11 +367,9 @@ function update_feed_cache_infos($feed_id, $last_modified, $etag) function parse_content_with_readability($content, $url) { require_once 'vendor/Readability/Readability.php'; - require_once 'vendor/PicoFeed/Encoding.php'; if (! empty($content)) { - $content = \PicoFeed\Encoding::toUTF8($content); $readability = new \Readability($content, $url); if ($readability->init()) { @@ -400,13 +398,14 @@ function download_content($url) // Try first with PicoFeed grabber and with Readability after $grabber = new \PicoFeed\Grabber($url); $grabber->html = $html; + $content = ''; if ($grabber->parse()) { $content = $grabber->content; } if (empty($content)) { - $content = parse_content_with_readability($html, $url); + $content = parse_content_with_readability($grabber->html, $url); } // Filter content diff --git a/vendor/PicoFeed/Filter.php b/vendor/PicoFeed/Filter.php index 74e9147..971e0af 100644 --- a/vendor/PicoFeed/Filter.php +++ b/vendor/PicoFeed/Filter.php @@ -464,4 +464,10 @@ class Filter return $data; } + + + public static function stripMetaTags($data) + { + return preg_replace('//is', '', $data); + } } diff --git a/vendor/PicoFeed/Grabber.php b/vendor/PicoFeed/Grabber.php index ec1e00f..cbfc264 100644 --- a/vendor/PicoFeed/Grabber.php +++ b/vendor/PicoFeed/Grabber.php @@ -5,6 +5,7 @@ namespace PicoFeed; require_once __DIR__.'/Client.php'; require_once __DIR__.'/Encoding.php'; require_once __DIR__.'/Logging.php'; +require_once __DIR__.'/Filter.php'; class Grabber { @@ -20,6 +21,7 @@ class Grabber 'articlecontent', 'articlePage', 'post-content', + 'entry-content', 'content', 'main', ); @@ -36,6 +38,7 @@ class Grabber 'nav', 'header', 'social', + 'entry-utility', ); public $stripTags = array( @@ -58,34 +61,23 @@ class Grabber { if ($this->html) { - Logging::log(\get_called_class().' HTML fetched'); + Logging::log(\get_called_class().' Fix encoding'); + $this->html = Filter::stripMetaTags($this->html); + $this->html = Encoding::toUtf8($this->html); + Logging::log(\get_called_class().' Try to find rules'); $rules = $this->getRules(); - \libxml_use_internal_errors(true); - $dom = new \DOMDocument; - $dom->loadHTML($this->html); - if (is_array($rules)) { Logging::log(\get_called_class().' Parse content with rules'); - $this->parseContentWithRules($dom, $rules); + $this->parseContentWithRules($rules); } else { - Logging::log(\get_called_class().' Parse content with candidates'); - $this->parseContentWithCandidates($dom); - - if (strlen($this->content) < 50) { - Logging::log(\get_called_class().' No enought content fetched, get the full body'); - $this->content = $dom->saveXML($dom->firstChild); - } - - Logging::log(\get_called_class().' Strip garbage'); - $this->stripGarbage(); + $this->parseContentWithCandidates(); } } else { - Logging::log(\get_called_class().' No content fetched'); } @@ -129,8 +121,11 @@ class Grabber } - public function parseContentWithRules($dom, array $rules) + public function parseContentWithRules(array $rules) { + \libxml_use_internal_errors(true); + $dom = new \DOMDocument; + $dom->loadHTML(''.$this->html); $xpath = new \DOMXPath($dom); if (isset($rules['strip']) && is_array($rules['strip'])) { @@ -147,21 +142,6 @@ class Grabber } } - if (isset($rules['strip_id_or_class']) && is_array($rules['strip_id_or_class'])) { - - foreach ($rules['strip_id_or_class'] as $pattern) { - - $pattern = strtr($pattern, array("'" => '', '"' => '')); - $nodes = $xpath->query("//*[contains(@class, '$pattern') or contains(@id, '$pattern')]"); - - if ($nodes !== false && $nodes->length > 0) { - foreach ($nodes as $node) { - $node->parentNode->removeChild($node); - } - } - } - } - if (isset($rules['body']) && is_array($rules['body'])) { foreach ($rules['body'] as $pattern) { @@ -178,8 +158,11 @@ class Grabber } - public function parseContentWithCandidates($dom) + public function parseContentWithCandidates() { + \libxml_use_internal_errors(true); + $dom = new \DOMDocument; + $dom->loadHTML(''.$this->html); $xpath = new \DOMXPath($dom); // Try to fetch @@ -187,19 +170,28 @@ class Grabber if ($nodes !== false && $nodes->length > 0) { $this->content = $dom->saveXML($nodes->item(0)); - return; } // Try to lookup in each
- foreach ($this->candidatesAttributes as $candidate) { + if (! $this->content) { - $nodes = $xpath->query('//div[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]'); + foreach ($this->candidatesAttributes as $candidate) { - if ($nodes !== false && $nodes->length > 0) { - $this->content = $dom->saveXML($nodes->item(0)); - return; + $nodes = $xpath->query('//div[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]'); + + if ($nodes !== false && $nodes->length > 0) { + $this->content = $dom->saveXML($nodes->item(0)); + } } } + + if (strlen($this->content) < 50) { + Logging::log(\get_called_class().' No enought content fetched, get the full body'); + $this->content = $dom->saveXML($dom->firstChild); + } + + Logging::log(\get_called_class().' Strip garbage'); + $this->stripGarbage(); } @@ -207,7 +199,7 @@ class Grabber { \libxml_use_internal_errors(true); $dom = new \DOMDocument; - $dom->loadXML($this->content); + $dom->loadXML(''.$this->content); $xpath = new \DOMXPath($dom); foreach ($this->stripTags as $tag) { diff --git a/vendor/PicoFeed/Parser.php b/vendor/PicoFeed/Parser.php index f33e745..84d1b4d 100644 --- a/vendor/PicoFeed/Parser.php +++ b/vendor/PicoFeed/Parser.php @@ -18,8 +18,8 @@ abstract class Parser public $items = array(); public $grabber = false; public $grabber_ignore_urls = array(); - public $grabber_timeout = 5; - public $grabber_user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)'; + public $grabber_timeout = null; + public $grabber_user_agent = null; abstract public function execute(); @@ -45,8 +45,7 @@ abstract class Parser if ($this->grabber && ! in_array($item_url, $this->grabber_ignore_urls)) { $grabber = new Grabber($item_url); $grabber->download($this->grabber_timeout, $this->grabber_user_agent); - $grabber->parse(); - if ($grabber->content) $item_content = $grabber->content; + if ($grabber->parse()) $item_content = $grabber->content; } if ($item_content) { diff --git a/vendor/PicoFeed/Rules/.blog.lemonde.fr.php b/vendor/PicoFeed/Rules/.blog.lemonde.fr.php index a6f35f5..226169b 100644 --- a/vendor/PicoFeed/Rules/.blog.lemonde.fr.php +++ b/vendor/PicoFeed/Rules/.blog.lemonde.fr.php @@ -6,5 +6,5 @@ return array( ), 'strip' => array( '//*[contains(@class, "fb-like") or contains(@class, "social")]' - ) + ), ); \ No newline at end of file diff --git a/vendor/PicoFeed/Rules/.blogs.nytimes.com.php b/vendor/PicoFeed/Rules/.blogs.nytimes.com.php index 58673ee..aa17033 100644 --- a/vendor/PicoFeed/Rules/.blogs.nytimes.com.php +++ b/vendor/PicoFeed/Rules/.blogs.nytimes.com.php @@ -1,8 +1,6 @@ '//header/h1', - 'test_url' => 'http://opinionator.blogs.nytimes.com/2011/02/03/lost-and-gone-forever/', - 'test_url' => 'http://krugman.blogs.nytimes.com/2012/09/12/a-vote-of-confidence/', 'test_url' => 'http://bits.blogs.nytimes.com/2012/01/16/wikipedia-plans-to-go-dark-on-wednesday-to-protest-sopa/', 'body' => array( '//div[@class="postContent"]', diff --git a/vendor/PicoFeed/Rules/.wikipedia.org.php b/vendor/PicoFeed/Rules/.wikipedia.org.php new file mode 100644 index 0000000..ea99ab6 --- /dev/null +++ b/vendor/PicoFeed/Rules/.wikipedia.org.php @@ -0,0 +1,25 @@ + 'https://en.wikipedia.org/wiki/Grace_Hopper', + 'body' => array( + '//div[@id="bodyContent"]', + ), + 'strip' => array( + "//div[@id='toc']", + "//div[@id='catlinks']", + "//div[@id='jump-to-nav']", + "//div[@class='thumbcaption']//div[@class='magnify']", + "//table[@class='navbox']", + "//table[contains(@class, 'infobox')]", + "//div[@class='dablink']", + "//div[@id='contentSub']", + "//div[@id='siteSub']", + "//table[@id='persondata']", + "//table[contains(@class, 'metadata')]", + "//*[contains(@class, 'noprint')]", + "//*[contains(@class, 'printfooter')]", + "//*[contains(@class, 'editsection')]", + "//*[contains(@class, 'error')]", + "//span[@title='pronunciation:']", + ), +); diff --git a/vendor/PicoFeed/Rules/techcrunch.com.php b/vendor/PicoFeed/Rules/techcrunch.com.php new file mode 100644 index 0000000..5ad42ad --- /dev/null +++ b/vendor/PicoFeed/Rules/techcrunch.com.php @@ -0,0 +1,12 @@ + 'http://techcrunch.com/2013/08/31/indias-visa-maze/', + 'body' => array( + '//div[contains(@class, "media-container")]', + '//div[@class="body-copy"]', + ), + 'strip' => array( + '//script', + '//style', + ) +); diff --git a/vendor/PicoFeed/Rules/www.cnn.com.php b/vendor/PicoFeed/Rules/www.cnn.com.php index 4ac468e..472832f 100644 --- a/vendor/PicoFeed/Rules/www.cnn.com.php +++ b/vendor/PicoFeed/Rules/www.cnn.com.php @@ -2,7 +2,21 @@ return array( 'test_url' => 'http://www.cnn.com/2013/08/31/world/meast/syria-civil-war/index.html?hpt=hp_t1', 'body' => array( - '//*[contains(@class, "cnn_storypgraphtxt")]]', - '//*[contains(@class, "cnnvideo_wrapper")]]', + '//div[@class="cnn_strycntntlft"]', ), + 'strip' => array( + '//script', + '//style', + '//div[@class="cnn_stryshrwdgtbtm"]', + '//div[@class="cnn_strybtmcntnt"]', + '//div[@class="cnn_strylftcntnt"]', + '//div[contains(@class, "cnnGalleryContainer")]', + '//div[contains(@class, "cnn_strylftcexpbx")]', + '//div[contains(@class, "articleGalleryNavContainer")]', + '//div[contains(@class, "cnnArticleGalleryCaptionControl")]', + '//div[contains(@class, "cnnArticleGalleryNavPrevNextDisabled")]', + '//div[contains(@class, "cnnArticleGalleryNavPrevNext")]', + '//div[contains(@class, "cnn_html_media_title_new")]', + '//div[contains(@id, "disqus")]', + ) ); diff --git a/vendor/PicoFeed/Rules/www.theguardian.com.php b/vendor/PicoFeed/Rules/www.theguardian.com.php new file mode 100644 index 0000000..ddb0b0a --- /dev/null +++ b/vendor/PicoFeed/Rules/www.theguardian.com.php @@ -0,0 +1,9 @@ + 'http://www.theguardian.com/law/2013/aug/31/microsoft-google-sue-us-fisa', + 'body' => array( + '//div[@id="article-wrapper"]', + ), + 'strip' => array( + ), +);