From b7ca07b9c0ad209622ea2e29c9a9889e86c92d9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Fri, 2 Jan 2015 15:25:11 -0500 Subject: [PATCH] Update PicoFeed (improve grabber and bug fixes) --- vendor/autoload.php | 2 +- vendor/composer/autoload_real.php | 10 +- vendor/composer/installed.json | 78 +++++----- .../picofeed/lib/PicoFeed/Client/Curl.php | 2 + .../picofeed/lib/PicoFeed/Client/Grabber.php | 142 ++++++++++++++---- .../picofeed/lib/PicoFeed/Client/Url.php | 18 ++- .../picofeed/lib/PicoFeed/Parser/Parser.php | 5 +- .../picofeed/tests/Client/UrlTest.php | 35 +++++ 8 files changed, 216 insertions(+), 76 deletions(-) diff --git a/vendor/autoload.php b/vendor/autoload.php index 265e82b..228365d 100644 --- a/vendor/autoload.php +++ b/vendor/autoload.php @@ -4,4 +4,4 @@ require_once __DIR__ . '/composer' . '/autoload_real.php'; -return ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97::getLoader(); +return ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae::getLoader(); diff --git a/vendor/composer/autoload_real.php b/vendor/composer/autoload_real.php index 7ed76d2..aeb7b76 100644 --- a/vendor/composer/autoload_real.php +++ b/vendor/composer/autoload_real.php @@ -2,7 +2,7 @@ // autoload_real.php @generated by Composer -class ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97 +class ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae { private static $loader; @@ -19,9 +19,9 @@ class ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97 return self::$loader; } - spl_autoload_register(array('ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97', 'loadClassLoader'), true, true); + spl_autoload_register(array('ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae', 'loadClassLoader'), true, true); self::$loader = $loader = new \Composer\Autoload\ClassLoader(); - spl_autoload_unregister(array('ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97', 'loadClassLoader')); + spl_autoload_unregister(array('ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae', 'loadClassLoader')); $map = require __DIR__ . '/autoload_namespaces.php'; foreach ($map as $namespace => $path) { @@ -42,14 +42,14 @@ class ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97 $includeFiles = require __DIR__ . '/autoload_files.php'; foreach ($includeFiles as $file) { - composerRequirec13b90f3b2e13ad59b988101eac1fc97($file); + composerRequire738d0ffba01de68eecc7cdccd108bcae($file); } return $loader; } } -function composerRequirec13b90f3b2e13ad59b988101eac1fc97($file) +function composerRequire738d0ffba01de68eecc7cdccd108bcae($file) { require $file; } diff --git a/vendor/composer/installed.json b/vendor/composer/installed.json index 79754f9..941ad5c 100644 --- a/vendor/composer/installed.json +++ b/vendor/composer/installed.json @@ -116,45 +116,6 @@ "description": "Minimalist micro-framework", "homepage": "https://github.com/fguillot/picoFarad" }, - { - "name": "fguillot/picofeed", - "version": "dev-master", - "version_normalized": "9999999-dev", - "source": { - "type": "git", - "url": "https://github.com/fguillot/picoFeed.git", - "reference": "8722d562f1f5446c5595d8854f2a3260fdf1a5b2" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/fguillot/picoFeed/zipball/8722d562f1f5446c5595d8854f2a3260fdf1a5b2", - "reference": "8722d562f1f5446c5595d8854f2a3260fdf1a5b2", - "shasum": "" - }, - "require": { - "php": ">=5.3.0" - }, - "time": "2015-01-02 16:39:51", - "type": "library", - "installation-source": "dist", - "autoload": { - "psr-0": { - "PicoFeed": "lib/" - } - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "Unlicense" - ], - "authors": [ - { - "name": "Frédéric Guillot", - "homepage": "http://fredericguillot.com" - } - ], - "description": "Modern library to write or read feeds (RSS/Atom)", - "homepage": "http://fguillot.github.io/picoFeed" - }, { "name": "fguillot/picodb", "version": "dev-master", @@ -193,5 +154,44 @@ ], "description": "Minimalist database query builder", "homepage": "https://github.com/fguillot/picoDb" + }, + { + "name": "fguillot/picofeed", + "version": "dev-master", + "version_normalized": "9999999-dev", + "source": { + "type": "git", + "url": "https://github.com/fguillot/picoFeed.git", + "reference": "3ef98d7b1ea35bd48e0a4e99ea518d0c3165a0c0" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/fguillot/picoFeed/zipball/3ef98d7b1ea35bd48e0a4e99ea518d0c3165a0c0", + "reference": "3ef98d7b1ea35bd48e0a4e99ea518d0c3165a0c0", + "shasum": "" + }, + "require": { + "php": ">=5.3.0" + }, + "time": "2015-01-02 20:21:50", + "type": "library", + "installation-source": "dist", + "autoload": { + "psr-0": { + "PicoFeed": "lib/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "Unlicense" + ], + "authors": [ + { + "name": "Frédéric Guillot", + "homepage": "http://fredericguillot.com" + } + ], + "description": "Modern library to write or read feeds (RSS/Atom)", + "homepage": "http://fguillot.github.io/picoFeed" } ] diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Curl.php b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Curl.php index a284f69..7b3cd65 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Curl.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Curl.php @@ -320,6 +320,8 @@ class Curl extends Client throw new InvalidUrlException('Unable to resolve hostname'); case 7: // CURLE_COULDNT_CONNECT throw new InvalidUrlException('Unable to connect to the remote host'); + case 23: // CURLE_WRITE_ERROR + throw new MaxSizeException('Maximum response size exceeded'); case 28: // CURLE_OPERATION_TIMEDOUT throw new TimeoutException('Operation timeout'); case 35: // CURLE_SSL_CONNECT_ERROR diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php index e119797..fc7634c 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php @@ -48,6 +48,14 @@ class Grabber */ private $encoding = ''; + /** + * Flag to skip download and parsing + * + * @access private + * @var boolean + */ + private $skip_processing = false; + /** * List of attributes to try to get the content, order is important, generic terms at the end * @@ -65,6 +73,7 @@ class Grabber 'post-content', 'post_content', 'entry-content', + 'entry-body', 'main-content', 'story_content', 'storycontent', @@ -101,6 +110,10 @@ class Grabber 'related-posts', 'tweet', 'categories', + 'post_title', + 'by_line', + 'byline', + 'sponsors', ); /** @@ -140,6 +153,9 @@ class Grabber $this->url = $url; $this->html = $html; $this->encoding = $encoding; + + $this->handleFiles(); + $this->handleStreamingVideos(); } /** @@ -185,11 +201,39 @@ class Grabber */ public function getFilteredContent() { - $filter = Filter::html($this->content, Url::base($this->url)); + $filter = Filter::html($this->content, $this->url); $filter->setConfig($this->config); return $filter->execute(); } + /** + * Return the Youtube embed player and skip processing + * + * @access public + * @return string + */ + public function handleStreamingVideos() + { + if (preg_match("#(?<=v=|v\/|vi=|vi\/|youtu.be\/)[a-zA-Z0-9_-]{11}#", $this->url, $matches)) { + $this->content = ''; + $this->skip_processing = true; + } + } + + /** + * Skip processing for PDF documents + * + * @access public + * @return string + */ + public function handleFiles() + { + if (substr($this->url, -3) === 'pdf') { + $this->skip_processing = true; + Logger::setMessage(get_called_class().': PDF document => processing skipped'); + } + } + /** * Parse the HTML content * @@ -198,32 +242,36 @@ class Grabber */ public function parse() { + if ($this->skip_processing) { + return true; + } + if ($this->html) { - Logger::setMessage(get_called_class().' Fix encoding'); + Logger::setMessage(get_called_class().': Fix encoding'); Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'"'); $this->html = Encoding::convert($this->html, $this->encoding); $this->html = Filter::stripHeadTags($this->html); - Logger::setMessage(get_called_class().' Content length: '.strlen($this->html).' bytes'); + Logger::setMessage(get_called_class().': Content length: '.strlen($this->html).' bytes'); $rules = $this->getRules(); if (is_array($rules)) { - Logger::setMessage(get_called_class().' Parse content with rules'); + Logger::setMessage(get_called_class().': Parse content with rules'); $this->parseContentWithRules($rules); } else { - Logger::setMessage(get_called_class().' Parse content with candidates'); + Logger::setMessage(get_called_class().': Parse content with candidates'); $this->parseContentWithCandidates(); } } else { - Logger::setMessage(get_called_class().' No content fetched'); + Logger::setMessage(get_called_class().': No content fetched'); } - Logger::setMessage(get_called_class().' Content length: '.strlen($this->content).' bytes'); - Logger::setMessage(get_called_class().' Grabber done'); + Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes'); + Logger::setMessage(get_called_class().': Grabber done'); return $this->content !== ''; } @@ -236,18 +284,21 @@ class Grabber */ public function download() { - try { + if (! $this->skip_processing) { - $client = Client::getInstance(); - $client->setConfig($this->config); - $client->execute($this->url); + try { - $this->url = $client->getUrl(); - $this->html = $client->getContent(); - $this->encoding = $client->getEncoding(); - } - catch (ClientException $e) { - Logger::setMessage(get_called_class().': '.$e->getMessage()); + $client = Client::getInstance(); + $client->setConfig($this->config); + $client->execute($this->url); + + $this->url = $client->getUrl(); + $this->html = $client->getContent(); + $this->encoding = $client->getEncoding(); + } + catch (ClientException $e) { + Logger::setMessage(get_called_class().': '.$e->getMessage()); + } } return $this->html; @@ -346,34 +397,40 @@ class Grabber // Try to lookup in each tag foreach ($this->candidatesAttributes as $candidate) { - Logger::setMessage(get_called_class().' Try this candidate: "'.$candidate.'"'); + Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"'); $nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]'); if ($nodes !== false && $nodes->length > 0) { $this->content = $dom->saveXML($nodes->item(0)); - Logger::setMessage(get_called_class().' Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)'); + Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)'); break; } } // Try to fetch
- if (! $this->content) { + if (strlen($this->content) < 200) { $nodes = $xpath->query('//article'); if ($nodes !== false && $nodes->length > 0) { $this->content = $dom->saveXML($nodes->item(0)); - Logger::setMessage(get_called_class().' Find
tag ('.strlen($this->content).' bytes)'); + Logger::setMessage(get_called_class().': Find
tag ('.strlen($this->content).' bytes)'); } } + // Get everything if (strlen($this->content) < 50) { - Logger::setMessage(get_called_class().' No enought content fetched, get the full body'); - $this->content = $dom->saveXML($dom->firstChild); + + $nodes = $xpath->query('//body'); + + if ($nodes !== false && $nodes->length > 0) { + Logger::setMessage(get_called_class().' No enought content fetched, get //body'); + $this->content = $dom->saveXML($nodes->item(0)); + } } - Logger::setMessage(get_called_class().' Strip garbage'); + Logger::setMessage(get_called_class().': Strip garbage'); $this->stripGarbage(); } @@ -395,7 +452,7 @@ class Grabber $nodes = $xpath->query('//'.$tag); if ($nodes !== false && $nodes->length > 0) { - Logger::setMessage(get_called_class().' Strip tag: "'.$tag.'"'); + Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"'); foreach ($nodes as $node) { $node->parentNode->removeChild($node); } @@ -407,9 +464,11 @@ class Grabber $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]'); if ($nodes !== false && $nodes->length > 0) { - Logger::setMessage(get_called_class().' Strip attribute: "'.$attribute.'"'); + Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"'); foreach ($nodes as $node) { - $node->parentNode->removeChild($node); + if ($this->shouldRemove($dom, $node)) { + $node->parentNode->removeChild($node); + } } } } @@ -417,4 +476,31 @@ class Grabber $this->content = $dom->saveXML($dom->documentElement); } } + + /** + * Return false if the node should not be removed + * + * @access public + * @param DomDocument $dom + * @param DomNode $node + * @return boolean + */ + public function shouldRemove($dom, $node) + { + $document_length = strlen($dom->textContent); + $node_length = strlen($node->textContent); + + if ($document_length === 0) { + return true; + } + + $ratio = $node_length * 100 / $document_length; + + if ($ratio >= 90) { + Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%'); + return false; + } + + return true; + } } diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Url.php b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Url.php index a74c235..fc1150c 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Url.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Url.php @@ -67,7 +67,7 @@ class Url if ($link->isRelativeUrl()) { if ($link->isRelativePath()) { - return $link->getAbsoluteUrl($website->getAbsoluteUrl()); + return $link->getAbsoluteUrl($website->getBaseUrl($website->getBasePath())); } return $link->getAbsoluteUrl($website->getBaseUrl()); @@ -159,6 +159,22 @@ class Url return empty($this->components['path']) ? '' : $this->components['path']; } + /** + * Get the base path + * + * @access public + * @return string + */ + public function getBasePath() + { + $current_path = $this->getPath(); + + $path = $this->isRelativePath() ? '/' : ''; + $path .= substr($current_path, -1) === '/' ? $current_path : dirname($current_path); + + return str_replace('//', '/', $path.'/'); + } + /** * Get the full path (path + querystring + fragment) * diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php index 44f0c8e..a25ef2e 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php @@ -168,8 +168,9 @@ abstract class Parser $this->findItemEnclosure($entry, $item, $feed); $this->findItemLanguage($entry, $item, $feed); - $this->scrapWebsite($item); + // Order is important (avoid double filtering) $this->filterItemContent($feed, $item); + $this->scrapWebsite($item); $feed->items[] = $item; } @@ -238,7 +239,7 @@ abstract class Parser $grabber->download(); if ($grabber->parse()) { - $item->content = $grabber->getContent() ?: $item->content; + $item->content = $grabber->getFilteredContent(); } } } diff --git a/vendor/fguillot/picofeed/tests/Client/UrlTest.php b/vendor/fguillot/picofeed/tests/Client/UrlTest.php index f7d67ea..90055b3 100644 --- a/vendor/fguillot/picofeed/tests/Client/UrlTest.php +++ b/vendor/fguillot/picofeed/tests/Client/UrlTest.php @@ -67,6 +67,9 @@ class UrlTest extends PHPUnit_Framework_TestCase $url = new Url('//localhost/test?truc'); $this->assertEquals('http://localhost', $url->getBaseUrl()); + + $url = new Url('//localhost/test?truc'); + $this->assertEquals('http://localhost', $url->getBaseUrl()); } public function testIsRelativeUrl() @@ -112,6 +115,9 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO $url = new Url('anything'); $this->assertEquals('/anything', $url->getFullPath()); + $url = new Url('foo/bar'); + $this->assertEquals('/foo/bar', $url->getFullPath()); + $url = new Url('index.php?foo=bar&test=1'); $this->assertEquals('/index.php?foo=bar&test=1', $url->getFullPath()); } @@ -160,6 +166,9 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO $url = new Url('https://127.0.0.1:8000/here/test?v=3'); $this->assertEquals('https://127.0.0.1:8000/here/test?v=3', $url->getAbsoluteUrl()); + $url = new Url('http://www.lofibucket.com/articles/oscilloscope_quake.html'); + $this->assertEquals('http://www.lofibucket.com/articles/oscilloscope_quake.html', $url->getAbsoluteUrl()); + $url = new Url('test?v=3'); $this->assertEquals('https://127.0.0.1:8000/here/test?v=3', $url->getAbsoluteUrl('https://127.0.0.1:8000/here/')); } @@ -185,6 +194,27 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO $this->assertFalse($url->isRelativePath()); } + public function testGetBasePath() + { + $url = new Url('img/quakescope.jpg'); + $this->assertEquals('/img/', $url->getBasePath()); + + $url = new Url('http://foo/img/quakescope.jpg'); + $this->assertEquals('/img/', $url->getBasePath()); + + $url = new Url('http://foo/bar.html'); + $this->assertEquals('/', $url->getBasePath()); + + $url = new Url('http://foo/bar'); + $this->assertEquals('/', $url->getBasePath()); + + $url = new Url('http://foo/bar/'); + $this->assertEquals('/bar/', $url->getBasePath()); + + $url = new Url('http://website/subfolder/img/foo.png'); + $this->assertEquals('/subfolder/img/', $url->getBasePath()); + } + public function testResolve() { $this->assertEquals( @@ -216,5 +246,10 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO 'http://website/img/foo.png', Url::resolve('/img/foo.png', 'http://website/subfolder/') ); + + $this->assertEquals( + 'http://www.lofibucket.com/articles/img/quakescope.jpg', + Url::resolve('img/quakescope.jpg', 'http://www.lofibucket.com/articles/oscilloscope_quake.html') + ); } }