diff --git a/vendor/autoload.php b/vendor/autoload.php
index 265e82b..228365d 100644
--- a/vendor/autoload.php
+++ b/vendor/autoload.php
@@ -4,4 +4,4 @@
require_once __DIR__ . '/composer' . '/autoload_real.php';
-return ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97::getLoader();
+return ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae::getLoader();
diff --git a/vendor/composer/autoload_real.php b/vendor/composer/autoload_real.php
index 7ed76d2..aeb7b76 100644
--- a/vendor/composer/autoload_real.php
+++ b/vendor/composer/autoload_real.php
@@ -2,7 +2,7 @@
// autoload_real.php @generated by Composer
-class ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97
+class ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae
{
private static $loader;
@@ -19,9 +19,9 @@ class ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97
return self::$loader;
}
- spl_autoload_register(array('ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97', 'loadClassLoader'), true, true);
+ spl_autoload_register(array('ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae', 'loadClassLoader'), true, true);
self::$loader = $loader = new \Composer\Autoload\ClassLoader();
- spl_autoload_unregister(array('ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97', 'loadClassLoader'));
+ spl_autoload_unregister(array('ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae', 'loadClassLoader'));
$map = require __DIR__ . '/autoload_namespaces.php';
foreach ($map as $namespace => $path) {
@@ -42,14 +42,14 @@ class ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97
$includeFiles = require __DIR__ . '/autoload_files.php';
foreach ($includeFiles as $file) {
- composerRequirec13b90f3b2e13ad59b988101eac1fc97($file);
+ composerRequire738d0ffba01de68eecc7cdccd108bcae($file);
}
return $loader;
}
}
-function composerRequirec13b90f3b2e13ad59b988101eac1fc97($file)
+function composerRequire738d0ffba01de68eecc7cdccd108bcae($file)
{
require $file;
}
diff --git a/vendor/composer/installed.json b/vendor/composer/installed.json
index 79754f9..941ad5c 100644
--- a/vendor/composer/installed.json
+++ b/vendor/composer/installed.json
@@ -116,45 +116,6 @@
"description": "Minimalist micro-framework",
"homepage": "https://github.com/fguillot/picoFarad"
},
- {
- "name": "fguillot/picofeed",
- "version": "dev-master",
- "version_normalized": "9999999-dev",
- "source": {
- "type": "git",
- "url": "https://github.com/fguillot/picoFeed.git",
- "reference": "8722d562f1f5446c5595d8854f2a3260fdf1a5b2"
- },
- "dist": {
- "type": "zip",
- "url": "https://api.github.com/repos/fguillot/picoFeed/zipball/8722d562f1f5446c5595d8854f2a3260fdf1a5b2",
- "reference": "8722d562f1f5446c5595d8854f2a3260fdf1a5b2",
- "shasum": ""
- },
- "require": {
- "php": ">=5.3.0"
- },
- "time": "2015-01-02 16:39:51",
- "type": "library",
- "installation-source": "dist",
- "autoload": {
- "psr-0": {
- "PicoFeed": "lib/"
- }
- },
- "notification-url": "https://packagist.org/downloads/",
- "license": [
- "Unlicense"
- ],
- "authors": [
- {
- "name": "Frédéric Guillot",
- "homepage": "http://fredericguillot.com"
- }
- ],
- "description": "Modern library to write or read feeds (RSS/Atom)",
- "homepage": "http://fguillot.github.io/picoFeed"
- },
{
"name": "fguillot/picodb",
"version": "dev-master",
@@ -193,5 +154,44 @@
],
"description": "Minimalist database query builder",
"homepage": "https://github.com/fguillot/picoDb"
+ },
+ {
+ "name": "fguillot/picofeed",
+ "version": "dev-master",
+ "version_normalized": "9999999-dev",
+ "source": {
+ "type": "git",
+ "url": "https://github.com/fguillot/picoFeed.git",
+ "reference": "3ef98d7b1ea35bd48e0a4e99ea518d0c3165a0c0"
+ },
+ "dist": {
+ "type": "zip",
+ "url": "https://api.github.com/repos/fguillot/picoFeed/zipball/3ef98d7b1ea35bd48e0a4e99ea518d0c3165a0c0",
+ "reference": "3ef98d7b1ea35bd48e0a4e99ea518d0c3165a0c0",
+ "shasum": ""
+ },
+ "require": {
+ "php": ">=5.3.0"
+ },
+ "time": "2015-01-02 20:21:50",
+ "type": "library",
+ "installation-source": "dist",
+ "autoload": {
+ "psr-0": {
+ "PicoFeed": "lib/"
+ }
+ },
+ "notification-url": "https://packagist.org/downloads/",
+ "license": [
+ "Unlicense"
+ ],
+ "authors": [
+ {
+ "name": "Frédéric Guillot",
+ "homepage": "http://fredericguillot.com"
+ }
+ ],
+ "description": "Modern library to write or read feeds (RSS/Atom)",
+ "homepage": "http://fguillot.github.io/picoFeed"
}
]
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Curl.php b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Curl.php
index a284f69..7b3cd65 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Curl.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Curl.php
@@ -320,6 +320,8 @@ class Curl extends Client
throw new InvalidUrlException('Unable to resolve hostname');
case 7: // CURLE_COULDNT_CONNECT
throw new InvalidUrlException('Unable to connect to the remote host');
+ case 23: // CURLE_WRITE_ERROR
+ throw new MaxSizeException('Maximum response size exceeded');
case 28: // CURLE_OPERATION_TIMEDOUT
throw new TimeoutException('Operation timeout');
case 35: // CURLE_SSL_CONNECT_ERROR
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php
index e119797..fc7634c 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php
@@ -48,6 +48,14 @@ class Grabber
*/
private $encoding = '';
+ /**
+ * Flag to skip download and parsing
+ *
+ * @access private
+ * @var boolean
+ */
+ private $skip_processing = false;
+
/**
* List of attributes to try to get the content, order is important, generic terms at the end
*
@@ -65,6 +73,7 @@ class Grabber
'post-content',
'post_content',
'entry-content',
+ 'entry-body',
'main-content',
'story_content',
'storycontent',
@@ -101,6 +110,10 @@ class Grabber
'related-posts',
'tweet',
'categories',
+ 'post_title',
+ 'by_line',
+ 'byline',
+ 'sponsors',
);
/**
@@ -140,6 +153,9 @@ class Grabber
$this->url = $url;
$this->html = $html;
$this->encoding = $encoding;
+
+ $this->handleFiles();
+ $this->handleStreamingVideos();
}
/**
@@ -185,11 +201,39 @@ class Grabber
*/
public function getFilteredContent()
{
- $filter = Filter::html($this->content, Url::base($this->url));
+ $filter = Filter::html($this->content, $this->url);
$filter->setConfig($this->config);
return $filter->execute();
}
+ /**
+ * Return the Youtube embed player and skip processing
+ *
+ * @access public
+ * @return string
+ */
+ public function handleStreamingVideos()
+ {
+ if (preg_match("#(?<=v=|v\/|vi=|vi\/|youtu.be\/)[a-zA-Z0-9_-]{11}#", $this->url, $matches)) {
+ $this->content = '';
+ $this->skip_processing = true;
+ }
+ }
+
+ /**
+ * Skip processing for PDF documents
+ *
+ * @access public
+ * @return string
+ */
+ public function handleFiles()
+ {
+ if (substr($this->url, -3) === 'pdf') {
+ $this->skip_processing = true;
+ Logger::setMessage(get_called_class().': PDF document => processing skipped');
+ }
+ }
+
/**
* Parse the HTML content
*
@@ -198,32 +242,36 @@ class Grabber
*/
public function parse()
{
+ if ($this->skip_processing) {
+ return true;
+ }
+
if ($this->html) {
- Logger::setMessage(get_called_class().' Fix encoding');
+ Logger::setMessage(get_called_class().': Fix encoding');
Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'"');
$this->html = Encoding::convert($this->html, $this->encoding);
$this->html = Filter::stripHeadTags($this->html);
- Logger::setMessage(get_called_class().' Content length: '.strlen($this->html).' bytes');
+ Logger::setMessage(get_called_class().': Content length: '.strlen($this->html).' bytes');
$rules = $this->getRules();
if (is_array($rules)) {
- Logger::setMessage(get_called_class().' Parse content with rules');
+ Logger::setMessage(get_called_class().': Parse content with rules');
$this->parseContentWithRules($rules);
}
else {
- Logger::setMessage(get_called_class().' Parse content with candidates');
+ Logger::setMessage(get_called_class().': Parse content with candidates');
$this->parseContentWithCandidates();
}
}
else {
- Logger::setMessage(get_called_class().' No content fetched');
+ Logger::setMessage(get_called_class().': No content fetched');
}
- Logger::setMessage(get_called_class().' Content length: '.strlen($this->content).' bytes');
- Logger::setMessage(get_called_class().' Grabber done');
+ Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
+ Logger::setMessage(get_called_class().': Grabber done');
return $this->content !== '';
}
@@ -236,18 +284,21 @@ class Grabber
*/
public function download()
{
- try {
+ if (! $this->skip_processing) {
- $client = Client::getInstance();
- $client->setConfig($this->config);
- $client->execute($this->url);
+ try {
- $this->url = $client->getUrl();
- $this->html = $client->getContent();
- $this->encoding = $client->getEncoding();
- }
- catch (ClientException $e) {
- Logger::setMessage(get_called_class().': '.$e->getMessage());
+ $client = Client::getInstance();
+ $client->setConfig($this->config);
+ $client->execute($this->url);
+
+ $this->url = $client->getUrl();
+ $this->html = $client->getContent();
+ $this->encoding = $client->getEncoding();
+ }
+ catch (ClientException $e) {
+ Logger::setMessage(get_called_class().': '.$e->getMessage());
+ }
}
return $this->html;
@@ -346,34 +397,40 @@ class Grabber
// Try to lookup in each tag
foreach ($this->candidatesAttributes as $candidate) {
- Logger::setMessage(get_called_class().' Try this candidate: "'.$candidate.'"');
+ Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"');
$nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
- Logger::setMessage(get_called_class().' Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
+ Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
break;
}
}
// Try to fetch
- if (! $this->content) {
+ if (strlen($this->content) < 200) {
$nodes = $xpath->query('//article');
if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
- Logger::setMessage(get_called_class().' Find tag ('.strlen($this->content).' bytes)');
+ Logger::setMessage(get_called_class().': Find tag ('.strlen($this->content).' bytes)');
}
}
+ // Get everything
if (strlen($this->content) < 50) {
- Logger::setMessage(get_called_class().' No enought content fetched, get the full body');
- $this->content = $dom->saveXML($dom->firstChild);
+
+ $nodes = $xpath->query('//body');
+
+ if ($nodes !== false && $nodes->length > 0) {
+ Logger::setMessage(get_called_class().' No enought content fetched, get //body');
+ $this->content = $dom->saveXML($nodes->item(0));
+ }
}
- Logger::setMessage(get_called_class().' Strip garbage');
+ Logger::setMessage(get_called_class().': Strip garbage');
$this->stripGarbage();
}
@@ -395,7 +452,7 @@ class Grabber
$nodes = $xpath->query('//'.$tag);
if ($nodes !== false && $nodes->length > 0) {
- Logger::setMessage(get_called_class().' Strip tag: "'.$tag.'"');
+ Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"');
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
@@ -407,9 +464,11 @@ class Grabber
$nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
if ($nodes !== false && $nodes->length > 0) {
- Logger::setMessage(get_called_class().' Strip attribute: "'.$attribute.'"');
+ Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"');
foreach ($nodes as $node) {
- $node->parentNode->removeChild($node);
+ if ($this->shouldRemove($dom, $node)) {
+ $node->parentNode->removeChild($node);
+ }
}
}
}
@@ -417,4 +476,31 @@ class Grabber
$this->content = $dom->saveXML($dom->documentElement);
}
}
+
+ /**
+ * Return false if the node should not be removed
+ *
+ * @access public
+ * @param DomDocument $dom
+ * @param DomNode $node
+ * @return boolean
+ */
+ public function shouldRemove($dom, $node)
+ {
+ $document_length = strlen($dom->textContent);
+ $node_length = strlen($node->textContent);
+
+ if ($document_length === 0) {
+ return true;
+ }
+
+ $ratio = $node_length * 100 / $document_length;
+
+ if ($ratio >= 90) {
+ Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%');
+ return false;
+ }
+
+ return true;
+ }
}
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Url.php b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Url.php
index a74c235..fc1150c 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Url.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Url.php
@@ -67,7 +67,7 @@ class Url
if ($link->isRelativeUrl()) {
if ($link->isRelativePath()) {
- return $link->getAbsoluteUrl($website->getAbsoluteUrl());
+ return $link->getAbsoluteUrl($website->getBaseUrl($website->getBasePath()));
}
return $link->getAbsoluteUrl($website->getBaseUrl());
@@ -159,6 +159,22 @@ class Url
return empty($this->components['path']) ? '' : $this->components['path'];
}
+ /**
+ * Get the base path
+ *
+ * @access public
+ * @return string
+ */
+ public function getBasePath()
+ {
+ $current_path = $this->getPath();
+
+ $path = $this->isRelativePath() ? '/' : '';
+ $path .= substr($current_path, -1) === '/' ? $current_path : dirname($current_path);
+
+ return str_replace('//', '/', $path.'/');
+ }
+
/**
* Get the full path (path + querystring + fragment)
*
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php
index 44f0c8e..a25ef2e 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php
@@ -168,8 +168,9 @@ abstract class Parser
$this->findItemEnclosure($entry, $item, $feed);
$this->findItemLanguage($entry, $item, $feed);
- $this->scrapWebsite($item);
+ // Order is important (avoid double filtering)
$this->filterItemContent($feed, $item);
+ $this->scrapWebsite($item);
$feed->items[] = $item;
}
@@ -238,7 +239,7 @@ abstract class Parser
$grabber->download();
if ($grabber->parse()) {
- $item->content = $grabber->getContent() ?: $item->content;
+ $item->content = $grabber->getFilteredContent();
}
}
}
diff --git a/vendor/fguillot/picofeed/tests/Client/UrlTest.php b/vendor/fguillot/picofeed/tests/Client/UrlTest.php
index f7d67ea..90055b3 100644
--- a/vendor/fguillot/picofeed/tests/Client/UrlTest.php
+++ b/vendor/fguillot/picofeed/tests/Client/UrlTest.php
@@ -67,6 +67,9 @@ class UrlTest extends PHPUnit_Framework_TestCase
$url = new Url('//localhost/test?truc');
$this->assertEquals('http://localhost', $url->getBaseUrl());
+
+ $url = new Url('//localhost/test?truc');
+ $this->assertEquals('http://localhost', $url->getBaseUrl());
}
public function testIsRelativeUrl()
@@ -112,6 +115,9 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO
$url = new Url('anything');
$this->assertEquals('/anything', $url->getFullPath());
+ $url = new Url('foo/bar');
+ $this->assertEquals('/foo/bar', $url->getFullPath());
+
$url = new Url('index.php?foo=bar&test=1');
$this->assertEquals('/index.php?foo=bar&test=1', $url->getFullPath());
}
@@ -160,6 +166,9 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO
$url = new Url('https://127.0.0.1:8000/here/test?v=3');
$this->assertEquals('https://127.0.0.1:8000/here/test?v=3', $url->getAbsoluteUrl());
+ $url = new Url('http://www.lofibucket.com/articles/oscilloscope_quake.html');
+ $this->assertEquals('http://www.lofibucket.com/articles/oscilloscope_quake.html', $url->getAbsoluteUrl());
+
$url = new Url('test?v=3');
$this->assertEquals('https://127.0.0.1:8000/here/test?v=3', $url->getAbsoluteUrl('https://127.0.0.1:8000/here/'));
}
@@ -185,6 +194,27 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO
$this->assertFalse($url->isRelativePath());
}
+ public function testGetBasePath()
+ {
+ $url = new Url('img/quakescope.jpg');
+ $this->assertEquals('/img/', $url->getBasePath());
+
+ $url = new Url('http://foo/img/quakescope.jpg');
+ $this->assertEquals('/img/', $url->getBasePath());
+
+ $url = new Url('http://foo/bar.html');
+ $this->assertEquals('/', $url->getBasePath());
+
+ $url = new Url('http://foo/bar');
+ $this->assertEquals('/', $url->getBasePath());
+
+ $url = new Url('http://foo/bar/');
+ $this->assertEquals('/bar/', $url->getBasePath());
+
+ $url = new Url('http://website/subfolder/img/foo.png');
+ $this->assertEquals('/subfolder/img/', $url->getBasePath());
+ }
+
public function testResolve()
{
$this->assertEquals(
@@ -216,5 +246,10 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO
'http://website/img/foo.png',
Url::resolve('/img/foo.png', 'http://website/subfolder/')
);
+
+ $this->assertEquals(
+ 'http://www.lofibucket.com/articles/img/quakescope.jpg',
+ Url::resolve('img/quakescope.jpg', 'http://www.lofibucket.com/articles/oscilloscope_quake.html')
+ );
}
}