Update PicoFeed (improve grabber and bug fixes)

This commit is contained in:
Frédéric Guillot 2015-01-02 15:25:11 -05:00
parent da174a10cc
commit b7ca07b9c0
8 changed files with 216 additions and 76 deletions

2
vendor/autoload.php vendored
View File

@ -4,4 +4,4 @@
require_once __DIR__ . '/composer' . '/autoload_real.php'; require_once __DIR__ . '/composer' . '/autoload_real.php';
return ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97::getLoader(); return ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae::getLoader();

View File

@ -2,7 +2,7 @@
// autoload_real.php @generated by Composer // autoload_real.php @generated by Composer
class ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97 class ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae
{ {
private static $loader; private static $loader;
@ -19,9 +19,9 @@ class ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97
return self::$loader; return self::$loader;
} }
spl_autoload_register(array('ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97', 'loadClassLoader'), true, true); spl_autoload_register(array('ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae', 'loadClassLoader'), true, true);
self::$loader = $loader = new \Composer\Autoload\ClassLoader(); self::$loader = $loader = new \Composer\Autoload\ClassLoader();
spl_autoload_unregister(array('ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97', 'loadClassLoader')); spl_autoload_unregister(array('ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae', 'loadClassLoader'));
$map = require __DIR__ . '/autoload_namespaces.php'; $map = require __DIR__ . '/autoload_namespaces.php';
foreach ($map as $namespace => $path) { foreach ($map as $namespace => $path) {
@ -42,14 +42,14 @@ class ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97
$includeFiles = require __DIR__ . '/autoload_files.php'; $includeFiles = require __DIR__ . '/autoload_files.php';
foreach ($includeFiles as $file) { foreach ($includeFiles as $file) {
composerRequirec13b90f3b2e13ad59b988101eac1fc97($file); composerRequire738d0ffba01de68eecc7cdccd108bcae($file);
} }
return $loader; return $loader;
} }
} }
function composerRequirec13b90f3b2e13ad59b988101eac1fc97($file) function composerRequire738d0ffba01de68eecc7cdccd108bcae($file)
{ {
require $file; require $file;
} }

View File

@ -116,45 +116,6 @@
"description": "Minimalist micro-framework", "description": "Minimalist micro-framework",
"homepage": "https://github.com/fguillot/picoFarad" "homepage": "https://github.com/fguillot/picoFarad"
}, },
{
"name": "fguillot/picofeed",
"version": "dev-master",
"version_normalized": "9999999-dev",
"source": {
"type": "git",
"url": "https://github.com/fguillot/picoFeed.git",
"reference": "8722d562f1f5446c5595d8854f2a3260fdf1a5b2"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/fguillot/picoFeed/zipball/8722d562f1f5446c5595d8854f2a3260fdf1a5b2",
"reference": "8722d562f1f5446c5595d8854f2a3260fdf1a5b2",
"shasum": ""
},
"require": {
"php": ">=5.3.0"
},
"time": "2015-01-02 16:39:51",
"type": "library",
"installation-source": "dist",
"autoload": {
"psr-0": {
"PicoFeed": "lib/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"Unlicense"
],
"authors": [
{
"name": "Frédéric Guillot",
"homepage": "http://fredericguillot.com"
}
],
"description": "Modern library to write or read feeds (RSS/Atom)",
"homepage": "http://fguillot.github.io/picoFeed"
},
{ {
"name": "fguillot/picodb", "name": "fguillot/picodb",
"version": "dev-master", "version": "dev-master",
@ -193,5 +154,44 @@
], ],
"description": "Minimalist database query builder", "description": "Minimalist database query builder",
"homepage": "https://github.com/fguillot/picoDb" "homepage": "https://github.com/fguillot/picoDb"
},
{
"name": "fguillot/picofeed",
"version": "dev-master",
"version_normalized": "9999999-dev",
"source": {
"type": "git",
"url": "https://github.com/fguillot/picoFeed.git",
"reference": "3ef98d7b1ea35bd48e0a4e99ea518d0c3165a0c0"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/fguillot/picoFeed/zipball/3ef98d7b1ea35bd48e0a4e99ea518d0c3165a0c0",
"reference": "3ef98d7b1ea35bd48e0a4e99ea518d0c3165a0c0",
"shasum": ""
},
"require": {
"php": ">=5.3.0"
},
"time": "2015-01-02 20:21:50",
"type": "library",
"installation-source": "dist",
"autoload": {
"psr-0": {
"PicoFeed": "lib/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"Unlicense"
],
"authors": [
{
"name": "Frédéric Guillot",
"homepage": "http://fredericguillot.com"
}
],
"description": "Modern library to write or read feeds (RSS/Atom)",
"homepage": "http://fguillot.github.io/picoFeed"
} }
] ]

View File

@ -320,6 +320,8 @@ class Curl extends Client
throw new InvalidUrlException('Unable to resolve hostname'); throw new InvalidUrlException('Unable to resolve hostname');
case 7: // CURLE_COULDNT_CONNECT case 7: // CURLE_COULDNT_CONNECT
throw new InvalidUrlException('Unable to connect to the remote host'); throw new InvalidUrlException('Unable to connect to the remote host');
case 23: // CURLE_WRITE_ERROR
throw new MaxSizeException('Maximum response size exceeded');
case 28: // CURLE_OPERATION_TIMEDOUT case 28: // CURLE_OPERATION_TIMEDOUT
throw new TimeoutException('Operation timeout'); throw new TimeoutException('Operation timeout');
case 35: // CURLE_SSL_CONNECT_ERROR case 35: // CURLE_SSL_CONNECT_ERROR

View File

@ -48,6 +48,14 @@ class Grabber
*/ */
private $encoding = ''; private $encoding = '';
/**
* Flag to skip download and parsing
*
* @access private
* @var boolean
*/
private $skip_processing = false;
/** /**
* List of attributes to try to get the content, order is important, generic terms at the end * List of attributes to try to get the content, order is important, generic terms at the end
* *
@ -65,6 +73,7 @@ class Grabber
'post-content', 'post-content',
'post_content', 'post_content',
'entry-content', 'entry-content',
'entry-body',
'main-content', 'main-content',
'story_content', 'story_content',
'storycontent', 'storycontent',
@ -101,6 +110,10 @@ class Grabber
'related-posts', 'related-posts',
'tweet', 'tweet',
'categories', 'categories',
'post_title',
'by_line',
'byline',
'sponsors',
); );
/** /**
@ -140,6 +153,9 @@ class Grabber
$this->url = $url; $this->url = $url;
$this->html = $html; $this->html = $html;
$this->encoding = $encoding; $this->encoding = $encoding;
$this->handleFiles();
$this->handleStreamingVideos();
} }
/** /**
@ -185,11 +201,39 @@ class Grabber
*/ */
public function getFilteredContent() public function getFilteredContent()
{ {
$filter = Filter::html($this->content, Url::base($this->url)); $filter = Filter::html($this->content, $this->url);
$filter->setConfig($this->config); $filter->setConfig($this->config);
return $filter->execute(); return $filter->execute();
} }
/**
* Return the Youtube embed player and skip processing
*
* @access public
* @return string
*/
public function handleStreamingVideos()
{
if (preg_match("#(?<=v=|v\/|vi=|vi\/|youtu.be\/)[a-zA-Z0-9_-]{11}#", $this->url, $matches)) {
$this->content = '<iframe width="560" height="315" src="//www.youtube.com/embed/'.$matches[0].'" frameborder="0"></iframe>';
$this->skip_processing = true;
}
}
/**
* Skip processing for PDF documents
*
* @access public
* @return string
*/
public function handleFiles()
{
if (substr($this->url, -3) === 'pdf') {
$this->skip_processing = true;
Logger::setMessage(get_called_class().': PDF document => processing skipped');
}
}
/** /**
* Parse the HTML content * Parse the HTML content
* *
@ -198,32 +242,36 @@ class Grabber
*/ */
public function parse() public function parse()
{ {
if ($this->skip_processing) {
return true;
}
if ($this->html) { if ($this->html) {
Logger::setMessage(get_called_class().' Fix encoding'); Logger::setMessage(get_called_class().': Fix encoding');
Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'"'); Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'"');
$this->html = Encoding::convert($this->html, $this->encoding); $this->html = Encoding::convert($this->html, $this->encoding);
$this->html = Filter::stripHeadTags($this->html); $this->html = Filter::stripHeadTags($this->html);
Logger::setMessage(get_called_class().' Content length: '.strlen($this->html).' bytes'); Logger::setMessage(get_called_class().': Content length: '.strlen($this->html).' bytes');
$rules = $this->getRules(); $rules = $this->getRules();
if (is_array($rules)) { if (is_array($rules)) {
Logger::setMessage(get_called_class().' Parse content with rules'); Logger::setMessage(get_called_class().': Parse content with rules');
$this->parseContentWithRules($rules); $this->parseContentWithRules($rules);
} }
else { else {
Logger::setMessage(get_called_class().' Parse content with candidates'); Logger::setMessage(get_called_class().': Parse content with candidates');
$this->parseContentWithCandidates(); $this->parseContentWithCandidates();
} }
} }
else { else {
Logger::setMessage(get_called_class().' No content fetched'); Logger::setMessage(get_called_class().': No content fetched');
} }
Logger::setMessage(get_called_class().' Content length: '.strlen($this->content).' bytes'); Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
Logger::setMessage(get_called_class().' Grabber done'); Logger::setMessage(get_called_class().': Grabber done');
return $this->content !== ''; return $this->content !== '';
} }
@ -236,18 +284,21 @@ class Grabber
*/ */
public function download() public function download()
{ {
try { if (! $this->skip_processing) {
$client = Client::getInstance(); try {
$client->setConfig($this->config);
$client->execute($this->url);
$this->url = $client->getUrl(); $client = Client::getInstance();
$this->html = $client->getContent(); $client->setConfig($this->config);
$this->encoding = $client->getEncoding(); $client->execute($this->url);
}
catch (ClientException $e) { $this->url = $client->getUrl();
Logger::setMessage(get_called_class().': '.$e->getMessage()); $this->html = $client->getContent();
$this->encoding = $client->getEncoding();
}
catch (ClientException $e) {
Logger::setMessage(get_called_class().': '.$e->getMessage());
}
} }
return $this->html; return $this->html;
@ -346,34 +397,40 @@ class Grabber
// Try to lookup in each tag // Try to lookup in each tag
foreach ($this->candidatesAttributes as $candidate) { foreach ($this->candidatesAttributes as $candidate) {
Logger::setMessage(get_called_class().' Try this candidate: "'.$candidate.'"'); Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"');
$nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]'); $nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
if ($nodes !== false && $nodes->length > 0) { if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0)); $this->content = $dom->saveXML($nodes->item(0));
Logger::setMessage(get_called_class().' Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)'); Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
break; break;
} }
} }
// Try to fetch <article/> // Try to fetch <article/>
if (! $this->content) { if (strlen($this->content) < 200) {
$nodes = $xpath->query('//article'); $nodes = $xpath->query('//article');
if ($nodes !== false && $nodes->length > 0) { if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0)); $this->content = $dom->saveXML($nodes->item(0));
Logger::setMessage(get_called_class().' Find <article/> tag ('.strlen($this->content).' bytes)'); Logger::setMessage(get_called_class().': Find <article/> tag ('.strlen($this->content).' bytes)');
} }
} }
// Get everything
if (strlen($this->content) < 50) { if (strlen($this->content) < 50) {
Logger::setMessage(get_called_class().' No enought content fetched, get the full body');
$this->content = $dom->saveXML($dom->firstChild); $nodes = $xpath->query('//body');
if ($nodes !== false && $nodes->length > 0) {
Logger::setMessage(get_called_class().' No enought content fetched, get //body');
$this->content = $dom->saveXML($nodes->item(0));
}
} }
Logger::setMessage(get_called_class().' Strip garbage'); Logger::setMessage(get_called_class().': Strip garbage');
$this->stripGarbage(); $this->stripGarbage();
} }
@ -395,7 +452,7 @@ class Grabber
$nodes = $xpath->query('//'.$tag); $nodes = $xpath->query('//'.$tag);
if ($nodes !== false && $nodes->length > 0) { if ($nodes !== false && $nodes->length > 0) {
Logger::setMessage(get_called_class().' Strip tag: "'.$tag.'"'); Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"');
foreach ($nodes as $node) { foreach ($nodes as $node) {
$node->parentNode->removeChild($node); $node->parentNode->removeChild($node);
} }
@ -407,9 +464,11 @@ class Grabber
$nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]'); $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
if ($nodes !== false && $nodes->length > 0) { if ($nodes !== false && $nodes->length > 0) {
Logger::setMessage(get_called_class().' Strip attribute: "'.$attribute.'"'); Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"');
foreach ($nodes as $node) { foreach ($nodes as $node) {
$node->parentNode->removeChild($node); if ($this->shouldRemove($dom, $node)) {
$node->parentNode->removeChild($node);
}
} }
} }
} }
@ -417,4 +476,31 @@ class Grabber
$this->content = $dom->saveXML($dom->documentElement); $this->content = $dom->saveXML($dom->documentElement);
} }
} }
/**
* Return false if the node should not be removed
*
* @access public
* @param DomDocument $dom
* @param DomNode $node
* @return boolean
*/
public function shouldRemove($dom, $node)
{
$document_length = strlen($dom->textContent);
$node_length = strlen($node->textContent);
if ($document_length === 0) {
return true;
}
$ratio = $node_length * 100 / $document_length;
if ($ratio >= 90) {
Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%');
return false;
}
return true;
}
} }

View File

@ -67,7 +67,7 @@ class Url
if ($link->isRelativeUrl()) { if ($link->isRelativeUrl()) {
if ($link->isRelativePath()) { if ($link->isRelativePath()) {
return $link->getAbsoluteUrl($website->getAbsoluteUrl()); return $link->getAbsoluteUrl($website->getBaseUrl($website->getBasePath()));
} }
return $link->getAbsoluteUrl($website->getBaseUrl()); return $link->getAbsoluteUrl($website->getBaseUrl());
@ -159,6 +159,22 @@ class Url
return empty($this->components['path']) ? '' : $this->components['path']; return empty($this->components['path']) ? '' : $this->components['path'];
} }
/**
* Get the base path
*
* @access public
* @return string
*/
public function getBasePath()
{
$current_path = $this->getPath();
$path = $this->isRelativePath() ? '/' : '';
$path .= substr($current_path, -1) === '/' ? $current_path : dirname($current_path);
return str_replace('//', '/', $path.'/');
}
/** /**
* Get the full path (path + querystring + fragment) * Get the full path (path + querystring + fragment)
* *

View File

@ -168,8 +168,9 @@ abstract class Parser
$this->findItemEnclosure($entry, $item, $feed); $this->findItemEnclosure($entry, $item, $feed);
$this->findItemLanguage($entry, $item, $feed); $this->findItemLanguage($entry, $item, $feed);
$this->scrapWebsite($item); // Order is important (avoid double filtering)
$this->filterItemContent($feed, $item); $this->filterItemContent($feed, $item);
$this->scrapWebsite($item);
$feed->items[] = $item; $feed->items[] = $item;
} }
@ -238,7 +239,7 @@ abstract class Parser
$grabber->download(); $grabber->download();
if ($grabber->parse()) { if ($grabber->parse()) {
$item->content = $grabber->getContent() ?: $item->content; $item->content = $grabber->getFilteredContent();
} }
} }
} }

View File

@ -67,6 +67,9 @@ class UrlTest extends PHPUnit_Framework_TestCase
$url = new Url('//localhost/test?truc'); $url = new Url('//localhost/test?truc');
$this->assertEquals('http://localhost', $url->getBaseUrl()); $this->assertEquals('http://localhost', $url->getBaseUrl());
$url = new Url('//localhost/test?truc');
$this->assertEquals('http://localhost', $url->getBaseUrl());
} }
public function testIsRelativeUrl() public function testIsRelativeUrl()
@ -112,6 +115,9 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO
$url = new Url('anything'); $url = new Url('anything');
$this->assertEquals('/anything', $url->getFullPath()); $this->assertEquals('/anything', $url->getFullPath());
$url = new Url('foo/bar');
$this->assertEquals('/foo/bar', $url->getFullPath());
$url = new Url('index.php?foo=bar&test=1'); $url = new Url('index.php?foo=bar&test=1');
$this->assertEquals('/index.php?foo=bar&test=1', $url->getFullPath()); $this->assertEquals('/index.php?foo=bar&test=1', $url->getFullPath());
} }
@ -160,6 +166,9 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO
$url = new Url('https://127.0.0.1:8000/here/test?v=3'); $url = new Url('https://127.0.0.1:8000/here/test?v=3');
$this->assertEquals('https://127.0.0.1:8000/here/test?v=3', $url->getAbsoluteUrl()); $this->assertEquals('https://127.0.0.1:8000/here/test?v=3', $url->getAbsoluteUrl());
$url = new Url('http://www.lofibucket.com/articles/oscilloscope_quake.html');
$this->assertEquals('http://www.lofibucket.com/articles/oscilloscope_quake.html', $url->getAbsoluteUrl());
$url = new Url('test?v=3'); $url = new Url('test?v=3');
$this->assertEquals('https://127.0.0.1:8000/here/test?v=3', $url->getAbsoluteUrl('https://127.0.0.1:8000/here/')); $this->assertEquals('https://127.0.0.1:8000/here/test?v=3', $url->getAbsoluteUrl('https://127.0.0.1:8000/here/'));
} }
@ -185,6 +194,27 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO
$this->assertFalse($url->isRelativePath()); $this->assertFalse($url->isRelativePath());
} }
public function testGetBasePath()
{
$url = new Url('img/quakescope.jpg');
$this->assertEquals('/img/', $url->getBasePath());
$url = new Url('http://foo/img/quakescope.jpg');
$this->assertEquals('/img/', $url->getBasePath());
$url = new Url('http://foo/bar.html');
$this->assertEquals('/', $url->getBasePath());
$url = new Url('http://foo/bar');
$this->assertEquals('/', $url->getBasePath());
$url = new Url('http://foo/bar/');
$this->assertEquals('/bar/', $url->getBasePath());
$url = new Url('http://website/subfolder/img/foo.png');
$this->assertEquals('/subfolder/img/', $url->getBasePath());
}
public function testResolve() public function testResolve()
{ {
$this->assertEquals( $this->assertEquals(
@ -216,5 +246,10 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO
'http://website/img/foo.png', 'http://website/img/foo.png',
Url::resolve('/img/foo.png', 'http://website/subfolder/') Url::resolve('/img/foo.png', 'http://website/subfolder/')
); );
$this->assertEquals(
'http://www.lofibucket.com/articles/img/quakescope.jpg',
Url::resolve('img/quakescope.jpg', 'http://www.lofibucket.com/articles/oscilloscope_quake.html')
);
} }
} }