Update PicoFeed (improve grabber and bug fixes)

This commit is contained in:
Frédéric Guillot 2015-01-02 15:25:11 -05:00
parent da174a10cc
commit b7ca07b9c0
8 changed files with 216 additions and 76 deletions

2
vendor/autoload.php vendored
View File

@ -4,4 +4,4 @@
require_once __DIR__ . '/composer' . '/autoload_real.php';
return ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97::getLoader();
return ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae::getLoader();

View File

@ -2,7 +2,7 @@
// autoload_real.php @generated by Composer
class ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97
class ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae
{
private static $loader;
@ -19,9 +19,9 @@ class ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97
return self::$loader;
}
spl_autoload_register(array('ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97', 'loadClassLoader'), true, true);
spl_autoload_register(array('ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae', 'loadClassLoader'), true, true);
self::$loader = $loader = new \Composer\Autoload\ClassLoader();
spl_autoload_unregister(array('ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97', 'loadClassLoader'));
spl_autoload_unregister(array('ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae', 'loadClassLoader'));
$map = require __DIR__ . '/autoload_namespaces.php';
foreach ($map as $namespace => $path) {
@ -42,14 +42,14 @@ class ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97
$includeFiles = require __DIR__ . '/autoload_files.php';
foreach ($includeFiles as $file) {
composerRequirec13b90f3b2e13ad59b988101eac1fc97($file);
composerRequire738d0ffba01de68eecc7cdccd108bcae($file);
}
return $loader;
}
}
function composerRequirec13b90f3b2e13ad59b988101eac1fc97($file)
function composerRequire738d0ffba01de68eecc7cdccd108bcae($file)
{
require $file;
}

View File

@ -116,45 +116,6 @@
"description": "Minimalist micro-framework",
"homepage": "https://github.com/fguillot/picoFarad"
},
{
"name": "fguillot/picofeed",
"version": "dev-master",
"version_normalized": "9999999-dev",
"source": {
"type": "git",
"url": "https://github.com/fguillot/picoFeed.git",
"reference": "8722d562f1f5446c5595d8854f2a3260fdf1a5b2"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/fguillot/picoFeed/zipball/8722d562f1f5446c5595d8854f2a3260fdf1a5b2",
"reference": "8722d562f1f5446c5595d8854f2a3260fdf1a5b2",
"shasum": ""
},
"require": {
"php": ">=5.3.0"
},
"time": "2015-01-02 16:39:51",
"type": "library",
"installation-source": "dist",
"autoload": {
"psr-0": {
"PicoFeed": "lib/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"Unlicense"
],
"authors": [
{
"name": "Frédéric Guillot",
"homepage": "http://fredericguillot.com"
}
],
"description": "Modern library to write or read feeds (RSS/Atom)",
"homepage": "http://fguillot.github.io/picoFeed"
},
{
"name": "fguillot/picodb",
"version": "dev-master",
@ -193,5 +154,44 @@
],
"description": "Minimalist database query builder",
"homepage": "https://github.com/fguillot/picoDb"
},
{
"name": "fguillot/picofeed",
"version": "dev-master",
"version_normalized": "9999999-dev",
"source": {
"type": "git",
"url": "https://github.com/fguillot/picoFeed.git",
"reference": "3ef98d7b1ea35bd48e0a4e99ea518d0c3165a0c0"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/fguillot/picoFeed/zipball/3ef98d7b1ea35bd48e0a4e99ea518d0c3165a0c0",
"reference": "3ef98d7b1ea35bd48e0a4e99ea518d0c3165a0c0",
"shasum": ""
},
"require": {
"php": ">=5.3.0"
},
"time": "2015-01-02 20:21:50",
"type": "library",
"installation-source": "dist",
"autoload": {
"psr-0": {
"PicoFeed": "lib/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"Unlicense"
],
"authors": [
{
"name": "Frédéric Guillot",
"homepage": "http://fredericguillot.com"
}
],
"description": "Modern library to write or read feeds (RSS/Atom)",
"homepage": "http://fguillot.github.io/picoFeed"
}
]

View File

@ -320,6 +320,8 @@ class Curl extends Client
throw new InvalidUrlException('Unable to resolve hostname');
case 7: // CURLE_COULDNT_CONNECT
throw new InvalidUrlException('Unable to connect to the remote host');
case 23: // CURLE_WRITE_ERROR
throw new MaxSizeException('Maximum response size exceeded');
case 28: // CURLE_OPERATION_TIMEDOUT
throw new TimeoutException('Operation timeout');
case 35: // CURLE_SSL_CONNECT_ERROR

View File

@ -48,6 +48,14 @@ class Grabber
*/
private $encoding = '';
/**
* Flag to skip download and parsing
*
* @access private
* @var boolean
*/
private $skip_processing = false;
/**
* List of attributes to try to get the content, order is important, generic terms at the end
*
@ -65,6 +73,7 @@ class Grabber
'post-content',
'post_content',
'entry-content',
'entry-body',
'main-content',
'story_content',
'storycontent',
@ -101,6 +110,10 @@ class Grabber
'related-posts',
'tweet',
'categories',
'post_title',
'by_line',
'byline',
'sponsors',
);
/**
@ -140,6 +153,9 @@ class Grabber
$this->url = $url;
$this->html = $html;
$this->encoding = $encoding;
$this->handleFiles();
$this->handleStreamingVideos();
}
/**
@ -185,11 +201,39 @@ class Grabber
*/
public function getFilteredContent()
{
$filter = Filter::html($this->content, Url::base($this->url));
$filter = Filter::html($this->content, $this->url);
$filter->setConfig($this->config);
return $filter->execute();
}
/**
* Return the Youtube embed player and skip processing
*
* @access public
* @return string
*/
public function handleStreamingVideos()
{
if (preg_match("#(?<=v=|v\/|vi=|vi\/|youtu.be\/)[a-zA-Z0-9_-]{11}#", $this->url, $matches)) {
$this->content = '<iframe width="560" height="315" src="//www.youtube.com/embed/'.$matches[0].'" frameborder="0"></iframe>';
$this->skip_processing = true;
}
}
/**
* Skip processing for PDF documents
*
* @access public
* @return string
*/
public function handleFiles()
{
if (substr($this->url, -3) === 'pdf') {
$this->skip_processing = true;
Logger::setMessage(get_called_class().': PDF document => processing skipped');
}
}
/**
* Parse the HTML content
*
@ -198,32 +242,36 @@ class Grabber
*/
public function parse()
{
if ($this->skip_processing) {
return true;
}
if ($this->html) {
Logger::setMessage(get_called_class().' Fix encoding');
Logger::setMessage(get_called_class().': Fix encoding');
Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'"');
$this->html = Encoding::convert($this->html, $this->encoding);
$this->html = Filter::stripHeadTags($this->html);
Logger::setMessage(get_called_class().' Content length: '.strlen($this->html).' bytes');
Logger::setMessage(get_called_class().': Content length: '.strlen($this->html).' bytes');
$rules = $this->getRules();
if (is_array($rules)) {
Logger::setMessage(get_called_class().' Parse content with rules');
Logger::setMessage(get_called_class().': Parse content with rules');
$this->parseContentWithRules($rules);
}
else {
Logger::setMessage(get_called_class().' Parse content with candidates');
Logger::setMessage(get_called_class().': Parse content with candidates');
$this->parseContentWithCandidates();
}
}
else {
Logger::setMessage(get_called_class().' No content fetched');
Logger::setMessage(get_called_class().': No content fetched');
}
Logger::setMessage(get_called_class().' Content length: '.strlen($this->content).' bytes');
Logger::setMessage(get_called_class().' Grabber done');
Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
Logger::setMessage(get_called_class().': Grabber done');
return $this->content !== '';
}
@ -236,18 +284,21 @@ class Grabber
*/
public function download()
{
try {
if (! $this->skip_processing) {
$client = Client::getInstance();
$client->setConfig($this->config);
$client->execute($this->url);
try {
$this->url = $client->getUrl();
$this->html = $client->getContent();
$this->encoding = $client->getEncoding();
}
catch (ClientException $e) {
Logger::setMessage(get_called_class().': '.$e->getMessage());
$client = Client::getInstance();
$client->setConfig($this->config);
$client->execute($this->url);
$this->url = $client->getUrl();
$this->html = $client->getContent();
$this->encoding = $client->getEncoding();
}
catch (ClientException $e) {
Logger::setMessage(get_called_class().': '.$e->getMessage());
}
}
return $this->html;
@ -346,34 +397,40 @@ class Grabber
// Try to lookup in each tag
foreach ($this->candidatesAttributes as $candidate) {
Logger::setMessage(get_called_class().' Try this candidate: "'.$candidate.'"');
Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"');
$nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
Logger::setMessage(get_called_class().' Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
break;
}
}
// Try to fetch <article/>
if (! $this->content) {
if (strlen($this->content) < 200) {
$nodes = $xpath->query('//article');
if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
Logger::setMessage(get_called_class().' Find <article/> tag ('.strlen($this->content).' bytes)');
Logger::setMessage(get_called_class().': Find <article/> tag ('.strlen($this->content).' bytes)');
}
}
// Get everything
if (strlen($this->content) < 50) {
Logger::setMessage(get_called_class().' No enought content fetched, get the full body');
$this->content = $dom->saveXML($dom->firstChild);
$nodes = $xpath->query('//body');
if ($nodes !== false && $nodes->length > 0) {
Logger::setMessage(get_called_class().' No enought content fetched, get //body');
$this->content = $dom->saveXML($nodes->item(0));
}
}
Logger::setMessage(get_called_class().' Strip garbage');
Logger::setMessage(get_called_class().': Strip garbage');
$this->stripGarbage();
}
@ -395,7 +452,7 @@ class Grabber
$nodes = $xpath->query('//'.$tag);
if ($nodes !== false && $nodes->length > 0) {
Logger::setMessage(get_called_class().' Strip tag: "'.$tag.'"');
Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"');
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
@ -407,9 +464,11 @@ class Grabber
$nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
if ($nodes !== false && $nodes->length > 0) {
Logger::setMessage(get_called_class().' Strip attribute: "'.$attribute.'"');
Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"');
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
if ($this->shouldRemove($dom, $node)) {
$node->parentNode->removeChild($node);
}
}
}
}
@ -417,4 +476,31 @@ class Grabber
$this->content = $dom->saveXML($dom->documentElement);
}
}
/**
* Return false if the node should not be removed
*
* @access public
* @param DomDocument $dom
* @param DomNode $node
* @return boolean
*/
public function shouldRemove($dom, $node)
{
$document_length = strlen($dom->textContent);
$node_length = strlen($node->textContent);
if ($document_length === 0) {
return true;
}
$ratio = $node_length * 100 / $document_length;
if ($ratio >= 90) {
Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%');
return false;
}
return true;
}
}

View File

@ -67,7 +67,7 @@ class Url
if ($link->isRelativeUrl()) {
if ($link->isRelativePath()) {
return $link->getAbsoluteUrl($website->getAbsoluteUrl());
return $link->getAbsoluteUrl($website->getBaseUrl($website->getBasePath()));
}
return $link->getAbsoluteUrl($website->getBaseUrl());
@ -159,6 +159,22 @@ class Url
return empty($this->components['path']) ? '' : $this->components['path'];
}
/**
* Get the base path
*
* @access public
* @return string
*/
public function getBasePath()
{
$current_path = $this->getPath();
$path = $this->isRelativePath() ? '/' : '';
$path .= substr($current_path, -1) === '/' ? $current_path : dirname($current_path);
return str_replace('//', '/', $path.'/');
}
/**
* Get the full path (path + querystring + fragment)
*

View File

@ -168,8 +168,9 @@ abstract class Parser
$this->findItemEnclosure($entry, $item, $feed);
$this->findItemLanguage($entry, $item, $feed);
$this->scrapWebsite($item);
// Order is important (avoid double filtering)
$this->filterItemContent($feed, $item);
$this->scrapWebsite($item);
$feed->items[] = $item;
}
@ -238,7 +239,7 @@ abstract class Parser
$grabber->download();
if ($grabber->parse()) {
$item->content = $grabber->getContent() ?: $item->content;
$item->content = $grabber->getFilteredContent();
}
}
}

View File

@ -67,6 +67,9 @@ class UrlTest extends PHPUnit_Framework_TestCase
$url = new Url('//localhost/test?truc');
$this->assertEquals('http://localhost', $url->getBaseUrl());
$url = new Url('//localhost/test?truc');
$this->assertEquals('http://localhost', $url->getBaseUrl());
}
public function testIsRelativeUrl()
@ -112,6 +115,9 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO
$url = new Url('anything');
$this->assertEquals('/anything', $url->getFullPath());
$url = new Url('foo/bar');
$this->assertEquals('/foo/bar', $url->getFullPath());
$url = new Url('index.php?foo=bar&test=1');
$this->assertEquals('/index.php?foo=bar&test=1', $url->getFullPath());
}
@ -160,6 +166,9 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO
$url = new Url('https://127.0.0.1:8000/here/test?v=3');
$this->assertEquals('https://127.0.0.1:8000/here/test?v=3', $url->getAbsoluteUrl());
$url = new Url('http://www.lofibucket.com/articles/oscilloscope_quake.html');
$this->assertEquals('http://www.lofibucket.com/articles/oscilloscope_quake.html', $url->getAbsoluteUrl());
$url = new Url('test?v=3');
$this->assertEquals('https://127.0.0.1:8000/here/test?v=3', $url->getAbsoluteUrl('https://127.0.0.1:8000/here/'));
}
@ -185,6 +194,27 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO
$this->assertFalse($url->isRelativePath());
}
public function testGetBasePath()
{
$url = new Url('img/quakescope.jpg');
$this->assertEquals('/img/', $url->getBasePath());
$url = new Url('http://foo/img/quakescope.jpg');
$this->assertEquals('/img/', $url->getBasePath());
$url = new Url('http://foo/bar.html');
$this->assertEquals('/', $url->getBasePath());
$url = new Url('http://foo/bar');
$this->assertEquals('/', $url->getBasePath());
$url = new Url('http://foo/bar/');
$this->assertEquals('/bar/', $url->getBasePath());
$url = new Url('http://website/subfolder/img/foo.png');
$this->assertEquals('/subfolder/img/', $url->getBasePath());
}
public function testResolve()
{
$this->assertEquals(
@ -216,5 +246,10 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO
'http://website/img/foo.png',
Url::resolve('/img/foo.png', 'http://website/subfolder/')
);
$this->assertEquals(
'http://www.lofibucket.com/articles/img/quakescope.jpg',
Url::resolve('img/quakescope.jpg', 'http://www.lofibucket.com/articles/oscilloscope_quake.html')
);
}
}