Update PicoFeed (improve grabber and bug fixes)
This commit is contained in:
parent
da174a10cc
commit
b7ca07b9c0
2
vendor/autoload.php
vendored
2
vendor/autoload.php
vendored
@ -4,4 +4,4 @@
|
||||
|
||||
require_once __DIR__ . '/composer' . '/autoload_real.php';
|
||||
|
||||
return ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97::getLoader();
|
||||
return ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae::getLoader();
|
||||
|
10
vendor/composer/autoload_real.php
vendored
10
vendor/composer/autoload_real.php
vendored
@ -2,7 +2,7 @@
|
||||
|
||||
// autoload_real.php @generated by Composer
|
||||
|
||||
class ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97
|
||||
class ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae
|
||||
{
|
||||
private static $loader;
|
||||
|
||||
@ -19,9 +19,9 @@ class ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97
|
||||
return self::$loader;
|
||||
}
|
||||
|
||||
spl_autoload_register(array('ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97', 'loadClassLoader'), true, true);
|
||||
spl_autoload_register(array('ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae', 'loadClassLoader'), true, true);
|
||||
self::$loader = $loader = new \Composer\Autoload\ClassLoader();
|
||||
spl_autoload_unregister(array('ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97', 'loadClassLoader'));
|
||||
spl_autoload_unregister(array('ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae', 'loadClassLoader'));
|
||||
|
||||
$map = require __DIR__ . '/autoload_namespaces.php';
|
||||
foreach ($map as $namespace => $path) {
|
||||
@ -42,14 +42,14 @@ class ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97
|
||||
|
||||
$includeFiles = require __DIR__ . '/autoload_files.php';
|
||||
foreach ($includeFiles as $file) {
|
||||
composerRequirec13b90f3b2e13ad59b988101eac1fc97($file);
|
||||
composerRequire738d0ffba01de68eecc7cdccd108bcae($file);
|
||||
}
|
||||
|
||||
return $loader;
|
||||
}
|
||||
}
|
||||
|
||||
function composerRequirec13b90f3b2e13ad59b988101eac1fc97($file)
|
||||
function composerRequire738d0ffba01de68eecc7cdccd108bcae($file)
|
||||
{
|
||||
require $file;
|
||||
}
|
||||
|
78
vendor/composer/installed.json
vendored
78
vendor/composer/installed.json
vendored
@ -116,45 +116,6 @@
|
||||
"description": "Minimalist micro-framework",
|
||||
"homepage": "https://github.com/fguillot/picoFarad"
|
||||
},
|
||||
{
|
||||
"name": "fguillot/picofeed",
|
||||
"version": "dev-master",
|
||||
"version_normalized": "9999999-dev",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/fguillot/picoFeed.git",
|
||||
"reference": "8722d562f1f5446c5595d8854f2a3260fdf1a5b2"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/fguillot/picoFeed/zipball/8722d562f1f5446c5595d8854f2a3260fdf1a5b2",
|
||||
"reference": "8722d562f1f5446c5595d8854f2a3260fdf1a5b2",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
"php": ">=5.3.0"
|
||||
},
|
||||
"time": "2015-01-02 16:39:51",
|
||||
"type": "library",
|
||||
"installation-source": "dist",
|
||||
"autoload": {
|
||||
"psr-0": {
|
||||
"PicoFeed": "lib/"
|
||||
}
|
||||
},
|
||||
"notification-url": "https://packagist.org/downloads/",
|
||||
"license": [
|
||||
"Unlicense"
|
||||
],
|
||||
"authors": [
|
||||
{
|
||||
"name": "Frédéric Guillot",
|
||||
"homepage": "http://fredericguillot.com"
|
||||
}
|
||||
],
|
||||
"description": "Modern library to write or read feeds (RSS/Atom)",
|
||||
"homepage": "http://fguillot.github.io/picoFeed"
|
||||
},
|
||||
{
|
||||
"name": "fguillot/picodb",
|
||||
"version": "dev-master",
|
||||
@ -193,5 +154,44 @@
|
||||
],
|
||||
"description": "Minimalist database query builder",
|
||||
"homepage": "https://github.com/fguillot/picoDb"
|
||||
},
|
||||
{
|
||||
"name": "fguillot/picofeed",
|
||||
"version": "dev-master",
|
||||
"version_normalized": "9999999-dev",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/fguillot/picoFeed.git",
|
||||
"reference": "3ef98d7b1ea35bd48e0a4e99ea518d0c3165a0c0"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/fguillot/picoFeed/zipball/3ef98d7b1ea35bd48e0a4e99ea518d0c3165a0c0",
|
||||
"reference": "3ef98d7b1ea35bd48e0a4e99ea518d0c3165a0c0",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
"php": ">=5.3.0"
|
||||
},
|
||||
"time": "2015-01-02 20:21:50",
|
||||
"type": "library",
|
||||
"installation-source": "dist",
|
||||
"autoload": {
|
||||
"psr-0": {
|
||||
"PicoFeed": "lib/"
|
||||
}
|
||||
},
|
||||
"notification-url": "https://packagist.org/downloads/",
|
||||
"license": [
|
||||
"Unlicense"
|
||||
],
|
||||
"authors": [
|
||||
{
|
||||
"name": "Frédéric Guillot",
|
||||
"homepage": "http://fredericguillot.com"
|
||||
}
|
||||
],
|
||||
"description": "Modern library to write or read feeds (RSS/Atom)",
|
||||
"homepage": "http://fguillot.github.io/picoFeed"
|
||||
}
|
||||
]
|
||||
|
@ -320,6 +320,8 @@ class Curl extends Client
|
||||
throw new InvalidUrlException('Unable to resolve hostname');
|
||||
case 7: // CURLE_COULDNT_CONNECT
|
||||
throw new InvalidUrlException('Unable to connect to the remote host');
|
||||
case 23: // CURLE_WRITE_ERROR
|
||||
throw new MaxSizeException('Maximum response size exceeded');
|
||||
case 28: // CURLE_OPERATION_TIMEDOUT
|
||||
throw new TimeoutException('Operation timeout');
|
||||
case 35: // CURLE_SSL_CONNECT_ERROR
|
||||
|
@ -48,6 +48,14 @@ class Grabber
|
||||
*/
|
||||
private $encoding = '';
|
||||
|
||||
/**
|
||||
* Flag to skip download and parsing
|
||||
*
|
||||
* @access private
|
||||
* @var boolean
|
||||
*/
|
||||
private $skip_processing = false;
|
||||
|
||||
/**
|
||||
* List of attributes to try to get the content, order is important, generic terms at the end
|
||||
*
|
||||
@ -65,6 +73,7 @@ class Grabber
|
||||
'post-content',
|
||||
'post_content',
|
||||
'entry-content',
|
||||
'entry-body',
|
||||
'main-content',
|
||||
'story_content',
|
||||
'storycontent',
|
||||
@ -101,6 +110,10 @@ class Grabber
|
||||
'related-posts',
|
||||
'tweet',
|
||||
'categories',
|
||||
'post_title',
|
||||
'by_line',
|
||||
'byline',
|
||||
'sponsors',
|
||||
);
|
||||
|
||||
/**
|
||||
@ -140,6 +153,9 @@ class Grabber
|
||||
$this->url = $url;
|
||||
$this->html = $html;
|
||||
$this->encoding = $encoding;
|
||||
|
||||
$this->handleFiles();
|
||||
$this->handleStreamingVideos();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -185,11 +201,39 @@ class Grabber
|
||||
*/
|
||||
public function getFilteredContent()
|
||||
{
|
||||
$filter = Filter::html($this->content, Url::base($this->url));
|
||||
$filter = Filter::html($this->content, $this->url);
|
||||
$filter->setConfig($this->config);
|
||||
return $filter->execute();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the Youtube embed player and skip processing
|
||||
*
|
||||
* @access public
|
||||
* @return string
|
||||
*/
|
||||
public function handleStreamingVideos()
|
||||
{
|
||||
if (preg_match("#(?<=v=|v\/|vi=|vi\/|youtu.be\/)[a-zA-Z0-9_-]{11}#", $this->url, $matches)) {
|
||||
$this->content = '<iframe width="560" height="315" src="//www.youtube.com/embed/'.$matches[0].'" frameborder="0"></iframe>';
|
||||
$this->skip_processing = true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip processing for PDF documents
|
||||
*
|
||||
* @access public
|
||||
* @return string
|
||||
*/
|
||||
public function handleFiles()
|
||||
{
|
||||
if (substr($this->url, -3) === 'pdf') {
|
||||
$this->skip_processing = true;
|
||||
Logger::setMessage(get_called_class().': PDF document => processing skipped');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the HTML content
|
||||
*
|
||||
@ -198,32 +242,36 @@ class Grabber
|
||||
*/
|
||||
public function parse()
|
||||
{
|
||||
if ($this->skip_processing) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if ($this->html) {
|
||||
|
||||
Logger::setMessage(get_called_class().' Fix encoding');
|
||||
Logger::setMessage(get_called_class().': Fix encoding');
|
||||
Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'"');
|
||||
|
||||
$this->html = Encoding::convert($this->html, $this->encoding);
|
||||
$this->html = Filter::stripHeadTags($this->html);
|
||||
|
||||
Logger::setMessage(get_called_class().' Content length: '.strlen($this->html).' bytes');
|
||||
Logger::setMessage(get_called_class().': Content length: '.strlen($this->html).' bytes');
|
||||
$rules = $this->getRules();
|
||||
|
||||
if (is_array($rules)) {
|
||||
Logger::setMessage(get_called_class().' Parse content with rules');
|
||||
Logger::setMessage(get_called_class().': Parse content with rules');
|
||||
$this->parseContentWithRules($rules);
|
||||
}
|
||||
else {
|
||||
Logger::setMessage(get_called_class().' Parse content with candidates');
|
||||
Logger::setMessage(get_called_class().': Parse content with candidates');
|
||||
$this->parseContentWithCandidates();
|
||||
}
|
||||
}
|
||||
else {
|
||||
Logger::setMessage(get_called_class().' No content fetched');
|
||||
Logger::setMessage(get_called_class().': No content fetched');
|
||||
}
|
||||
|
||||
Logger::setMessage(get_called_class().' Content length: '.strlen($this->content).' bytes');
|
||||
Logger::setMessage(get_called_class().' Grabber done');
|
||||
Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
|
||||
Logger::setMessage(get_called_class().': Grabber done');
|
||||
|
||||
return $this->content !== '';
|
||||
}
|
||||
@ -236,6 +284,8 @@ class Grabber
|
||||
*/
|
||||
public function download()
|
||||
{
|
||||
if (! $this->skip_processing) {
|
||||
|
||||
try {
|
||||
|
||||
$client = Client::getInstance();
|
||||
@ -249,6 +299,7 @@ class Grabber
|
||||
catch (ClientException $e) {
|
||||
Logger::setMessage(get_called_class().': '.$e->getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
return $this->html;
|
||||
}
|
||||
@ -346,34 +397,40 @@ class Grabber
|
||||
// Try to lookup in each tag
|
||||
foreach ($this->candidatesAttributes as $candidate) {
|
||||
|
||||
Logger::setMessage(get_called_class().' Try this candidate: "'.$candidate.'"');
|
||||
Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"');
|
||||
|
||||
$nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
|
||||
|
||||
if ($nodes !== false && $nodes->length > 0) {
|
||||
$this->content = $dom->saveXML($nodes->item(0));
|
||||
Logger::setMessage(get_called_class().' Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
|
||||
Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Try to fetch <article/>
|
||||
if (! $this->content) {
|
||||
if (strlen($this->content) < 200) {
|
||||
|
||||
$nodes = $xpath->query('//article');
|
||||
|
||||
if ($nodes !== false && $nodes->length > 0) {
|
||||
$this->content = $dom->saveXML($nodes->item(0));
|
||||
Logger::setMessage(get_called_class().' Find <article/> tag ('.strlen($this->content).' bytes)');
|
||||
Logger::setMessage(get_called_class().': Find <article/> tag ('.strlen($this->content).' bytes)');
|
||||
}
|
||||
}
|
||||
|
||||
// Get everything
|
||||
if (strlen($this->content) < 50) {
|
||||
Logger::setMessage(get_called_class().' No enought content fetched, get the full body');
|
||||
$this->content = $dom->saveXML($dom->firstChild);
|
||||
|
||||
$nodes = $xpath->query('//body');
|
||||
|
||||
if ($nodes !== false && $nodes->length > 0) {
|
||||
Logger::setMessage(get_called_class().' No enought content fetched, get //body');
|
||||
$this->content = $dom->saveXML($nodes->item(0));
|
||||
}
|
||||
}
|
||||
|
||||
Logger::setMessage(get_called_class().' Strip garbage');
|
||||
Logger::setMessage(get_called_class().': Strip garbage');
|
||||
$this->stripGarbage();
|
||||
}
|
||||
|
||||
@ -395,7 +452,7 @@ class Grabber
|
||||
$nodes = $xpath->query('//'.$tag);
|
||||
|
||||
if ($nodes !== false && $nodes->length > 0) {
|
||||
Logger::setMessage(get_called_class().' Strip tag: "'.$tag.'"');
|
||||
Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"');
|
||||
foreach ($nodes as $node) {
|
||||
$node->parentNode->removeChild($node);
|
||||
}
|
||||
@ -407,14 +464,43 @@ class Grabber
|
||||
$nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
|
||||
|
||||
if ($nodes !== false && $nodes->length > 0) {
|
||||
Logger::setMessage(get_called_class().' Strip attribute: "'.$attribute.'"');
|
||||
Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"');
|
||||
foreach ($nodes as $node) {
|
||||
if ($this->shouldRemove($dom, $node)) {
|
||||
$node->parentNode->removeChild($node);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$this->content = $dom->saveXML($dom->documentElement);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return false if the node should not be removed
|
||||
*
|
||||
* @access public
|
||||
* @param DomDocument $dom
|
||||
* @param DomNode $node
|
||||
* @return boolean
|
||||
*/
|
||||
public function shouldRemove($dom, $node)
|
||||
{
|
||||
$document_length = strlen($dom->textContent);
|
||||
$node_length = strlen($node->textContent);
|
||||
|
||||
if ($document_length === 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
$ratio = $node_length * 100 / $document_length;
|
||||
|
||||
if ($ratio >= 90) {
|
||||
Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%');
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -67,7 +67,7 @@ class Url
|
||||
if ($link->isRelativeUrl()) {
|
||||
|
||||
if ($link->isRelativePath()) {
|
||||
return $link->getAbsoluteUrl($website->getAbsoluteUrl());
|
||||
return $link->getAbsoluteUrl($website->getBaseUrl($website->getBasePath()));
|
||||
}
|
||||
|
||||
return $link->getAbsoluteUrl($website->getBaseUrl());
|
||||
@ -159,6 +159,22 @@ class Url
|
||||
return empty($this->components['path']) ? '' : $this->components['path'];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the base path
|
||||
*
|
||||
* @access public
|
||||
* @return string
|
||||
*/
|
||||
public function getBasePath()
|
||||
{
|
||||
$current_path = $this->getPath();
|
||||
|
||||
$path = $this->isRelativePath() ? '/' : '';
|
||||
$path .= substr($current_path, -1) === '/' ? $current_path : dirname($current_path);
|
||||
|
||||
return str_replace('//', '/', $path.'/');
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the full path (path + querystring + fragment)
|
||||
*
|
||||
|
@ -168,8 +168,9 @@ abstract class Parser
|
||||
$this->findItemEnclosure($entry, $item, $feed);
|
||||
$this->findItemLanguage($entry, $item, $feed);
|
||||
|
||||
$this->scrapWebsite($item);
|
||||
// Order is important (avoid double filtering)
|
||||
$this->filterItemContent($feed, $item);
|
||||
$this->scrapWebsite($item);
|
||||
|
||||
$feed->items[] = $item;
|
||||
}
|
||||
@ -238,7 +239,7 @@ abstract class Parser
|
||||
$grabber->download();
|
||||
|
||||
if ($grabber->parse()) {
|
||||
$item->content = $grabber->getContent() ?: $item->content;
|
||||
$item->content = $grabber->getFilteredContent();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -67,6 +67,9 @@ class UrlTest extends PHPUnit_Framework_TestCase
|
||||
|
||||
$url = new Url('//localhost/test?truc');
|
||||
$this->assertEquals('http://localhost', $url->getBaseUrl());
|
||||
|
||||
$url = new Url('//localhost/test?truc');
|
||||
$this->assertEquals('http://localhost', $url->getBaseUrl());
|
||||
}
|
||||
|
||||
public function testIsRelativeUrl()
|
||||
@ -112,6 +115,9 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO
|
||||
$url = new Url('anything');
|
||||
$this->assertEquals('/anything', $url->getFullPath());
|
||||
|
||||
$url = new Url('foo/bar');
|
||||
$this->assertEquals('/foo/bar', $url->getFullPath());
|
||||
|
||||
$url = new Url('index.php?foo=bar&test=1');
|
||||
$this->assertEquals('/index.php?foo=bar&test=1', $url->getFullPath());
|
||||
}
|
||||
@ -160,6 +166,9 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO
|
||||
$url = new Url('https://127.0.0.1:8000/here/test?v=3');
|
||||
$this->assertEquals('https://127.0.0.1:8000/here/test?v=3', $url->getAbsoluteUrl());
|
||||
|
||||
$url = new Url('http://www.lofibucket.com/articles/oscilloscope_quake.html');
|
||||
$this->assertEquals('http://www.lofibucket.com/articles/oscilloscope_quake.html', $url->getAbsoluteUrl());
|
||||
|
||||
$url = new Url('test?v=3');
|
||||
$this->assertEquals('https://127.0.0.1:8000/here/test?v=3', $url->getAbsoluteUrl('https://127.0.0.1:8000/here/'));
|
||||
}
|
||||
@ -185,6 +194,27 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO
|
||||
$this->assertFalse($url->isRelativePath());
|
||||
}
|
||||
|
||||
public function testGetBasePath()
|
||||
{
|
||||
$url = new Url('img/quakescope.jpg');
|
||||
$this->assertEquals('/img/', $url->getBasePath());
|
||||
|
||||
$url = new Url('http://foo/img/quakescope.jpg');
|
||||
$this->assertEquals('/img/', $url->getBasePath());
|
||||
|
||||
$url = new Url('http://foo/bar.html');
|
||||
$this->assertEquals('/', $url->getBasePath());
|
||||
|
||||
$url = new Url('http://foo/bar');
|
||||
$this->assertEquals('/', $url->getBasePath());
|
||||
|
||||
$url = new Url('http://foo/bar/');
|
||||
$this->assertEquals('/bar/', $url->getBasePath());
|
||||
|
||||
$url = new Url('http://website/subfolder/img/foo.png');
|
||||
$this->assertEquals('/subfolder/img/', $url->getBasePath());
|
||||
}
|
||||
|
||||
public function testResolve()
|
||||
{
|
||||
$this->assertEquals(
|
||||
@ -216,5 +246,10 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO
|
||||
'http://website/img/foo.png',
|
||||
Url::resolve('/img/foo.png', 'http://website/subfolder/')
|
||||
);
|
||||
|
||||
$this->assertEquals(
|
||||
'http://www.lofibucket.com/articles/img/quakescope.jpg',
|
||||
Url::resolve('img/quakescope.jpg', 'http://www.lofibucket.com/articles/oscilloscope_quake.html')
|
||||
);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user