Update PicoFeed (improve grabber and bug fixes)
This commit is contained in:
parent
da174a10cc
commit
b7ca07b9c0
2
vendor/autoload.php
vendored
2
vendor/autoload.php
vendored
@ -4,4 +4,4 @@
|
|||||||
|
|
||||||
require_once __DIR__ . '/composer' . '/autoload_real.php';
|
require_once __DIR__ . '/composer' . '/autoload_real.php';
|
||||||
|
|
||||||
return ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97::getLoader();
|
return ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae::getLoader();
|
||||||
|
10
vendor/composer/autoload_real.php
vendored
10
vendor/composer/autoload_real.php
vendored
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
// autoload_real.php @generated by Composer
|
// autoload_real.php @generated by Composer
|
||||||
|
|
||||||
class ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97
|
class ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae
|
||||||
{
|
{
|
||||||
private static $loader;
|
private static $loader;
|
||||||
|
|
||||||
@ -19,9 +19,9 @@ class ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97
|
|||||||
return self::$loader;
|
return self::$loader;
|
||||||
}
|
}
|
||||||
|
|
||||||
spl_autoload_register(array('ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97', 'loadClassLoader'), true, true);
|
spl_autoload_register(array('ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae', 'loadClassLoader'), true, true);
|
||||||
self::$loader = $loader = new \Composer\Autoload\ClassLoader();
|
self::$loader = $loader = new \Composer\Autoload\ClassLoader();
|
||||||
spl_autoload_unregister(array('ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97', 'loadClassLoader'));
|
spl_autoload_unregister(array('ComposerAutoloaderInit738d0ffba01de68eecc7cdccd108bcae', 'loadClassLoader'));
|
||||||
|
|
||||||
$map = require __DIR__ . '/autoload_namespaces.php';
|
$map = require __DIR__ . '/autoload_namespaces.php';
|
||||||
foreach ($map as $namespace => $path) {
|
foreach ($map as $namespace => $path) {
|
||||||
@ -42,14 +42,14 @@ class ComposerAutoloaderInitc13b90f3b2e13ad59b988101eac1fc97
|
|||||||
|
|
||||||
$includeFiles = require __DIR__ . '/autoload_files.php';
|
$includeFiles = require __DIR__ . '/autoload_files.php';
|
||||||
foreach ($includeFiles as $file) {
|
foreach ($includeFiles as $file) {
|
||||||
composerRequirec13b90f3b2e13ad59b988101eac1fc97($file);
|
composerRequire738d0ffba01de68eecc7cdccd108bcae($file);
|
||||||
}
|
}
|
||||||
|
|
||||||
return $loader;
|
return $loader;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function composerRequirec13b90f3b2e13ad59b988101eac1fc97($file)
|
function composerRequire738d0ffba01de68eecc7cdccd108bcae($file)
|
||||||
{
|
{
|
||||||
require $file;
|
require $file;
|
||||||
}
|
}
|
||||||
|
78
vendor/composer/installed.json
vendored
78
vendor/composer/installed.json
vendored
@ -116,45 +116,6 @@
|
|||||||
"description": "Minimalist micro-framework",
|
"description": "Minimalist micro-framework",
|
||||||
"homepage": "https://github.com/fguillot/picoFarad"
|
"homepage": "https://github.com/fguillot/picoFarad"
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"name": "fguillot/picofeed",
|
|
||||||
"version": "dev-master",
|
|
||||||
"version_normalized": "9999999-dev",
|
|
||||||
"source": {
|
|
||||||
"type": "git",
|
|
||||||
"url": "https://github.com/fguillot/picoFeed.git",
|
|
||||||
"reference": "8722d562f1f5446c5595d8854f2a3260fdf1a5b2"
|
|
||||||
},
|
|
||||||
"dist": {
|
|
||||||
"type": "zip",
|
|
||||||
"url": "https://api.github.com/repos/fguillot/picoFeed/zipball/8722d562f1f5446c5595d8854f2a3260fdf1a5b2",
|
|
||||||
"reference": "8722d562f1f5446c5595d8854f2a3260fdf1a5b2",
|
|
||||||
"shasum": ""
|
|
||||||
},
|
|
||||||
"require": {
|
|
||||||
"php": ">=5.3.0"
|
|
||||||
},
|
|
||||||
"time": "2015-01-02 16:39:51",
|
|
||||||
"type": "library",
|
|
||||||
"installation-source": "dist",
|
|
||||||
"autoload": {
|
|
||||||
"psr-0": {
|
|
||||||
"PicoFeed": "lib/"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"notification-url": "https://packagist.org/downloads/",
|
|
||||||
"license": [
|
|
||||||
"Unlicense"
|
|
||||||
],
|
|
||||||
"authors": [
|
|
||||||
{
|
|
||||||
"name": "Frédéric Guillot",
|
|
||||||
"homepage": "http://fredericguillot.com"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"description": "Modern library to write or read feeds (RSS/Atom)",
|
|
||||||
"homepage": "http://fguillot.github.io/picoFeed"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"name": "fguillot/picodb",
|
"name": "fguillot/picodb",
|
||||||
"version": "dev-master",
|
"version": "dev-master",
|
||||||
@ -193,5 +154,44 @@
|
|||||||
],
|
],
|
||||||
"description": "Minimalist database query builder",
|
"description": "Minimalist database query builder",
|
||||||
"homepage": "https://github.com/fguillot/picoDb"
|
"homepage": "https://github.com/fguillot/picoDb"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "fguillot/picofeed",
|
||||||
|
"version": "dev-master",
|
||||||
|
"version_normalized": "9999999-dev",
|
||||||
|
"source": {
|
||||||
|
"type": "git",
|
||||||
|
"url": "https://github.com/fguillot/picoFeed.git",
|
||||||
|
"reference": "3ef98d7b1ea35bd48e0a4e99ea518d0c3165a0c0"
|
||||||
|
},
|
||||||
|
"dist": {
|
||||||
|
"type": "zip",
|
||||||
|
"url": "https://api.github.com/repos/fguillot/picoFeed/zipball/3ef98d7b1ea35bd48e0a4e99ea518d0c3165a0c0",
|
||||||
|
"reference": "3ef98d7b1ea35bd48e0a4e99ea518d0c3165a0c0",
|
||||||
|
"shasum": ""
|
||||||
|
},
|
||||||
|
"require": {
|
||||||
|
"php": ">=5.3.0"
|
||||||
|
},
|
||||||
|
"time": "2015-01-02 20:21:50",
|
||||||
|
"type": "library",
|
||||||
|
"installation-source": "dist",
|
||||||
|
"autoload": {
|
||||||
|
"psr-0": {
|
||||||
|
"PicoFeed": "lib/"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"notification-url": "https://packagist.org/downloads/",
|
||||||
|
"license": [
|
||||||
|
"Unlicense"
|
||||||
|
],
|
||||||
|
"authors": [
|
||||||
|
{
|
||||||
|
"name": "Frédéric Guillot",
|
||||||
|
"homepage": "http://fredericguillot.com"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"description": "Modern library to write or read feeds (RSS/Atom)",
|
||||||
|
"homepage": "http://fguillot.github.io/picoFeed"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
@ -320,6 +320,8 @@ class Curl extends Client
|
|||||||
throw new InvalidUrlException('Unable to resolve hostname');
|
throw new InvalidUrlException('Unable to resolve hostname');
|
||||||
case 7: // CURLE_COULDNT_CONNECT
|
case 7: // CURLE_COULDNT_CONNECT
|
||||||
throw new InvalidUrlException('Unable to connect to the remote host');
|
throw new InvalidUrlException('Unable to connect to the remote host');
|
||||||
|
case 23: // CURLE_WRITE_ERROR
|
||||||
|
throw new MaxSizeException('Maximum response size exceeded');
|
||||||
case 28: // CURLE_OPERATION_TIMEDOUT
|
case 28: // CURLE_OPERATION_TIMEDOUT
|
||||||
throw new TimeoutException('Operation timeout');
|
throw new TimeoutException('Operation timeout');
|
||||||
case 35: // CURLE_SSL_CONNECT_ERROR
|
case 35: // CURLE_SSL_CONNECT_ERROR
|
||||||
|
@ -48,6 +48,14 @@ class Grabber
|
|||||||
*/
|
*/
|
||||||
private $encoding = '';
|
private $encoding = '';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Flag to skip download and parsing
|
||||||
|
*
|
||||||
|
* @access private
|
||||||
|
* @var boolean
|
||||||
|
*/
|
||||||
|
private $skip_processing = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* List of attributes to try to get the content, order is important, generic terms at the end
|
* List of attributes to try to get the content, order is important, generic terms at the end
|
||||||
*
|
*
|
||||||
@ -65,6 +73,7 @@ class Grabber
|
|||||||
'post-content',
|
'post-content',
|
||||||
'post_content',
|
'post_content',
|
||||||
'entry-content',
|
'entry-content',
|
||||||
|
'entry-body',
|
||||||
'main-content',
|
'main-content',
|
||||||
'story_content',
|
'story_content',
|
||||||
'storycontent',
|
'storycontent',
|
||||||
@ -101,6 +110,10 @@ class Grabber
|
|||||||
'related-posts',
|
'related-posts',
|
||||||
'tweet',
|
'tweet',
|
||||||
'categories',
|
'categories',
|
||||||
|
'post_title',
|
||||||
|
'by_line',
|
||||||
|
'byline',
|
||||||
|
'sponsors',
|
||||||
);
|
);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -140,6 +153,9 @@ class Grabber
|
|||||||
$this->url = $url;
|
$this->url = $url;
|
||||||
$this->html = $html;
|
$this->html = $html;
|
||||||
$this->encoding = $encoding;
|
$this->encoding = $encoding;
|
||||||
|
|
||||||
|
$this->handleFiles();
|
||||||
|
$this->handleStreamingVideos();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -185,11 +201,39 @@ class Grabber
|
|||||||
*/
|
*/
|
||||||
public function getFilteredContent()
|
public function getFilteredContent()
|
||||||
{
|
{
|
||||||
$filter = Filter::html($this->content, Url::base($this->url));
|
$filter = Filter::html($this->content, $this->url);
|
||||||
$filter->setConfig($this->config);
|
$filter->setConfig($this->config);
|
||||||
return $filter->execute();
|
return $filter->execute();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the Youtube embed player and skip processing
|
||||||
|
*
|
||||||
|
* @access public
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
public function handleStreamingVideos()
|
||||||
|
{
|
||||||
|
if (preg_match("#(?<=v=|v\/|vi=|vi\/|youtu.be\/)[a-zA-Z0-9_-]{11}#", $this->url, $matches)) {
|
||||||
|
$this->content = '<iframe width="560" height="315" src="//www.youtube.com/embed/'.$matches[0].'" frameborder="0"></iframe>';
|
||||||
|
$this->skip_processing = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Skip processing for PDF documents
|
||||||
|
*
|
||||||
|
* @access public
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
public function handleFiles()
|
||||||
|
{
|
||||||
|
if (substr($this->url, -3) === 'pdf') {
|
||||||
|
$this->skip_processing = true;
|
||||||
|
Logger::setMessage(get_called_class().': PDF document => processing skipped');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse the HTML content
|
* Parse the HTML content
|
||||||
*
|
*
|
||||||
@ -198,32 +242,36 @@ class Grabber
|
|||||||
*/
|
*/
|
||||||
public function parse()
|
public function parse()
|
||||||
{
|
{
|
||||||
|
if ($this->skip_processing) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
if ($this->html) {
|
if ($this->html) {
|
||||||
|
|
||||||
Logger::setMessage(get_called_class().' Fix encoding');
|
Logger::setMessage(get_called_class().': Fix encoding');
|
||||||
Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'"');
|
Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'"');
|
||||||
|
|
||||||
$this->html = Encoding::convert($this->html, $this->encoding);
|
$this->html = Encoding::convert($this->html, $this->encoding);
|
||||||
$this->html = Filter::stripHeadTags($this->html);
|
$this->html = Filter::stripHeadTags($this->html);
|
||||||
|
|
||||||
Logger::setMessage(get_called_class().' Content length: '.strlen($this->html).' bytes');
|
Logger::setMessage(get_called_class().': Content length: '.strlen($this->html).' bytes');
|
||||||
$rules = $this->getRules();
|
$rules = $this->getRules();
|
||||||
|
|
||||||
if (is_array($rules)) {
|
if (is_array($rules)) {
|
||||||
Logger::setMessage(get_called_class().' Parse content with rules');
|
Logger::setMessage(get_called_class().': Parse content with rules');
|
||||||
$this->parseContentWithRules($rules);
|
$this->parseContentWithRules($rules);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
Logger::setMessage(get_called_class().' Parse content with candidates');
|
Logger::setMessage(get_called_class().': Parse content with candidates');
|
||||||
$this->parseContentWithCandidates();
|
$this->parseContentWithCandidates();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
Logger::setMessage(get_called_class().' No content fetched');
|
Logger::setMessage(get_called_class().': No content fetched');
|
||||||
}
|
}
|
||||||
|
|
||||||
Logger::setMessage(get_called_class().' Content length: '.strlen($this->content).' bytes');
|
Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
|
||||||
Logger::setMessage(get_called_class().' Grabber done');
|
Logger::setMessage(get_called_class().': Grabber done');
|
||||||
|
|
||||||
return $this->content !== '';
|
return $this->content !== '';
|
||||||
}
|
}
|
||||||
@ -236,6 +284,8 @@ class Grabber
|
|||||||
*/
|
*/
|
||||||
public function download()
|
public function download()
|
||||||
{
|
{
|
||||||
|
if (! $this->skip_processing) {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
|
||||||
$client = Client::getInstance();
|
$client = Client::getInstance();
|
||||||
@ -249,6 +299,7 @@ class Grabber
|
|||||||
catch (ClientException $e) {
|
catch (ClientException $e) {
|
||||||
Logger::setMessage(get_called_class().': '.$e->getMessage());
|
Logger::setMessage(get_called_class().': '.$e->getMessage());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return $this->html;
|
return $this->html;
|
||||||
}
|
}
|
||||||
@ -346,34 +397,40 @@ class Grabber
|
|||||||
// Try to lookup in each tag
|
// Try to lookup in each tag
|
||||||
foreach ($this->candidatesAttributes as $candidate) {
|
foreach ($this->candidatesAttributes as $candidate) {
|
||||||
|
|
||||||
Logger::setMessage(get_called_class().' Try this candidate: "'.$candidate.'"');
|
Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"');
|
||||||
|
|
||||||
$nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
|
$nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
|
||||||
|
|
||||||
if ($nodes !== false && $nodes->length > 0) {
|
if ($nodes !== false && $nodes->length > 0) {
|
||||||
$this->content = $dom->saveXML($nodes->item(0));
|
$this->content = $dom->saveXML($nodes->item(0));
|
||||||
Logger::setMessage(get_called_class().' Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
|
Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Try to fetch <article/>
|
// Try to fetch <article/>
|
||||||
if (! $this->content) {
|
if (strlen($this->content) < 200) {
|
||||||
|
|
||||||
$nodes = $xpath->query('//article');
|
$nodes = $xpath->query('//article');
|
||||||
|
|
||||||
if ($nodes !== false && $nodes->length > 0) {
|
if ($nodes !== false && $nodes->length > 0) {
|
||||||
$this->content = $dom->saveXML($nodes->item(0));
|
$this->content = $dom->saveXML($nodes->item(0));
|
||||||
Logger::setMessage(get_called_class().' Find <article/> tag ('.strlen($this->content).' bytes)');
|
Logger::setMessage(get_called_class().': Find <article/> tag ('.strlen($this->content).' bytes)');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Get everything
|
||||||
if (strlen($this->content) < 50) {
|
if (strlen($this->content) < 50) {
|
||||||
Logger::setMessage(get_called_class().' No enought content fetched, get the full body');
|
|
||||||
$this->content = $dom->saveXML($dom->firstChild);
|
$nodes = $xpath->query('//body');
|
||||||
|
|
||||||
|
if ($nodes !== false && $nodes->length > 0) {
|
||||||
|
Logger::setMessage(get_called_class().' No enought content fetched, get //body');
|
||||||
|
$this->content = $dom->saveXML($nodes->item(0));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Logger::setMessage(get_called_class().' Strip garbage');
|
Logger::setMessage(get_called_class().': Strip garbage');
|
||||||
$this->stripGarbage();
|
$this->stripGarbage();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -395,7 +452,7 @@ class Grabber
|
|||||||
$nodes = $xpath->query('//'.$tag);
|
$nodes = $xpath->query('//'.$tag);
|
||||||
|
|
||||||
if ($nodes !== false && $nodes->length > 0) {
|
if ($nodes !== false && $nodes->length > 0) {
|
||||||
Logger::setMessage(get_called_class().' Strip tag: "'.$tag.'"');
|
Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"');
|
||||||
foreach ($nodes as $node) {
|
foreach ($nodes as $node) {
|
||||||
$node->parentNode->removeChild($node);
|
$node->parentNode->removeChild($node);
|
||||||
}
|
}
|
||||||
@ -407,14 +464,43 @@ class Grabber
|
|||||||
$nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
|
$nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
|
||||||
|
|
||||||
if ($nodes !== false && $nodes->length > 0) {
|
if ($nodes !== false && $nodes->length > 0) {
|
||||||
Logger::setMessage(get_called_class().' Strip attribute: "'.$attribute.'"');
|
Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"');
|
||||||
foreach ($nodes as $node) {
|
foreach ($nodes as $node) {
|
||||||
|
if ($this->shouldRemove($dom, $node)) {
|
||||||
$node->parentNode->removeChild($node);
|
$node->parentNode->removeChild($node);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
$this->content = $dom->saveXML($dom->documentElement);
|
$this->content = $dom->saveXML($dom->documentElement);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return false if the node should not be removed
|
||||||
|
*
|
||||||
|
* @access public
|
||||||
|
* @param DomDocument $dom
|
||||||
|
* @param DomNode $node
|
||||||
|
* @return boolean
|
||||||
|
*/
|
||||||
|
public function shouldRemove($dom, $node)
|
||||||
|
{
|
||||||
|
$document_length = strlen($dom->textContent);
|
||||||
|
$node_length = strlen($node->textContent);
|
||||||
|
|
||||||
|
if ($document_length === 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
$ratio = $node_length * 100 / $document_length;
|
||||||
|
|
||||||
|
if ($ratio >= 90) {
|
||||||
|
Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%');
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -67,7 +67,7 @@ class Url
|
|||||||
if ($link->isRelativeUrl()) {
|
if ($link->isRelativeUrl()) {
|
||||||
|
|
||||||
if ($link->isRelativePath()) {
|
if ($link->isRelativePath()) {
|
||||||
return $link->getAbsoluteUrl($website->getAbsoluteUrl());
|
return $link->getAbsoluteUrl($website->getBaseUrl($website->getBasePath()));
|
||||||
}
|
}
|
||||||
|
|
||||||
return $link->getAbsoluteUrl($website->getBaseUrl());
|
return $link->getAbsoluteUrl($website->getBaseUrl());
|
||||||
@ -159,6 +159,22 @@ class Url
|
|||||||
return empty($this->components['path']) ? '' : $this->components['path'];
|
return empty($this->components['path']) ? '' : $this->components['path'];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the base path
|
||||||
|
*
|
||||||
|
* @access public
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
public function getBasePath()
|
||||||
|
{
|
||||||
|
$current_path = $this->getPath();
|
||||||
|
|
||||||
|
$path = $this->isRelativePath() ? '/' : '';
|
||||||
|
$path .= substr($current_path, -1) === '/' ? $current_path : dirname($current_path);
|
||||||
|
|
||||||
|
return str_replace('//', '/', $path.'/');
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the full path (path + querystring + fragment)
|
* Get the full path (path + querystring + fragment)
|
||||||
*
|
*
|
||||||
|
@ -168,8 +168,9 @@ abstract class Parser
|
|||||||
$this->findItemEnclosure($entry, $item, $feed);
|
$this->findItemEnclosure($entry, $item, $feed);
|
||||||
$this->findItemLanguage($entry, $item, $feed);
|
$this->findItemLanguage($entry, $item, $feed);
|
||||||
|
|
||||||
$this->scrapWebsite($item);
|
// Order is important (avoid double filtering)
|
||||||
$this->filterItemContent($feed, $item);
|
$this->filterItemContent($feed, $item);
|
||||||
|
$this->scrapWebsite($item);
|
||||||
|
|
||||||
$feed->items[] = $item;
|
$feed->items[] = $item;
|
||||||
}
|
}
|
||||||
@ -238,7 +239,7 @@ abstract class Parser
|
|||||||
$grabber->download();
|
$grabber->download();
|
||||||
|
|
||||||
if ($grabber->parse()) {
|
if ($grabber->parse()) {
|
||||||
$item->content = $grabber->getContent() ?: $item->content;
|
$item->content = $grabber->getFilteredContent();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -67,6 +67,9 @@ class UrlTest extends PHPUnit_Framework_TestCase
|
|||||||
|
|
||||||
$url = new Url('//localhost/test?truc');
|
$url = new Url('//localhost/test?truc');
|
||||||
$this->assertEquals('http://localhost', $url->getBaseUrl());
|
$this->assertEquals('http://localhost', $url->getBaseUrl());
|
||||||
|
|
||||||
|
$url = new Url('//localhost/test?truc');
|
||||||
|
$this->assertEquals('http://localhost', $url->getBaseUrl());
|
||||||
}
|
}
|
||||||
|
|
||||||
public function testIsRelativeUrl()
|
public function testIsRelativeUrl()
|
||||||
@ -112,6 +115,9 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO
|
|||||||
$url = new Url('anything');
|
$url = new Url('anything');
|
||||||
$this->assertEquals('/anything', $url->getFullPath());
|
$this->assertEquals('/anything', $url->getFullPath());
|
||||||
|
|
||||||
|
$url = new Url('foo/bar');
|
||||||
|
$this->assertEquals('/foo/bar', $url->getFullPath());
|
||||||
|
|
||||||
$url = new Url('index.php?foo=bar&test=1');
|
$url = new Url('index.php?foo=bar&test=1');
|
||||||
$this->assertEquals('/index.php?foo=bar&test=1', $url->getFullPath());
|
$this->assertEquals('/index.php?foo=bar&test=1', $url->getFullPath());
|
||||||
}
|
}
|
||||||
@ -160,6 +166,9 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO
|
|||||||
$url = new Url('https://127.0.0.1:8000/here/test?v=3');
|
$url = new Url('https://127.0.0.1:8000/here/test?v=3');
|
||||||
$this->assertEquals('https://127.0.0.1:8000/here/test?v=3', $url->getAbsoluteUrl());
|
$this->assertEquals('https://127.0.0.1:8000/here/test?v=3', $url->getAbsoluteUrl());
|
||||||
|
|
||||||
|
$url = new Url('http://www.lofibucket.com/articles/oscilloscope_quake.html');
|
||||||
|
$this->assertEquals('http://www.lofibucket.com/articles/oscilloscope_quake.html', $url->getAbsoluteUrl());
|
||||||
|
|
||||||
$url = new Url('test?v=3');
|
$url = new Url('test?v=3');
|
||||||
$this->assertEquals('https://127.0.0.1:8000/here/test?v=3', $url->getAbsoluteUrl('https://127.0.0.1:8000/here/'));
|
$this->assertEquals('https://127.0.0.1:8000/here/test?v=3', $url->getAbsoluteUrl('https://127.0.0.1:8000/here/'));
|
||||||
}
|
}
|
||||||
@ -185,6 +194,27 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO
|
|||||||
$this->assertFalse($url->isRelativePath());
|
$this->assertFalse($url->isRelativePath());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function testGetBasePath()
|
||||||
|
{
|
||||||
|
$url = new Url('img/quakescope.jpg');
|
||||||
|
$this->assertEquals('/img/', $url->getBasePath());
|
||||||
|
|
||||||
|
$url = new Url('http://foo/img/quakescope.jpg');
|
||||||
|
$this->assertEquals('/img/', $url->getBasePath());
|
||||||
|
|
||||||
|
$url = new Url('http://foo/bar.html');
|
||||||
|
$this->assertEquals('/', $url->getBasePath());
|
||||||
|
|
||||||
|
$url = new Url('http://foo/bar');
|
||||||
|
$this->assertEquals('/', $url->getBasePath());
|
||||||
|
|
||||||
|
$url = new Url('http://foo/bar/');
|
||||||
|
$this->assertEquals('/bar/', $url->getBasePath());
|
||||||
|
|
||||||
|
$url = new Url('http://website/subfolder/img/foo.png');
|
||||||
|
$this->assertEquals('/subfolder/img/', $url->getBasePath());
|
||||||
|
}
|
||||||
|
|
||||||
public function testResolve()
|
public function testResolve()
|
||||||
{
|
{
|
||||||
$this->assertEquals(
|
$this->assertEquals(
|
||||||
@ -216,5 +246,10 @@ AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO
|
|||||||
'http://website/img/foo.png',
|
'http://website/img/foo.png',
|
||||||
Url::resolve('/img/foo.png', 'http://website/subfolder/')
|
Url::resolve('/img/foo.png', 'http://website/subfolder/')
|
||||||
);
|
);
|
||||||
|
|
||||||
|
$this->assertEquals(
|
||||||
|
'http://www.lofibucket.com/articles/img/quakescope.jpg',
|
||||||
|
Url::resolve('img/quakescope.jpg', 'http://www.lofibucket.com/articles/oscilloscope_quake.html')
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user