Update PicoFeed
This commit is contained in:
parent
165acb0342
commit
2ef48e5f5c
@ -15,7 +15,7 @@
|
||||
"fguillot/simple-validator": "v1.0.0",
|
||||
"fguillot/json-rpc": "v1.0.2",
|
||||
"fguillot/picodb": "v1.0.2",
|
||||
"fguillot/picofeed": "v0.1.20"
|
||||
"fguillot/picofeed": "v0.1.21"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpunit/phpunit": "4.8.3",
|
||||
|
2
vendor/composer/autoload_classmap.php
vendored
2
vendor/composer/autoload_classmap.php
vendored
@ -29,6 +29,7 @@ return array(
|
||||
'PicoFeed\\Client\\Client' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Client.php',
|
||||
'PicoFeed\\Client\\ClientException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/ClientException.php',
|
||||
'PicoFeed\\Client\\Curl' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Curl.php',
|
||||
'PicoFeed\\Client\\ForbiddenException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/ForbiddenException.php',
|
||||
'PicoFeed\\Client\\HttpHeaders' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/HttpHeaders.php',
|
||||
'PicoFeed\\Client\\InvalidCertificateException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/InvalidCertificateException.php',
|
||||
'PicoFeed\\Client\\InvalidUrlException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/InvalidUrlException.php',
|
||||
@ -36,6 +37,7 @@ return array(
|
||||
'PicoFeed\\Client\\MaxSizeException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/MaxSizeException.php',
|
||||
'PicoFeed\\Client\\Stream' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Stream.php',
|
||||
'PicoFeed\\Client\\TimeoutException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/TimeoutException.php',
|
||||
'PicoFeed\\Client\\UnauthorizedException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/UnauthorizedException.php',
|
||||
'PicoFeed\\Client\\Url' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Url.php',
|
||||
'PicoFeed\\Config\\Config' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Config/Config.php',
|
||||
'PicoFeed\\Encoding\\Encoding' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Encoding/Encoding.php',
|
||||
|
12
vendor/composer/installed.json
vendored
12
vendor/composer/installed.json
vendored
@ -163,17 +163,17 @@
|
||||
},
|
||||
{
|
||||
"name": "fguillot/picofeed",
|
||||
"version": "v0.1.20",
|
||||
"version_normalized": "0.1.20.0",
|
||||
"version": "v0.1.21",
|
||||
"version_normalized": "0.1.21.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/fguillot/picoFeed.git",
|
||||
"reference": "d6bbdd248fa4a3eef7831ffaae0491a2ea58f897"
|
||||
"reference": "2baff3240ef187c9f443656ab26b0b626aec5776"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/fguillot/picoFeed/zipball/d6bbdd248fa4a3eef7831ffaae0491a2ea58f897",
|
||||
"reference": "d6bbdd248fa4a3eef7831ffaae0491a2ea58f897",
|
||||
"url": "https://api.github.com/repos/fguillot/picoFeed/zipball/2baff3240ef187c9f443656ab26b0b626aec5776",
|
||||
"reference": "2baff3240ef187c9f443656ab26b0b626aec5776",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
@ -188,7 +188,7 @@
|
||||
"suggest": {
|
||||
"ext-curl": "PicoFeed will use cURL if present"
|
||||
},
|
||||
"time": "2016-03-24 12:09:56",
|
||||
"time": "2016-03-31 00:39:41",
|
||||
"bin": [
|
||||
"picofeed"
|
||||
],
|
||||
|
@ -211,7 +211,7 @@ abstract class Client
|
||||
|
||||
$this->status_code = $response['status'];
|
||||
$this->handleNotModifiedResponse($response);
|
||||
$this->handleNotFoundResponse($response);
|
||||
$this->handleErrorResponse($response);
|
||||
$this->handleNormalResponse($response);
|
||||
|
||||
return $this;
|
||||
@ -222,7 +222,7 @@ abstract class Client
|
||||
*
|
||||
* @param array $response Client response
|
||||
*/
|
||||
public function handleNotModifiedResponse(array $response)
|
||||
protected function handleNotModifiedResponse(array $response)
|
||||
{
|
||||
if ($response['status'] == 304) {
|
||||
$this->is_modified = false;
|
||||
@ -238,13 +238,18 @@ abstract class Client
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle not found response.
|
||||
* Handle Http Error codes
|
||||
*
|
||||
* @param array $response Client response
|
||||
*/
|
||||
public function handleNotFoundResponse(array $response)
|
||||
protected function handleErrorResponse(array $response)
|
||||
{
|
||||
if ($response['status'] == 404) {
|
||||
$status = $response['status'];
|
||||
if ($status == 401) {
|
||||
throw new UnauthorizedException('Wrong or missing credentials');
|
||||
} else if ($status == 403) {
|
||||
throw new ForbiddenException('Not allowed to access resource');
|
||||
} else if ($status == 404) {
|
||||
throw new InvalidUrlException('Resource not found');
|
||||
}
|
||||
}
|
||||
@ -254,7 +259,7 @@ abstract class Client
|
||||
*
|
||||
* @param array $response Client response
|
||||
*/
|
||||
public function handleNormalResponse(array $response)
|
||||
protected function handleNormalResponse(array $response)
|
||||
{
|
||||
if ($response['status'] == 200) {
|
||||
$this->content = $response['body'];
|
||||
|
@ -108,7 +108,7 @@ class Curl extends Client
|
||||
return $this->handleRedirection($headers['Location']);
|
||||
}
|
||||
|
||||
header($status);
|
||||
header(':', true, $status);
|
||||
|
||||
if (isset($headers['Content-Type'])) {
|
||||
header('Content-Type:' .$headers['Content-Type']);
|
||||
|
10
vendor/fguillot/picofeed/lib/PicoFeed/Client/ForbiddenException.php
vendored
Normal file
10
vendor/fguillot/picofeed/lib/PicoFeed/Client/ForbiddenException.php
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
<?php
|
||||
|
||||
namespace PicoFeed\Client;
|
||||
|
||||
/**
|
||||
* @author Bernhard Posselt
|
||||
*/
|
||||
class ForbiddenException extends ClientException
|
||||
{
|
||||
}
|
10
vendor/fguillot/picofeed/lib/PicoFeed/Client/UnauthorizedException.php
vendored
Normal file
10
vendor/fguillot/picofeed/lib/PicoFeed/Client/UnauthorizedException.php
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
<?php
|
||||
|
||||
namespace PicoFeed\Client;
|
||||
|
||||
/**
|
||||
* @author Bernhard Posselt
|
||||
*/
|
||||
class UnauthorizedException extends ClientException
|
||||
{
|
||||
}
|
@ -319,19 +319,25 @@ abstract class Parser
|
||||
/**
|
||||
* Enable the content grabber.
|
||||
*
|
||||
* @param bool $needs_rule_file true if only pages with rule files should be
|
||||
* @param bool $needsRuleFile true if only pages with rule files should be
|
||||
* scraped
|
||||
* @param null|\Closure $scraperCallback Callback function that gets called for each
|
||||
* scraper execution
|
||||
*
|
||||
* @return \PicoFeed\Parser\Parser
|
||||
*/
|
||||
public function enableContentGrabber($needs_rule_file = false)
|
||||
public function enableContentGrabber($needsRuleFile = false, $scraperCallback = null)
|
||||
{
|
||||
$processor = new ScraperProcessor($this->config);
|
||||
|
||||
if ($needs_rule_file) {
|
||||
if ($needsRuleFile) {
|
||||
$processor->getScraper()->disableCandidateParser();
|
||||
}
|
||||
|
||||
if ($scraperCallback !== null) {
|
||||
$processor->setExecutionCallback($scraperCallback);
|
||||
}
|
||||
|
||||
$this->itemPostProcessor->register($processor);
|
||||
return $this;
|
||||
}
|
||||
|
@ -70,6 +70,18 @@ class ItemPostProcessor extends Base
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks wheather a specific processor is registered or not
|
||||
*
|
||||
* @access public
|
||||
* @param string $class
|
||||
* @return bool
|
||||
*/
|
||||
public function hasProcessor($class)
|
||||
{
|
||||
return isset($this->processors[$class]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get Processor instance
|
||||
*
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
namespace PicoFeed\Processor;
|
||||
|
||||
use Closure;
|
||||
use PicoFeed\Base;
|
||||
use PicoFeed\Parser\Feed;
|
||||
use PicoFeed\Parser\Item;
|
||||
@ -18,6 +19,26 @@ class ScraperProcessor extends Base implements ItemProcessorInterface
|
||||
private $ignoredUrls = array();
|
||||
private $scraper;
|
||||
|
||||
/**
|
||||
* Callback function for each scraper execution
|
||||
*
|
||||
* @var Closure
|
||||
*/
|
||||
private $executionCallback;
|
||||
|
||||
/**
|
||||
* Add a new execution callback
|
||||
*
|
||||
* @access public
|
||||
* @param Closure $executionCallback
|
||||
* @return $this
|
||||
*/
|
||||
public function setExecutionCallback(Closure $executionCallback)
|
||||
{
|
||||
$this->executionCallback = $executionCallback;
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute Item Processor
|
||||
*
|
||||
@ -33,6 +54,10 @@ class ScraperProcessor extends Base implements ItemProcessorInterface
|
||||
$scraper->setUrl($item->getUrl());
|
||||
$scraper->execute();
|
||||
|
||||
if ($this->executionCallback && is_callable($this->executionCallback)) {
|
||||
call_user_func($this->executionCallback, $feed, $item, $scraper);
|
||||
}
|
||||
|
||||
if ($scraper->hasRelevantContent()) {
|
||||
$item->setContent($scraper->getFilteredContent());
|
||||
}
|
||||
|
@ -5,19 +5,19 @@ return array(
|
||||
'%.*%' => array(
|
||||
'test_url' => 'http://www.jsonline.com/news/usandworld/as-many-as-a-million-expected-for-popes-last-mass-in-us-b99585180z1-329688131.html',
|
||||
'body' => array(
|
||||
'//div[@id="mainContent"]',
|
||||
'//div[@id="main"]',
|
||||
),
|
||||
'strip' => array(
|
||||
'//script',
|
||||
'//h1',
|
||||
'//h4[@class="credit"]',
|
||||
'//div[@class="columnist_container"]',
|
||||
'//div[@class="storyTimestamp"]',
|
||||
'//ul[@id="sharing-tools"]',
|
||||
'//div[@class="title"]',
|
||||
'//img[@class="floatLeft"]',
|
||||
'//div[@class="first feature"]',
|
||||
'//div[@class="collateral_article_content"]',
|
||||
'div[contains(@class, "header")]',
|
||||
'div[@class="module--headline"]',
|
||||
'div[@class="main--inlinemeta"]',
|
||||
'div[contains(@class, "leftcol--")]',
|
||||
'p[@class="main--author"]',
|
||||
'div[@class="story--rightcol"]',
|
||||
'div[contains(@class, "footer")]',
|
||||
'div[contains(@class, "rightcol--")]',
|
||||
'div[contains(@class, "author")]',
|
||||
),
|
||||
),
|
||||
),
|
||||
|
@ -18,6 +18,7 @@ return array(
|
||||
'//section[@class="ribboned"]',
|
||||
'//div[contains(@class,"sidebar")]',
|
||||
'//aside[@class="article_tag_list"]',
|
||||
'//section[contains(@id, "more_posts")]',
|
||||
),
|
||||
),
|
||||
),
|
||||
|
Loading…
Reference in New Issue
Block a user