Update PicoFeed

This commit is contained in:
Frederic Guillot 2016-03-30 22:43:08 -04:00
parent 165acb0342
commit 2ef48e5f5c
12 changed files with 99 additions and 28 deletions

View File

@ -15,7 +15,7 @@
"fguillot/simple-validator": "v1.0.0",
"fguillot/json-rpc": "v1.0.2",
"fguillot/picodb": "v1.0.2",
"fguillot/picofeed": "v0.1.20"
"fguillot/picofeed": "v0.1.21"
},
"require-dev": {
"phpunit/phpunit": "4.8.3",

View File

@ -29,6 +29,7 @@ return array(
'PicoFeed\\Client\\Client' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Client.php',
'PicoFeed\\Client\\ClientException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/ClientException.php',
'PicoFeed\\Client\\Curl' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Curl.php',
'PicoFeed\\Client\\ForbiddenException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/ForbiddenException.php',
'PicoFeed\\Client\\HttpHeaders' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/HttpHeaders.php',
'PicoFeed\\Client\\InvalidCertificateException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/InvalidCertificateException.php',
'PicoFeed\\Client\\InvalidUrlException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/InvalidUrlException.php',
@ -36,6 +37,7 @@ return array(
'PicoFeed\\Client\\MaxSizeException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/MaxSizeException.php',
'PicoFeed\\Client\\Stream' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Stream.php',
'PicoFeed\\Client\\TimeoutException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/TimeoutException.php',
'PicoFeed\\Client\\UnauthorizedException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/UnauthorizedException.php',
'PicoFeed\\Client\\Url' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Url.php',
'PicoFeed\\Config\\Config' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Config/Config.php',
'PicoFeed\\Encoding\\Encoding' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Encoding/Encoding.php',

View File

@ -163,17 +163,17 @@
},
{
"name": "fguillot/picofeed",
"version": "v0.1.20",
"version_normalized": "0.1.20.0",
"version": "v0.1.21",
"version_normalized": "0.1.21.0",
"source": {
"type": "git",
"url": "https://github.com/fguillot/picoFeed.git",
"reference": "d6bbdd248fa4a3eef7831ffaae0491a2ea58f897"
"reference": "2baff3240ef187c9f443656ab26b0b626aec5776"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/fguillot/picoFeed/zipball/d6bbdd248fa4a3eef7831ffaae0491a2ea58f897",
"reference": "d6bbdd248fa4a3eef7831ffaae0491a2ea58f897",
"url": "https://api.github.com/repos/fguillot/picoFeed/zipball/2baff3240ef187c9f443656ab26b0b626aec5776",
"reference": "2baff3240ef187c9f443656ab26b0b626aec5776",
"shasum": ""
},
"require": {
@ -188,7 +188,7 @@
"suggest": {
"ext-curl": "PicoFeed will use cURL if present"
},
"time": "2016-03-24 12:09:56",
"time": "2016-03-31 00:39:41",
"bin": [
"picofeed"
],

View File

@ -211,7 +211,7 @@ abstract class Client
$this->status_code = $response['status'];
$this->handleNotModifiedResponse($response);
$this->handleNotFoundResponse($response);
$this->handleErrorResponse($response);
$this->handleNormalResponse($response);
return $this;
@ -222,7 +222,7 @@ abstract class Client
*
* @param array $response Client response
*/
public function handleNotModifiedResponse(array $response)
protected function handleNotModifiedResponse(array $response)
{
if ($response['status'] == 304) {
$this->is_modified = false;
@ -238,13 +238,18 @@ abstract class Client
}
/**
* Handle not found response.
* Handle Http Error codes
*
* @param array $response Client response
*/
public function handleNotFoundResponse(array $response)
protected function handleErrorResponse(array $response)
{
if ($response['status'] == 404) {
$status = $response['status'];
if ($status == 401) {
throw new UnauthorizedException('Wrong or missing credentials');
} else if ($status == 403) {
throw new ForbiddenException('Not allowed to access resource');
} else if ($status == 404) {
throw new InvalidUrlException('Resource not found');
}
}
@ -254,7 +259,7 @@ abstract class Client
*
* @param array $response Client response
*/
public function handleNormalResponse(array $response)
protected function handleNormalResponse(array $response)
{
if ($response['status'] == 200) {
$this->content = $response['body'];

View File

@ -108,7 +108,7 @@ class Curl extends Client
return $this->handleRedirection($headers['Location']);
}
header($status);
header(':', true, $status);
if (isset($headers['Content-Type'])) {
header('Content-Type:' .$headers['Content-Type']);

View File

@ -0,0 +1,10 @@
<?php
namespace PicoFeed\Client;
/**
* @author Bernhard Posselt
*/
class ForbiddenException extends ClientException
{
}

View File

@ -0,0 +1,10 @@
<?php
namespace PicoFeed\Client;
/**
* @author Bernhard Posselt
*/
class UnauthorizedException extends ClientException
{
}

View File

@ -319,19 +319,25 @@ abstract class Parser
/**
* Enable the content grabber.
*
* @param bool $needs_rule_file true if only pages with rule files should be
* @param bool $needsRuleFile true if only pages with rule files should be
* scraped
* @param null|\Closure $scraperCallback Callback function that gets called for each
* scraper execution
*
* @return \PicoFeed\Parser\Parser
*/
public function enableContentGrabber($needs_rule_file = false)
public function enableContentGrabber($needsRuleFile = false, $scraperCallback = null)
{
$processor = new ScraperProcessor($this->config);
if ($needs_rule_file) {
if ($needsRuleFile) {
$processor->getScraper()->disableCandidateParser();
}
if ($scraperCallback !== null) {
$processor->setExecutionCallback($scraperCallback);
}
$this->itemPostProcessor->register($processor);
return $this;
}

View File

@ -70,6 +70,18 @@ class ItemPostProcessor extends Base
return $this;
}
/**
* Checks wheather a specific processor is registered or not
*
* @access public
* @param string $class
* @return bool
*/
public function hasProcessor($class)
{
return isset($this->processors[$class]);
}
/**
* Get Processor instance
*

View File

@ -2,6 +2,7 @@
namespace PicoFeed\Processor;
use Closure;
use PicoFeed\Base;
use PicoFeed\Parser\Feed;
use PicoFeed\Parser\Item;
@ -18,6 +19,26 @@ class ScraperProcessor extends Base implements ItemProcessorInterface
private $ignoredUrls = array();
private $scraper;
/**
* Callback function for each scraper execution
*
* @var Closure
*/
private $executionCallback;
/**
* Add a new execution callback
*
* @access public
* @param Closure $executionCallback
* @return $this
*/
public function setExecutionCallback(Closure $executionCallback)
{
$this->executionCallback = $executionCallback;
return $this;
}
/**
* Execute Item Processor
*
@ -33,6 +54,10 @@ class ScraperProcessor extends Base implements ItemProcessorInterface
$scraper->setUrl($item->getUrl());
$scraper->execute();
if ($this->executionCallback && is_callable($this->executionCallback)) {
call_user_func($this->executionCallback, $feed, $item, $scraper);
}
if ($scraper->hasRelevantContent()) {
$item->setContent($scraper->getFilteredContent());
}

View File

@ -5,19 +5,19 @@ return array(
'%.*%' => array(
'test_url' => 'http://www.jsonline.com/news/usandworld/as-many-as-a-million-expected-for-popes-last-mass-in-us-b99585180z1-329688131.html',
'body' => array(
'//div[@id="mainContent"]',
'//div[@id="main"]',
),
'strip' => array(
'//script',
'//h1',
'//h4[@class="credit"]',
'//div[@class="columnist_container"]',
'//div[@class="storyTimestamp"]',
'//ul[@id="sharing-tools"]',
'//div[@class="title"]',
'//img[@class="floatLeft"]',
'//div[@class="first feature"]',
'//div[@class="collateral_article_content"]',
'div[contains(@class, "header")]',
'div[@class="module--headline"]',
'div[@class="main--inlinemeta"]',
'div[contains(@class, "leftcol--")]',
'p[@class="main--author"]',
'div[@class="story--rightcol"]',
'div[contains(@class, "footer")]',
'div[contains(@class, "rightcol--")]',
'div[contains(@class, "author")]',
),
),
),

View File

@ -18,6 +18,7 @@ return array(
'//section[@class="ribboned"]',
'//div[contains(@class,"sidebar")]',
'//aside[@class="article_tag_list"]',
'//section[contains(@id, "more_posts")]',
),
),
),