Update PicoFeed to have ContentGenerator

This commit is contained in:
Frederic Guillot 2016-03-24 17:49:50 -04:00
parent b8a9c91e79
commit 165acb0342
32 changed files with 649 additions and 318 deletions

View File

@ -15,7 +15,7 @@
"fguillot/simple-validator": "v1.0.0",
"fguillot/json-rpc": "v1.0.2",
"fguillot/picodb": "v1.0.2",
"fguillot/picofeed": "v0.1.19"
"fguillot/picofeed": "v0.1.20"
},
"require-dev": {
"phpunit/phpunit": "4.8.3",

View File

@ -25,6 +25,7 @@ return array(
'PicoDb\\SQLException' => $vendorDir . '/fguillot/picodb/lib/PicoDb/SQLException.php',
'PicoDb\\Schema' => $vendorDir . '/fguillot/picodb/lib/PicoDb/Schema.php',
'PicoDb\\Table' => $vendorDir . '/fguillot/picodb/lib/PicoDb/Table.php',
'PicoFeed\\Base' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Base.php',
'PicoFeed\\Client\\Client' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Client.php',
'PicoFeed\\Client\\ClientException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/ClientException.php',
'PicoFeed\\Client\\Curl' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Curl.php',
@ -42,6 +43,9 @@ return array(
'PicoFeed\\Filter\\Filter' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php',
'PicoFeed\\Filter\\Html' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Filter/Html.php',
'PicoFeed\\Filter\\Tag' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Filter/Tag.php',
'PicoFeed\\Generator\\ContentGeneratorInterface' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Generator/ContentGeneratorInterface.php',
'PicoFeed\\Generator\\FileContentGenerator' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Generator/FileContentGenerator.php',
'PicoFeed\\Generator\\YoutubeContentGenerator' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Generator/YoutubeContentGenerator.php',
'PicoFeed\\Logging\\Logger' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Logging/Logger.php',
'PicoFeed\\Parser\\Atom' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/Atom.php',
'PicoFeed\\Parser\\DateParser' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/DateParser.php',
@ -57,6 +61,11 @@ return array(
'PicoFeed\\Parser\\XmlEntityException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/XmlEntityException.php',
'PicoFeed\\Parser\\XmlParser' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php',
'PicoFeed\\PicoFeedException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/PicoFeedException.php',
'PicoFeed\\Processor\\ContentFilterProcessor' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ContentFilterProcessor.php',
'PicoFeed\\Processor\\ContentGeneratorProcessor' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ContentGeneratorProcessor.php',
'PicoFeed\\Processor\\ItemPostProcessor' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ItemPostProcessor.php',
'PicoFeed\\Processor\\ItemProcessorInterface' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ItemProcessorInterface.php',
'PicoFeed\\Processor\\ScraperProcessor' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ScraperProcessor.php',
'PicoFeed\\Reader\\Favicon' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/Favicon.php',
'PicoFeed\\Reader\\Reader' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/Reader.php',
'PicoFeed\\Reader\\ReaderException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/ReaderException.php',

View File

@ -163,17 +163,17 @@
},
{
"name": "fguillot/picofeed",
"version": "v0.1.19",
"version_normalized": "0.1.19.0",
"version": "v0.1.20",
"version_normalized": "0.1.20.0",
"source": {
"type": "git",
"url": "https://github.com/fguillot/picoFeed.git",
"reference": "c270ef4474a2460d857f99c84612025c5f9975f2"
"reference": "d6bbdd248fa4a3eef7831ffaae0491a2ea58f897"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/fguillot/picoFeed/zipball/c270ef4474a2460d857f99c84612025c5f9975f2",
"reference": "c270ef4474a2460d857f99c84612025c5f9975f2",
"url": "https://api.github.com/repos/fguillot/picoFeed/zipball/d6bbdd248fa4a3eef7831ffaae0491a2ea58f897",
"reference": "d6bbdd248fa4a3eef7831ffaae0491a2ea58f897",
"shasum": ""
},
"require": {
@ -188,7 +188,7 @@
"suggest": {
"ext-curl": "PicoFeed will use cURL if present"
},
"time": "2016-02-11 19:52:02",
"time": "2016-03-24 12:09:56",
"bin": [
"picofeed"
],

View File

@ -0,0 +1,34 @@
<?php
namespace PicoFeed;
use PicoFeed\Config\Config;
use PicoFeed\Logging\Logger;
/**
* Base class
*
* @package PicoFeed
* @author Frederic Guillot
*/
abstract class Base
{
/**
* Config class instance
*
* @access protected
* @var \PicoFeed\Config\Config
*/
protected $config;
/**
* Constructor.
*
* @param \PicoFeed\Config\Config $config Config class instance
*/
public function __construct(Config $config = null)
{
$this->config = $config ?: new Config();
Logger::setTimezone($this->config->getTimezone());
}
}

View File

@ -2,24 +2,17 @@
namespace PicoFeed\Filter;
use DOMXpath;
use DOMXPath;
use PicoFeed\Base;
use PicoFeed\Parser\XmlParser;
use PicoFeed\Config\Config;
/**
* Tag Filter class.
*
* @author Frederic Guillot
*/
class Tag
class Tag extends Base
{
/**
* Config object.
*
* @var \PicoFeed\Config\Config
*/
private $config;
/**
* Tags blacklist (Xpath expressions).
*
@ -76,11 +69,6 @@ class Tag
'q',
);
public function __construct(Config $config)
{
$this->config = $config;
}
/**
* Check if the tag is allowed and is not a pixel tracker.
*

View File

@ -0,0 +1,23 @@
<?php
namespace PicoFeed\Generator;
use PicoFeed\Parser\Item;
/**
* Content Generator Interface
*
* @package PicoFeed\Generator
* @author Frederic Guillot
*/
interface ContentGeneratorInterface
{
/**
* Execute Content Generator
*
* @access public
* @param Item $item
* @return boolean
*/
public function execute(Item $item);
}

View File

@ -0,0 +1,36 @@
<?php
namespace PicoFeed\Generator;
use PicoFeed\Base;
use PicoFeed\Parser\Item;
/**
* File Content Generator
*
* @package PicoFeed\Generator
* @author Frederic Guillot
*/
class FileContentGenerator extends Base implements ContentGeneratorInterface
{
private $extensions = array('pdf');
/**
* Execute Content Generator
*
* @access public
* @param Item $item
* @return boolean
*/
public function execute(Item $item)
{
foreach ($this->extensions as $extension) {
if (substr($item->getUrl(), - strlen($extension)) === $extension) {
$item->setContent('<a href="'.$item->getUrl().'" target="_blank">'.$item->getUrl().'</a>');
return true;
}
}
return false;
}
}

View File

@ -0,0 +1,67 @@
<?php
namespace PicoFeed\Generator;
use PicoFeed\Base;
use PicoFeed\Parser\Item;
/**
* Youtube Content Generator
*
* @package PicoFeed\Generator
* @author Frederic Guillot
*/
class YoutubeContentGenerator extends Base implements ContentGeneratorInterface
{
/**
* Execute Content Generator
*
* @access public
* @param Item $item
* @return boolean
*/
public function execute(Item $item)
{
if ($item->hasNamespace('yt')) {
return $this->generateHtmlFromXml($item);
}
return $this->generateHtmlFromUrl($item);
}
/**
* Generate HTML
*
* @access public
* @param Item $item
* @return boolean
*/
private function generateHtmlFromXml(Item $item)
{
$videoId = $item->getTag('yt:videoId');
if (! empty($videoId)) {
$item->setContent('<iframe width="560" height="315" src="//www.youtube.com/embed/'.$videoId[0].'" frameborder="0"></iframe>');
return true;
}
return false;
}
/**
* Generate HTML from item URL
*
* @access public
* @param Item $item
* @return bool
*/
public function generateHtmlFromUrl(Item $item)
{
if (preg_match('/youtube\.com\/watch\?v=(.*)/', $item->getUrl(), $matches)) {
$item->setContent('<iframe width="560" height="315" src="//www.youtube.com/embed/'.$matches[1].'" frameborder="0"></iframe>');
return true;
}
return false;
}
}

View File

@ -150,7 +150,7 @@ class Atom extends Parser
$updated = XmlParser::getXPathResult($xml, 'atom:updated', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'updated');
$feed->date = $this->date->getDateTime((string) current($updated));
$feed->date = $this->getDateParser()->getDateTime((string) current($updated));
}
/**
@ -168,8 +168,8 @@ class Atom extends Parser
$updated = XmlParser::getXPathResult($entry, 'atom:updated', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'updated');
$published = !empty($published) ? $this->date->getDateTime((string) current($published)) : null;
$updated = !empty($updated) ? $this->date->getDateTime((string) current($updated)) : null;
$published = !empty($published) ? $this->getDateParser()->getDateTime((string) current($published)) : null;
$updated = !empty($updated) ? $this->getDateParser()->getDateTime((string) current($updated)) : null;
if ($published === null && $updated === null) {
$item->date = $feed->getDate(); // We use the feed date if there is no date for the item

View File

@ -4,20 +4,22 @@ namespace PicoFeed\Parser;
use DateTime;
use DateTimeZone;
use PicoFeed\Base;
/**
* Date Parser.
*
* @author Frederic Guillot
*/
class DateParser
class DateParser extends Base
{
/**
* Timezone used to parse feed dates.
*
* @access private
* @var string
*/
public $timezone = 'UTC';
private $timezone = 'UTC';
/**
* Supported formats [ 'format' => length ].
@ -88,7 +90,7 @@ class DateParser
*/
public function getValidDate($format, $value)
{
$date = DateTime::createFromFormat($format, $value, new DateTimeZone($this->timezone));
$date = DateTime::createFromFormat($format, $value, $this->getTimeZone());
if ($date !== false) {
$errors = DateTime::getLastErrors();
@ -108,6 +110,17 @@ class DateParser
*/
public function getCurrentDateTime()
{
return new DateTime('now', new DateTimeZone($this->timezone));
return new DateTime('now', $this->getTimeZone());
}
/**
* Get DateTimeZone instance
*
* @access public
* @return DateTimeZone
*/
public function getTimeZone()
{
return new DateTimeZone($this->config->getTimezone() ?: $this->timezone);
}
}

View File

@ -102,6 +102,18 @@ class Item
*/
public $namespaces = array();
/**
* Check if a XML namespace exists
*
* @access public
* @param string $namespace
* @return bool
*/
public function hasNamespace($namespace)
{
return array_key_exists($namespace, $this->namespaces);
}
/**
* Get specific XML tag or attribute value.
*
@ -112,12 +124,10 @@ class Item
*/
public function getTag($tag, $attribute = '')
{
// convert to xPath attribute query
if ($attribute !== '') {
$attribute = '/@'.$attribute;
}
// construct query
$query = './/'.$tag.$attribute;
$elements = XmlParser::getXPathResult($this->xml, $query, $this->namespaces);
@ -155,13 +165,29 @@ class Item
}
/**
* Get url.
* Get URL
*
* @access public
* @return string
*/
public function getUrl()
{
return $this->url;
}
/**
* Set URL
*
* @access public
* @param string $url
* @return Item
*/
public function setUrl($url)
{
$this->url = $url;
return $this;
}
/**
* Get id.
*/
@ -186,6 +212,19 @@ class Item
return $this->content;
}
/**
* Set content
*
* @access public
* @param string $value
* @return Item
*/
public function setContent($value)
{
$this->content = $value;
return $this;
}
/**
* Get enclosure url.
*/

View File

@ -2,12 +2,15 @@
namespace PicoFeed\Parser;
use PicoFeed\Processor\ContentFilterProcessor;
use PicoFeed\Processor\ContentGeneratorProcessor;
use PicoFeed\Processor\ItemPostProcessor;
use PicoFeed\Processor\ScraperProcessor;
use SimpleXMLElement;
use PicoFeed\Client\Url;
use PicoFeed\Encoding\Encoding;
use PicoFeed\Filter\Filter;
use PicoFeed\Logging\Logger;
use PicoFeed\Scraper\Scraper;
/**
* Base parser class.
@ -28,7 +31,7 @@ abstract class Parser
*
* @var \PicoFeed\Parser\DateParser
*/
protected $date;
private $dateParser;
/**
* Hash algorithm used to generate item id, any value supported by PHP, see hash_algos().
@ -66,32 +69,12 @@ abstract class Parser
protected $used_namespaces = array();
/**
* Enable the content filtering.
* Item Post Processor instance
*
* @var bool
* @access private
* @var ItemPostProcessor
*/
private $enable_filter = true;
/**
* Enable the content grabber.
*
* @var bool
*/
private $enable_grabber = false;
/**
* Enable the content grabber on all pages.
*
* @var bool
*/
private $grabber_needs_rule_file = false;
/**
* Ignore those urls for the content scraper.
*
* @var array
*/
private $grabber_ignore_urls = array();
private $itemPostProcessor;
/**
* Constructor.
@ -102,7 +85,6 @@ abstract class Parser
*/
public function __construct($content, $http_encoding = '', $fallback_url = '')
{
$this->date = new DateParser();
$this->fallback_url = $fallback_url;
$xml_encoding = XmlParser::getEncodingFromXmlTag($content);
@ -112,6 +94,10 @@ abstract class Parser
// Encode everything in UTF-8
Logger::setMessage(get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"');
$this->content = Encoding::convert($this->content, $xml_encoding ?: $http_encoding);
$this->itemPostProcessor = new ItemPostProcessor($this->config);
$this->itemPostProcessor->register(new ContentGeneratorProcessor($this->config));
$this->itemPostProcessor->register(new ContentFilterProcessor($this->config));
}
/**
@ -173,15 +159,11 @@ abstract class Parser
// Id generation can use the item url/title/content (order is important)
$this->findItemId($entry, $item, $feed);
$this->findItemDate($entry, $item, $feed);
$this->findItemEnclosure($entry, $item, $feed);
$this->findItemLanguage($entry, $item, $feed);
// Order is important (avoid double filtering)
$this->filterItemContent($feed, $item);
$this->scrapWebsite($item);
$this->itemPostProcessor->execute($feed, $item);
$feed->items[] = $item;
}
@ -230,43 +212,29 @@ abstract class Parser
}
/**
* Fetch item content with the content grabber.
* Get Item Post Processor instance
*
* @param Item $item Item object
* @access public
* @return ItemPostProcessor
*/
public function scrapWebsite(Item $item)
public function getItemPostProcessor()
{
if ($this->enable_grabber && !in_array($item->getUrl(), $this->grabber_ignore_urls)) {
$grabber = new Scraper($this->config);
$grabber->setUrl($item->getUrl());
if ($this->grabber_needs_rule_file) {
$grabber->disableCandidateParser();
}
$grabber->execute();
if ($grabber->hasRelevantContent()) {
$item->content = $grabber->getFilteredContent();
}
}
return $this->itemPostProcessor;
}
/**
* Filter HTML for entry content.
* Get DateParser instance
*
* @param Feed $feed Feed object
* @param Item $item Item object
* @access public
* @return DateParser
*/
public function filterItemContent(Feed $feed, Item $item)
public function getDateParser()
{
if ($this->isFilteringEnabled()) {
$filter = Filter::html($item->getContent(), $feed->getSiteUrl());
$filter->setConfig($this->config);
$item->content = $filter->execute();
} else {
Logger::setMessage(get_called_class().': Content filtering disabled');
if ($this->dateParser === null) {
return new DateParser($this->config);
}
return $this->dateParser;
}
/**
@ -316,31 +284,11 @@ abstract class Parser
* Set Hash algorithm used for id generation.
*
* @param string $algo Algorithm name
*
* @return \PicoFeed\Parser\Parser
*/
public function setHashAlgo($algo)
{
$this->hash_algo = $algo ?: $this->hash_algo;
return $this;
}
/**
* Set a different timezone.
*
* @see http://php.net/manual/en/timezones.php
*
* @param string $timezone Timezone
*
* @return \PicoFeed\Parser\Parser
*/
public function setTimezone($timezone)
{
if ($timezone) {
$this->date->timezone = $timezone;
}
return $this;
}
@ -354,7 +302,6 @@ abstract class Parser
public function setConfig($config)
{
$this->config = $config;
return $this;
}
@ -365,21 +312,8 @@ abstract class Parser
*/
public function disableContentFiltering()
{
$this->enable_filter = false;
}
/**
* Return true if the content filtering is enabled.
*
* @return bool
*/
public function isFilteringEnabled()
{
if ($this->config === null) {
return $this->enable_filter;
}
return $this->config->getContentFiltering($this->enable_filter);
$this->itemPostProcessor->unregister('PicoFeed\Processor\ContentFilterProcessor');
return $this;
}
/**
@ -392,8 +326,14 @@ abstract class Parser
*/
public function enableContentGrabber($needs_rule_file = false)
{
$this->enable_grabber = true;
$this->grabber_needs_rule_file = $needs_rule_file;
$processor = new ScraperProcessor($this->config);
if ($needs_rule_file) {
$processor->getScraper()->disableCandidateParser();
}
$this->itemPostProcessor->register($processor);
return $this;
}
/**
@ -405,7 +345,8 @@ abstract class Parser
*/
public function setGrabberIgnoreUrls(array $urls)
{
$this->grabber_ignore_urls = $urls;
$this->itemPostProcessor->getProcessor('PicoFeed\Processor\ScraperProcessor')->ignoreUrls($urls);
return $this;
}
/**

View File

@ -149,7 +149,7 @@ class Rss10 extends Parser
$date = XmlParser::getXPathResult($xml, 'rss:channel/dc:date', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/dc:date', $this->namespaces);
$feed->date = $this->date->getDateTime((string) current($date));
$feed->date = $this->getDateParser()->getDateTime((string) current($date));
}
/**
@ -163,7 +163,7 @@ class Rss10 extends Parser
{
$date = XmlParser::getXPathResult($entry, 'dc:date', $this->namespaces);
$item->date = empty($date) ? $feed->getDate() : $this->date->getDateTime((string) current($date));
$item->date = empty($date) ? $feed->getDate() : $this->getDateParser()->getDateTime((string) current($date));
}
/**

View File

@ -139,11 +139,11 @@ class Rss20 extends Parser
$publish_date = XmlParser::getXPathResult($xml, 'channel/pubDate');
$update_date = XmlParser::getXPathResult($xml, 'channel/lastBuildDate');
$published = !empty($publish_date) ? $this->date->getDateTime((string) current($publish_date)) : null;
$updated = !empty($update_date) ? $this->date->getDateTime((string) current($update_date)) : null;
$published = !empty($publish_date) ? $this->getDateParser()->getDateTime((string) current($publish_date)) : null;
$updated = !empty($update_date) ? $this->getDateParser()->getDateTime((string) current($update_date)) : null;
if ($published === null && $updated === null) {
$feed->date = $this->date->getCurrentDateTime(); // We use the current date if there is no date for the feed
$feed->date = $this->getDateParser()->getCurrentDateTime(); // We use the current date if there is no date for the feed
} elseif ($published !== null && $updated !== null) {
$feed->date = max($published, $updated); // We use the most recent date between published and updated
} else {
@ -162,7 +162,7 @@ class Rss20 extends Parser
{
$date = XmlParser::getXPathResult($entry, 'pubDate');
$item->date = empty($date) ? $feed->getDate() : $this->date->getDateTime((string) current($date));
$item->date = empty($date) ? $feed->getDate() : $this->getDateParser()->getDateTime((string) current($date));
}
/**

View File

@ -38,7 +38,7 @@ class XmlParser
*
* @param string $input XML content
*
* @return \DOMNDocument
* @return \DOMDocument
*/
public static function getDomDocument($input)
{
@ -60,10 +60,10 @@ class XmlParser
* Small wrapper around ZendXml to turn their exceptions into picoFeed
* exceptions
* @param $input the xml to load
* @param $dom pass in a dom document or use null/omit if simpleXml should
* @param $dom pass in a dom document or use null/omit if simpleXml should
* be used
*/
private static function scan($input, $dom=null)
private static function scan($input, $dom = null)
{
try {
return Security::scan($input, $dom);

View File

@ -0,0 +1,37 @@
<?php
namespace PicoFeed\Processor;
use PicoFeed\Base;
use PicoFeed\Filter\Filter;
use PicoFeed\Logging\Logger;
use PicoFeed\Parser\Feed;
use PicoFeed\Parser\Item;
/**
* Item Content Filter
*
* @package PicoFeed\Processor
* @author Frederic Guillot
*/
class ContentFilterProcessor extends Base implements ItemProcessorInterface
{
/**
* Execute Item Processor
*
* @access public
* @param Feed $feed
* @param Item $item
* @return bool
*/
public function execute(Feed $feed, Item $item)
{
if ($this->config->getContentFiltering(true)) {
$filter = Filter::html($item->getContent(), $feed->getSiteUrl());
$filter->setConfig($this->config);
$item->setContent($filter->execute());
} else {
Logger::setMessage(get_called_class().': Content filtering disabled');
}
}
}

View File

@ -0,0 +1,49 @@
<?php
namespace PicoFeed\Processor;
use PicoFeed\Base;
use PicoFeed\Parser\Feed;
use PicoFeed\Parser\Item;
/**
* Item Content Generator
*
* @package PicoFeed\Processor
* @author Frederic Guillot
*/
class ContentGeneratorProcessor extends Base implements ItemProcessorInterface
{
/**
* List of generators
*
* @access protected
* @var array
*/
protected $generators = array(
'youtube',
'file',
);
/**
* Execute Item Processor
*
* @access public
* @param Feed $feed
* @param Item $item
* @return bool
*/
public function execute(Feed $feed, Item $item)
{
foreach ($this->generators as $generator) {
$className = '\PicoFeed\Generator\\'.ucfirst($generator).'ContentGenerator';
$object = new $className($this->config);
if ($object->execute($item)) {
return true;
}
}
return false;
}
}

View File

@ -0,0 +1,84 @@
<?php
namespace PicoFeed\Processor;
use PicoFeed\Base;
use PicoFeed\Parser\Feed;
use PicoFeed\Parser\Item;
/**
* Item Post Processor
*
* @package PicoFeed\Processor
* @author Frederic Guillot
*/
class ItemPostProcessor extends Base
{
/**
* List of processors
*
* @access private
* @var array
*/
private $processors = array();
/**
* Execute all processors
*
* @access public
* @param Feed $feed
* @param Item $item
* @return bool
*/
public function execute(Feed $feed, Item $item)
{
foreach ($this->processors as $processor) {
if ($processor->execute($feed, $item)) {
return true;
}
}
return false;
}
/**
* Register a new Item post-processor
*
* @access public
* @param ItemProcessorInterface $processor
* @return ItemPostProcessor
*/
public function register(ItemProcessorInterface $processor)
{
$this->processors[get_class($processor)] = $processor;
return $this;
}
/**
* Remove Processor instance
*
* @access public
* @param string $class
* @return ItemPostProcessor
*/
public function unregister($class)
{
if (isset($this->processors[$class])) {
unset($this->processors[$class]);
}
return $this;
}
/**
* Get Processor instance
*
* @access public
* @param string $class
* @return ItemProcessorInterface|null
*/
public function getProcessor($class)
{
return isset($this->processors[$class]) ? $this->processors[$class] : null;
}
}

View File

@ -0,0 +1,25 @@
<?php
namespace PicoFeed\Processor;
use PicoFeed\Parser\Feed;
use PicoFeed\Parser\Item;
/**
* Item Processor Interface
*
* @package PicoFeed\Processor
* @author Frederic Guillot
*/
interface ItemProcessorInterface
{
/**
* Execute Item Processor
*
* @access public
* @param Feed $feed
* @param Item $item
* @return bool
*/
public function execute(Feed $feed, Item $item);
}

View File

@ -0,0 +1,71 @@
<?php
namespace PicoFeed\Processor;
use PicoFeed\Base;
use PicoFeed\Parser\Feed;
use PicoFeed\Parser\Item;
use PicoFeed\Scraper\Scraper;
/**
* Scraper Processor
*
* @package PicoFeed\Processor
* @author Frederic Guillot
*/
class ScraperProcessor extends Base implements ItemProcessorInterface
{
private $ignoredUrls = array();
private $scraper;
/**
* Execute Item Processor
*
* @access public
* @param Feed $feed
* @param Item $item
* @return bool
*/
public function execute(Feed $feed, Item $item)
{
if (!in_array($item->getUrl(), $this->ignoredUrls)) {
$scraper = $this->getScraper();
$scraper->setUrl($item->getUrl());
$scraper->execute();
if ($scraper->hasRelevantContent()) {
$item->setContent($scraper->getFilteredContent());
}
}
return false;
}
/**
* Ignore list of URLs
*
* @access public
* @param array $urls
* @return $this
*/
public function ignoreUrls(array $urls)
{
$this->ignoredUrls = $urls;
return $this;
}
/**
* Returns Scraper instance
*
* @access public
* @return Scraper
*/
public function getScraper()
{
if ($this->scraper === null) {
$this->scraper = new Scraper($this->config);
}
return $this->scraper;
}
}

View File

@ -2,11 +2,11 @@
namespace PicoFeed\Reader;
use DOMXpath;
use DOMXPath;
use PicoFeed\Base;
use PicoFeed\Client\Client;
use PicoFeed\Client\ClientException;
use PicoFeed\Client\Url;
use PicoFeed\Config\Config;
use PicoFeed\Logging\Logger;
use PicoFeed\Parser\XmlParser;
@ -17,7 +17,7 @@ use PicoFeed\Parser\XmlParser;
*
* @author Frederic Guillot
*/
class Favicon
class Favicon extends Base
{
/**
* Valid types for favicon (supported by browsers).
@ -33,13 +33,6 @@ class Favicon
'image/svg+xml'
);
/**
* Config class instance.
*
* @var \PicoFeed\Config\Config
*/
private $config;
/**
* Icon binary content.
*
@ -54,16 +47,6 @@ class Favicon
*/
private $content_type = '';
/**
* Constructor.
*
* @param \PicoFeed\Config\Config $config Config class instance
*/
public function __construct(Config $config = null)
{
$this->config = $config ?: new Config();
}
/**
* Get the icon file content (available only after the download).
*

View File

@ -3,7 +3,7 @@
namespace PicoFeed\Reader;
use DOMXPath;
use PicoFeed\Config\Config;
use PicoFeed\Base;
use PicoFeed\Client\Client;
use PicoFeed\Client\Url;
use PicoFeed\Logging\Logger;
@ -14,7 +14,7 @@ use PicoFeed\Parser\XmlParser;
*
* @author Frederic Guillot
*/
class Reader
class Reader extends Base
{
/**
* Feed formats for detection.
@ -29,24 +29,6 @@ class Reader
'Rss10' => '//rdf',
);
/**
* Config class instance.
*
* @var \PicoFeed\Config\Config
*/
private $config;
/**
* Constructor.
*
* @param \PicoFeed\Config\Config $config Config class instance
*/
public function __construct(Config $config = null)
{
$this->config = $config ?: new Config();
Logger::setTimezone($this->config->getTimezone());
}
/**
* Download a feed (no discovery).
*
@ -163,7 +145,6 @@ class Reader
$parser = new $className($content, $encoding, $url);
$parser->setHashAlgo($this->config->getParserHashAlgo());
$parser->setTimezone($this->config->getTimezone());
$parser->setConfig($this->config);
return $parser;

View File

@ -0,0 +1,11 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://eliascarpe.over-blog.com/2015/12/re-upload-projets-d-avenir.html',
'body' => array(
'//div[contains(concat(" ", normalize-space(@class), " "), " ob-section ")]',
),
)
)
);

View File

@ -0,0 +1,13 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://encyclopedie.naheulbeuk.com/article.php3?id_article=352',
'body' => array(
'//td//h1[@class="titre-texte"]',
'//td//div[@class="surtitre"]',
'//td//div[@class="texte"]',
),
)
),
);

View File

@ -1,12 +0,0 @@
<?php
return array(
'grabber' => array(
'%/joyoftech/.*%' => array(
'body' => array(
'//img[@width="640"]',
),
'test_url' => 'http://www.geekculture.com/joyoftech/joyarchives/2235.html',
),
),
);

View File

@ -1,18 +0,0 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://news.sciencemag.org/biology/2015/09/genetic-engineering-turns-common-plant-cancer-fighter',
'body' => array(
'//div[@class="content"]',
),
'strip' => array(
'//h1[@class="snews-article__headline"]',
'//div[contains(@class,"easy_social_box")]',
'//div[@class="author-teaser"]',
'//div[@class="article-byline"]',
),
),
)
);

View File

@ -0,0 +1,20 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.rugbyrama.fr/rugby/top-14/2015-2016/top-14-hayman-coupe-du-monde-finale-2012-lutte.-voici-levan-chilachava-toulon_sto5283863/story.shtml',
'body' => array(
'//div[@class="story-simple-content"]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//*[@class="share-buttons"]',
'//*[@class="show-mobile-block"]',
'//*[@class="hide-desktop"]',
'//*[@id="tracking_img"]',
)
)
)
);

View File

@ -4,8 +4,7 @@ return array(
'%.*%' => array(
'test_url' => 'http://www.franceculture.fr/emission-culture-eco-la-finance-aime-toujours-la-france-2016-01-08',
'body' => array(
'//div[@class="heading"]/*/*/div[contains(@class,"player-inline")]',
'//article/div[@class="text-zone"]',
'//div[@class="text-zone"]',
),
'strip' => array(
'//ul[@class="tags"]',

View File

@ -0,0 +1,11 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.monsieur-le-chien.fr/index.php?planche=672',
'body' => array(
'//img[starts-with(@src, "i/planches/")]',
),
)
)
);

View File

@ -0,0 +1,16 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.sciencemag.org/news/2016/01/could-bright-foamy-wak$
'body' => array(
'//div[@class="row--hero"]',
'//article[contains(@class,"primary")]',
),
'strip' => array(
'//header[@class="article__header"]',
'//footer[@class="article__foot"]',
),
),
)
);

View File

@ -2,8 +2,8 @@
namespace PicoFeed\Scraper;
use PicoFeed\Base;
use PicoFeed\Logging\Logger;
use PicoFeed\Config\Config;
/**
* RuleLoader class.
@ -11,25 +11,8 @@ use PicoFeed\Config\Config;
* @author Frederic Guillot
* @author Bernhard Posselt
*/
class RuleLoader
class RuleLoader extends Base
{
/**
* Config object.
*
* @var \PicoFeed\Config\Config
*/
private $config;
/**
* Constructor.
*
* @param \PicoFeed\Config\Config $config Config class instance
*/
public function __construct(Config $config)
{
$this->config = $config;
}
/**
* Get the rules for an URL.
*
@ -111,12 +94,14 @@ class RuleLoader
*/
public function getRulesFolders()
{
$folders = array(__DIR__.'/../Rules');
$folders = array();
if ($this->config !== null && $this->config->getGrabberRulesFolder() !== null) {
$folders[] = $this->config->getGrabberRulesFolder();
}
$folders[] = __DIR__ . '/../Rules';
return $folders;
}
}

View File

@ -2,10 +2,10 @@
namespace PicoFeed\Scraper;
use PicoFeed\Base;
use PicoFeed\Client\Client;
use PicoFeed\Client\ClientException;
use PicoFeed\Client\Url;
use PicoFeed\Config\Config;
use PicoFeed\Encoding\Encoding;
use PicoFeed\Filter\Filter;
use PicoFeed\Logging\Logger;
@ -16,7 +16,7 @@ use PicoFeed\Parser\XmlParser;
*
* @author Frederic Guillot
*/
class Scraper
class Scraper extends Base
{
/**
* URL.
@ -53,24 +53,6 @@ class Scraper
*/
private $enableCandidateParser = true;
/**
* Config object.
*
* @var \PicoFeed\Config\Config
*/
private $config;
/**
* Constructor.
*
* @param \PicoFeed\Config\Config $config Config class instance
*/
public function __construct(Config $config)
{
$this->config = $config;
Logger::setTimezone($this->config->getTimezone());
}
/**
* Disable candidates parsing.
*
@ -79,7 +61,6 @@ class Scraper
public function disableCandidateParser()
{
$this->enableCandidateParser = false;
return $this;
}
@ -227,47 +208,21 @@ class Scraper
*/
public function execute()
{
$this->content = '';
$this->html = '';
$this->encoding = '';
$this->download();
$this->prepareHtml();
if (!$this->skipProcessing()) {
$this->prepareHtml();
$parser = $this->getParser();
$parser = $this->getParser();
if ($parser !== null) {
$this->content = $parser->execute();
Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
}
if ($parser !== null) {
$this->content = $parser->execute();
Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
}
}
/**
* Returns true if the parsing must be skipped.
*
* @return bool
*/
public function skipProcessing()
{
$handlers = array(
'detectStreamingVideos',
'detectPdfFiles',
);
foreach ($handlers as $handler) {
if ($this->$handler()) {
return true;
}
}
if (empty($this->html)) {
Logger::setMessage(get_called_class().': Raw HTML is empty');
return true;
}
return false;
}
/**
* Get the parser.
*
@ -287,17 +242,14 @@ class Scraper
if (preg_match($pattern, $sub_url)) {
Logger::setMessage(get_called_class().': Matched url '.$sub_url);
return new RuleParser($this->html, $rule);
}
}
} elseif ($this->enableCandidateParser) {
Logger::setMessage(get_called_class().': Parse content with candidates');
return new CandidateParser($this->html);
}
return;
return new CandidateParser($this->html);
}
/**
@ -312,30 +264,4 @@ class Scraper
Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'" ; HTML Encoding "'.$html_encoding.'"');
}
/**
* Return the Youtube embed player and skip processing.
*
* @return bool
*/
public function detectStreamingVideos()
{
if (preg_match("#(?<=v=|v\/|vi=|vi\/|youtu.be\/)[a-zA-Z0-9_-]{11}#", $this->url, $matches)) {
$this->content = '<iframe width="560" height="315" src="//www.youtube.com/embed/'.$matches[0].'" frameborder="0"></iframe>';
return true;
}
return false;
}
/**
* Skip processing for PDF documents.
*
* @return bool
*/
public function detectPdfFiles()
{
return substr($this->url, -3) === 'pdf';
}
}