Update PicoFeed to have ContentGenerator

This commit is contained in:
Frederic Guillot 2016-03-24 17:49:50 -04:00
parent b8a9c91e79
commit 165acb0342
32 changed files with 649 additions and 318 deletions

View File

@ -15,7 +15,7 @@
"fguillot/simple-validator": "v1.0.0", "fguillot/simple-validator": "v1.0.0",
"fguillot/json-rpc": "v1.0.2", "fguillot/json-rpc": "v1.0.2",
"fguillot/picodb": "v1.0.2", "fguillot/picodb": "v1.0.2",
"fguillot/picofeed": "v0.1.19" "fguillot/picofeed": "v0.1.20"
}, },
"require-dev": { "require-dev": {
"phpunit/phpunit": "4.8.3", "phpunit/phpunit": "4.8.3",

View File

@ -25,6 +25,7 @@ return array(
'PicoDb\\SQLException' => $vendorDir . '/fguillot/picodb/lib/PicoDb/SQLException.php', 'PicoDb\\SQLException' => $vendorDir . '/fguillot/picodb/lib/PicoDb/SQLException.php',
'PicoDb\\Schema' => $vendorDir . '/fguillot/picodb/lib/PicoDb/Schema.php', 'PicoDb\\Schema' => $vendorDir . '/fguillot/picodb/lib/PicoDb/Schema.php',
'PicoDb\\Table' => $vendorDir . '/fguillot/picodb/lib/PicoDb/Table.php', 'PicoDb\\Table' => $vendorDir . '/fguillot/picodb/lib/PicoDb/Table.php',
'PicoFeed\\Base' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Base.php',
'PicoFeed\\Client\\Client' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Client.php', 'PicoFeed\\Client\\Client' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Client.php',
'PicoFeed\\Client\\ClientException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/ClientException.php', 'PicoFeed\\Client\\ClientException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/ClientException.php',
'PicoFeed\\Client\\Curl' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Curl.php', 'PicoFeed\\Client\\Curl' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Curl.php',
@ -42,6 +43,9 @@ return array(
'PicoFeed\\Filter\\Filter' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php', 'PicoFeed\\Filter\\Filter' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php',
'PicoFeed\\Filter\\Html' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Filter/Html.php', 'PicoFeed\\Filter\\Html' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Filter/Html.php',
'PicoFeed\\Filter\\Tag' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Filter/Tag.php', 'PicoFeed\\Filter\\Tag' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Filter/Tag.php',
'PicoFeed\\Generator\\ContentGeneratorInterface' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Generator/ContentGeneratorInterface.php',
'PicoFeed\\Generator\\FileContentGenerator' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Generator/FileContentGenerator.php',
'PicoFeed\\Generator\\YoutubeContentGenerator' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Generator/YoutubeContentGenerator.php',
'PicoFeed\\Logging\\Logger' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Logging/Logger.php', 'PicoFeed\\Logging\\Logger' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Logging/Logger.php',
'PicoFeed\\Parser\\Atom' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/Atom.php', 'PicoFeed\\Parser\\Atom' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/Atom.php',
'PicoFeed\\Parser\\DateParser' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/DateParser.php', 'PicoFeed\\Parser\\DateParser' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/DateParser.php',
@ -57,6 +61,11 @@ return array(
'PicoFeed\\Parser\\XmlEntityException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/XmlEntityException.php', 'PicoFeed\\Parser\\XmlEntityException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/XmlEntityException.php',
'PicoFeed\\Parser\\XmlParser' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php', 'PicoFeed\\Parser\\XmlParser' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php',
'PicoFeed\\PicoFeedException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/PicoFeedException.php', 'PicoFeed\\PicoFeedException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/PicoFeedException.php',
'PicoFeed\\Processor\\ContentFilterProcessor' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ContentFilterProcessor.php',
'PicoFeed\\Processor\\ContentGeneratorProcessor' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ContentGeneratorProcessor.php',
'PicoFeed\\Processor\\ItemPostProcessor' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ItemPostProcessor.php',
'PicoFeed\\Processor\\ItemProcessorInterface' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ItemProcessorInterface.php',
'PicoFeed\\Processor\\ScraperProcessor' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ScraperProcessor.php',
'PicoFeed\\Reader\\Favicon' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/Favicon.php', 'PicoFeed\\Reader\\Favicon' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/Favicon.php',
'PicoFeed\\Reader\\Reader' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/Reader.php', 'PicoFeed\\Reader\\Reader' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/Reader.php',
'PicoFeed\\Reader\\ReaderException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/ReaderException.php', 'PicoFeed\\Reader\\ReaderException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/ReaderException.php',

View File

@ -163,17 +163,17 @@
}, },
{ {
"name": "fguillot/picofeed", "name": "fguillot/picofeed",
"version": "v0.1.19", "version": "v0.1.20",
"version_normalized": "0.1.19.0", "version_normalized": "0.1.20.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/fguillot/picoFeed.git", "url": "https://github.com/fguillot/picoFeed.git",
"reference": "c270ef4474a2460d857f99c84612025c5f9975f2" "reference": "d6bbdd248fa4a3eef7831ffaae0491a2ea58f897"
}, },
"dist": { "dist": {
"type": "zip", "type": "zip",
"url": "https://api.github.com/repos/fguillot/picoFeed/zipball/c270ef4474a2460d857f99c84612025c5f9975f2", "url": "https://api.github.com/repos/fguillot/picoFeed/zipball/d6bbdd248fa4a3eef7831ffaae0491a2ea58f897",
"reference": "c270ef4474a2460d857f99c84612025c5f9975f2", "reference": "d6bbdd248fa4a3eef7831ffaae0491a2ea58f897",
"shasum": "" "shasum": ""
}, },
"require": { "require": {
@ -188,7 +188,7 @@
"suggest": { "suggest": {
"ext-curl": "PicoFeed will use cURL if present" "ext-curl": "PicoFeed will use cURL if present"
}, },
"time": "2016-02-11 19:52:02", "time": "2016-03-24 12:09:56",
"bin": [ "bin": [
"picofeed" "picofeed"
], ],

View File

@ -0,0 +1,34 @@
<?php
namespace PicoFeed;
use PicoFeed\Config\Config;
use PicoFeed\Logging\Logger;
/**
* Base class
*
* @package PicoFeed
* @author Frederic Guillot
*/
abstract class Base
{
/**
* Config class instance
*
* @access protected
* @var \PicoFeed\Config\Config
*/
protected $config;
/**
* Constructor.
*
* @param \PicoFeed\Config\Config $config Config class instance
*/
public function __construct(Config $config = null)
{
$this->config = $config ?: new Config();
Logger::setTimezone($this->config->getTimezone());
}
}

View File

@ -2,24 +2,17 @@
namespace PicoFeed\Filter; namespace PicoFeed\Filter;
use DOMXpath; use DOMXPath;
use PicoFeed\Base;
use PicoFeed\Parser\XmlParser; use PicoFeed\Parser\XmlParser;
use PicoFeed\Config\Config;
/** /**
* Tag Filter class. * Tag Filter class.
* *
* @author Frederic Guillot * @author Frederic Guillot
*/ */
class Tag class Tag extends Base
{ {
/**
* Config object.
*
* @var \PicoFeed\Config\Config
*/
private $config;
/** /**
* Tags blacklist (Xpath expressions). * Tags blacklist (Xpath expressions).
* *
@ -76,11 +69,6 @@ class Tag
'q', 'q',
); );
public function __construct(Config $config)
{
$this->config = $config;
}
/** /**
* Check if the tag is allowed and is not a pixel tracker. * Check if the tag is allowed and is not a pixel tracker.
* *

View File

@ -0,0 +1,23 @@
<?php
namespace PicoFeed\Generator;
use PicoFeed\Parser\Item;
/**
* Content Generator Interface
*
* @package PicoFeed\Generator
* @author Frederic Guillot
*/
interface ContentGeneratorInterface
{
/**
* Execute Content Generator
*
* @access public
* @param Item $item
* @return boolean
*/
public function execute(Item $item);
}

View File

@ -0,0 +1,36 @@
<?php
namespace PicoFeed\Generator;
use PicoFeed\Base;
use PicoFeed\Parser\Item;
/**
* File Content Generator
*
* @package PicoFeed\Generator
* @author Frederic Guillot
*/
class FileContentGenerator extends Base implements ContentGeneratorInterface
{
private $extensions = array('pdf');
/**
* Execute Content Generator
*
* @access public
* @param Item $item
* @return boolean
*/
public function execute(Item $item)
{
foreach ($this->extensions as $extension) {
if (substr($item->getUrl(), - strlen($extension)) === $extension) {
$item->setContent('<a href="'.$item->getUrl().'" target="_blank">'.$item->getUrl().'</a>');
return true;
}
}
return false;
}
}

View File

@ -0,0 +1,67 @@
<?php
namespace PicoFeed\Generator;
use PicoFeed\Base;
use PicoFeed\Parser\Item;
/**
* Youtube Content Generator
*
* @package PicoFeed\Generator
* @author Frederic Guillot
*/
class YoutubeContentGenerator extends Base implements ContentGeneratorInterface
{
/**
* Execute Content Generator
*
* @access public
* @param Item $item
* @return boolean
*/
public function execute(Item $item)
{
if ($item->hasNamespace('yt')) {
return $this->generateHtmlFromXml($item);
}
return $this->generateHtmlFromUrl($item);
}
/**
* Generate HTML
*
* @access public
* @param Item $item
* @return boolean
*/
private function generateHtmlFromXml(Item $item)
{
$videoId = $item->getTag('yt:videoId');
if (! empty($videoId)) {
$item->setContent('<iframe width="560" height="315" src="//www.youtube.com/embed/'.$videoId[0].'" frameborder="0"></iframe>');
return true;
}
return false;
}
/**
* Generate HTML from item URL
*
* @access public
* @param Item $item
* @return bool
*/
public function generateHtmlFromUrl(Item $item)
{
if (preg_match('/youtube\.com\/watch\?v=(.*)/', $item->getUrl(), $matches)) {
$item->setContent('<iframe width="560" height="315" src="//www.youtube.com/embed/'.$matches[1].'" frameborder="0"></iframe>');
return true;
}
return false;
}
}

View File

@ -150,7 +150,7 @@ class Atom extends Parser
$updated = XmlParser::getXPathResult($xml, 'atom:updated', $this->namespaces) $updated = XmlParser::getXPathResult($xml, 'atom:updated', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'updated'); ?: XmlParser::getXPathResult($xml, 'updated');
$feed->date = $this->date->getDateTime((string) current($updated)); $feed->date = $this->getDateParser()->getDateTime((string) current($updated));
} }
/** /**
@ -168,8 +168,8 @@ class Atom extends Parser
$updated = XmlParser::getXPathResult($entry, 'atom:updated', $this->namespaces) $updated = XmlParser::getXPathResult($entry, 'atom:updated', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'updated'); ?: XmlParser::getXPathResult($entry, 'updated');
$published = !empty($published) ? $this->date->getDateTime((string) current($published)) : null; $published = !empty($published) ? $this->getDateParser()->getDateTime((string) current($published)) : null;
$updated = !empty($updated) ? $this->date->getDateTime((string) current($updated)) : null; $updated = !empty($updated) ? $this->getDateParser()->getDateTime((string) current($updated)) : null;
if ($published === null && $updated === null) { if ($published === null && $updated === null) {
$item->date = $feed->getDate(); // We use the feed date if there is no date for the item $item->date = $feed->getDate(); // We use the feed date if there is no date for the item

View File

@ -4,20 +4,22 @@ namespace PicoFeed\Parser;
use DateTime; use DateTime;
use DateTimeZone; use DateTimeZone;
use PicoFeed\Base;
/** /**
* Date Parser. * Date Parser.
* *
* @author Frederic Guillot * @author Frederic Guillot
*/ */
class DateParser class DateParser extends Base
{ {
/** /**
* Timezone used to parse feed dates. * Timezone used to parse feed dates.
* *
* @access private
* @var string * @var string
*/ */
public $timezone = 'UTC'; private $timezone = 'UTC';
/** /**
* Supported formats [ 'format' => length ]. * Supported formats [ 'format' => length ].
@ -88,7 +90,7 @@ class DateParser
*/ */
public function getValidDate($format, $value) public function getValidDate($format, $value)
{ {
$date = DateTime::createFromFormat($format, $value, new DateTimeZone($this->timezone)); $date = DateTime::createFromFormat($format, $value, $this->getTimeZone());
if ($date !== false) { if ($date !== false) {
$errors = DateTime::getLastErrors(); $errors = DateTime::getLastErrors();
@ -108,6 +110,17 @@ class DateParser
*/ */
public function getCurrentDateTime() public function getCurrentDateTime()
{ {
return new DateTime('now', new DateTimeZone($this->timezone)); return new DateTime('now', $this->getTimeZone());
}
/**
* Get DateTimeZone instance
*
* @access public
* @return DateTimeZone
*/
public function getTimeZone()
{
return new DateTimeZone($this->config->getTimezone() ?: $this->timezone);
} }
} }

View File

@ -102,6 +102,18 @@ class Item
*/ */
public $namespaces = array(); public $namespaces = array();
/**
* Check if a XML namespace exists
*
* @access public
* @param string $namespace
* @return bool
*/
public function hasNamespace($namespace)
{
return array_key_exists($namespace, $this->namespaces);
}
/** /**
* Get specific XML tag or attribute value. * Get specific XML tag or attribute value.
* *
@ -112,12 +124,10 @@ class Item
*/ */
public function getTag($tag, $attribute = '') public function getTag($tag, $attribute = '')
{ {
// convert to xPath attribute query
if ($attribute !== '') { if ($attribute !== '') {
$attribute = '/@'.$attribute; $attribute = '/@'.$attribute;
} }
// construct query
$query = './/'.$tag.$attribute; $query = './/'.$tag.$attribute;
$elements = XmlParser::getXPathResult($this->xml, $query, $this->namespaces); $elements = XmlParser::getXPathResult($this->xml, $query, $this->namespaces);
@ -155,13 +165,29 @@ class Item
} }
/** /**
* Get url. * Get URL
*
* @access public
* @return string
*/ */
public function getUrl() public function getUrl()
{ {
return $this->url; return $this->url;
} }
/**
* Set URL
*
* @access public
* @param string $url
* @return Item
*/
public function setUrl($url)
{
$this->url = $url;
return $this;
}
/** /**
* Get id. * Get id.
*/ */
@ -186,6 +212,19 @@ class Item
return $this->content; return $this->content;
} }
/**
* Set content
*
* @access public
* @param string $value
* @return Item
*/
public function setContent($value)
{
$this->content = $value;
return $this;
}
/** /**
* Get enclosure url. * Get enclosure url.
*/ */

View File

@ -2,12 +2,15 @@
namespace PicoFeed\Parser; namespace PicoFeed\Parser;
use PicoFeed\Processor\ContentFilterProcessor;
use PicoFeed\Processor\ContentGeneratorProcessor;
use PicoFeed\Processor\ItemPostProcessor;
use PicoFeed\Processor\ScraperProcessor;
use SimpleXMLElement; use SimpleXMLElement;
use PicoFeed\Client\Url; use PicoFeed\Client\Url;
use PicoFeed\Encoding\Encoding; use PicoFeed\Encoding\Encoding;
use PicoFeed\Filter\Filter; use PicoFeed\Filter\Filter;
use PicoFeed\Logging\Logger; use PicoFeed\Logging\Logger;
use PicoFeed\Scraper\Scraper;
/** /**
* Base parser class. * Base parser class.
@ -28,7 +31,7 @@ abstract class Parser
* *
* @var \PicoFeed\Parser\DateParser * @var \PicoFeed\Parser\DateParser
*/ */
protected $date; private $dateParser;
/** /**
* Hash algorithm used to generate item id, any value supported by PHP, see hash_algos(). * Hash algorithm used to generate item id, any value supported by PHP, see hash_algos().
@ -66,32 +69,12 @@ abstract class Parser
protected $used_namespaces = array(); protected $used_namespaces = array();
/** /**
* Enable the content filtering. * Item Post Processor instance
* *
* @var bool * @access private
* @var ItemPostProcessor
*/ */
private $enable_filter = true; private $itemPostProcessor;
/**
* Enable the content grabber.
*
* @var bool
*/
private $enable_grabber = false;
/**
* Enable the content grabber on all pages.
*
* @var bool
*/
private $grabber_needs_rule_file = false;
/**
* Ignore those urls for the content scraper.
*
* @var array
*/
private $grabber_ignore_urls = array();
/** /**
* Constructor. * Constructor.
@ -102,7 +85,6 @@ abstract class Parser
*/ */
public function __construct($content, $http_encoding = '', $fallback_url = '') public function __construct($content, $http_encoding = '', $fallback_url = '')
{ {
$this->date = new DateParser();
$this->fallback_url = $fallback_url; $this->fallback_url = $fallback_url;
$xml_encoding = XmlParser::getEncodingFromXmlTag($content); $xml_encoding = XmlParser::getEncodingFromXmlTag($content);
@ -112,6 +94,10 @@ abstract class Parser
// Encode everything in UTF-8 // Encode everything in UTF-8
Logger::setMessage(get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"'); Logger::setMessage(get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"');
$this->content = Encoding::convert($this->content, $xml_encoding ?: $http_encoding); $this->content = Encoding::convert($this->content, $xml_encoding ?: $http_encoding);
$this->itemPostProcessor = new ItemPostProcessor($this->config);
$this->itemPostProcessor->register(new ContentGeneratorProcessor($this->config));
$this->itemPostProcessor->register(new ContentFilterProcessor($this->config));
} }
/** /**
@ -173,15 +159,11 @@ abstract class Parser
// Id generation can use the item url/title/content (order is important) // Id generation can use the item url/title/content (order is important)
$this->findItemId($entry, $item, $feed); $this->findItemId($entry, $item, $feed);
$this->findItemDate($entry, $item, $feed); $this->findItemDate($entry, $item, $feed);
$this->findItemEnclosure($entry, $item, $feed); $this->findItemEnclosure($entry, $item, $feed);
$this->findItemLanguage($entry, $item, $feed); $this->findItemLanguage($entry, $item, $feed);
// Order is important (avoid double filtering) $this->itemPostProcessor->execute($feed, $item);
$this->filterItemContent($feed, $item);
$this->scrapWebsite($item);
$feed->items[] = $item; $feed->items[] = $item;
} }
@ -230,43 +212,29 @@ abstract class Parser
} }
/** /**
* Fetch item content with the content grabber. * Get Item Post Processor instance
* *
* @param Item $item Item object * @access public
* @return ItemPostProcessor
*/ */
public function scrapWebsite(Item $item) public function getItemPostProcessor()
{ {
if ($this->enable_grabber && !in_array($item->getUrl(), $this->grabber_ignore_urls)) { return $this->itemPostProcessor;
$grabber = new Scraper($this->config);
$grabber->setUrl($item->getUrl());
if ($this->grabber_needs_rule_file) {
$grabber->disableCandidateParser();
}
$grabber->execute();
if ($grabber->hasRelevantContent()) {
$item->content = $grabber->getFilteredContent();
}
}
} }
/** /**
* Filter HTML for entry content. * Get DateParser instance
* *
* @param Feed $feed Feed object * @access public
* @param Item $item Item object * @return DateParser
*/ */
public function filterItemContent(Feed $feed, Item $item) public function getDateParser()
{ {
if ($this->isFilteringEnabled()) { if ($this->dateParser === null) {
$filter = Filter::html($item->getContent(), $feed->getSiteUrl()); return new DateParser($this->config);
$filter->setConfig($this->config);
$item->content = $filter->execute();
} else {
Logger::setMessage(get_called_class().': Content filtering disabled');
} }
return $this->dateParser;
} }
/** /**
@ -316,31 +284,11 @@ abstract class Parser
* Set Hash algorithm used for id generation. * Set Hash algorithm used for id generation.
* *
* @param string $algo Algorithm name * @param string $algo Algorithm name
*
* @return \PicoFeed\Parser\Parser * @return \PicoFeed\Parser\Parser
*/ */
public function setHashAlgo($algo) public function setHashAlgo($algo)
{ {
$this->hash_algo = $algo ?: $this->hash_algo; $this->hash_algo = $algo ?: $this->hash_algo;
return $this;
}
/**
* Set a different timezone.
*
* @see http://php.net/manual/en/timezones.php
*
* @param string $timezone Timezone
*
* @return \PicoFeed\Parser\Parser
*/
public function setTimezone($timezone)
{
if ($timezone) {
$this->date->timezone = $timezone;
}
return $this; return $this;
} }
@ -354,7 +302,6 @@ abstract class Parser
public function setConfig($config) public function setConfig($config)
{ {
$this->config = $config; $this->config = $config;
return $this; return $this;
} }
@ -365,21 +312,8 @@ abstract class Parser
*/ */
public function disableContentFiltering() public function disableContentFiltering()
{ {
$this->enable_filter = false; $this->itemPostProcessor->unregister('PicoFeed\Processor\ContentFilterProcessor');
} return $this;
/**
* Return true if the content filtering is enabled.
*
* @return bool
*/
public function isFilteringEnabled()
{
if ($this->config === null) {
return $this->enable_filter;
}
return $this->config->getContentFiltering($this->enable_filter);
} }
/** /**
@ -392,8 +326,14 @@ abstract class Parser
*/ */
public function enableContentGrabber($needs_rule_file = false) public function enableContentGrabber($needs_rule_file = false)
{ {
$this->enable_grabber = true; $processor = new ScraperProcessor($this->config);
$this->grabber_needs_rule_file = $needs_rule_file;
if ($needs_rule_file) {
$processor->getScraper()->disableCandidateParser();
}
$this->itemPostProcessor->register($processor);
return $this;
} }
/** /**
@ -405,7 +345,8 @@ abstract class Parser
*/ */
public function setGrabberIgnoreUrls(array $urls) public function setGrabberIgnoreUrls(array $urls)
{ {
$this->grabber_ignore_urls = $urls; $this->itemPostProcessor->getProcessor('PicoFeed\Processor\ScraperProcessor')->ignoreUrls($urls);
return $this;
} }
/** /**

View File

@ -149,7 +149,7 @@ class Rss10 extends Parser
$date = XmlParser::getXPathResult($xml, 'rss:channel/dc:date', $this->namespaces) $date = XmlParser::getXPathResult($xml, 'rss:channel/dc:date', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/dc:date', $this->namespaces); ?: XmlParser::getXPathResult($xml, 'channel/dc:date', $this->namespaces);
$feed->date = $this->date->getDateTime((string) current($date)); $feed->date = $this->getDateParser()->getDateTime((string) current($date));
} }
/** /**
@ -163,7 +163,7 @@ class Rss10 extends Parser
{ {
$date = XmlParser::getXPathResult($entry, 'dc:date', $this->namespaces); $date = XmlParser::getXPathResult($entry, 'dc:date', $this->namespaces);
$item->date = empty($date) ? $feed->getDate() : $this->date->getDateTime((string) current($date)); $item->date = empty($date) ? $feed->getDate() : $this->getDateParser()->getDateTime((string) current($date));
} }
/** /**

View File

@ -139,11 +139,11 @@ class Rss20 extends Parser
$publish_date = XmlParser::getXPathResult($xml, 'channel/pubDate'); $publish_date = XmlParser::getXPathResult($xml, 'channel/pubDate');
$update_date = XmlParser::getXPathResult($xml, 'channel/lastBuildDate'); $update_date = XmlParser::getXPathResult($xml, 'channel/lastBuildDate');
$published = !empty($publish_date) ? $this->date->getDateTime((string) current($publish_date)) : null; $published = !empty($publish_date) ? $this->getDateParser()->getDateTime((string) current($publish_date)) : null;
$updated = !empty($update_date) ? $this->date->getDateTime((string) current($update_date)) : null; $updated = !empty($update_date) ? $this->getDateParser()->getDateTime((string) current($update_date)) : null;
if ($published === null && $updated === null) { if ($published === null && $updated === null) {
$feed->date = $this->date->getCurrentDateTime(); // We use the current date if there is no date for the feed $feed->date = $this->getDateParser()->getCurrentDateTime(); // We use the current date if there is no date for the feed
} elseif ($published !== null && $updated !== null) { } elseif ($published !== null && $updated !== null) {
$feed->date = max($published, $updated); // We use the most recent date between published and updated $feed->date = max($published, $updated); // We use the most recent date between published and updated
} else { } else {
@ -162,7 +162,7 @@ class Rss20 extends Parser
{ {
$date = XmlParser::getXPathResult($entry, 'pubDate'); $date = XmlParser::getXPathResult($entry, 'pubDate');
$item->date = empty($date) ? $feed->getDate() : $this->date->getDateTime((string) current($date)); $item->date = empty($date) ? $feed->getDate() : $this->getDateParser()->getDateTime((string) current($date));
} }
/** /**

View File

@ -38,7 +38,7 @@ class XmlParser
* *
* @param string $input XML content * @param string $input XML content
* *
* @return \DOMNDocument * @return \DOMDocument
*/ */
public static function getDomDocument($input) public static function getDomDocument($input)
{ {
@ -63,7 +63,7 @@ class XmlParser
* @param $dom pass in a dom document or use null/omit if simpleXml should * @param $dom pass in a dom document or use null/omit if simpleXml should
* be used * be used
*/ */
private static function scan($input, $dom=null) private static function scan($input, $dom = null)
{ {
try { try {
return Security::scan($input, $dom); return Security::scan($input, $dom);

View File

@ -0,0 +1,37 @@
<?php
namespace PicoFeed\Processor;
use PicoFeed\Base;
use PicoFeed\Filter\Filter;
use PicoFeed\Logging\Logger;
use PicoFeed\Parser\Feed;
use PicoFeed\Parser\Item;
/**
* Item Content Filter
*
* @package PicoFeed\Processor
* @author Frederic Guillot
*/
class ContentFilterProcessor extends Base implements ItemProcessorInterface
{
/**
* Execute Item Processor
*
* @access public
* @param Feed $feed
* @param Item $item
* @return bool
*/
public function execute(Feed $feed, Item $item)
{
if ($this->config->getContentFiltering(true)) {
$filter = Filter::html($item->getContent(), $feed->getSiteUrl());
$filter->setConfig($this->config);
$item->setContent($filter->execute());
} else {
Logger::setMessage(get_called_class().': Content filtering disabled');
}
}
}

View File

@ -0,0 +1,49 @@
<?php
namespace PicoFeed\Processor;
use PicoFeed\Base;
use PicoFeed\Parser\Feed;
use PicoFeed\Parser\Item;
/**
* Item Content Generator
*
* @package PicoFeed\Processor
* @author Frederic Guillot
*/
class ContentGeneratorProcessor extends Base implements ItemProcessorInterface
{
/**
* List of generators
*
* @access protected
* @var array
*/
protected $generators = array(
'youtube',
'file',
);
/**
* Execute Item Processor
*
* @access public
* @param Feed $feed
* @param Item $item
* @return bool
*/
public function execute(Feed $feed, Item $item)
{
foreach ($this->generators as $generator) {
$className = '\PicoFeed\Generator\\'.ucfirst($generator).'ContentGenerator';
$object = new $className($this->config);
if ($object->execute($item)) {
return true;
}
}
return false;
}
}

View File

@ -0,0 +1,84 @@
<?php
namespace PicoFeed\Processor;
use PicoFeed\Base;
use PicoFeed\Parser\Feed;
use PicoFeed\Parser\Item;
/**
* Item Post Processor
*
* @package PicoFeed\Processor
* @author Frederic Guillot
*/
class ItemPostProcessor extends Base
{
/**
* List of processors
*
* @access private
* @var array
*/
private $processors = array();
/**
* Execute all processors
*
* @access public
* @param Feed $feed
* @param Item $item
* @return bool
*/
public function execute(Feed $feed, Item $item)
{
foreach ($this->processors as $processor) {
if ($processor->execute($feed, $item)) {
return true;
}
}
return false;
}
/**
* Register a new Item post-processor
*
* @access public
* @param ItemProcessorInterface $processor
* @return ItemPostProcessor
*/
public function register(ItemProcessorInterface $processor)
{
$this->processors[get_class($processor)] = $processor;
return $this;
}
/**
* Remove Processor instance
*
* @access public
* @param string $class
* @return ItemPostProcessor
*/
public function unregister($class)
{
if (isset($this->processors[$class])) {
unset($this->processors[$class]);
}
return $this;
}
/**
* Get Processor instance
*
* @access public
* @param string $class
* @return ItemProcessorInterface|null
*/
public function getProcessor($class)
{
return isset($this->processors[$class]) ? $this->processors[$class] : null;
}
}

View File

@ -0,0 +1,25 @@
<?php
namespace PicoFeed\Processor;
use PicoFeed\Parser\Feed;
use PicoFeed\Parser\Item;
/**
* Item Processor Interface
*
* @package PicoFeed\Processor
* @author Frederic Guillot
*/
interface ItemProcessorInterface
{
/**
* Execute Item Processor
*
* @access public
* @param Feed $feed
* @param Item $item
* @return bool
*/
public function execute(Feed $feed, Item $item);
}

View File

@ -0,0 +1,71 @@
<?php
namespace PicoFeed\Processor;
use PicoFeed\Base;
use PicoFeed\Parser\Feed;
use PicoFeed\Parser\Item;
use PicoFeed\Scraper\Scraper;
/**
* Scraper Processor
*
* @package PicoFeed\Processor
* @author Frederic Guillot
*/
class ScraperProcessor extends Base implements ItemProcessorInterface
{
private $ignoredUrls = array();
private $scraper;
/**
* Execute Item Processor
*
* @access public
* @param Feed $feed
* @param Item $item
* @return bool
*/
public function execute(Feed $feed, Item $item)
{
if (!in_array($item->getUrl(), $this->ignoredUrls)) {
$scraper = $this->getScraper();
$scraper->setUrl($item->getUrl());
$scraper->execute();
if ($scraper->hasRelevantContent()) {
$item->setContent($scraper->getFilteredContent());
}
}
return false;
}
/**
* Ignore list of URLs
*
* @access public
* @param array $urls
* @return $this
*/
public function ignoreUrls(array $urls)
{
$this->ignoredUrls = $urls;
return $this;
}
/**
* Returns Scraper instance
*
* @access public
* @return Scraper
*/
public function getScraper()
{
if ($this->scraper === null) {
$this->scraper = new Scraper($this->config);
}
return $this->scraper;
}
}

View File

@ -2,11 +2,11 @@
namespace PicoFeed\Reader; namespace PicoFeed\Reader;
use DOMXpath; use DOMXPath;
use PicoFeed\Base;
use PicoFeed\Client\Client; use PicoFeed\Client\Client;
use PicoFeed\Client\ClientException; use PicoFeed\Client\ClientException;
use PicoFeed\Client\Url; use PicoFeed\Client\Url;
use PicoFeed\Config\Config;
use PicoFeed\Logging\Logger; use PicoFeed\Logging\Logger;
use PicoFeed\Parser\XmlParser; use PicoFeed\Parser\XmlParser;
@ -17,7 +17,7 @@ use PicoFeed\Parser\XmlParser;
* *
* @author Frederic Guillot * @author Frederic Guillot
*/ */
class Favicon class Favicon extends Base
{ {
/** /**
* Valid types for favicon (supported by browsers). * Valid types for favicon (supported by browsers).
@ -33,13 +33,6 @@ class Favicon
'image/svg+xml' 'image/svg+xml'
); );
/**
* Config class instance.
*
* @var \PicoFeed\Config\Config
*/
private $config;
/** /**
* Icon binary content. * Icon binary content.
* *
@ -54,16 +47,6 @@ class Favicon
*/ */
private $content_type = ''; private $content_type = '';
/**
* Constructor.
*
* @param \PicoFeed\Config\Config $config Config class instance
*/
public function __construct(Config $config = null)
{
$this->config = $config ?: new Config();
}
/** /**
* Get the icon file content (available only after the download). * Get the icon file content (available only after the download).
* *

View File

@ -3,7 +3,7 @@
namespace PicoFeed\Reader; namespace PicoFeed\Reader;
use DOMXPath; use DOMXPath;
use PicoFeed\Config\Config; use PicoFeed\Base;
use PicoFeed\Client\Client; use PicoFeed\Client\Client;
use PicoFeed\Client\Url; use PicoFeed\Client\Url;
use PicoFeed\Logging\Logger; use PicoFeed\Logging\Logger;
@ -14,7 +14,7 @@ use PicoFeed\Parser\XmlParser;
* *
* @author Frederic Guillot * @author Frederic Guillot
*/ */
class Reader class Reader extends Base
{ {
/** /**
* Feed formats for detection. * Feed formats for detection.
@ -29,24 +29,6 @@ class Reader
'Rss10' => '//rdf', 'Rss10' => '//rdf',
); );
/**
* Config class instance.
*
* @var \PicoFeed\Config\Config
*/
private $config;
/**
* Constructor.
*
* @param \PicoFeed\Config\Config $config Config class instance
*/
public function __construct(Config $config = null)
{
$this->config = $config ?: new Config();
Logger::setTimezone($this->config->getTimezone());
}
/** /**
* Download a feed (no discovery). * Download a feed (no discovery).
* *
@ -163,7 +145,6 @@ class Reader
$parser = new $className($content, $encoding, $url); $parser = new $className($content, $encoding, $url);
$parser->setHashAlgo($this->config->getParserHashAlgo()); $parser->setHashAlgo($this->config->getParserHashAlgo());
$parser->setTimezone($this->config->getTimezone());
$parser->setConfig($this->config); $parser->setConfig($this->config);
return $parser; return $parser;

View File

@ -0,0 +1,11 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://eliascarpe.over-blog.com/2015/12/re-upload-projets-d-avenir.html',
'body' => array(
'//div[contains(concat(" ", normalize-space(@class), " "), " ob-section ")]',
),
)
)
);

View File

@ -0,0 +1,13 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://encyclopedie.naheulbeuk.com/article.php3?id_article=352',
'body' => array(
'//td//h1[@class="titre-texte"]',
'//td//div[@class="surtitre"]',
'//td//div[@class="texte"]',
),
)
),
);

View File

@ -1,12 +0,0 @@
<?php
return array(
'grabber' => array(
'%/joyoftech/.*%' => array(
'body' => array(
'//img[@width="640"]',
),
'test_url' => 'http://www.geekculture.com/joyoftech/joyarchives/2235.html',
),
),
);

View File

@ -1,18 +0,0 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://news.sciencemag.org/biology/2015/09/genetic-engineering-turns-common-plant-cancer-fighter',
'body' => array(
'//div[@class="content"]',
),
'strip' => array(
'//h1[@class="snews-article__headline"]',
'//div[contains(@class,"easy_social_box")]',
'//div[@class="author-teaser"]',
'//div[@class="article-byline"]',
),
),
)
);

View File

@ -0,0 +1,20 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.rugbyrama.fr/rugby/top-14/2015-2016/top-14-hayman-coupe-du-monde-finale-2012-lutte.-voici-levan-chilachava-toulon_sto5283863/story.shtml',
'body' => array(
'//div[@class="story-simple-content"]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//*[@class="share-buttons"]',
'//*[@class="show-mobile-block"]',
'//*[@class="hide-desktop"]',
'//*[@id="tracking_img"]',
)
)
)
);

View File

@ -4,8 +4,7 @@ return array(
'%.*%' => array( '%.*%' => array(
'test_url' => 'http://www.franceculture.fr/emission-culture-eco-la-finance-aime-toujours-la-france-2016-01-08', 'test_url' => 'http://www.franceculture.fr/emission-culture-eco-la-finance-aime-toujours-la-france-2016-01-08',
'body' => array( 'body' => array(
'//div[@class="heading"]/*/*/div[contains(@class,"player-inline")]', '//div[@class="text-zone"]',
'//article/div[@class="text-zone"]',
), ),
'strip' => array( 'strip' => array(
'//ul[@class="tags"]', '//ul[@class="tags"]',

View File

@ -0,0 +1,11 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.monsieur-le-chien.fr/index.php?planche=672',
'body' => array(
'//img[starts-with(@src, "i/planches/")]',
),
)
)
);

View File

@ -0,0 +1,16 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.sciencemag.org/news/2016/01/could-bright-foamy-wak$
'body' => array(
'//div[@class="row--hero"]',
'//article[contains(@class,"primary")]',
),
'strip' => array(
'//header[@class="article__header"]',
'//footer[@class="article__foot"]',
),
),
)
);

View File

@ -2,8 +2,8 @@
namespace PicoFeed\Scraper; namespace PicoFeed\Scraper;
use PicoFeed\Base;
use PicoFeed\Logging\Logger; use PicoFeed\Logging\Logger;
use PicoFeed\Config\Config;
/** /**
* RuleLoader class. * RuleLoader class.
@ -11,25 +11,8 @@ use PicoFeed\Config\Config;
* @author Frederic Guillot * @author Frederic Guillot
* @author Bernhard Posselt * @author Bernhard Posselt
*/ */
class RuleLoader class RuleLoader extends Base
{ {
/**
* Config object.
*
* @var \PicoFeed\Config\Config
*/
private $config;
/**
* Constructor.
*
* @param \PicoFeed\Config\Config $config Config class instance
*/
public function __construct(Config $config)
{
$this->config = $config;
}
/** /**
* Get the rules for an URL. * Get the rules for an URL.
* *
@ -111,12 +94,14 @@ class RuleLoader
*/ */
public function getRulesFolders() public function getRulesFolders()
{ {
$folders = array(__DIR__.'/../Rules'); $folders = array();
if ($this->config !== null && $this->config->getGrabberRulesFolder() !== null) { if ($this->config !== null && $this->config->getGrabberRulesFolder() !== null) {
$folders[] = $this->config->getGrabberRulesFolder(); $folders[] = $this->config->getGrabberRulesFolder();
} }
$folders[] = __DIR__ . '/../Rules';
return $folders; return $folders;
} }
} }

View File

@ -2,10 +2,10 @@
namespace PicoFeed\Scraper; namespace PicoFeed\Scraper;
use PicoFeed\Base;
use PicoFeed\Client\Client; use PicoFeed\Client\Client;
use PicoFeed\Client\ClientException; use PicoFeed\Client\ClientException;
use PicoFeed\Client\Url; use PicoFeed\Client\Url;
use PicoFeed\Config\Config;
use PicoFeed\Encoding\Encoding; use PicoFeed\Encoding\Encoding;
use PicoFeed\Filter\Filter; use PicoFeed\Filter\Filter;
use PicoFeed\Logging\Logger; use PicoFeed\Logging\Logger;
@ -16,7 +16,7 @@ use PicoFeed\Parser\XmlParser;
* *
* @author Frederic Guillot * @author Frederic Guillot
*/ */
class Scraper class Scraper extends Base
{ {
/** /**
* URL. * URL.
@ -53,24 +53,6 @@ class Scraper
*/ */
private $enableCandidateParser = true; private $enableCandidateParser = true;
/**
* Config object.
*
* @var \PicoFeed\Config\Config
*/
private $config;
/**
* Constructor.
*
* @param \PicoFeed\Config\Config $config Config class instance
*/
public function __construct(Config $config)
{
$this->config = $config;
Logger::setTimezone($this->config->getTimezone());
}
/** /**
* Disable candidates parsing. * Disable candidates parsing.
* *
@ -79,7 +61,6 @@ class Scraper
public function disableCandidateParser() public function disableCandidateParser()
{ {
$this->enableCandidateParser = false; $this->enableCandidateParser = false;
return $this; return $this;
} }
@ -227,9 +208,11 @@ class Scraper
*/ */
public function execute() public function execute()
{ {
$this->download(); $this->content = '';
$this->html = '';
$this->encoding = '';
if (!$this->skipProcessing()) { $this->download();
$this->prepareHtml(); $this->prepareHtml();
$parser = $this->getParser(); $parser = $this->getParser();
@ -239,34 +222,6 @@ class Scraper
Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes'); Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
} }
} }
}
/**
* Returns true if the parsing must be skipped.
*
* @return bool
*/
public function skipProcessing()
{
$handlers = array(
'detectStreamingVideos',
'detectPdfFiles',
);
foreach ($handlers as $handler) {
if ($this->$handler()) {
return true;
}
}
if (empty($this->html)) {
Logger::setMessage(get_called_class().': Raw HTML is empty');
return true;
}
return false;
}
/** /**
* Get the parser. * Get the parser.
@ -287,17 +242,14 @@ class Scraper
if (preg_match($pattern, $sub_url)) { if (preg_match($pattern, $sub_url)) {
Logger::setMessage(get_called_class().': Matched url '.$sub_url); Logger::setMessage(get_called_class().': Matched url '.$sub_url);
return new RuleParser($this->html, $rule); return new RuleParser($this->html, $rule);
} }
} }
} elseif ($this->enableCandidateParser) { } elseif ($this->enableCandidateParser) {
Logger::setMessage(get_called_class().': Parse content with candidates'); Logger::setMessage(get_called_class().': Parse content with candidates');
return new CandidateParser($this->html);
} }
return; return new CandidateParser($this->html);
} }
/** /**
@ -312,30 +264,4 @@ class Scraper
Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'" ; HTML Encoding "'.$html_encoding.'"'); Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'" ; HTML Encoding "'.$html_encoding.'"');
} }
/**
* Return the Youtube embed player and skip processing.
*
* @return bool
*/
public function detectStreamingVideos()
{
if (preg_match("#(?<=v=|v\/|vi=|vi\/|youtu.be\/)[a-zA-Z0-9_-]{11}#", $this->url, $matches)) {
$this->content = '<iframe width="560" height="315" src="//www.youtube.com/embed/'.$matches[0].'" frameborder="0"></iframe>';
return true;
}
return false;
}
/**
* Skip processing for PDF documents.
*
* @return bool
*/
public function detectPdfFiles()
{
return substr($this->url, -3) === 'pdf';
}
} }