diff --git a/composer.json b/composer.json
index 5bee794..9d45006 100644
--- a/composer.json
+++ b/composer.json
@@ -15,7 +15,7 @@
"fguillot/simple-validator": "v1.0.0",
"fguillot/json-rpc": "v1.0.2",
"fguillot/picodb": "v1.0.2",
- "fguillot/picofeed": "v0.1.19"
+ "fguillot/picofeed": "v0.1.20"
},
"require-dev": {
"phpunit/phpunit": "4.8.3",
diff --git a/vendor/composer/autoload_classmap.php b/vendor/composer/autoload_classmap.php
index cca7d49..ecabc01 100644
--- a/vendor/composer/autoload_classmap.php
+++ b/vendor/composer/autoload_classmap.php
@@ -25,6 +25,7 @@ return array(
'PicoDb\\SQLException' => $vendorDir . '/fguillot/picodb/lib/PicoDb/SQLException.php',
'PicoDb\\Schema' => $vendorDir . '/fguillot/picodb/lib/PicoDb/Schema.php',
'PicoDb\\Table' => $vendorDir . '/fguillot/picodb/lib/PicoDb/Table.php',
+ 'PicoFeed\\Base' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Base.php',
'PicoFeed\\Client\\Client' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Client.php',
'PicoFeed\\Client\\ClientException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/ClientException.php',
'PicoFeed\\Client\\Curl' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Curl.php',
@@ -42,6 +43,9 @@ return array(
'PicoFeed\\Filter\\Filter' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php',
'PicoFeed\\Filter\\Html' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Filter/Html.php',
'PicoFeed\\Filter\\Tag' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Filter/Tag.php',
+ 'PicoFeed\\Generator\\ContentGeneratorInterface' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Generator/ContentGeneratorInterface.php',
+ 'PicoFeed\\Generator\\FileContentGenerator' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Generator/FileContentGenerator.php',
+ 'PicoFeed\\Generator\\YoutubeContentGenerator' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Generator/YoutubeContentGenerator.php',
'PicoFeed\\Logging\\Logger' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Logging/Logger.php',
'PicoFeed\\Parser\\Atom' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/Atom.php',
'PicoFeed\\Parser\\DateParser' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/DateParser.php',
@@ -57,6 +61,11 @@ return array(
'PicoFeed\\Parser\\XmlEntityException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/XmlEntityException.php',
'PicoFeed\\Parser\\XmlParser' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php',
'PicoFeed\\PicoFeedException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/PicoFeedException.php',
+ 'PicoFeed\\Processor\\ContentFilterProcessor' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ContentFilterProcessor.php',
+ 'PicoFeed\\Processor\\ContentGeneratorProcessor' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ContentGeneratorProcessor.php',
+ 'PicoFeed\\Processor\\ItemPostProcessor' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ItemPostProcessor.php',
+ 'PicoFeed\\Processor\\ItemProcessorInterface' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ItemProcessorInterface.php',
+ 'PicoFeed\\Processor\\ScraperProcessor' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ScraperProcessor.php',
'PicoFeed\\Reader\\Favicon' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/Favicon.php',
'PicoFeed\\Reader\\Reader' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/Reader.php',
'PicoFeed\\Reader\\ReaderException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/ReaderException.php',
diff --git a/vendor/composer/installed.json b/vendor/composer/installed.json
index f72aa48..d65a909 100644
--- a/vendor/composer/installed.json
+++ b/vendor/composer/installed.json
@@ -163,17 +163,17 @@
},
{
"name": "fguillot/picofeed",
- "version": "v0.1.19",
- "version_normalized": "0.1.19.0",
+ "version": "v0.1.20",
+ "version_normalized": "0.1.20.0",
"source": {
"type": "git",
"url": "https://github.com/fguillot/picoFeed.git",
- "reference": "c270ef4474a2460d857f99c84612025c5f9975f2"
+ "reference": "d6bbdd248fa4a3eef7831ffaae0491a2ea58f897"
},
"dist": {
"type": "zip",
- "url": "https://api.github.com/repos/fguillot/picoFeed/zipball/c270ef4474a2460d857f99c84612025c5f9975f2",
- "reference": "c270ef4474a2460d857f99c84612025c5f9975f2",
+ "url": "https://api.github.com/repos/fguillot/picoFeed/zipball/d6bbdd248fa4a3eef7831ffaae0491a2ea58f897",
+ "reference": "d6bbdd248fa4a3eef7831ffaae0491a2ea58f897",
"shasum": ""
},
"require": {
@@ -188,7 +188,7 @@
"suggest": {
"ext-curl": "PicoFeed will use cURL if present"
},
- "time": "2016-02-11 19:52:02",
+ "time": "2016-03-24 12:09:56",
"bin": [
"picofeed"
],
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Base.php b/vendor/fguillot/picofeed/lib/PicoFeed/Base.php
new file mode 100644
index 0000000..4be0985
--- /dev/null
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Base.php
@@ -0,0 +1,34 @@
+config = $config ?: new Config();
+ Logger::setTimezone($this->config->getTimezone());
+ }
+}
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Filter/Tag.php b/vendor/fguillot/picofeed/lib/PicoFeed/Filter/Tag.php
index 34e21dc..5fd8d6d 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Filter/Tag.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Filter/Tag.php
@@ -2,24 +2,17 @@
namespace PicoFeed\Filter;
-use DOMXpath;
+use DOMXPath;
+use PicoFeed\Base;
use PicoFeed\Parser\XmlParser;
-use PicoFeed\Config\Config;
/**
* Tag Filter class.
*
* @author Frederic Guillot
*/
-class Tag
+class Tag extends Base
{
- /**
- * Config object.
- *
- * @var \PicoFeed\Config\Config
- */
- private $config;
-
/**
* Tags blacklist (Xpath expressions).
*
@@ -76,11 +69,6 @@ class Tag
'q',
);
- public function __construct(Config $config)
- {
- $this->config = $config;
- }
-
/**
* Check if the tag is allowed and is not a pixel tracker.
*
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Generator/ContentGeneratorInterface.php b/vendor/fguillot/picofeed/lib/PicoFeed/Generator/ContentGeneratorInterface.php
new file mode 100644
index 0000000..5c2f205
--- /dev/null
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Generator/ContentGeneratorInterface.php
@@ -0,0 +1,23 @@
+extensions as $extension) {
+ if (substr($item->getUrl(), - strlen($extension)) === $extension) {
+ $item->setContent(''.$item->getUrl().'');
+ return true;
+ }
+ }
+
+ return false;
+ }
+}
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Generator/YoutubeContentGenerator.php b/vendor/fguillot/picofeed/lib/PicoFeed/Generator/YoutubeContentGenerator.php
new file mode 100644
index 0000000..198090d
--- /dev/null
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Generator/YoutubeContentGenerator.php
@@ -0,0 +1,67 @@
+hasNamespace('yt')) {
+ return $this->generateHtmlFromXml($item);
+ }
+
+ return $this->generateHtmlFromUrl($item);
+ }
+
+ /**
+ * Generate HTML
+ *
+ * @access public
+ * @param Item $item
+ * @return boolean
+ */
+ private function generateHtmlFromXml(Item $item)
+ {
+ $videoId = $item->getTag('yt:videoId');
+
+ if (! empty($videoId)) {
+ $item->setContent('');
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Generate HTML from item URL
+ *
+ * @access public
+ * @param Item $item
+ * @return bool
+ */
+ public function generateHtmlFromUrl(Item $item)
+ {
+ if (preg_match('/youtube\.com\/watch\?v=(.*)/', $item->getUrl(), $matches)) {
+ $item->setContent('');
+ return true;
+ }
+
+ return false;
+ }
+}
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Atom.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Atom.php
index 356453c..6325923 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Atom.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Atom.php
@@ -150,7 +150,7 @@ class Atom extends Parser
$updated = XmlParser::getXPathResult($xml, 'atom:updated', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'updated');
- $feed->date = $this->date->getDateTime((string) current($updated));
+ $feed->date = $this->getDateParser()->getDateTime((string) current($updated));
}
/**
@@ -168,8 +168,8 @@ class Atom extends Parser
$updated = XmlParser::getXPathResult($entry, 'atom:updated', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'updated');
- $published = !empty($published) ? $this->date->getDateTime((string) current($published)) : null;
- $updated = !empty($updated) ? $this->date->getDateTime((string) current($updated)) : null;
+ $published = !empty($published) ? $this->getDateParser()->getDateTime((string) current($published)) : null;
+ $updated = !empty($updated) ? $this->getDateParser()->getDateTime((string) current($updated)) : null;
if ($published === null && $updated === null) {
$item->date = $feed->getDate(); // We use the feed date if there is no date for the item
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/DateParser.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/DateParser.php
index e4d08b5..4ad0078 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/DateParser.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/DateParser.php
@@ -4,20 +4,22 @@ namespace PicoFeed\Parser;
use DateTime;
use DateTimeZone;
+use PicoFeed\Base;
/**
* Date Parser.
*
* @author Frederic Guillot
*/
-class DateParser
+class DateParser extends Base
{
/**
* Timezone used to parse feed dates.
*
+ * @access private
* @var string
*/
- public $timezone = 'UTC';
+ private $timezone = 'UTC';
/**
* Supported formats [ 'format' => length ].
@@ -88,7 +90,7 @@ class DateParser
*/
public function getValidDate($format, $value)
{
- $date = DateTime::createFromFormat($format, $value, new DateTimeZone($this->timezone));
+ $date = DateTime::createFromFormat($format, $value, $this->getTimeZone());
if ($date !== false) {
$errors = DateTime::getLastErrors();
@@ -108,6 +110,17 @@ class DateParser
*/
public function getCurrentDateTime()
{
- return new DateTime('now', new DateTimeZone($this->timezone));
+ return new DateTime('now', $this->getTimeZone());
+ }
+
+ /**
+ * Get DateTimeZone instance
+ *
+ * @access public
+ * @return DateTimeZone
+ */
+ public function getTimeZone()
+ {
+ return new DateTimeZone($this->config->getTimezone() ?: $this->timezone);
}
}
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Item.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Item.php
index 22d7c59..34e557a 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Item.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Item.php
@@ -102,6 +102,18 @@ class Item
*/
public $namespaces = array();
+ /**
+ * Check if a XML namespace exists
+ *
+ * @access public
+ * @param string $namespace
+ * @return bool
+ */
+ public function hasNamespace($namespace)
+ {
+ return array_key_exists($namespace, $this->namespaces);
+ }
+
/**
* Get specific XML tag or attribute value.
*
@@ -112,12 +124,10 @@ class Item
*/
public function getTag($tag, $attribute = '')
{
- // convert to xPath attribute query
if ($attribute !== '') {
$attribute = '/@'.$attribute;
}
- // construct query
$query = './/'.$tag.$attribute;
$elements = XmlParser::getXPathResult($this->xml, $query, $this->namespaces);
@@ -155,13 +165,29 @@ class Item
}
/**
- * Get url.
+ * Get URL
+ *
+ * @access public
+ * @return string
*/
public function getUrl()
{
return $this->url;
}
+ /**
+ * Set URL
+ *
+ * @access public
+ * @param string $url
+ * @return Item
+ */
+ public function setUrl($url)
+ {
+ $this->url = $url;
+ return $this;
+ }
+
/**
* Get id.
*/
@@ -186,6 +212,19 @@ class Item
return $this->content;
}
+ /**
+ * Set content
+ *
+ * @access public
+ * @param string $value
+ * @return Item
+ */
+ public function setContent($value)
+ {
+ $this->content = $value;
+ return $this;
+ }
+
/**
* Get enclosure url.
*/
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php
index 5130b68..433f21a 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php
@@ -2,12 +2,15 @@
namespace PicoFeed\Parser;
+use PicoFeed\Processor\ContentFilterProcessor;
+use PicoFeed\Processor\ContentGeneratorProcessor;
+use PicoFeed\Processor\ItemPostProcessor;
+use PicoFeed\Processor\ScraperProcessor;
use SimpleXMLElement;
use PicoFeed\Client\Url;
use PicoFeed\Encoding\Encoding;
use PicoFeed\Filter\Filter;
use PicoFeed\Logging\Logger;
-use PicoFeed\Scraper\Scraper;
/**
* Base parser class.
@@ -28,7 +31,7 @@ abstract class Parser
*
* @var \PicoFeed\Parser\DateParser
*/
- protected $date;
+ private $dateParser;
/**
* Hash algorithm used to generate item id, any value supported by PHP, see hash_algos().
@@ -66,32 +69,12 @@ abstract class Parser
protected $used_namespaces = array();
/**
- * Enable the content filtering.
+ * Item Post Processor instance
*
- * @var bool
+ * @access private
+ * @var ItemPostProcessor
*/
- private $enable_filter = true;
-
- /**
- * Enable the content grabber.
- *
- * @var bool
- */
- private $enable_grabber = false;
-
- /**
- * Enable the content grabber on all pages.
- *
- * @var bool
- */
- private $grabber_needs_rule_file = false;
-
- /**
- * Ignore those urls for the content scraper.
- *
- * @var array
- */
- private $grabber_ignore_urls = array();
+ private $itemPostProcessor;
/**
* Constructor.
@@ -102,7 +85,6 @@ abstract class Parser
*/
public function __construct($content, $http_encoding = '', $fallback_url = '')
{
- $this->date = new DateParser();
$this->fallback_url = $fallback_url;
$xml_encoding = XmlParser::getEncodingFromXmlTag($content);
@@ -112,6 +94,10 @@ abstract class Parser
// Encode everything in UTF-8
Logger::setMessage(get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"');
$this->content = Encoding::convert($this->content, $xml_encoding ?: $http_encoding);
+
+ $this->itemPostProcessor = new ItemPostProcessor($this->config);
+ $this->itemPostProcessor->register(new ContentGeneratorProcessor($this->config));
+ $this->itemPostProcessor->register(new ContentFilterProcessor($this->config));
}
/**
@@ -173,15 +159,11 @@ abstract class Parser
// Id generation can use the item url/title/content (order is important)
$this->findItemId($entry, $item, $feed);
-
$this->findItemDate($entry, $item, $feed);
$this->findItemEnclosure($entry, $item, $feed);
$this->findItemLanguage($entry, $item, $feed);
- // Order is important (avoid double filtering)
- $this->filterItemContent($feed, $item);
- $this->scrapWebsite($item);
-
+ $this->itemPostProcessor->execute($feed, $item);
$feed->items[] = $item;
}
@@ -230,43 +212,29 @@ abstract class Parser
}
/**
- * Fetch item content with the content grabber.
+ * Get Item Post Processor instance
*
- * @param Item $item Item object
+ * @access public
+ * @return ItemPostProcessor
*/
- public function scrapWebsite(Item $item)
+ public function getItemPostProcessor()
{
- if ($this->enable_grabber && !in_array($item->getUrl(), $this->grabber_ignore_urls)) {
- $grabber = new Scraper($this->config);
- $grabber->setUrl($item->getUrl());
-
- if ($this->grabber_needs_rule_file) {
- $grabber->disableCandidateParser();
- }
-
- $grabber->execute();
-
- if ($grabber->hasRelevantContent()) {
- $item->content = $grabber->getFilteredContent();
- }
- }
+ return $this->itemPostProcessor;
}
/**
- * Filter HTML for entry content.
+ * Get DateParser instance
*
- * @param Feed $feed Feed object
- * @param Item $item Item object
+ * @access public
+ * @return DateParser
*/
- public function filterItemContent(Feed $feed, Item $item)
+ public function getDateParser()
{
- if ($this->isFilteringEnabled()) {
- $filter = Filter::html($item->getContent(), $feed->getSiteUrl());
- $filter->setConfig($this->config);
- $item->content = $filter->execute();
- } else {
- Logger::setMessage(get_called_class().': Content filtering disabled');
+ if ($this->dateParser === null) {
+ return new DateParser($this->config);
}
+
+ return $this->dateParser;
}
/**
@@ -316,31 +284,11 @@ abstract class Parser
* Set Hash algorithm used for id generation.
*
* @param string $algo Algorithm name
- *
* @return \PicoFeed\Parser\Parser
*/
public function setHashAlgo($algo)
{
$this->hash_algo = $algo ?: $this->hash_algo;
-
- return $this;
- }
-
- /**
- * Set a different timezone.
- *
- * @see http://php.net/manual/en/timezones.php
- *
- * @param string $timezone Timezone
- *
- * @return \PicoFeed\Parser\Parser
- */
- public function setTimezone($timezone)
- {
- if ($timezone) {
- $this->date->timezone = $timezone;
- }
-
return $this;
}
@@ -354,7 +302,6 @@ abstract class Parser
public function setConfig($config)
{
$this->config = $config;
-
return $this;
}
@@ -365,21 +312,8 @@ abstract class Parser
*/
public function disableContentFiltering()
{
- $this->enable_filter = false;
- }
-
- /**
- * Return true if the content filtering is enabled.
- *
- * @return bool
- */
- public function isFilteringEnabled()
- {
- if ($this->config === null) {
- return $this->enable_filter;
- }
-
- return $this->config->getContentFiltering($this->enable_filter);
+ $this->itemPostProcessor->unregister('PicoFeed\Processor\ContentFilterProcessor');
+ return $this;
}
/**
@@ -392,8 +326,14 @@ abstract class Parser
*/
public function enableContentGrabber($needs_rule_file = false)
{
- $this->enable_grabber = true;
- $this->grabber_needs_rule_file = $needs_rule_file;
+ $processor = new ScraperProcessor($this->config);
+
+ if ($needs_rule_file) {
+ $processor->getScraper()->disableCandidateParser();
+ }
+
+ $this->itemPostProcessor->register($processor);
+ return $this;
}
/**
@@ -405,7 +345,8 @@ abstract class Parser
*/
public function setGrabberIgnoreUrls(array $urls)
{
- $this->grabber_ignore_urls = $urls;
+ $this->itemPostProcessor->getProcessor('PicoFeed\Processor\ScraperProcessor')->ignoreUrls($urls);
+ return $this;
}
/**
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Rss10.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Rss10.php
index dd2aa7a..315c7db 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Rss10.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Rss10.php
@@ -149,7 +149,7 @@ class Rss10 extends Parser
$date = XmlParser::getXPathResult($xml, 'rss:channel/dc:date', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/dc:date', $this->namespaces);
- $feed->date = $this->date->getDateTime((string) current($date));
+ $feed->date = $this->getDateParser()->getDateTime((string) current($date));
}
/**
@@ -163,7 +163,7 @@ class Rss10 extends Parser
{
$date = XmlParser::getXPathResult($entry, 'dc:date', $this->namespaces);
- $item->date = empty($date) ? $feed->getDate() : $this->date->getDateTime((string) current($date));
+ $item->date = empty($date) ? $feed->getDate() : $this->getDateParser()->getDateTime((string) current($date));
}
/**
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Rss20.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Rss20.php
index 005691f..b265656 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Rss20.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Rss20.php
@@ -139,11 +139,11 @@ class Rss20 extends Parser
$publish_date = XmlParser::getXPathResult($xml, 'channel/pubDate');
$update_date = XmlParser::getXPathResult($xml, 'channel/lastBuildDate');
- $published = !empty($publish_date) ? $this->date->getDateTime((string) current($publish_date)) : null;
- $updated = !empty($update_date) ? $this->date->getDateTime((string) current($update_date)) : null;
+ $published = !empty($publish_date) ? $this->getDateParser()->getDateTime((string) current($publish_date)) : null;
+ $updated = !empty($update_date) ? $this->getDateParser()->getDateTime((string) current($update_date)) : null;
if ($published === null && $updated === null) {
- $feed->date = $this->date->getCurrentDateTime(); // We use the current date if there is no date for the feed
+ $feed->date = $this->getDateParser()->getCurrentDateTime(); // We use the current date if there is no date for the feed
} elseif ($published !== null && $updated !== null) {
$feed->date = max($published, $updated); // We use the most recent date between published and updated
} else {
@@ -162,7 +162,7 @@ class Rss20 extends Parser
{
$date = XmlParser::getXPathResult($entry, 'pubDate');
- $item->date = empty($date) ? $feed->getDate() : $this->date->getDateTime((string) current($date));
+ $item->date = empty($date) ? $feed->getDate() : $this->getDateParser()->getDateTime((string) current($date));
}
/**
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php
index 7c8ebc6..6ed5a48 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php
@@ -38,7 +38,7 @@ class XmlParser
*
* @param string $input XML content
*
- * @return \DOMNDocument
+ * @return \DOMDocument
*/
public static function getDomDocument($input)
{
@@ -60,10 +60,10 @@ class XmlParser
* Small wrapper around ZendXml to turn their exceptions into picoFeed
* exceptions
* @param $input the xml to load
- * @param $dom pass in a dom document or use null/omit if simpleXml should
+ * @param $dom pass in a dom document or use null/omit if simpleXml should
* be used
*/
- private static function scan($input, $dom=null)
+ private static function scan($input, $dom = null)
{
try {
return Security::scan($input, $dom);
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ContentFilterProcessor.php b/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ContentFilterProcessor.php
new file mode 100644
index 0000000..9b7ddcc
--- /dev/null
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ContentFilterProcessor.php
@@ -0,0 +1,37 @@
+config->getContentFiltering(true)) {
+ $filter = Filter::html($item->getContent(), $feed->getSiteUrl());
+ $filter->setConfig($this->config);
+ $item->setContent($filter->execute());
+ } else {
+ Logger::setMessage(get_called_class().': Content filtering disabled');
+ }
+ }
+}
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ContentGeneratorProcessor.php b/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ContentGeneratorProcessor.php
new file mode 100644
index 0000000..49adf9c
--- /dev/null
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ContentGeneratorProcessor.php
@@ -0,0 +1,49 @@
+generators as $generator) {
+ $className = '\PicoFeed\Generator\\'.ucfirst($generator).'ContentGenerator';
+ $object = new $className($this->config);
+
+ if ($object->execute($item)) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+}
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ItemPostProcessor.php b/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ItemPostProcessor.php
new file mode 100644
index 0000000..ff9740b
--- /dev/null
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ItemPostProcessor.php
@@ -0,0 +1,84 @@
+processors as $processor) {
+ if ($processor->execute($feed, $item)) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Register a new Item post-processor
+ *
+ * @access public
+ * @param ItemProcessorInterface $processor
+ * @return ItemPostProcessor
+ */
+ public function register(ItemProcessorInterface $processor)
+ {
+ $this->processors[get_class($processor)] = $processor;
+ return $this;
+ }
+
+ /**
+ * Remove Processor instance
+ *
+ * @access public
+ * @param string $class
+ * @return ItemPostProcessor
+ */
+ public function unregister($class)
+ {
+ if (isset($this->processors[$class])) {
+ unset($this->processors[$class]);
+ }
+
+ return $this;
+ }
+
+ /**
+ * Get Processor instance
+ *
+ * @access public
+ * @param string $class
+ * @return ItemProcessorInterface|null
+ */
+ public function getProcessor($class)
+ {
+ return isset($this->processors[$class]) ? $this->processors[$class] : null;
+ }
+}
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ItemProcessorInterface.php b/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ItemProcessorInterface.php
new file mode 100644
index 0000000..5d53226
--- /dev/null
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ItemProcessorInterface.php
@@ -0,0 +1,25 @@
+getUrl(), $this->ignoredUrls)) {
+ $scraper = $this->getScraper();
+ $scraper->setUrl($item->getUrl());
+ $scraper->execute();
+
+ if ($scraper->hasRelevantContent()) {
+ $item->setContent($scraper->getFilteredContent());
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Ignore list of URLs
+ *
+ * @access public
+ * @param array $urls
+ * @return $this
+ */
+ public function ignoreUrls(array $urls)
+ {
+ $this->ignoredUrls = $urls;
+ return $this;
+ }
+
+ /**
+ * Returns Scraper instance
+ *
+ * @access public
+ * @return Scraper
+ */
+ public function getScraper()
+ {
+ if ($this->scraper === null) {
+ $this->scraper = new Scraper($this->config);
+ }
+
+ return $this->scraper;
+ }
+}
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Reader/Favicon.php b/vendor/fguillot/picofeed/lib/PicoFeed/Reader/Favicon.php
index 53c5cf7..09feb49 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Reader/Favicon.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Reader/Favicon.php
@@ -2,11 +2,11 @@
namespace PicoFeed\Reader;
-use DOMXpath;
+use DOMXPath;
+use PicoFeed\Base;
use PicoFeed\Client\Client;
use PicoFeed\Client\ClientException;
use PicoFeed\Client\Url;
-use PicoFeed\Config\Config;
use PicoFeed\Logging\Logger;
use PicoFeed\Parser\XmlParser;
@@ -17,7 +17,7 @@ use PicoFeed\Parser\XmlParser;
*
* @author Frederic Guillot
*/
-class Favicon
+class Favicon extends Base
{
/**
* Valid types for favicon (supported by browsers).
@@ -33,13 +33,6 @@ class Favicon
'image/svg+xml'
);
- /**
- * Config class instance.
- *
- * @var \PicoFeed\Config\Config
- */
- private $config;
-
/**
* Icon binary content.
*
@@ -54,16 +47,6 @@ class Favicon
*/
private $content_type = '';
- /**
- * Constructor.
- *
- * @param \PicoFeed\Config\Config $config Config class instance
- */
- public function __construct(Config $config = null)
- {
- $this->config = $config ?: new Config();
- }
-
/**
* Get the icon file content (available only after the download).
*
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Reader/Reader.php b/vendor/fguillot/picofeed/lib/PicoFeed/Reader/Reader.php
index cfe5171..7b26dea 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Reader/Reader.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Reader/Reader.php
@@ -3,7 +3,7 @@
namespace PicoFeed\Reader;
use DOMXPath;
-use PicoFeed\Config\Config;
+use PicoFeed\Base;
use PicoFeed\Client\Client;
use PicoFeed\Client\Url;
use PicoFeed\Logging\Logger;
@@ -14,7 +14,7 @@ use PicoFeed\Parser\XmlParser;
*
* @author Frederic Guillot
*/
-class Reader
+class Reader extends Base
{
/**
* Feed formats for detection.
@@ -29,24 +29,6 @@ class Reader
'Rss10' => '//rdf',
);
- /**
- * Config class instance.
- *
- * @var \PicoFeed\Config\Config
- */
- private $config;
-
- /**
- * Constructor.
- *
- * @param \PicoFeed\Config\Config $config Config class instance
- */
- public function __construct(Config $config = null)
- {
- $this->config = $config ?: new Config();
- Logger::setTimezone($this->config->getTimezone());
- }
-
/**
* Download a feed (no discovery).
*
@@ -163,7 +145,6 @@ class Reader
$parser = new $className($content, $encoding, $url);
$parser->setHashAlgo($this->config->getParserHashAlgo());
- $parser->setTimezone($this->config->getTimezone());
$parser->setConfig($this->config);
return $parser;
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/.over-blog.com.php b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/.over-blog.com.php
new file mode 100644
index 0000000..cc5d83c
--- /dev/null
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/.over-blog.com.php
@@ -0,0 +1,11 @@
+ array(
+ '%.*%' => array(
+ 'test_url' => 'http://eliascarpe.over-blog.com/2015/12/re-upload-projets-d-avenir.html',
+ 'body' => array(
+ '//div[contains(concat(" ", normalize-space(@class), " "), " ob-section ")]',
+ ),
+ )
+ )
+);
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/encyclopedie.naheulbeuk.com.php b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/encyclopedie.naheulbeuk.com.php
new file mode 100644
index 0000000..19bcbde
--- /dev/null
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/encyclopedie.naheulbeuk.com.php
@@ -0,0 +1,13 @@
+ array(
+ '%.*%' => array(
+ 'test_url' => 'http://encyclopedie.naheulbeuk.com/article.php3?id_article=352',
+ 'body' => array(
+ '//td//h1[@class="titre-texte"]',
+ '//td//div[@class="surtitre"]',
+ '//td//div[@class="texte"]',
+ ),
+ )
+ ),
+);
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/greekculture.com.php b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/greekculture.com.php
deleted file mode 100644
index 9410de9..0000000
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/greekculture.com.php
+++ /dev/null
@@ -1,12 +0,0 @@
- array(
- '%/joyoftech/.*%' => array(
- 'body' => array(
- '//img[@width="640"]',
- ),
- 'test_url' => 'http://www.geekculture.com/joyoftech/joyarchives/2235.html',
- ),
- ),
-);
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/news.sciencemag.org.php b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/news.sciencemag.org.php
deleted file mode 100644
index 9b572ef..0000000
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/news.sciencemag.org.php
+++ /dev/null
@@ -1,18 +0,0 @@
- array(
- '%.*%' => array(
- 'test_url' => 'http://news.sciencemag.org/biology/2015/09/genetic-engineering-turns-common-plant-cancer-fighter',
- 'body' => array(
- '//div[@class="content"]',
- ),
- 'strip' => array(
- '//h1[@class="snews-article__headline"]',
- '//div[contains(@class,"easy_social_box")]',
- '//div[@class="author-teaser"]',
- '//div[@class="article-byline"]',
- ),
- ),
- )
-);
-
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/rugbyrama.fr.php b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/rugbyrama.fr.php
new file mode 100644
index 0000000..2280b66
--- /dev/null
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/rugbyrama.fr.php
@@ -0,0 +1,20 @@
+ array(
+ '%.*%' => array(
+ 'test_url' => 'http://www.rugbyrama.fr/rugby/top-14/2015-2016/top-14-hayman-coupe-du-monde-finale-2012-lutte.-voici-levan-chilachava-toulon_sto5283863/story.shtml',
+ 'body' => array(
+ '//div[@class="story-simple-content"]',
+ ),
+ 'strip' => array(
+ '//script',
+ '//form',
+ '//style',
+ '//*[@class="share-buttons"]',
+ '//*[@class="show-mobile-block"]',
+ '//*[@class="hide-desktop"]',
+ '//*[@id="tracking_img"]',
+ )
+ )
+ )
+);
\ No newline at end of file
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.franceculture.fr.php b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.franceculture.fr.php
index 67e3253..f7ec0d8 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.franceculture.fr.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.franceculture.fr.php
@@ -4,8 +4,7 @@ return array(
'%.*%' => array(
'test_url' => 'http://www.franceculture.fr/emission-culture-eco-la-finance-aime-toujours-la-france-2016-01-08',
'body' => array(
- '//div[@class="heading"]/*/*/div[contains(@class,"player-inline")]',
- '//article/div[@class="text-zone"]',
+ '//div[@class="text-zone"]',
),
'strip' => array(
'//ul[@class="tags"]',
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.monsieur-le-chien.fr.php b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.monsieur-le-chien.fr.php
new file mode 100644
index 0000000..5f5e987
--- /dev/null
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.monsieur-le-chien.fr.php
@@ -0,0 +1,11 @@
+ array(
+ '%.*%' => array(
+ 'test_url' => 'http://www.monsieur-le-chien.fr/index.php?planche=672',
+ 'body' => array(
+ '//img[starts-with(@src, "i/planches/")]',
+ ),
+ )
+ )
+);
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.sciencemag.org.php b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.sciencemag.org.php
new file mode 100644
index 0000000..3d34857
--- /dev/null
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.sciencemag.org.php
@@ -0,0 +1,16 @@
+ array(
+ '%.*%' => array(
+ 'test_url' => 'http://www.sciencemag.org/news/2016/01/could-bright-foamy-wak$
+ 'body' => array(
+ '//div[@class="row--hero"]',
+ '//article[contains(@class,"primary")]',
+ ),
+ 'strip' => array(
+ '//header[@class="article__header"]',
+ '//footer[@class="article__foot"]',
+ ),
+ ),
+ )
+);
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/RuleLoader.php b/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/RuleLoader.php
index 0cffbf6..6650682 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/RuleLoader.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/RuleLoader.php
@@ -2,8 +2,8 @@
namespace PicoFeed\Scraper;
+use PicoFeed\Base;
use PicoFeed\Logging\Logger;
-use PicoFeed\Config\Config;
/**
* RuleLoader class.
@@ -11,25 +11,8 @@ use PicoFeed\Config\Config;
* @author Frederic Guillot
* @author Bernhard Posselt
*/
-class RuleLoader
+class RuleLoader extends Base
{
- /**
- * Config object.
- *
- * @var \PicoFeed\Config\Config
- */
- private $config;
-
- /**
- * Constructor.
- *
- * @param \PicoFeed\Config\Config $config Config class instance
- */
- public function __construct(Config $config)
- {
- $this->config = $config;
- }
-
/**
* Get the rules for an URL.
*
@@ -111,12 +94,14 @@ class RuleLoader
*/
public function getRulesFolders()
{
- $folders = array(__DIR__.'/../Rules');
+ $folders = array();
if ($this->config !== null && $this->config->getGrabberRulesFolder() !== null) {
$folders[] = $this->config->getGrabberRulesFolder();
}
+ $folders[] = __DIR__ . '/../Rules';
+
return $folders;
}
}
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/Scraper.php b/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/Scraper.php
index f1d1222..980a88d 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/Scraper.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/Scraper.php
@@ -2,10 +2,10 @@
namespace PicoFeed\Scraper;
+use PicoFeed\Base;
use PicoFeed\Client\Client;
use PicoFeed\Client\ClientException;
use PicoFeed\Client\Url;
-use PicoFeed\Config\Config;
use PicoFeed\Encoding\Encoding;
use PicoFeed\Filter\Filter;
use PicoFeed\Logging\Logger;
@@ -16,7 +16,7 @@ use PicoFeed\Parser\XmlParser;
*
* @author Frederic Guillot
*/
-class Scraper
+class Scraper extends Base
{
/**
* URL.
@@ -53,24 +53,6 @@ class Scraper
*/
private $enableCandidateParser = true;
- /**
- * Config object.
- *
- * @var \PicoFeed\Config\Config
- */
- private $config;
-
- /**
- * Constructor.
- *
- * @param \PicoFeed\Config\Config $config Config class instance
- */
- public function __construct(Config $config)
- {
- $this->config = $config;
- Logger::setTimezone($this->config->getTimezone());
- }
-
/**
* Disable candidates parsing.
*
@@ -79,7 +61,6 @@ class Scraper
public function disableCandidateParser()
{
$this->enableCandidateParser = false;
-
return $this;
}
@@ -227,47 +208,21 @@ class Scraper
*/
public function execute()
{
+ $this->content = '';
+ $this->html = '';
+ $this->encoding = '';
+
$this->download();
+ $this->prepareHtml();
- if (!$this->skipProcessing()) {
- $this->prepareHtml();
+ $parser = $this->getParser();
- $parser = $this->getParser();
-
- if ($parser !== null) {
- $this->content = $parser->execute();
- Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
- }
+ if ($parser !== null) {
+ $this->content = $parser->execute();
+ Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
}
}
- /**
- * Returns true if the parsing must be skipped.
- *
- * @return bool
- */
- public function skipProcessing()
- {
- $handlers = array(
- 'detectStreamingVideos',
- 'detectPdfFiles',
- );
-
- foreach ($handlers as $handler) {
- if ($this->$handler()) {
- return true;
- }
- }
-
- if (empty($this->html)) {
- Logger::setMessage(get_called_class().': Raw HTML is empty');
-
- return true;
- }
-
- return false;
- }
-
/**
* Get the parser.
*
@@ -287,17 +242,14 @@ class Scraper
if (preg_match($pattern, $sub_url)) {
Logger::setMessage(get_called_class().': Matched url '.$sub_url);
-
return new RuleParser($this->html, $rule);
}
}
} elseif ($this->enableCandidateParser) {
Logger::setMessage(get_called_class().': Parse content with candidates');
-
- return new CandidateParser($this->html);
}
- return;
+ return new CandidateParser($this->html);
}
/**
@@ -312,30 +264,4 @@ class Scraper
Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'" ; HTML Encoding "'.$html_encoding.'"');
}
-
- /**
- * Return the Youtube embed player and skip processing.
- *
- * @return bool
- */
- public function detectStreamingVideos()
- {
- if (preg_match("#(?<=v=|v\/|vi=|vi\/|youtu.be\/)[a-zA-Z0-9_-]{11}#", $this->url, $matches)) {
- $this->content = '';
-
- return true;
- }
-
- return false;
- }
-
- /**
- * Skip processing for PDF documents.
- *
- * @return bool
- */
- public function detectPdfFiles()
- {
- return substr($this->url, -3) === 'pdf';
- }
}