diff --git a/composer.json b/composer.json index 5bee794..9d45006 100644 --- a/composer.json +++ b/composer.json @@ -15,7 +15,7 @@ "fguillot/simple-validator": "v1.0.0", "fguillot/json-rpc": "v1.0.2", "fguillot/picodb": "v1.0.2", - "fguillot/picofeed": "v0.1.19" + "fguillot/picofeed": "v0.1.20" }, "require-dev": { "phpunit/phpunit": "4.8.3", diff --git a/vendor/composer/autoload_classmap.php b/vendor/composer/autoload_classmap.php index cca7d49..ecabc01 100644 --- a/vendor/composer/autoload_classmap.php +++ b/vendor/composer/autoload_classmap.php @@ -25,6 +25,7 @@ return array( 'PicoDb\\SQLException' => $vendorDir . '/fguillot/picodb/lib/PicoDb/SQLException.php', 'PicoDb\\Schema' => $vendorDir . '/fguillot/picodb/lib/PicoDb/Schema.php', 'PicoDb\\Table' => $vendorDir . '/fguillot/picodb/lib/PicoDb/Table.php', + 'PicoFeed\\Base' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Base.php', 'PicoFeed\\Client\\Client' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Client.php', 'PicoFeed\\Client\\ClientException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/ClientException.php', 'PicoFeed\\Client\\Curl' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Curl.php', @@ -42,6 +43,9 @@ return array( 'PicoFeed\\Filter\\Filter' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php', 'PicoFeed\\Filter\\Html' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Filter/Html.php', 'PicoFeed\\Filter\\Tag' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Filter/Tag.php', + 'PicoFeed\\Generator\\ContentGeneratorInterface' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Generator/ContentGeneratorInterface.php', + 'PicoFeed\\Generator\\FileContentGenerator' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Generator/FileContentGenerator.php', + 'PicoFeed\\Generator\\YoutubeContentGenerator' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Generator/YoutubeContentGenerator.php', 'PicoFeed\\Logging\\Logger' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Logging/Logger.php', 'PicoFeed\\Parser\\Atom' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/Atom.php', 'PicoFeed\\Parser\\DateParser' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/DateParser.php', @@ -57,6 +61,11 @@ return array( 'PicoFeed\\Parser\\XmlEntityException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/XmlEntityException.php', 'PicoFeed\\Parser\\XmlParser' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php', 'PicoFeed\\PicoFeedException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/PicoFeedException.php', + 'PicoFeed\\Processor\\ContentFilterProcessor' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ContentFilterProcessor.php', + 'PicoFeed\\Processor\\ContentGeneratorProcessor' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ContentGeneratorProcessor.php', + 'PicoFeed\\Processor\\ItemPostProcessor' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ItemPostProcessor.php', + 'PicoFeed\\Processor\\ItemProcessorInterface' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ItemProcessorInterface.php', + 'PicoFeed\\Processor\\ScraperProcessor' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ScraperProcessor.php', 'PicoFeed\\Reader\\Favicon' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/Favicon.php', 'PicoFeed\\Reader\\Reader' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/Reader.php', 'PicoFeed\\Reader\\ReaderException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/ReaderException.php', diff --git a/vendor/composer/installed.json b/vendor/composer/installed.json index f72aa48..d65a909 100644 --- a/vendor/composer/installed.json +++ b/vendor/composer/installed.json @@ -163,17 +163,17 @@ }, { "name": "fguillot/picofeed", - "version": "v0.1.19", - "version_normalized": "0.1.19.0", + "version": "v0.1.20", + "version_normalized": "0.1.20.0", "source": { "type": "git", "url": "https://github.com/fguillot/picoFeed.git", - "reference": "c270ef4474a2460d857f99c84612025c5f9975f2" + "reference": "d6bbdd248fa4a3eef7831ffaae0491a2ea58f897" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/fguillot/picoFeed/zipball/c270ef4474a2460d857f99c84612025c5f9975f2", - "reference": "c270ef4474a2460d857f99c84612025c5f9975f2", + "url": "https://api.github.com/repos/fguillot/picoFeed/zipball/d6bbdd248fa4a3eef7831ffaae0491a2ea58f897", + "reference": "d6bbdd248fa4a3eef7831ffaae0491a2ea58f897", "shasum": "" }, "require": { @@ -188,7 +188,7 @@ "suggest": { "ext-curl": "PicoFeed will use cURL if present" }, - "time": "2016-02-11 19:52:02", + "time": "2016-03-24 12:09:56", "bin": [ "picofeed" ], diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Base.php b/vendor/fguillot/picofeed/lib/PicoFeed/Base.php new file mode 100644 index 0000000..4be0985 --- /dev/null +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Base.php @@ -0,0 +1,34 @@ +config = $config ?: new Config(); + Logger::setTimezone($this->config->getTimezone()); + } +} diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Filter/Tag.php b/vendor/fguillot/picofeed/lib/PicoFeed/Filter/Tag.php index 34e21dc..5fd8d6d 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Filter/Tag.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Filter/Tag.php @@ -2,24 +2,17 @@ namespace PicoFeed\Filter; -use DOMXpath; +use DOMXPath; +use PicoFeed\Base; use PicoFeed\Parser\XmlParser; -use PicoFeed\Config\Config; /** * Tag Filter class. * * @author Frederic Guillot */ -class Tag +class Tag extends Base { - /** - * Config object. - * - * @var \PicoFeed\Config\Config - */ - private $config; - /** * Tags blacklist (Xpath expressions). * @@ -76,11 +69,6 @@ class Tag 'q', ); - public function __construct(Config $config) - { - $this->config = $config; - } - /** * Check if the tag is allowed and is not a pixel tracker. * diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Generator/ContentGeneratorInterface.php b/vendor/fguillot/picofeed/lib/PicoFeed/Generator/ContentGeneratorInterface.php new file mode 100644 index 0000000..5c2f205 --- /dev/null +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Generator/ContentGeneratorInterface.php @@ -0,0 +1,23 @@ +extensions as $extension) { + if (substr($item->getUrl(), - strlen($extension)) === $extension) { + $item->setContent(''.$item->getUrl().''); + return true; + } + } + + return false; + } +} diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Generator/YoutubeContentGenerator.php b/vendor/fguillot/picofeed/lib/PicoFeed/Generator/YoutubeContentGenerator.php new file mode 100644 index 0000000..198090d --- /dev/null +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Generator/YoutubeContentGenerator.php @@ -0,0 +1,67 @@ +hasNamespace('yt')) { + return $this->generateHtmlFromXml($item); + } + + return $this->generateHtmlFromUrl($item); + } + + /** + * Generate HTML + * + * @access public + * @param Item $item + * @return boolean + */ + private function generateHtmlFromXml(Item $item) + { + $videoId = $item->getTag('yt:videoId'); + + if (! empty($videoId)) { + $item->setContent(''); + return true; + } + + return false; + } + + /** + * Generate HTML from item URL + * + * @access public + * @param Item $item + * @return bool + */ + public function generateHtmlFromUrl(Item $item) + { + if (preg_match('/youtube\.com\/watch\?v=(.*)/', $item->getUrl(), $matches)) { + $item->setContent(''); + return true; + } + + return false; + } +} diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Atom.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Atom.php index 356453c..6325923 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Atom.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Atom.php @@ -150,7 +150,7 @@ class Atom extends Parser $updated = XmlParser::getXPathResult($xml, 'atom:updated', $this->namespaces) ?: XmlParser::getXPathResult($xml, 'updated'); - $feed->date = $this->date->getDateTime((string) current($updated)); + $feed->date = $this->getDateParser()->getDateTime((string) current($updated)); } /** @@ -168,8 +168,8 @@ class Atom extends Parser $updated = XmlParser::getXPathResult($entry, 'atom:updated', $this->namespaces) ?: XmlParser::getXPathResult($entry, 'updated'); - $published = !empty($published) ? $this->date->getDateTime((string) current($published)) : null; - $updated = !empty($updated) ? $this->date->getDateTime((string) current($updated)) : null; + $published = !empty($published) ? $this->getDateParser()->getDateTime((string) current($published)) : null; + $updated = !empty($updated) ? $this->getDateParser()->getDateTime((string) current($updated)) : null; if ($published === null && $updated === null) { $item->date = $feed->getDate(); // We use the feed date if there is no date for the item diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/DateParser.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/DateParser.php index e4d08b5..4ad0078 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/DateParser.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/DateParser.php @@ -4,20 +4,22 @@ namespace PicoFeed\Parser; use DateTime; use DateTimeZone; +use PicoFeed\Base; /** * Date Parser. * * @author Frederic Guillot */ -class DateParser +class DateParser extends Base { /** * Timezone used to parse feed dates. * + * @access private * @var string */ - public $timezone = 'UTC'; + private $timezone = 'UTC'; /** * Supported formats [ 'format' => length ]. @@ -88,7 +90,7 @@ class DateParser */ public function getValidDate($format, $value) { - $date = DateTime::createFromFormat($format, $value, new DateTimeZone($this->timezone)); + $date = DateTime::createFromFormat($format, $value, $this->getTimeZone()); if ($date !== false) { $errors = DateTime::getLastErrors(); @@ -108,6 +110,17 @@ class DateParser */ public function getCurrentDateTime() { - return new DateTime('now', new DateTimeZone($this->timezone)); + return new DateTime('now', $this->getTimeZone()); + } + + /** + * Get DateTimeZone instance + * + * @access public + * @return DateTimeZone + */ + public function getTimeZone() + { + return new DateTimeZone($this->config->getTimezone() ?: $this->timezone); } } diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Item.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Item.php index 22d7c59..34e557a 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Item.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Item.php @@ -102,6 +102,18 @@ class Item */ public $namespaces = array(); + /** + * Check if a XML namespace exists + * + * @access public + * @param string $namespace + * @return bool + */ + public function hasNamespace($namespace) + { + return array_key_exists($namespace, $this->namespaces); + } + /** * Get specific XML tag or attribute value. * @@ -112,12 +124,10 @@ class Item */ public function getTag($tag, $attribute = '') { - // convert to xPath attribute query if ($attribute !== '') { $attribute = '/@'.$attribute; } - // construct query $query = './/'.$tag.$attribute; $elements = XmlParser::getXPathResult($this->xml, $query, $this->namespaces); @@ -155,13 +165,29 @@ class Item } /** - * Get url. + * Get URL + * + * @access public + * @return string */ public function getUrl() { return $this->url; } + /** + * Set URL + * + * @access public + * @param string $url + * @return Item + */ + public function setUrl($url) + { + $this->url = $url; + return $this; + } + /** * Get id. */ @@ -186,6 +212,19 @@ class Item return $this->content; } + /** + * Set content + * + * @access public + * @param string $value + * @return Item + */ + public function setContent($value) + { + $this->content = $value; + return $this; + } + /** * Get enclosure url. */ diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php index 5130b68..433f21a 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php @@ -2,12 +2,15 @@ namespace PicoFeed\Parser; +use PicoFeed\Processor\ContentFilterProcessor; +use PicoFeed\Processor\ContentGeneratorProcessor; +use PicoFeed\Processor\ItemPostProcessor; +use PicoFeed\Processor\ScraperProcessor; use SimpleXMLElement; use PicoFeed\Client\Url; use PicoFeed\Encoding\Encoding; use PicoFeed\Filter\Filter; use PicoFeed\Logging\Logger; -use PicoFeed\Scraper\Scraper; /** * Base parser class. @@ -28,7 +31,7 @@ abstract class Parser * * @var \PicoFeed\Parser\DateParser */ - protected $date; + private $dateParser; /** * Hash algorithm used to generate item id, any value supported by PHP, see hash_algos(). @@ -66,32 +69,12 @@ abstract class Parser protected $used_namespaces = array(); /** - * Enable the content filtering. + * Item Post Processor instance * - * @var bool + * @access private + * @var ItemPostProcessor */ - private $enable_filter = true; - - /** - * Enable the content grabber. - * - * @var bool - */ - private $enable_grabber = false; - - /** - * Enable the content grabber on all pages. - * - * @var bool - */ - private $grabber_needs_rule_file = false; - - /** - * Ignore those urls for the content scraper. - * - * @var array - */ - private $grabber_ignore_urls = array(); + private $itemPostProcessor; /** * Constructor. @@ -102,7 +85,6 @@ abstract class Parser */ public function __construct($content, $http_encoding = '', $fallback_url = '') { - $this->date = new DateParser(); $this->fallback_url = $fallback_url; $xml_encoding = XmlParser::getEncodingFromXmlTag($content); @@ -112,6 +94,10 @@ abstract class Parser // Encode everything in UTF-8 Logger::setMessage(get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"'); $this->content = Encoding::convert($this->content, $xml_encoding ?: $http_encoding); + + $this->itemPostProcessor = new ItemPostProcessor($this->config); + $this->itemPostProcessor->register(new ContentGeneratorProcessor($this->config)); + $this->itemPostProcessor->register(new ContentFilterProcessor($this->config)); } /** @@ -173,15 +159,11 @@ abstract class Parser // Id generation can use the item url/title/content (order is important) $this->findItemId($entry, $item, $feed); - $this->findItemDate($entry, $item, $feed); $this->findItemEnclosure($entry, $item, $feed); $this->findItemLanguage($entry, $item, $feed); - // Order is important (avoid double filtering) - $this->filterItemContent($feed, $item); - $this->scrapWebsite($item); - + $this->itemPostProcessor->execute($feed, $item); $feed->items[] = $item; } @@ -230,43 +212,29 @@ abstract class Parser } /** - * Fetch item content with the content grabber. + * Get Item Post Processor instance * - * @param Item $item Item object + * @access public + * @return ItemPostProcessor */ - public function scrapWebsite(Item $item) + public function getItemPostProcessor() { - if ($this->enable_grabber && !in_array($item->getUrl(), $this->grabber_ignore_urls)) { - $grabber = new Scraper($this->config); - $grabber->setUrl($item->getUrl()); - - if ($this->grabber_needs_rule_file) { - $grabber->disableCandidateParser(); - } - - $grabber->execute(); - - if ($grabber->hasRelevantContent()) { - $item->content = $grabber->getFilteredContent(); - } - } + return $this->itemPostProcessor; } /** - * Filter HTML for entry content. + * Get DateParser instance * - * @param Feed $feed Feed object - * @param Item $item Item object + * @access public + * @return DateParser */ - public function filterItemContent(Feed $feed, Item $item) + public function getDateParser() { - if ($this->isFilteringEnabled()) { - $filter = Filter::html($item->getContent(), $feed->getSiteUrl()); - $filter->setConfig($this->config); - $item->content = $filter->execute(); - } else { - Logger::setMessage(get_called_class().': Content filtering disabled'); + if ($this->dateParser === null) { + return new DateParser($this->config); } + + return $this->dateParser; } /** @@ -316,31 +284,11 @@ abstract class Parser * Set Hash algorithm used for id generation. * * @param string $algo Algorithm name - * * @return \PicoFeed\Parser\Parser */ public function setHashAlgo($algo) { $this->hash_algo = $algo ?: $this->hash_algo; - - return $this; - } - - /** - * Set a different timezone. - * - * @see http://php.net/manual/en/timezones.php - * - * @param string $timezone Timezone - * - * @return \PicoFeed\Parser\Parser - */ - public function setTimezone($timezone) - { - if ($timezone) { - $this->date->timezone = $timezone; - } - return $this; } @@ -354,7 +302,6 @@ abstract class Parser public function setConfig($config) { $this->config = $config; - return $this; } @@ -365,21 +312,8 @@ abstract class Parser */ public function disableContentFiltering() { - $this->enable_filter = false; - } - - /** - * Return true if the content filtering is enabled. - * - * @return bool - */ - public function isFilteringEnabled() - { - if ($this->config === null) { - return $this->enable_filter; - } - - return $this->config->getContentFiltering($this->enable_filter); + $this->itemPostProcessor->unregister('PicoFeed\Processor\ContentFilterProcessor'); + return $this; } /** @@ -392,8 +326,14 @@ abstract class Parser */ public function enableContentGrabber($needs_rule_file = false) { - $this->enable_grabber = true; - $this->grabber_needs_rule_file = $needs_rule_file; + $processor = new ScraperProcessor($this->config); + + if ($needs_rule_file) { + $processor->getScraper()->disableCandidateParser(); + } + + $this->itemPostProcessor->register($processor); + return $this; } /** @@ -405,7 +345,8 @@ abstract class Parser */ public function setGrabberIgnoreUrls(array $urls) { - $this->grabber_ignore_urls = $urls; + $this->itemPostProcessor->getProcessor('PicoFeed\Processor\ScraperProcessor')->ignoreUrls($urls); + return $this; } /** diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Rss10.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Rss10.php index dd2aa7a..315c7db 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Rss10.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Rss10.php @@ -149,7 +149,7 @@ class Rss10 extends Parser $date = XmlParser::getXPathResult($xml, 'rss:channel/dc:date', $this->namespaces) ?: XmlParser::getXPathResult($xml, 'channel/dc:date', $this->namespaces); - $feed->date = $this->date->getDateTime((string) current($date)); + $feed->date = $this->getDateParser()->getDateTime((string) current($date)); } /** @@ -163,7 +163,7 @@ class Rss10 extends Parser { $date = XmlParser::getXPathResult($entry, 'dc:date', $this->namespaces); - $item->date = empty($date) ? $feed->getDate() : $this->date->getDateTime((string) current($date)); + $item->date = empty($date) ? $feed->getDate() : $this->getDateParser()->getDateTime((string) current($date)); } /** diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Rss20.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Rss20.php index 005691f..b265656 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Rss20.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Rss20.php @@ -139,11 +139,11 @@ class Rss20 extends Parser $publish_date = XmlParser::getXPathResult($xml, 'channel/pubDate'); $update_date = XmlParser::getXPathResult($xml, 'channel/lastBuildDate'); - $published = !empty($publish_date) ? $this->date->getDateTime((string) current($publish_date)) : null; - $updated = !empty($update_date) ? $this->date->getDateTime((string) current($update_date)) : null; + $published = !empty($publish_date) ? $this->getDateParser()->getDateTime((string) current($publish_date)) : null; + $updated = !empty($update_date) ? $this->getDateParser()->getDateTime((string) current($update_date)) : null; if ($published === null && $updated === null) { - $feed->date = $this->date->getCurrentDateTime(); // We use the current date if there is no date for the feed + $feed->date = $this->getDateParser()->getCurrentDateTime(); // We use the current date if there is no date for the feed } elseif ($published !== null && $updated !== null) { $feed->date = max($published, $updated); // We use the most recent date between published and updated } else { @@ -162,7 +162,7 @@ class Rss20 extends Parser { $date = XmlParser::getXPathResult($entry, 'pubDate'); - $item->date = empty($date) ? $feed->getDate() : $this->date->getDateTime((string) current($date)); + $item->date = empty($date) ? $feed->getDate() : $this->getDateParser()->getDateTime((string) current($date)); } /** diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php index 7c8ebc6..6ed5a48 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php @@ -38,7 +38,7 @@ class XmlParser * * @param string $input XML content * - * @return \DOMNDocument + * @return \DOMDocument */ public static function getDomDocument($input) { @@ -60,10 +60,10 @@ class XmlParser * Small wrapper around ZendXml to turn their exceptions into picoFeed * exceptions * @param $input the xml to load - * @param $dom pass in a dom document or use null/omit if simpleXml should + * @param $dom pass in a dom document or use null/omit if simpleXml should * be used */ - private static function scan($input, $dom=null) + private static function scan($input, $dom = null) { try { return Security::scan($input, $dom); diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ContentFilterProcessor.php b/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ContentFilterProcessor.php new file mode 100644 index 0000000..9b7ddcc --- /dev/null +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ContentFilterProcessor.php @@ -0,0 +1,37 @@ +config->getContentFiltering(true)) { + $filter = Filter::html($item->getContent(), $feed->getSiteUrl()); + $filter->setConfig($this->config); + $item->setContent($filter->execute()); + } else { + Logger::setMessage(get_called_class().': Content filtering disabled'); + } + } +} diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ContentGeneratorProcessor.php b/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ContentGeneratorProcessor.php new file mode 100644 index 0000000..49adf9c --- /dev/null +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ContentGeneratorProcessor.php @@ -0,0 +1,49 @@ +generators as $generator) { + $className = '\PicoFeed\Generator\\'.ucfirst($generator).'ContentGenerator'; + $object = new $className($this->config); + + if ($object->execute($item)) { + return true; + } + } + + return false; + } +} diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ItemPostProcessor.php b/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ItemPostProcessor.php new file mode 100644 index 0000000..ff9740b --- /dev/null +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ItemPostProcessor.php @@ -0,0 +1,84 @@ +processors as $processor) { + if ($processor->execute($feed, $item)) { + return true; + } + } + + return false; + } + + /** + * Register a new Item post-processor + * + * @access public + * @param ItemProcessorInterface $processor + * @return ItemPostProcessor + */ + public function register(ItemProcessorInterface $processor) + { + $this->processors[get_class($processor)] = $processor; + return $this; + } + + /** + * Remove Processor instance + * + * @access public + * @param string $class + * @return ItemPostProcessor + */ + public function unregister($class) + { + if (isset($this->processors[$class])) { + unset($this->processors[$class]); + } + + return $this; + } + + /** + * Get Processor instance + * + * @access public + * @param string $class + * @return ItemProcessorInterface|null + */ + public function getProcessor($class) + { + return isset($this->processors[$class]) ? $this->processors[$class] : null; + } +} diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ItemProcessorInterface.php b/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ItemProcessorInterface.php new file mode 100644 index 0000000..5d53226 --- /dev/null +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Processor/ItemProcessorInterface.php @@ -0,0 +1,25 @@ +getUrl(), $this->ignoredUrls)) { + $scraper = $this->getScraper(); + $scraper->setUrl($item->getUrl()); + $scraper->execute(); + + if ($scraper->hasRelevantContent()) { + $item->setContent($scraper->getFilteredContent()); + } + } + + return false; + } + + /** + * Ignore list of URLs + * + * @access public + * @param array $urls + * @return $this + */ + public function ignoreUrls(array $urls) + { + $this->ignoredUrls = $urls; + return $this; + } + + /** + * Returns Scraper instance + * + * @access public + * @return Scraper + */ + public function getScraper() + { + if ($this->scraper === null) { + $this->scraper = new Scraper($this->config); + } + + return $this->scraper; + } +} diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Reader/Favicon.php b/vendor/fguillot/picofeed/lib/PicoFeed/Reader/Favicon.php index 53c5cf7..09feb49 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Reader/Favicon.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Reader/Favicon.php @@ -2,11 +2,11 @@ namespace PicoFeed\Reader; -use DOMXpath; +use DOMXPath; +use PicoFeed\Base; use PicoFeed\Client\Client; use PicoFeed\Client\ClientException; use PicoFeed\Client\Url; -use PicoFeed\Config\Config; use PicoFeed\Logging\Logger; use PicoFeed\Parser\XmlParser; @@ -17,7 +17,7 @@ use PicoFeed\Parser\XmlParser; * * @author Frederic Guillot */ -class Favicon +class Favicon extends Base { /** * Valid types for favicon (supported by browsers). @@ -33,13 +33,6 @@ class Favicon 'image/svg+xml' ); - /** - * Config class instance. - * - * @var \PicoFeed\Config\Config - */ - private $config; - /** * Icon binary content. * @@ -54,16 +47,6 @@ class Favicon */ private $content_type = ''; - /** - * Constructor. - * - * @param \PicoFeed\Config\Config $config Config class instance - */ - public function __construct(Config $config = null) - { - $this->config = $config ?: new Config(); - } - /** * Get the icon file content (available only after the download). * diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Reader/Reader.php b/vendor/fguillot/picofeed/lib/PicoFeed/Reader/Reader.php index cfe5171..7b26dea 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Reader/Reader.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Reader/Reader.php @@ -3,7 +3,7 @@ namespace PicoFeed\Reader; use DOMXPath; -use PicoFeed\Config\Config; +use PicoFeed\Base; use PicoFeed\Client\Client; use PicoFeed\Client\Url; use PicoFeed\Logging\Logger; @@ -14,7 +14,7 @@ use PicoFeed\Parser\XmlParser; * * @author Frederic Guillot */ -class Reader +class Reader extends Base { /** * Feed formats for detection. @@ -29,24 +29,6 @@ class Reader 'Rss10' => '//rdf', ); - /** - * Config class instance. - * - * @var \PicoFeed\Config\Config - */ - private $config; - - /** - * Constructor. - * - * @param \PicoFeed\Config\Config $config Config class instance - */ - public function __construct(Config $config = null) - { - $this->config = $config ?: new Config(); - Logger::setTimezone($this->config->getTimezone()); - } - /** * Download a feed (no discovery). * @@ -163,7 +145,6 @@ class Reader $parser = new $className($content, $encoding, $url); $parser->setHashAlgo($this->config->getParserHashAlgo()); - $parser->setTimezone($this->config->getTimezone()); $parser->setConfig($this->config); return $parser; diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/.over-blog.com.php b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/.over-blog.com.php new file mode 100644 index 0000000..cc5d83c --- /dev/null +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/.over-blog.com.php @@ -0,0 +1,11 @@ + array( + '%.*%' => array( + 'test_url' => 'http://eliascarpe.over-blog.com/2015/12/re-upload-projets-d-avenir.html', + 'body' => array( + '//div[contains(concat(" ", normalize-space(@class), " "), " ob-section ")]', + ), + ) + ) +); diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/encyclopedie.naheulbeuk.com.php b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/encyclopedie.naheulbeuk.com.php new file mode 100644 index 0000000..19bcbde --- /dev/null +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/encyclopedie.naheulbeuk.com.php @@ -0,0 +1,13 @@ + array( + '%.*%' => array( + 'test_url' => 'http://encyclopedie.naheulbeuk.com/article.php3?id_article=352', + 'body' => array( + '//td//h1[@class="titre-texte"]', + '//td//div[@class="surtitre"]', + '//td//div[@class="texte"]', + ), + ) + ), +); diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/greekculture.com.php b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/greekculture.com.php deleted file mode 100644 index 9410de9..0000000 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/greekculture.com.php +++ /dev/null @@ -1,12 +0,0 @@ - array( - '%/joyoftech/.*%' => array( - 'body' => array( - '//img[@width="640"]', - ), - 'test_url' => 'http://www.geekculture.com/joyoftech/joyarchives/2235.html', - ), - ), -); diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/news.sciencemag.org.php b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/news.sciencemag.org.php deleted file mode 100644 index 9b572ef..0000000 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/news.sciencemag.org.php +++ /dev/null @@ -1,18 +0,0 @@ - array( - '%.*%' => array( - 'test_url' => 'http://news.sciencemag.org/biology/2015/09/genetic-engineering-turns-common-plant-cancer-fighter', - 'body' => array( - '//div[@class="content"]', - ), - 'strip' => array( - '//h1[@class="snews-article__headline"]', - '//div[contains(@class,"easy_social_box")]', - '//div[@class="author-teaser"]', - '//div[@class="article-byline"]', - ), - ), - ) -); - diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/rugbyrama.fr.php b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/rugbyrama.fr.php new file mode 100644 index 0000000..2280b66 --- /dev/null +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/rugbyrama.fr.php @@ -0,0 +1,20 @@ + array( + '%.*%' => array( + 'test_url' => 'http://www.rugbyrama.fr/rugby/top-14/2015-2016/top-14-hayman-coupe-du-monde-finale-2012-lutte.-voici-levan-chilachava-toulon_sto5283863/story.shtml', + 'body' => array( + '//div[@class="story-simple-content"]', + ), + 'strip' => array( + '//script', + '//form', + '//style', + '//*[@class="share-buttons"]', + '//*[@class="show-mobile-block"]', + '//*[@class="hide-desktop"]', + '//*[@id="tracking_img"]', + ) + ) + ) +); \ No newline at end of file diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.franceculture.fr.php b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.franceculture.fr.php index 67e3253..f7ec0d8 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.franceculture.fr.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.franceculture.fr.php @@ -4,8 +4,7 @@ return array( '%.*%' => array( 'test_url' => 'http://www.franceculture.fr/emission-culture-eco-la-finance-aime-toujours-la-france-2016-01-08', 'body' => array( - '//div[@class="heading"]/*/*/div[contains(@class,"player-inline")]', - '//article/div[@class="text-zone"]', + '//div[@class="text-zone"]', ), 'strip' => array( '//ul[@class="tags"]', diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.monsieur-le-chien.fr.php b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.monsieur-le-chien.fr.php new file mode 100644 index 0000000..5f5e987 --- /dev/null +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.monsieur-le-chien.fr.php @@ -0,0 +1,11 @@ + array( + '%.*%' => array( + 'test_url' => 'http://www.monsieur-le-chien.fr/index.php?planche=672', + 'body' => array( + '//img[starts-with(@src, "i/planches/")]', + ), + ) + ) +); diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.sciencemag.org.php b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.sciencemag.org.php new file mode 100644 index 0000000..3d34857 --- /dev/null +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.sciencemag.org.php @@ -0,0 +1,16 @@ + array( + '%.*%' => array( + 'test_url' => 'http://www.sciencemag.org/news/2016/01/could-bright-foamy-wak$ + 'body' => array( + '//div[@class="row--hero"]', + '//article[contains(@class,"primary")]', + ), + 'strip' => array( + '//header[@class="article__header"]', + '//footer[@class="article__foot"]', + ), + ), + ) +); diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/RuleLoader.php b/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/RuleLoader.php index 0cffbf6..6650682 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/RuleLoader.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/RuleLoader.php @@ -2,8 +2,8 @@ namespace PicoFeed\Scraper; +use PicoFeed\Base; use PicoFeed\Logging\Logger; -use PicoFeed\Config\Config; /** * RuleLoader class. @@ -11,25 +11,8 @@ use PicoFeed\Config\Config; * @author Frederic Guillot * @author Bernhard Posselt */ -class RuleLoader +class RuleLoader extends Base { - /** - * Config object. - * - * @var \PicoFeed\Config\Config - */ - private $config; - - /** - * Constructor. - * - * @param \PicoFeed\Config\Config $config Config class instance - */ - public function __construct(Config $config) - { - $this->config = $config; - } - /** * Get the rules for an URL. * @@ -111,12 +94,14 @@ class RuleLoader */ public function getRulesFolders() { - $folders = array(__DIR__.'/../Rules'); + $folders = array(); if ($this->config !== null && $this->config->getGrabberRulesFolder() !== null) { $folders[] = $this->config->getGrabberRulesFolder(); } + $folders[] = __DIR__ . '/../Rules'; + return $folders; } } diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/Scraper.php b/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/Scraper.php index f1d1222..980a88d 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/Scraper.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/Scraper.php @@ -2,10 +2,10 @@ namespace PicoFeed\Scraper; +use PicoFeed\Base; use PicoFeed\Client\Client; use PicoFeed\Client\ClientException; use PicoFeed\Client\Url; -use PicoFeed\Config\Config; use PicoFeed\Encoding\Encoding; use PicoFeed\Filter\Filter; use PicoFeed\Logging\Logger; @@ -16,7 +16,7 @@ use PicoFeed\Parser\XmlParser; * * @author Frederic Guillot */ -class Scraper +class Scraper extends Base { /** * URL. @@ -53,24 +53,6 @@ class Scraper */ private $enableCandidateParser = true; - /** - * Config object. - * - * @var \PicoFeed\Config\Config - */ - private $config; - - /** - * Constructor. - * - * @param \PicoFeed\Config\Config $config Config class instance - */ - public function __construct(Config $config) - { - $this->config = $config; - Logger::setTimezone($this->config->getTimezone()); - } - /** * Disable candidates parsing. * @@ -79,7 +61,6 @@ class Scraper public function disableCandidateParser() { $this->enableCandidateParser = false; - return $this; } @@ -227,47 +208,21 @@ class Scraper */ public function execute() { + $this->content = ''; + $this->html = ''; + $this->encoding = ''; + $this->download(); + $this->prepareHtml(); - if (!$this->skipProcessing()) { - $this->prepareHtml(); + $parser = $this->getParser(); - $parser = $this->getParser(); - - if ($parser !== null) { - $this->content = $parser->execute(); - Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes'); - } + if ($parser !== null) { + $this->content = $parser->execute(); + Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes'); } } - /** - * Returns true if the parsing must be skipped. - * - * @return bool - */ - public function skipProcessing() - { - $handlers = array( - 'detectStreamingVideos', - 'detectPdfFiles', - ); - - foreach ($handlers as $handler) { - if ($this->$handler()) { - return true; - } - } - - if (empty($this->html)) { - Logger::setMessage(get_called_class().': Raw HTML is empty'); - - return true; - } - - return false; - } - /** * Get the parser. * @@ -287,17 +242,14 @@ class Scraper if (preg_match($pattern, $sub_url)) { Logger::setMessage(get_called_class().': Matched url '.$sub_url); - return new RuleParser($this->html, $rule); } } } elseif ($this->enableCandidateParser) { Logger::setMessage(get_called_class().': Parse content with candidates'); - - return new CandidateParser($this->html); } - return; + return new CandidateParser($this->html); } /** @@ -312,30 +264,4 @@ class Scraper Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'" ; HTML Encoding "'.$html_encoding.'"'); } - - /** - * Return the Youtube embed player and skip processing. - * - * @return bool - */ - public function detectStreamingVideos() - { - if (preg_match("#(?<=v=|v\/|vi=|vi\/|youtu.be\/)[a-zA-Z0-9_-]{11}#", $this->url, $matches)) { - $this->content = ''; - - return true; - } - - return false; - } - - /** - * Skip processing for PDF documents. - * - * @return bool - */ - public function detectPdfFiles() - { - return substr($this->url, -3) === 'pdf'; - } }