From 3840a871287544c06ee7b4d9114c8b7e766120e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Tue, 20 May 2014 14:20:27 -0400 Subject: [PATCH] Update to the last version of PicoFeed --- common.php | 17 +- controllers/bookmark.php | 2 - controllers/common.php | 2 +- cronjob.php | 2 +- models/config.php | 60 ++- models/database.php | 8 - models/feed.php | 87 ++-- models/item.php | 133 +++--- models/user.php | 5 - vendor/PicoFeed/Client.php | 409 ++++++++++++++++--- vendor/PicoFeed/Clients/Curl.php | 100 ++++- vendor/PicoFeed/Clients/Stream.php | 33 +- vendor/PicoFeed/Encoding.php | 424 +++++++------------- vendor/PicoFeed/Export.php | 47 ++- vendor/PicoFeed/Feed.php | 150 +++++++ vendor/PicoFeed/Filter.php | 230 +++++++++-- vendor/PicoFeed/Grabber.php | 265 +++++++++--- vendor/PicoFeed/Import.php | 70 +++- vendor/PicoFeed/Item.php | 202 ++++++++++ vendor/PicoFeed/Logging.php | 78 +++- vendor/PicoFeed/Parser.php | 314 +++++++++++---- vendor/PicoFeed/Parsers/Atom.php | 292 ++++++++++---- vendor/PicoFeed/Parsers/Rss10.php | 146 +++---- vendor/PicoFeed/Parsers/Rss20.php | 358 +++++++++++------ vendor/PicoFeed/Parsers/Rss91.php | 12 +- vendor/PicoFeed/Parsers/Rss92.php | 12 +- vendor/PicoFeed/PicoFeed.php | 20 + vendor/PicoFeed/Reader.php | 239 ++++++----- vendor/PicoFeed/Rules/journaldugeek.com.php | 10 + vendor/PicoFeed/Writer.php | 47 ++- vendor/PicoFeed/Writers/Atom.php | 71 +++- vendor/PicoFeed/Writers/Rss20.php | 62 ++- vendor/PicoFeed/XmlParser.php | 136 +++++++ 33 files changed, 2920 insertions(+), 1123 deletions(-) create mode 100644 vendor/PicoFeed/Feed.php create mode 100644 vendor/PicoFeed/Item.php create mode 100644 vendor/PicoFeed/PicoFeed.php create mode 100644 vendor/PicoFeed/Rules/journaldugeek.com.php create mode 100644 vendor/PicoFeed/XmlParser.php diff --git a/common.php b/common.php index 65b0a45..daf62d6 100644 --- a/common.php +++ b/common.php @@ -3,8 +3,19 @@ require __DIR__.'/check_setup.php'; require __DIR__.'/lib/Translator.php'; require __DIR__.'/vendor/PicoDb/Database.php'; -require __DIR__.'/vendor/PicoFeed/Client.php'; -require __DIR__.'/vendor/PicoFeed/Parser.php'; +require __DIR__.'/vendor/PicoFeed/PicoFeed.php'; +require __DIR__.'/vendor/Readability/Readability.php'; + +require __DIR__.'/vendor/SimpleValidator/Validator.php'; +require __DIR__.'/vendor/SimpleValidator/Base.php'; +require __DIR__.'/vendor/SimpleValidator/Validators/Required.php'; +require __DIR__.'/vendor/SimpleValidator/Validators/Unique.php'; +require __DIR__.'/vendor/SimpleValidator/Validators/MaxLength.php'; +require __DIR__.'/vendor/SimpleValidator/Validators/MinLength.php'; +require __DIR__.'/vendor/SimpleValidator/Validators/Integer.php'; +require __DIR__.'/vendor/SimpleValidator/Validators/Equals.php'; +require __DIR__.'/vendor/SimpleValidator/Validators/AlphaNumeric.php'; + require __DIR__.'/models/config.php'; require __DIR__.'/models/user.php'; require __DIR__.'/models/feed.php'; @@ -40,8 +51,6 @@ defined('AUTO_UPDATE_DOWNLOAD_DIRECTORY') or define('AUTO_UPDATE_DOWNLOAD_DIRECT defined('AUTO_UPDATE_ARCHIVE_DIRECTORY') or define('AUTO_UPDATE_ARCHIVE_DIRECTORY', DATA_DIRECTORY.DIRECTORY_SEPARATOR.'archive'); defined('AUTO_UPDATE_BACKUP_DIRECTORY') or define('AUTO_UPDATE_BACKUP_DIRECTORY', DATA_DIRECTORY.DIRECTORY_SEPARATOR.'backup'); -PicoFeed\Client::proxy(PROXY_HOSTNAME, PROXY_PORT, PROXY_USERNAME, PROXY_PASSWORD); - PicoDb\Database::bootstrap('db', function() { $db = new PicoDb\Database(array( diff --git a/controllers/bookmark.php b/controllers/bookmark.php index ebd351a..9e178e4 100644 --- a/controllers/bookmark.php +++ b/controllers/bookmark.php @@ -1,7 +1,5 @@ setTimezone(get('timezone')); + + $config->setClientTimeout(HTTP_TIMEOUT); + $config->setClientUserAgent(HTTP_USER_AGENT); + $config->setGrabberUserAgent(HTTP_USER_AGENT); + + $config->setProxyHostname(PROXY_HOSTNAME); + $config->setProxyPort(PROXY_PORT); + $config->setProxyUsername(PROXY_USERNAME); + $config->setProxyPassword(PROXY_PASSWORD); + + $config->setFilterIframeWhitelist(get_iframe_whitelist()); + + return $config; +} + +function get_iframe_whitelist() +{ + return array( + '//www.youtube.com', + 'http://www.youtube.com', + 'https://www.youtube.com', + 'http://player.vimeo.com', + 'https://player.vimeo.com', + 'http://www.dailymotion.com', + 'https://www.dailymotion.com', + ); +} // Send a debug message to the console function debug($line) { - \PicoFeed\Logging::log($line); + Logging::setMessage($line); write_debug(); } @@ -32,14 +55,7 @@ function debug($line) function write_debug() { if (DEBUG) { - - $data = ''; - - foreach (\PicoFeed\Logging::$messages as $line) { - $data .= $line.PHP_EOL; - } - - file_put_contents(DEBUG_FILENAME, $data); + file_put_contents(DEBUG_FILENAME, implode(PHP_EOL, Logging::getMessages())); } } diff --git a/models/database.php b/models/database.php index f15b8cd..2497387 100644 --- a/models/database.php +++ b/models/database.php @@ -2,14 +2,6 @@ namespace Model\Database; -require_once __DIR__.'/../vendor/SimpleValidator/Validator.php'; -require_once __DIR__.'/../vendor/SimpleValidator/Base.php'; -require_once __DIR__.'/../vendor/SimpleValidator/Validators/Required.php'; -require_once __DIR__.'/../vendor/SimpleValidator/Validators/MaxLength.php'; -require_once __DIR__.'/../vendor/SimpleValidator/Validators/MinLength.php'; -require_once __DIR__.'/../vendor/SimpleValidator/Validators/Equals.php'; -require_once __DIR__.'/../vendor/SimpleValidator/Validators/AlphaNumeric.php'; - use SimpleValidator\Validator; use SimpleValidator\Validators; diff --git a/models/feed.php b/models/feed.php index 7c650aa..324aeb4 100644 --- a/models/feed.php +++ b/models/feed.php @@ -2,17 +2,15 @@ namespace Model\Feed; -require_once __DIR__.'/../vendor/PicoFeed/Filter.php'; -require_once __DIR__.'/../vendor/PicoFeed/Export.php'; -require_once __DIR__.'/../vendor/PicoFeed/Import.php'; -require_once __DIR__.'/../vendor/PicoFeed/Reader.php'; -require_once __DIR__.'/../vendor/SimpleValidator/Validator.php'; -require_once __DIR__.'/../vendor/SimpleValidator/Base.php'; -require_once __DIR__.'/../vendor/SimpleValidator/Validators/Required.php'; - use SimpleValidator\Validator; use SimpleValidator\Validators; use PicoDb\Database; +use PicoFeed\Export; +use PicoFeed\Import; +use PicoFeed\Reader; +use PicoFeed\Logging; +use Model\Config; +use Model\Item; const LIMIT_ALL = -1; @@ -32,14 +30,15 @@ function update(array $values) // Export all feeds function export_opml() { - $opml = new \PicoFeed\Export(get_all()); + $opml = new Export(get_all()); return $opml->execute(); } // Import OPML file function import_opml($content) { - $import = new \PicoFeed\Import($content); + Logging::setTimezone(Config\get('timezone')); + $import = new Import($content); $feeds = $import->execute(); if ($feeds) { @@ -61,65 +60,71 @@ function import_opml($content) $db->closeTransaction(); - \Model\Config\write_debug(); + Config\write_debug(); return true; } - \Model\Config\write_debug(); + Config\write_debug(); return false; } // Add a new feed from an URL -function create($url, $grabber = false) +function create($url, $enable_grabber = false) { - $reader = new \PicoFeed\Reader; - $resource = $reader->download($url, '', '', HTTP_TIMEOUT, \Model\Config\HTTP_USERAGENT); + $reader = new Reader(Config\get_reader_config()); + $resource = $reader->download($url); $parser = $reader->getParser(); if ($parser !== false) { - $parser->grabber = $grabber; + if ($enable_grabber) { + $parser->enableContentGrabber(); + } + $feed = $parser->execute(); if ($feed === false) { - \Model\Config\write_debug(); + Config\write_debug(); return false; } - if (! $feed->url) $feed->url = $reader->getUrl(); + if (! $feed->getUrl()) { + $feed->url = $reader->getUrl(); + } - if (! $feed->title) { - \Model\Config\write_debug(); + if (! $feed->getTitle()) { + Config\write_debug(); return false; } $db = Database::get('db'); + // Check if the feed is already there if (! $db->table('feeds')->eq('feed_url', $reader->getUrl())->count()) { // Etag and LastModified are added the next update $rs = $db->table('feeds')->save(array( - 'title' => $feed->title, - 'site_url' => $feed->url, + 'title' => $feed->getTitle(), + 'site_url' => $feed->getUrl(), 'feed_url' => $reader->getUrl(), - 'download_content' => $grabber ? 1 : 0 + 'download_content' => $enable_grabber ? 1 : 0 )); if ($rs) { $feed_id = $db->getConnection()->getLastId(); - \Model\Item\update_all($feed_id, $feed->items, $grabber); - \Model\Config\write_debug(); + Item\update_all($feed_id, $feed->getItems(), $enable_grabber); + Config\write_debug(); return (int) $feed_id; } } } - \Model\Config\write_debug(); + Config\write_debug(); return false; } @@ -143,16 +148,17 @@ function refresh_all($limit = LIMIT_ALL) function refresh($feed_id) { $feed = get($feed_id); - if (empty($feed)) return false; - $reader = new \PicoFeed\Reader; + if (empty($feed)) { + return false; + } + + $reader = new Reader(Config\get_reader_config()); $resource = $reader->download( $feed['feed_url'], $feed['last_modified'], - $feed['etag'], - HTTP_TIMEOUT, - \Model\Config\HTTP_USERAGENT + $feed['etag'] ); // Update the `last_checked` column each time, HTTP cache or not @@ -160,7 +166,7 @@ function refresh($feed_id) if (! $resource->isModified()) { update_parsing_error($feed_id, 0); - \Model\Config\write_debug(); + Config\write_debug(); return true; } @@ -171,14 +177,8 @@ function refresh($feed_id) if ($feed['download_content']) { // Don't fetch previous items, only new one - $parser->grabber_ignore_urls = Database::get('db') - ->table('items') - ->eq('feed_id', $feed_id) - ->findAllByColumn('url'); - - $parser->grabber = true; - $parser->grabber_timeout = HTTP_TIMEOUT; - $parser->grabber_user_agent = \Model\Config\HTTP_FAKE_USERAGENT; + $parser->enableContentGrabber(); + $parser->setGrabberIgnoreUrls(Database::get('db')->table('items')->eq('feed_id', $feed_id)->findAllByColumn('url')); } $result = $parser->execute(); @@ -187,15 +187,16 @@ function refresh($feed_id) update_parsing_error($feed_id, 0); update_cache($feed_id, $resource->getLastModified(), $resource->getEtag()); - \Model\Item\update_all($feed_id, $result->items, $parser->grabber); - \Model\Config\write_debug(); + + Item\update_all($feed_id, $result->getItems(), $feed['download_content']); + Config\write_debug(); return true; } } update_parsing_error($feed_id, 1); - \Model\Config\write_debug(); + Config\write_debug(); return false; } diff --git a/models/item.php b/models/item.php index e02105e..6cdc889 100644 --- a/models/item.php +++ b/models/item.php @@ -2,11 +2,13 @@ namespace Model\Item; -require_once __DIR__.'/../vendor/Readability/Readability.php'; -require_once __DIR__.'/../vendor/PicoFeed/Grabber.php'; -require_once __DIR__.'/../vendor/PicoFeed/Filter.php'; - +use Model\Config; use PicoDb\Database; +use PicoFeed\Logging; +use PicoFeed\Grabber; +use PicoFeed\Client; +use PicoFeed\Filter; +use Readability; // Get all items without filtering function get_everything() @@ -141,7 +143,7 @@ function get_bookmarks($offset = null, $limit = null) ->join('feeds', 'id', 'feed_id') ->in('status', array('read', 'unread')) ->eq('bookmark', 1) - ->orderBy('updated', \Model\Config\get('items_sorting_direction')) + ->orderBy('updated', Config\get('items_sorting_direction')) ->offset($offset) ->limit($limit) ->findAll(); @@ -201,7 +203,7 @@ function get_nav($item, $status = array('unread'), $bookmark = array(1, 0), $fee ->table('items') ->columns('id', 'status', 'title', 'bookmark') ->neq('status', 'removed') - ->orderBy('updated', \Model\Config\get('items_sorting_direction')); + ->orderBy('updated', Config\get('items_sorting_direction')); if ($feed_id) $query->eq('feed_id', $feed_id); @@ -377,7 +379,7 @@ function mark_feed_as_read($feed_id) // Mark all read items to removed after X days function autoflush() { - $autoflush = (int) \Model\Config\get('autoflush'); + $autoflush = (int) Config\get('autoflush'); if ($autoflush > 0) { @@ -401,9 +403,9 @@ function autoflush() } // Update all items -function update_all($feed_id, array $items, $grabber = false) +function update_all($feed_id, array $items, $enable_grabber = false) { - $nocontent = (bool) \Model\Config\get('nocontent'); + $nocontent = (bool) Config\get('nocontent'); $items_in_feed = array(); @@ -412,54 +414,55 @@ function update_all($feed_id, array $items, $grabber = false) foreach ($items as $item) { - \PicoFeed\Logging::log('Item => '.$item->id.' '.$item->url); + Logging::setMessage('Item => '.$item->getId().' '.$item->getUrl()); // Item parsed correctly? - if ($item->id && $item->url) { + if ($item->getId() && $item->getUrl()) { - \PicoFeed\Logging::log('Item parsed correctly'); + Logging::setMessage('Item parsed correctly'); // Get item record in database, if any $itemrec = $db ->table('items') ->columns('enclosure') - ->eq('id', $item->id)->findOne(); + ->eq('id', $item->getId()) + ->findOne(); // Insert a new item if ($itemrec === null) { - \PicoFeed\Logging::log('Item added to the database'); + Logging::setMessage('Item added to the database'); - if (! $item->content && ! $nocontent && $grabber) { - $item->content = download_content_url($item->url); + if ($enable_grabber && ! $nocontent && ! $item->getContent()) { + $item->content = download_content_url($item->getUrl()); } $db->table('items')->save(array( - 'id' => $item->id, - 'title' => $item->title, - 'url' => $item->url, - 'updated' => $item->updated, - 'author' => $item->author, - 'content' => $nocontent ? '' : $item->content, + 'id' => $item->getId(), + 'title' => $item->getTitle(), + 'url' => $item->getUrl(), + 'updated' => $item->getDate(), + 'author' => $item->getAuthor(), + 'content' => $nocontent ? '' : $item->getContent(), 'status' => 'unread', 'feed_id' => $feed_id, - 'enclosure' => isset($item->enclosure) ? $item->enclosure : null, - 'enclosure_type' => isset($item->enclosure_type) ? $item->enclosure_type : null, - 'language' => $item->language, + 'enclosure' => $item->getEnclosureUrl(), + 'enclosure_type' => $item->getEnclosureType(), + 'language' => $item->getLanguage(), )); } - else if (isset($item->enclosure) && $item->enclosure && !$itemrec['enclosure']) { + else if (! $itemrec['enclosure'] && $item->getEnclosureUrl()) { - \PicoFeed\Logging::log('Update item enclosure'); + Logging::setMessage('Update item enclosure'); - $db->table('items')->eq('id', $item->id)->save(array( + $db->table('items')->eq('id', $item->getId())->save(array( 'status' => 'unread', - 'enclosure' => $item->enclosure, - 'enclosure_type' => isset($item->enclosure_type) ? $item->enclosure_type : null, + 'enclosure' => $item->getEnclosureUrl(), + 'enclosure_type' => $item->getEnclosureType(), )); } else { - \PicoFeed\Logging::log('Item already in the database'); + Logging::setMessage('Item already in the database'); } // Items inside this feed @@ -467,10 +470,20 @@ function update_all($feed_id, array $items, $grabber = false) } } - // Remove from the database items marked as "removed" - // and not present inside the feed + // Cleanup old items + cleanup($feed_id, $items_in_feed); + + $db->closeTransaction(); +} + +// Remove from the database items marked as "removed" +// and not present inside the feed +function cleanup($feed_id, array $items_in_feed) +{ if (! empty($items_in_feed)) { + $db = Database::get('db'); + $removed_items = $db ->table('items') ->columns('id') @@ -489,7 +502,7 @@ function update_all($feed_id, array $items, $grabber = false) if (! empty($items_to_remove)) { $nb_items = count($items_to_remove); - \PicoFeed\Logging::log('There is '.$nb_items.' items to remove'); + Logging::setMessage('There is '.$nb_items.' items to remove'); // Handle the case when there is a huge number of items to remove // Sqlite have a limit of 1000 sql variables by default @@ -508,43 +521,31 @@ function update_all($feed_id, array $items, $grabber = false) } } } - - \PicoFeed\Logging::log('Db transaction => '.($db->getConnection()->inTransaction() ? 'ok' : 'rollback')); - - $db->closeTransaction(); } // Download content from an URL function download_content_url($url) { - $client = \PicoFeed\Client::create(); - $client->url = $url; - $client->timeout = HTTP_TIMEOUT; - $client->user_agent = \Model\Config\HTTP_FAKE_USERAGENT; - $client->execute(); + $content = ''; - $html = $client->getContent(); + $grabber = new Grabber($url); + $grabber->setConfig(Config\get_reader_config()); + $grabber->download(); - if (! empty($html)) { - - // Try first with PicoFeed grabber and with Readability after - $grabber = new \PicoFeed\Grabber($url, $html, $client->getEncoding()); - $content = ''; - - if ($grabber->parse()) { - $content = $grabber->content; - } - - if (empty($content)) { - $content = download_content_readability($grabber->html, $url); - } - - // Filter content - $filter = new \PicoFeed\Filter($content, $url); - return $filter->execute(); + if ($grabber->parse()) { + $content = $grabber->getcontent(); + } + else { + $content = download_content_readability($grabber->getRawContent(), $url); } - return ''; + if (! empty($content)) { + $filter = new Filter($content, $url); + $filter->setConfig(Config\get_reader_config()); + $content = $filter->execute(); + } + + return $content; } // Download content from item ID @@ -555,7 +556,7 @@ function download_content_id($item_id) if (! empty($content)) { - if (! \Model\Config\get('nocontent')) { + if (! Config\get('nocontent')) { // Save content Database::get('db') @@ -564,7 +565,7 @@ function download_content_id($item_id) ->save(array('content' => $content)); } - \Model\Config\write_debug(); + Config\write_debug(); return array( 'result' => true, @@ -572,7 +573,7 @@ function download_content_id($item_id) ); } - \Model\Config\write_debug(); + Config\write_debug(); return array( 'result' => false, @@ -585,7 +586,7 @@ function download_content_readability($content, $url) { if (! empty($content)) { - $readability = new \Readability($content, $url); + $readability = new Readability($content, $url); if ($readability->init()) { return $readability->getContent()->innerHTML; diff --git a/models/user.php b/models/user.php index 5fcfbb3..cad2945 100644 --- a/models/user.php +++ b/models/user.php @@ -2,11 +2,6 @@ namespace Model\User; -require_once __DIR__.'/../vendor/SimpleValidator/Validator.php'; -require_once __DIR__.'/../vendor/SimpleValidator/Base.php'; -require_once __DIR__.'/../vendor/SimpleValidator/Validators/Required.php'; -require_once __DIR__.'/../vendor/SimpleValidator/Validators/MaxLength.php'; - use SimpleValidator\Validator; use SimpleValidator\Validators; use PicoDb\Database; diff --git a/vendor/PicoFeed/Client.php b/vendor/PicoFeed/Client.php index a0912de..a79840c 100644 --- a/vendor/PicoFeed/Client.php +++ b/vendor/PicoFeed/Client.php @@ -2,59 +2,170 @@ namespace PicoFeed; -require_once __DIR__.'/Logging.php'; +use LogicException; +use Clients\Curl; +use Clients\Stream; +use PicoFeed\Logging; +/** + * Client class + * + * @author Frederic Guillot + * @package client + */ abstract class Client { - protected static $proxy_hostname = null; - protected static $proxy_port = null; - protected static $proxy_username = null; - protected static $proxy_password = null; + /** + * Flag that say if the resource have been modified + * + * @access private + * @var bool + */ + private $is_modified = true; - public $encoding = ''; - public $etag = ''; - public $last_modified = ''; - public $is_modified = true; - public $content = ''; - public $url = ''; - public $timeout = 10; - public $max_redirects = 5; - public $max_body_size = 2097152; // 2MB - public $user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)'; + /** + * HTTP encoding + * + * @access private + * @var string + */ + private $encoding = ''; + /** + * HTTP Etag header + * + * @access protected + * @var string + */ + protected $etag = ''; - public static function create($adapter = null) - { - return $adapter ?: self::chooseAdapter(); - } + /** + * HTTP Last-Modified header + * + * @access protected + * @var string + */ + protected $last_modified = ''; + /** + * Proxy hostname + * + * @access protected + * @var string + */ + protected $proxy_hostname = ''; - public static function chooseAdapter() + /** + * Proxy port + * + * @access protected + * @var integer + */ + protected $proxy_port = 3128; + + /** + * Proxy username + * + * @access protected + * @var string + */ + protected $proxy_username = ''; + + /** + * Proxy password + * + * @access protected + * @var string + */ + protected $proxy_password = ''; + + /** + * Client connection timeout + * + * @access protected + * @var integer + */ + protected $timeout = 10; + + /** + * User-agent + * + * @access protected + * @var string + */ + protected $user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)'; + + /** + * Real URL used (can be changed after a HTTP redirect) + * + * @access protected + * @var string + */ + protected $url = ''; + + /** + * Page/Feed content + * + * @access protected + * @var string + */ + protected $content = ''; + + /** + * Number maximum of HTTP redirections to avoid infinite loops + * + * @access protected + * @var integer + */ + protected $max_redirects = 5; + + /** + * Maximum size of the HTTP body response + * + * @access protected + * @var integer + */ + protected $max_body_size = 2097152; // 2MB + + /** + * Get client instance: curl or stream driver + * + * @static + * @access public + * @return \PicoFeed\Client + */ + public static function getInstance() { if (function_exists('curl_init')) { require_once __DIR__.'/Clients/Curl.php'; return new Clients\Curl; - - } else if (ini_get('allow_url_fopen')) { + } + else if (ini_get('allow_url_fopen')) { require_once __DIR__.'/Clients/Stream.php'; return new Clients\Stream; } - throw new \LogicException('You must have "allow_url_fopen=1" or curl extension installed'); + throw new LogicException('You must have "allow_url_fopen=1" or curl extension installed'); } - - public function execute() + /** + * Perform the HTTP request + * + * @access public + * @param string $url URL + * @return bool + */ + public function execute($url = '') { - if ($this->url === '') { - throw new \LogicException('The URL is missing'); + if ($url !== '') { + $this->url = $url; } - Logging::log(\get_called_class().' Fetch URL: '.$this->url); - Logging::log(\get_called_class().' Etag provided: '.$this->etag); - Logging::log(\get_called_class().' Last-Modified provided: '.$this->last_modified); + Logging::setMessage(get_called_class().' Fetch URL: '.$this->url); + Logging::setMessage(get_called_class().' Etag provided: '.$this->etag); + Logging::setMessage(get_called_class().' Last-Modified provided: '.$this->last_modified); $response = $this->doRequest(); @@ -62,25 +173,42 @@ abstract class Client if ($response['status'] == 304) { $this->is_modified = false; - Logging::log(\get_called_class().' Resource not modified'); + Logging::setMessage(get_called_class().' Resource not modified'); } else if ($response['status'] == 404) { - Logging::log(\get_called_class().' Resource not found'); + Logging::setMessage(get_called_class().' Resource not found'); } else { - $this->etag = isset($response['headers']['ETag']) ? $response['headers']['ETag'] : ''; - $this->last_modified = isset($response['headers']['Last-Modified']) ? $response['headers']['Last-Modified'] : ''; + $etag = isset($response['headers']['ETag']) ? $response['headers']['ETag'] : ''; + $last_modified = isset($response['headers']['Last-Modified']) ? $response['headers']['Last-Modified'] : ''; $this->content = $response['body']; if (isset($response['headers']['Content-Type'])) { $result = explode('charset=', strtolower($response['headers']['Content-Type'])); $this->encoding = isset($result[1]) ? $result[1] : ''; } + + if (($this->etag && $this->etag === $etag) || ($this->last_modified && $last_modified === $this->last_modified)) { + $this->is_modified = false; + } + + $this->etag = $etag; + $this->last_modified = $last_modified; } + + return true; } + + return false; } - + /** + * Parse HTTP headers + * + * @access public + * @param array $lines List of headers + * @return array + */ public function parseHeaders(array $lines) { $status = 200; @@ -88,7 +216,7 @@ abstract class Client foreach ($lines as $line) { - if (strpos($line, 'HTTP') === 0/* && strpos($line, '301') === false && strpos($line, '302') === false*/) { + if (strpos($line, 'HTTP') === 0) { $status = (int) substr($line, 9, 3); } else if (strpos($line, ':') !== false) { @@ -98,71 +226,242 @@ abstract class Client } } - Logging::log(\get_called_class().' HTTP status code: '.$status); + Logging::setMessage(get_called_class().' HTTP status code: '.$status); foreach ($headers as $name => $value) { - Logging::log(\get_called_class().' HTTP header: '.$name.' => '.$value); + Logging::setMessage(get_called_class().' HTTP header: '.$name.' => '.$value); } return array($status, $headers); } - - public static function proxy($hostname, $port = 3128, $username = '', $password = '') - { - self::$proxy_hostname = $hostname; - self::$proxy_port = $port; - self::$proxy_username = $username; - self::$proxy_password = $password; - } - - + /** + * Set the Last-Modified HTTP header + * + * @access public + * @param string $last_modified Header value + * @return \PicoFeed\Client + */ public function setLastModified($last_modified) { $this->last_modified = $last_modified; return $this; } - + /** + * Get the value of the Last-Modified HTTP header + * + * @access public + * @return string + */ public function getLastModified() { return $this->last_modified; } - + /** + * Set the value of the Etag HTTP header + * + * @access public + * @param string $etag Etag HTTP header value + * @return \PicoFeed\Client + */ public function setEtag($etag) { $this->etag = $etag; return $this; } - + /** + * Get the Etag HTTP header value + * + * @access public + * @return string + */ public function getEtag() { return $this->etag; } - + /** + * Get the final url value + * + * @access public + * @return string + */ public function getUrl() { return $this->url; } + /** + * Set the url + * + * @access public + * @return string + * @return \PicoFeed\Client + */ + public function setUrl($url) + { + $this->url = $url; + return $this; + } + /** + * Get the body of the HTTP response + * + * @access public + * @return string + */ public function getContent() { return $this->content; } - + /** + * Get the encoding value from HTTP headers + * + * @access public + * @return string + */ public function getEncoding() { return $this->encoding; } - + /** + * Return true if the remote resource has changed + * + * @access public + * @return bool + */ public function isModified() { return $this->is_modified; } -} \ No newline at end of file + + /** + * Set connection timeout + * + * @access public + * @param integer $timeout Connection timeout + * @return \PicoFeed\Client + */ + public function setTimeout($timeout) + { + $this->timeout = $timeout ?: $this->timeout; + return $this; + } + + /** + * Set a custom user agent + * + * @access public + * @param string $user_agent User Agent + * @return \PicoFeed\Client + */ + public function setUserAgent($user_agent) + { + $this->user_agent = $user_agent ?: $this->user_agent; + return $this; + } + + /** + * Set the mximum number of HTTP redirections + * + * @access public + * @param integer $max Maximum + * @return \PicoFeed\Client + */ + public function setMaxRedirections($max) + { + $this->max_redirects = $max ?: $this->max_redirects; + return $this; + } + + /** + * Set the maximum size of the HTTP body + * + * @access public + * @param integer $max Maximum + * @return \PicoFeed\Client + */ + public function setMaxBodySize($max) + { + $this->max_body_size = $max ?: $this->max_body_size; + return $this; + } + + /** + * Set the proxy hostname + * + * @access public + * @param string $hostname Proxy hostname + * @return \PicoFeed\Client + */ + public function setProxyHostname($hostname) + { + $this->proxy_hostname = $hostname ?: $this->proxy_hostname; + return $this; + } + + /** + * Set the proxy port + * + * @access public + * @param integer $port Proxy port + * @return \PicoFeed\Client + */ + public function setProxyPort($port) + { + $this->proxy_port = $port ?: $this->proxy_port; + return $this; + } + + /** + * Set the proxy username + * + * @access public + * @param string $username Proxy username + * @return \PicoFeed\Client + */ + public function setProxyUsername($username) + { + $this->proxy_username = $username ?: $this->proxy_username; + return $this; + } + + /** + * Set the proxy password + * + * @access public + * @param string $password Password + * @return \PicoFeed\Client + */ + public function setProxyPassword($password) + { + $this->proxy_password = $password ?: $this->proxy_password; + return $this; + } + + /** + * Set config object + * + * @access public + * @param \PicoFeed\Config $config Config instance + * @return \PicoFeed\Client + */ + public function setConfig($config) + { + $this->setTimeout($config->getGrabberTimeout()); + $this->setUserAgent($config->getGrabberUserAgent()); + $this->setMaxRedirections($config->getMaxRedirections()); + $this->setMaxBodySize($config->getMaxBodySize()); + $this->setProxyHostname($config->getProxyHostname()); + $this->setProxyPort($config->getProxyPort()); + $this->setProxyUsername($config->getProxyUsername()); + $this->setProxyPassword($config->getProxyPassword()); + + return $this; + } +} diff --git a/vendor/PicoFeed/Clients/Curl.php b/vendor/PicoFeed/Clients/Curl.php index 47eaae0..66a4773 100644 --- a/vendor/PicoFeed/Clients/Curl.php +++ b/vendor/PicoFeed/Clients/Curl.php @@ -3,27 +3,80 @@ namespace PicoFeed\Clients; use \PicoFeed\Logging; +use \PicoFeed\Client; -class Curl extends \PicoFeed\Client +/** + * cURL HTTP client + * + * @author Frederic Guillot + * @package client + */ +class Curl extends Client { + /** + * HTTP response body + * + * @access private + * @var string + */ private $body = ''; + + /** + * Body size + * + * @access private + * @var integer + */ private $body_length = 0; + + /** + * HTTP response headers + * + * @access private + * @var array + */ private $headers = array(); + + /** + * Counter on the number of header received + * + * @access private + * @var integer + */ private $headers_counter = 0; - + /** + * cURL callback to read the HTTP body + * + * If the function return -1, curl stop to read the HTTP response + * + * @access public + * @param resource $ch cURL handler + * @param string $buffer Chunk of data + * @return integer Length of the buffer + */ public function readBody($ch, $buffer) { $length = strlen($buffer); $this->body_length += $length; - if ($this->body_length > $this->max_body_size) return -1; + if ($this->body_length > $this->max_body_size) { + return -1; + } + $this->body .= $buffer; return $length; } - + /** + * cURL callback to read HTTP headers + * + * @access public + * @param resource $ch cURL handler + * @param string $buffer Header line + * @return integer Length of the buffer + */ public function readHeaders($ch, $buffer) { $length = strlen($buffer); @@ -43,7 +96,13 @@ class Curl extends \PicoFeed\Client return $length; } - + /** + * Do the HTTP request + * + * @access public + * @param bool $follow_location Flag used when there is an open_basedir restriction + * @return array HTTP response ['body' => ..., 'status' => ..., 'headers' => ...] + */ public function doRequest($follow_location = true) { $request_headers = array('Connection: close'); @@ -54,6 +113,7 @@ class Curl extends \PicoFeed\Client $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $this->url); + curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $this->timeout); curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); curl_setopt($ch, CURLOPT_USERAGENT, $this->user_agent); @@ -67,28 +127,34 @@ class Curl extends \PicoFeed\Client curl_setopt($ch, CURLOPT_COOKIEJAR, 'php://memory'); curl_setopt($ch, CURLOPT_COOKIEFILE, 'php://memory'); - if (parent::$proxy_hostname) { + if ($this->proxy_hostname) { - curl_setopt($ch, CURLOPT_PROXYPORT, parent::$proxy_port); + Logging::setMessage(get_called_class().' Proxy: '.$this->proxy_hostname.':'.$this->proxy_port); + + curl_setopt($ch, CURLOPT_PROXYPORT, $this->proxy_port); curl_setopt($ch, CURLOPT_PROXYTYPE, 'HTTP'); - curl_setopt($ch, CURLOPT_PROXY, parent::$proxy_hostname); + curl_setopt($ch, CURLOPT_PROXY, $this->proxy_hostname); - if (parent::$proxy_username) { - curl_setopt($ch, CURLOPT_PROXYUSERPWD, parent::$proxy_username.':'.parent::$proxy_password); + if ($this->proxy_username) { + Logging::setMessage(get_called_class().' Proxy credentials: Yes'); + curl_setopt($ch, CURLOPT_PROXYUSERPWD, $this->proxy_username.':'.$this->proxy_password); + } + else { + Logging::setMessage(get_called_class().' Proxy credentials: No'); } } curl_exec($ch); - Logging::log(\get_called_class().' cURL total time: '.curl_getinfo($ch, CURLINFO_TOTAL_TIME)); - Logging::log(\get_called_class().' cURL dns lookup time: '.curl_getinfo($ch, CURLINFO_NAMELOOKUP_TIME)); - Logging::log(\get_called_class().' cURL connect time: '.curl_getinfo($ch, CURLINFO_CONNECT_TIME)); - Logging::log(\get_called_class().' cURL speed download: '.curl_getinfo($ch, CURLINFO_SPEED_DOWNLOAD)); - Logging::log(\get_called_class().' cURL effective url: '.curl_getinfo($ch, CURLINFO_EFFECTIVE_URL)); + Logging::setMessage(get_called_class().' cURL total time: '.curl_getinfo($ch, CURLINFO_TOTAL_TIME)); + Logging::setMessage(get_called_class().' cURL dns lookup time: '.curl_getinfo($ch, CURLINFO_NAMELOOKUP_TIME)); + Logging::setMessage(get_called_class().' cURL connect time: '.curl_getinfo($ch, CURLINFO_CONNECT_TIME)); + Logging::setMessage(get_called_class().' cURL speed download: '.curl_getinfo($ch, CURLINFO_SPEED_DOWNLOAD)); + Logging::setMessage(get_called_class().' cURL effective url: '.curl_getinfo($ch, CURLINFO_EFFECTIVE_URL)); if (curl_errno($ch)) { - Logging::log(\get_called_class().' cURL error: '.curl_error($ch)); + Logging::setMessage(get_called_class().' cURL error: '.curl_error($ch)); curl_close($ch); return false; @@ -133,4 +199,4 @@ class Curl extends \PicoFeed\Client 'headers' => $headers ); } -} \ No newline at end of file +} diff --git a/vendor/PicoFeed/Clients/Stream.php b/vendor/PicoFeed/Clients/Stream.php index e004b2f..5da7e11 100644 --- a/vendor/PicoFeed/Clients/Stream.php +++ b/vendor/PicoFeed/Clients/Stream.php @@ -3,6 +3,7 @@ namespace PicoFeed\Clients; use \PicoFeed\Logging; +use \PicoFeed\Client; /** * Stream context HTTP client @@ -10,7 +11,7 @@ use \PicoFeed\Logging; * @author Frederic Guillot * @package client */ -class Stream extends \PicoFeed\Client +class Stream extends Client { /** * Do the HTTP request @@ -24,11 +25,19 @@ class Stream extends \PicoFeed\Client $headers = array( 'Connection: close', 'User-Agent: '.$this->user_agent, - 'Accept-Encoding: gzip', ); - if ($this->etag) $headers[] = 'If-None-Match: '.$this->etag; - if ($this->last_modified) $headers[] = 'If-Modified-Since: '.$this->last_modified; + if (function_exists('gzdecode')) { + $headers[] = 'Accept-Encoding: gzip'; + } + + if ($this->etag) { + $headers[] = 'If-None-Match: '.$this->etag; + } + + if ($this->last_modified) { + $headers[] = 'If-Modified-Since: '.$this->last_modified; + } // Create context $context_options = array( @@ -41,14 +50,22 @@ class Stream extends \PicoFeed\Client ) ); - if (parent::$proxy_hostname) { - $context_options['http']['proxy'] = 'tcp://'.parent::$proxy_hostname.':'.parent::$proxy_port; + if ($this->proxy_hostname) { + + Logging::setMessage(get_called_class().' Proxy: '.$this->proxy_hostname.':'.$this->proxy_port); + + $context_options['http']['proxy'] = 'tcp://'.$this->proxy_hostname.':'.$this->proxy_port; $context_options['http']['request_fulluri'] = true; - if (parent::$proxy_username) { - $headers[] = 'Proxy-Authorization: Basic '.base64_encode(parent::$proxy_username.':'.parent::$proxy_password); + if ($this->proxy_username) { + Logging::setMessage(get_called_class().' Proxy credentials: Yes'); + + $headers[] = 'Proxy-Authorization: Basic '.base64_encode($this->proxy_username.':'.$this->proxy_password); $context_options['http']['header'] = implode("\r\n", $headers); } + else { + Logging::setMessage(get_called_class().' Proxy credentials: No'); + } } $context = stream_context_create($context_options); diff --git a/vendor/PicoFeed/Encoding.php b/vendor/PicoFeed/Encoding.php index ebfa9a3..1f87c30 100644 --- a/vendor/PicoFeed/Encoding.php +++ b/vendor/PicoFeed/Encoding.php @@ -1,32 +1,6 @@ @@ -35,15 +9,11 @@ POSSIBILITY OF SUCH DAMAGE. * @link https://github.com/neitanod/forceutf8 * @example https://github.com/neitanod/forceutf8 * @license Revised BSD - */ - -namespace PicoFeed; - -class Encoding { - - protected static $win1252ToUtf8 = array( + */ +class Encoding +{ + protected static $win1252ToUtf8 = array( 128 => "\xe2\x82\xac", - 130 => "\xe2\x80\x9a", 131 => "\xc6\x92", 132 => "\xe2\x80\x9e", @@ -55,10 +25,7 @@ class Encoding { 138 => "\xc5\xa0", 139 => "\xe2\x80\xb9", 140 => "\xc5\x92", - 142 => "\xc5\xbd", - - 145 => "\xe2\x80\x98", 146 => "\xe2\x80\x99", 147 => "\xe2\x80\x9c", @@ -71,260 +38,155 @@ class Encoding { 154 => "\xc5\xa1", 155 => "\xe2\x80\xba", 156 => "\xc5\x93", - 158 => "\xc5\xbe", 159 => "\xc5\xb8" - ); - - protected static $brokenUtf8ToUtf8 = array( - "\xc2\x80" => "\xe2\x82\xac", - - "\xc2\x82" => "\xe2\x80\x9a", - "\xc2\x83" => "\xc6\x92", - "\xc2\x84" => "\xe2\x80\x9e", - "\xc2\x85" => "\xe2\x80\xa6", - "\xc2\x86" => "\xe2\x80\xa0", - "\xc2\x87" => "\xe2\x80\xa1", - "\xc2\x88" => "\xcb\x86", - "\xc2\x89" => "\xe2\x80\xb0", - "\xc2\x8a" => "\xc5\xa0", - "\xc2\x8b" => "\xe2\x80\xb9", - "\xc2\x8c" => "\xc5\x92", - - "\xc2\x8e" => "\xc5\xbd", - - - "\xc2\x91" => "\xe2\x80\x98", - "\xc2\x92" => "\xe2\x80\x99", - "\xc2\x93" => "\xe2\x80\x9c", - "\xc2\x94" => "\xe2\x80\x9d", - "\xc2\x95" => "\xe2\x80\xa2", - "\xc2\x96" => "\xe2\x80\x93", - "\xc2\x97" => "\xe2\x80\x94", - "\xc2\x98" => "\xcb\x9c", - "\xc2\x99" => "\xe2\x84\xa2", - "\xc2\x9a" => "\xc5\xa1", - "\xc2\x9b" => "\xe2\x80\xba", - "\xc2\x9c" => "\xc5\x93", - - "\xc2\x9e" => "\xc5\xbe", - "\xc2\x9f" => "\xc5\xb8" - ); - - protected static $utf8ToWin1252 = array( - "\xe2\x82\xac" => "\x80", - - "\xe2\x80\x9a" => "\x82", - "\xc6\x92" => "\x83", - "\xe2\x80\x9e" => "\x84", - "\xe2\x80\xa6" => "\x85", - "\xe2\x80\xa0" => "\x86", - "\xe2\x80\xa1" => "\x87", - "\xcb\x86" => "\x88", - "\xe2\x80\xb0" => "\x89", - "\xc5\xa0" => "\x8a", - "\xe2\x80\xb9" => "\x8b", - "\xc5\x92" => "\x8c", - - "\xc5\xbd" => "\x8e", - - - "\xe2\x80\x98" => "\x91", - "\xe2\x80\x99" => "\x92", - "\xe2\x80\x9c" => "\x93", - "\xe2\x80\x9d" => "\x94", - "\xe2\x80\xa2" => "\x95", - "\xe2\x80\x93" => "\x96", - "\xe2\x80\x94" => "\x97", - "\xcb\x9c" => "\x98", - "\xe2\x84\xa2" => "\x99", - "\xc5\xa1" => "\x9a", - "\xe2\x80\xba" => "\x9b", - "\xc5\x93" => "\x9c", - - "\xc5\xbe" => "\x9e", - "\xc5\xb8" => "\x9f" ); - static function toUTF8($text){ - /** - * Function Encoding::toUTF8 - * - * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8. - * - * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1. - * - * It may fail to convert characters to UTF-8 if they fall into one of these scenarios: - * - * 1) when any of these characters: ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß - * are followed by any of these: ("group B") - * ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿ - * For example: %ABREPRESENT%C9%BB. «REPRESENTÉ» - * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB) - * is also a valid unicode character, and will be left unchanged. - * - * 2) when any of these: àáâãäåæçèéêëìíîï are followed by TWO chars from group B, - * 3) when any of these: ðñòó are followed by THREE chars from group B. - * - * @name toUTF8 - * @param string $text Any string. - * @return string The same string, UTF8 encoded - * - */ + protected static $utf8ToWin1252 = array( + "\xe2\x82\xac" => "\x80", + "\xe2\x80\x9a" => "\x82", + "\xc6\x92" => "\x83", + "\xe2\x80\x9e" => "\x84", + "\xe2\x80\xa6" => "\x85", + "\xe2\x80\xa0" => "\x86", + "\xe2\x80\xa1" => "\x87", + "\xcb\x86" => "\x88", + "\xe2\x80\xb0" => "\x89", + "\xc5\xa0" => "\x8a", + "\xe2\x80\xb9" => "\x8b", + "\xc5\x92" => "\x8c", + "\xc5\xbd" => "\x8e", + "\xe2\x80\x98" => "\x91", + "\xe2\x80\x99" => "\x92", + "\xe2\x80\x9c" => "\x93", + "\xe2\x80\x9d" => "\x94", + "\xe2\x80\xa2" => "\x95", + "\xe2\x80\x93" => "\x96", + "\xe2\x80\x94" => "\x97", + "\xcb\x9c" => "\x98", + "\xe2\x84\xa2" => "\x99", + "\xc5\xa1" => "\x9a", + "\xe2\x80\xba" => "\x9b", + "\xc5\x93" => "\x9c", + "\xc5\xbe" => "\x9e", + "\xc5\xb8" => "\x9f" + ); - if(is_array($text)) + /** + * Function Encoding::toUTF8 + * + * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8. + * + * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1. + * + * It may fail to convert characters to UTF-8 if they fall into one of these scenarios: + * + * 1) when any of these characters: ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß + * are followed by any of these: ("group B") + * ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿ + * For example: %ABREPRESENT%C9%BB. «REPRESENTÉ» + * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB) + * is also a valid unicode character, and will be left unchanged. + * + * 2) when any of these: àáâãäåæçèéêëìíîï are followed by TWO chars from group B, + * 3) when any of these: ðñòó are followed by THREE chars from group B. + * + * @name toUTF8 + * @param string $text Any string. + * @return string The same string, UTF8 encoded + * + */ + public static function toUTF8($text) { - foreach($text as $k => $v) - { - $text[$k] = self::toUTF8($v); - } - return $text; - } elseif(is_string($text)) { + if (is_array($text)) { + foreach ($text as $k => $v) { + $text[$k] = self::toUTF8($v); + } - $max = strlen($text); - $buf = ""; - for($i = 0; $i < $max; $i++){ - $c1 = $text{$i}; - if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already - $c2 = $i+1 >= $max? "\x00" : $text{$i+1}; - $c3 = $i+2 >= $max? "\x00" : $text{$i+2}; - $c4 = $i+3 >= $max? "\x00" : $text{$i+3}; - if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8 - if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already - $buf .= $c1 . $c2; - $i++; - } else { //not valid UTF8. Convert it. - $cc1 = (chr(ord($c1) / 64) | "\xc0"); - $cc2 = ($c1 & "\x3f") | "\x80"; - $buf .= $cc1 . $cc2; - } - } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8 - if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already - $buf .= $c1 . $c2 . $c3; - $i = $i + 2; - } else { //not valid UTF8. Convert it. - $cc1 = (chr(ord($c1) / 64) | "\xc0"); - $cc2 = ($c1 & "\x3f") | "\x80"; - $buf .= $cc1 . $cc2; - } - } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8 - if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already - $buf .= $c1 . $c2 . $c3; - $i = $i + 2; - } else { //not valid UTF8. Convert it. - $cc1 = (chr(ord($c1) / 64) | "\xc0"); - $cc2 = ($c1 & "\x3f") | "\x80"; - $buf .= $cc1 . $cc2; - } - } else { //doesn't look like UTF8, but should be converted - $cc1 = (chr(ord($c1) / 64) | "\xc0"); - $cc2 = (($c1 & "\x3f") | "\x80"); - $buf .= $cc1 . $cc2; - } - } elseif(($c1 & "\xc0") == "\x80"){ // needs conversion - if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases - $buf .= self::$win1252ToUtf8[ord($c1)]; - } else { - $cc1 = (chr(ord($c1) / 64) | "\xc0"); - $cc2 = (($c1 & "\x3f") | "\x80"); - $buf .= $cc1 . $cc2; + return $text; + } + elseif (is_string($text)) { + + $max = strlen($text); + $buf = ""; + + for ($i = 0; $i < $max; $i++) { + + $c1 = $text{$i}; + + if ($c1>="\xc0") { //Should be converted to UTF8, if it's not UTF8 already + + $c2 = $i+1 >= $max? "\x00" : $text{$i+1}; + $c3 = $i+2 >= $max? "\x00" : $text{$i+2}; + $c4 = $i+3 >= $max? "\x00" : $text{$i+3}; + + if ($c1 >= "\xc0" & $c1 <= "\xdf") { //looks like 2 bytes UTF8 + + if ($c2 >= "\x80" && $c2 <= "\xbf") { //yeah, almost sure it's UTF8 already + $buf .= $c1 . $c2; + $i++; + } + else { //not valid UTF8. Convert it. + $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc2 = ($c1 & "\x3f") | "\x80"; + $buf .= $cc1 . $cc2; + } + } + else if ($c1 >= "\xe0" & $c1 <= "\xef") { //looks like 3 bytes UTF8 + + if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { //yeah, almost sure it's UTF8 already + $buf .= $c1 . $c2 . $c3; + $i = $i + 2; + } + else { //not valid UTF8. Convert it. + $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc2 = ($c1 & "\x3f") | "\x80"; + $buf .= $cc1 . $cc2; + } + } + else if ($c1 >= "\xf0" & $c1 <= "\xf7") { //looks like 4 bytes UTF8 + + if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") { //yeah, almost sure it's UTF8 already + $buf .= $c1 . $c2 . $c3; + $i = $i + 2; + } + else { //not valid UTF8. Convert it. + $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc2 = ($c1 & "\x3f") | "\x80"; + $buf .= $cc1 . $cc2; + } + } + else { //doesn't look like UTF8, but should be converted + $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc2 = (($c1 & "\x3f") | "\x80"); + $buf .= $cc1 . $cc2; + } } - } else { // it doesn't need convesion - $buf .= $c1; - } - } - return $buf; - } else { - return $text; - } - } + elseif (($c1 & "\xc0") == "\x80") { // needs conversion - static function toWin1252($text) { - if(is_array($text)) { - foreach($text as $k => $v) { - $text[$k] = self::toWin1252($v); - } - return $text; - } elseif(is_string($text)) { - return utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text))); - } else { - return $text; - } - } + if (isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases + $buf .= self::$win1252ToUtf8[ord($c1)]; + } + else { + $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc2 = (($c1 & "\x3f") | "\x80"); + $buf .= $cc1 . $cc2; + } + } + else { // it doesn't need convesion + $buf .= $c1; + } + } - static function toISO8859($text) { - return self::toWin1252($text); - } - - static function toLatin1($text) { - return self::toWin1252($text); - } - - static function fixUTF8($text){ - if(is_array($text)) { - foreach($text as $k => $v) { - $text[$k] = self::fixUTF8($v); - } - return $text; + return $buf; + } + else { + return $text; + } } - $last = ""; - while($last <> $text){ - $last = $text; - $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text))); + public static function cp1251ToUtf8($input) + { + return iconv('CP1251', 'UTF-8//TRANSLIT', $input); } - $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text))); - return $text; - } - - static function UTF8FixWin1252Chars($text){ - // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1 - // (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it. - // See: http://en.wikipedia.org/wiki/Windows-1252 - - return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text); - } - - static function removeBOM($str=""){ - if(substr($str, 0,3) == pack("CCC",0xef,0xbb,0xbf)) { - $str=substr($str, 3); - } - return $str; - } - - public static function normalizeEncoding($encodingLabel) - { - $encoding = strtoupper($encodingLabel); - $enc = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding); - $equivalences = array( - 'ISO88591' => 'ISO-8859-1', - 'ISO8859' => 'ISO-8859-1', - 'ISO' => 'ISO-8859-1', - 'LATIN1' => 'ISO-8859-1', - 'LATIN' => 'ISO-8859-1', - 'UTF8' => 'UTF-8', - 'UTF' => 'UTF-8', - 'WIN1252' => 'ISO-8859-1', - 'WINDOWS1252' => 'ISO-8859-1' - ); - - if(empty($equivalences[$encoding])){ - return 'UTF-8'; - } - - return $equivalences[$encoding]; - } - - public static function encode($encodingLabel, $text) - { - $encodingLabel = self::normalizeEncoding($encodingLabel); - if($encodingLabel == 'UTF-8') return Encoding::toUTF8($text); - if($encodingLabel == 'ISO-8859-1') return Encoding::toLatin1($text); - } - - - public static function cp1251ToUtf8($input) - { - return iconv('CP1251', 'UTF-8//TRANSLIT', $input); - } } diff --git a/vendor/PicoFeed/Export.php b/vendor/PicoFeed/Export.php index 4601f0a..df03f98 100644 --- a/vendor/PicoFeed/Export.php +++ b/vendor/PicoFeed/Export.php @@ -2,26 +2,56 @@ namespace PicoFeed; +use SimpleXMLElement; + +/** + * OPML export class + * + * @author Frederic Guillot + * @package picofeed + */ class Export { + /** + * List of feeds to exports + * + * @access private + * @var array + */ private $content = array(); - public $required_fields = array( + /** + * List of required properties for each feed + * + * @access private + * @var array + */ + private $required_fields = array( 'title', 'site_url', - 'feed_url' + 'feed_url', ); - + /** + * Constructor + * + * @access public + * @param array $content List of feeds + */ public function __construct(array $content) { $this->content = $content; } - + /** + * Get the OPML document + * + * @access public + * @return string + */ public function execute() { - $xml = new \SimpleXMLElement(''); + $xml = new SimpleXMLElement(''); $head = $xml->addChild('head'); $head->addChild('title', 'OPML Export'); @@ -35,13 +65,14 @@ class Export foreach ($this->required_fields as $field) { if (! isset($feed[$field])) { - $valid = false; break; } } - if (! $valid) continue; + if (! $valid) { + continue; + } $outline = $body->addChild('outline'); $outline->addAttribute('xmlUrl', $feed['feed_url']); @@ -55,4 +86,4 @@ class Export return $xml->asXML(); } -} \ No newline at end of file +} diff --git a/vendor/PicoFeed/Feed.php b/vendor/PicoFeed/Feed.php new file mode 100644 index 0000000..90ce0d8 --- /dev/null +++ b/vendor/PicoFeed/Feed.php @@ -0,0 +1,150 @@ +$property.PHP_EOL; + } + + $output .= 'Feed::items = '.count($this->items).' items'.PHP_EOL; + + foreach ($this->items as $item) { + $output .= '----'.PHP_EOL; + $output .= $item; + } + + return $output; + } + + /** + * Get title + * + * @access public + * $return string + */ + public function getTitle() + { + return $this->title; + } + + /** + * Get url + * + * @access public + * $return string + */ + public function getUrl() + { + return $this->url; + } + + /** + * Get date + * + * @access public + * $return integer + */ + public function getDate() + { + return $this->date; + } + + /** + * Get language + * + * @access public + * $return string + */ + public function getLanguage() + { + return $this->language; + } + + /** + * Get id + * + * @access public + * $return string + */ + public function getId() + { + return $this->id; + } + + /** + * Get feed items + * + * @access public + * $return array + */ + public function getItems() + { + return $this->items; + } +} diff --git a/vendor/PicoFeed/Filter.php b/vendor/PicoFeed/Filter.php index bbf5b7c..af1a877 100644 --- a/vendor/PicoFeed/Filter.php +++ b/vendor/PicoFeed/Filter.php @@ -2,14 +2,24 @@ namespace PicoFeed; +use DOMDocument; + /** * Filter class * * @author Frederic Guillot - * @package parser + * @package picofeed */ class Filter { + /** + * Config object + * + * @access private + * @var \PicoFeed\Config + */ + private $config = null; + /** * Filtered XML data * @@ -61,11 +71,10 @@ class Filter /** * Tags and attribute whitelist * - * @static - * @access public + * @access private * @var array */ - public static $whitelist_tags = array( + private $whitelist_tags = array( 'audio' => array('controls', 'src'), 'video' => array('poster', 'controls', 'height', 'width', 'src'), 'source' => array('src', 'type'), @@ -109,11 +118,10 @@ class Filter /** * Tags blacklist, strip the content of those tags * - * @static - * @access public + * @access private * @var array */ - public static $blacklist_tags = array( + private $blacklisted_tags = array( 'script' ); @@ -121,11 +129,10 @@ class Filter * Scheme whitelist * For a complete list go to http://en.wikipedia.org/wiki/URI_scheme * - * @static - * @access public + * @access private * @var array */ - public static $scheme_whitelist = array( + private $scheme_whitelist = array( '//', 'data:image/png;base64,', 'data:image/gif;base64,', @@ -164,11 +171,10 @@ class Filter /** * Attributes used for external resources * - * @static - * @access public + * @access private * @var array */ - public static $media_attributes = array( + private $media_attributes = array( 'src', 'href', 'poster', @@ -177,11 +183,10 @@ class Filter /** * Blacklisted resources * - * @static - * @access public + * @access private * @var array */ - public static $media_blacklist = array( + private $media_blacklist = array( 'feeds.feedburner.com', 'share.feedsportal.com', 'da.feedsportal.com', @@ -209,11 +214,10 @@ class Filter /** * Mandatory attributes for specified tags * - * @static - * @access public + * @access private * @var array */ - public static $required_attributes = array( + private $required_attributes = array( 'a' => array('href'), 'img' => array('src'), 'iframe' => array('src'), @@ -224,22 +228,20 @@ class Filter /** * Add attributes to specified tags * - * @static - * @access public + * @access private * @var array */ - public static $add_attributes = array( + private $add_attributes = array( 'a' => 'rel="noreferrer" target="_blank"' ); /** * Attributes that must be integer * - * @static - * @access public + * @access private * @var array */ - public static $integer_attributes = array( + private $integer_attributes = array( 'width', 'height', 'frameborder', @@ -248,11 +250,10 @@ class Filter /** * Iframe source whitelist, everything else is ignored * - * @static - * @access public + * @access private * @var array */ - public static $iframe_whitelist = array( + private $iframe_whitelist = array( '//www.youtube.com', 'http://www.youtube.com', 'https://www.youtube.com', @@ -273,10 +274,10 @@ class Filter { $this->url = $site_url; - \libxml_use_internal_errors(true); + libxml_use_internal_errors(true); // Convert bad formatted documents to XML - $dom = new \DOMDocument; + $dom = new DOMDocument; $dom->loadHTML(''.$data); $this->input = $dom->saveXML($dom->getElementsByTagName('body')->item(0)); } @@ -300,7 +301,7 @@ class Filter $this->data = $this->removeEmptyTags($this->data); $this->data = $this->removeMultipleTags($this->data); - return $this->data; + return trim($this->data); } /** @@ -372,9 +373,9 @@ class Filter } // Check for required attributes - if (isset(self::$required_attributes[$name])) { + if (isset($this->required_attributes[$name])) { - foreach (self::$required_attributes[$name] as $required_attribute) { + foreach ($this->required_attributes[$name] as $required_attribute) { if (! in_array($required_attribute, $used_attributes)) { @@ -389,9 +390,9 @@ class Filter $this->data .= '<'.$name.$attr_data; // Add custom attributes - if (isset(self::$add_attributes[$name])) { + if (isset($this->add_attributes[$name])) { - $this->data .= ' '.self::$add_attributes[$name].' '; + $this->data .= ' '.$this->add_attributes[$name].' '; } // If img or br, we don't close it here @@ -399,7 +400,7 @@ class Filter } } - if (in_array($name, self::$blacklist_tags)) { + if (in_array($name, $this->blacklisted_tags)) { $this->strip_content = true; } @@ -530,7 +531,7 @@ class Filter */ public function isAllowedTag($name) { - return isset(self::$whitelist_tags[$name]); + return isset($this->whitelist_tags[$name]); } /** @@ -543,7 +544,7 @@ class Filter */ public function isAllowedAttribute($tag, $attribute) { - return in_array($attribute, self::$whitelist_tags[$tag]); + return in_array($attribute, $this->whitelist_tags[$tag]); } /** @@ -555,7 +556,7 @@ class Filter */ public function isResource($attribute) { - return in_array($attribute, self::$media_attributes); + return in_array($attribute, $this->media_attributes); } /** @@ -567,7 +568,7 @@ class Filter */ public function isAllowedIframeResource($value) { - foreach (self::$iframe_whitelist as $url) { + foreach ($this->iframe_whitelist as $url) { if (strpos($value, $url) === 0) { return true; @@ -586,7 +587,7 @@ class Filter */ public function isAllowedProtocol($value) { - foreach (self::$scheme_whitelist as $protocol) { + foreach ($this->scheme_whitelist as $protocol) { if (strpos($value, $protocol) === 0) { return true; @@ -605,7 +606,7 @@ class Filter */ public function isBlacklistedMedia($resource) { - foreach (self::$media_blacklist as $name) { + foreach ($this->media_blacklist as $name) { if (strpos($resource, $name) !== false) { return true; @@ -640,7 +641,7 @@ class Filter */ public function validateAttributeValue($attribute, $value) { - if (in_array($attribute, self::$integer_attributes)) { + if (in_array($attribute, $this->integer_attributes)) { return ctype_digit($value); } @@ -758,4 +759,147 @@ class Filter return $encoding; } + + /** + * Set whitelisted tags adn attributes for each tag + * + * @access public + * @param array $values List of tags: ['video' => ['src', 'cover'], 'img' => ['src']] + * @return \PicoFeed\Filter + */ + public function setWhitelistedTags(array $values) + { + $this->whitelist_tags = $values ?: $this->whitelist_tags; + return $this; + } + + /** + * Set blacklisted tags + * + * @access public + * @param array $values List of tags: ['video', 'img'] + * @return \PicoFeed\Filter + */ + public function setBlacklistedTags(array $values) + { + $this->blacklisted_tags = $values ?: $this->blacklisted_tags; + return $this; + } + + /** + * Set scheme whitelist + * + * @access public + * @param array $values List of scheme: ['http://', 'ftp://'] + * @return \PicoFeed\Filter + */ + public function setSchemeWhitelist(array $values) + { + $this->scheme_whitelist = $values ?: $this->scheme_whitelist; + return $this; + } + + /** + * Set media attributes (used to load external resources) + * + * @access public + * @param array $values List of values: ['src', 'href'] + * @return \PicoFeed\Filter + */ + public function setMediaAttributes(array $values) + { + $this->media_attributes = $values ?: $this->media_attributes; + return $this; + } + + /** + * Set blacklisted external resources + * + * @access public + * @param array $values List of tags: ['http://google.com/', '...'] + * @return \PicoFeed\Filter + */ + public function setMediaBlacklist(array $values) + { + $this->media_blacklist = $values ?: $this->media_blacklist; + return $this; + } + + /** + * Set mandatory attributes for whitelisted tags + * + * @access public + * @param array $values List of tags: ['img' => 'src'] + * @return \PicoFeed\Filter + */ + public function setRequiredAttributes(array $values) + { + $this->required_attributes = $values ?: $this->required_attributes; + return $this; + } + + /** + * Set attributes to automatically to specific tags + * + * @access public + * @param array $values List of tags: ['a' => 'target="_blank"'] + * @return \PicoFeed\Filter + */ + public function setAttributeOverrides(array $values) + { + $this->add_attributes = $values ?: $this->add_attributes; + return $this; + } + + /** + * Set attributes that must be an integer + * + * @access public + * @param array $values List of tags: ['width', 'height'] + * @return \PicoFeed\Filter + */ + public function setIntegerAttributes(array $values) + { + $this->integer_attributes = $values ?: $this->integer_attributes; + return $this; + } + + /** + * Set allowed iframe resources + * + * @access public + * @param array $values List of tags: ['http://www.youtube.com'] + * @return \PicoFeed\Filter + */ + public function setIframeWhitelist(array $values) + { + $this->iframe_whitelist = $values ?: $this->iframe_whitelist; + return $this; + } + + /** + * Set config object + * + * @access public + * @param \PicoFeed\Config $config Config instance + * @return \PicoFeed\Parse + */ + public function setConfig($config) + { + $this->config = $config; + + if ($this->config !== null) { + $this->setIframeWhitelist($this->config->getFilterIframeWhitelist(array())); + $this->setIntegerAttributes($this->config->getFilterIntegerAttributes(array())); + $this->setAttributeOverrides($this->config->getFilterAttributeOverrides(array())); + $this->setRequiredAttributes($this->config->getFilterRequiredAttributes(array())); + $this->setMediaBlacklist($this->config->getFilterMediaBlacklist(array())); + $this->setMediaAttributes($this->config->getFilterMediaAttributes(array())); + $this->setSchemeWhitelist($this->config->getFilterSchemeWhitelist(array())); + $this->setBlacklistedTags($this->config->getFilterBlacklistedTags(array())); + $this->setWhitelistedTags($this->config->getFilterWhitelistedTags(array())); + } + + return $this; + } } diff --git a/vendor/PicoFeed/Grabber.php b/vendor/PicoFeed/Grabber.php index 329d291..33244cd 100644 --- a/vendor/PicoFeed/Grabber.php +++ b/vendor/PicoFeed/Grabber.php @@ -2,19 +2,59 @@ namespace PicoFeed; -require_once __DIR__.'/Client.php'; -require_once __DIR__.'/Encoding.php'; -require_once __DIR__.'/Logging.php'; -require_once __DIR__.'/Filter.php'; +use DOMXPath; +use PicoFeed\Logging; +use PicoFeed\Client; +use PicoFeed\Encoding; +use PicoFeed\Filter; +/** + * Grabber class + * + * @author Frederic Guillot + * @package picofeed + */ class Grabber { - public $content = ''; - public $html = ''; - public $encoding = ''; + /** + * URL + * + * @access private + * @var string + */ + private $url = ''; - // Order is important, generic terms at the end - public $candidatesAttributes = array( + /** + * Relevant content + * + * @access private + * @var string + */ + private $content = ''; + + /** + * HTML content + * + * @access private + * @var string + */ + private $html = ''; + + /** + * HTML content encoding + * + * @access private + * @var string + */ + private $encoding = ''; + + /** + * List of attributes to try to get the content, order is important, generic terms at the end + * + * @access private + * @var array + */ + private $candidatesAttributes = array( 'articleBody', 'articlebody', 'article-body', @@ -37,7 +77,13 @@ class Grabber 'main', ); - public $stripAttributes = array( + /** + * List of attributes to strip + * + * @access private + * @var array + */ + private $stripAttributes = array( 'comment', 'share', 'links', @@ -57,7 +103,13 @@ class Grabber 'categories', ); - public $stripTags = array( + /** + * Tags to remove + * + * @access private + * @var array + */ + private $stripTags = array( 'script', 'style', 'nav', @@ -67,7 +119,22 @@ class Grabber 'form', ); + /** + * Config object + * + * @access private + * @var \PicoFeed\Config + */ + private $config = null; + /** + * Constructor + * + * @access public + * @param string $url Url + * @param string $html HTML content + * @param string $encoding Charset + */ public function __construct($url, $html = '', $encoding = 'utf-8') { $this->url = $url; @@ -75,13 +142,53 @@ class Grabber $this->encoding = $encoding; } + /** + * Set config object + * + * @access public + * @param \PicoFeed\Config $config Config instance + * @return \PicoFeed\Grabber + */ + public function setConfig($config) + { + $this->config = $config; + return $this; + } + /** + * Get relevant content + * + * @access public + * @return string + */ + public function getContent() + { + return $this->content; + } + + /** + * Get raw content (unfiltered) + * + * @access public + * @return string + */ + public function getRawContent() + { + return $this->html; + } + + /** + * Parse the HTML content + * + * @access public + * @return bool + */ public function parse() { if ($this->html) { - Logging::log(\get_called_class().' Fix encoding'); - Logging::log(\get_called_class().': HTTP Encoding "'.$this->encoding.'"'); + Logging::setMessage(get_called_class().' Fix encoding'); + Logging::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'"'); $this->html = Filter::stripHeadTags($this->html); @@ -92,42 +199,63 @@ class Grabber $this->html = Encoding::toUTF8($this->html); } - Logging::log(\get_called_class().' Content length: '.strlen($this->html).' bytes'); + Logging::setMessage(get_called_class().' Content length: '.strlen($this->html).' bytes'); $rules = $this->getRules(); if (is_array($rules)) { - Logging::log(\get_called_class().' Parse content with rules'); + Logging::setMessage(get_called_class().' Parse content with rules'); $this->parseContentWithRules($rules); } else { - Logging::log(\get_called_class().' Parse content with candidates'); + Logging::setMessage(get_called_class().' Parse content with candidates'); $this->parseContentWithCandidates(); } } else { - Logging::log(\get_called_class().' No content fetched'); + Logging::setMessage(get_called_class().' No content fetched'); } - Logging::log(\get_called_class().' Content length: '.strlen($this->content).' bytes'); - Logging::log(\get_called_class().' Grabber done'); + Logging::setMessage(get_called_class().' Content length: '.strlen($this->content).' bytes'); + Logging::setMessage(get_called_class().' Grabber done'); return $this->content !== ''; } - - public function download($timeout = 5, $user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36') + /** + * Download the HTML content + * + * @access public + * @return HTML content + */ + public function download() { - $client = Client::create(); - $client->url = $this->url; - $client->timeout = $timeout; - $client->user_agent = $user_agent; - $client->execute(); + $client = Client::getInstance(); + + if ($this->config !== null) { + + $client->setTimeout($this->config->getGrabberTimeout()) + ->setUserAgent($this->config->getGrabberUserAgent()) + ->setMaxRedirections($this->config->getMaxRedirections()) + ->setMaxBodySize($this->config->getMaxBodySize()) + ->setProxyHostname($this->config->getProxyHostname()) + ->setProxyPort($this->config->getProxyPort()) + ->setProxyUsername($this->config->getProxyUsername()) + ->setProxyPassword($this->config->getProxyPassword()); + } + + $client->execute($this->url); $this->html = $client->getContent(); + $this->encoding = $client->getEncoding(); return $this->html; } - + /** + * Try to find a predefined rule + * + * @access public + * @return mixed + */ public function getRules() { $hostname = parse_url($this->url, PHP_URL_HOST); @@ -147,7 +275,7 @@ class Grabber $filename = __DIR__.'/Rules/'.$file.'.php'; if (file_exists($filename)) { - Logging::log(\get_called_class().' Load rule: '.$file); + Logging::setMessage(get_called_class().' Load rule: '.$file); return include $filename; } } @@ -155,13 +283,16 @@ class Grabber return false; } - + /** + * Get the relevant content with predefined rules + * + * @access public + * @param array $rules Rules + */ public function parseContentWithRules(array $rules) { - \libxml_use_internal_errors(true); - $dom = new \DOMDocument; - $dom->loadHTML(''.$this->html); - $xpath = new \DOMXPath($dom); + $dom = XmlParser::getHtmlDocument(''.$this->html); + $xpath = new DOMXPath($dom); if (isset($rules['strip']) && is_array($rules['strip'])) { @@ -192,24 +323,26 @@ class Grabber } } - + /** + * Get the relevant content with the list of potential attributes + * + * @access public + */ public function parseContentWithCandidates() { - \libxml_use_internal_errors(true); - $dom = new \DOMDocument; - $dom->loadHTML(''.$this->html); - $xpath = new \DOMXPath($dom); + $dom = XmlParser::getHtmlDocument(''.$this->html); + $xpath = new DOMXPath($dom); // Try to lookup in each tag foreach ($this->candidatesAttributes as $candidate) { - Logging::log(\get_called_class().' Try this candidate: "'.$candidate.'"'); + Logging::setMessage(get_called_class().' Try this candidate: "'.$candidate.'"'); $nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]'); if ($nodes !== false && $nodes->length > 0) { $this->content = $dom->saveXML($nodes->item(0)); - Logging::log(\get_called_class().' Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)'); + Logging::setMessage(get_called_class().' Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)'); break; } } @@ -221,51 +354,57 @@ class Grabber if ($nodes !== false && $nodes->length > 0) { $this->content = $dom->saveXML($nodes->item(0)); - Logging::log(\get_called_class().' Find
tag ('.strlen($this->content).' bytes)'); + Logging::setMessage(get_called_class().' Find
tag ('.strlen($this->content).' bytes)'); } } if (strlen($this->content) < 50) { - Logging::log(\get_called_class().' No enought content fetched, get the full body'); + Logging::setMessage(get_called_class().' No enought content fetched, get the full body'); $this->content = $dom->saveXML($dom->firstChild); } - Logging::log(\get_called_class().' Strip garbage'); + Logging::setMessage(get_called_class().' Strip garbage'); $this->stripGarbage(); } - + /** + * Strip useless tags + * + * @access public + */ public function stripGarbage() { - \libxml_use_internal_errors(true); - $dom = new \DOMDocument; - $dom->loadXML($this->content); - $xpath = new \DOMXPath($dom); + $dom = XmlParser::getDomDocument($this->content); - foreach ($this->stripTags as $tag) { + if ($dom !== false) { - $nodes = $xpath->query('//'.$tag); + $xpath = new DOMXPath($dom); - if ($nodes !== false && $nodes->length > 0) { - Logging::log(\get_called_class().' Strip tag: "'.$tag.'"'); - foreach ($nodes as $node) { - $node->parentNode->removeChild($node); + foreach ($this->stripTags as $tag) { + + $nodes = $xpath->query('//'.$tag); + + if ($nodes !== false && $nodes->length > 0) { + Logging::setMessage(get_called_class().' Strip tag: "'.$tag.'"'); + foreach ($nodes as $node) { + $node->parentNode->removeChild($node); + } } } - } - foreach ($this->stripAttributes as $attribute) { + foreach ($this->stripAttributes as $attribute) { - $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]'); + $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]'); - if ($nodes !== false && $nodes->length > 0) { - Logging::log(\get_called_class().' Strip attribute: "'.$tag.'"'); - foreach ($nodes as $node) { - $node->parentNode->removeChild($node); + if ($nodes !== false && $nodes->length > 0) { + Logging::setMessage(get_called_class().' Strip attribute: "'.$attribute.'"'); + foreach ($nodes as $node) { + $node->parentNode->removeChild($node); + } } } - } - $this->content = $dom->saveXML($dom->documentElement); + $this->content = $dom->saveXML($dom->documentElement); + } } } diff --git a/vendor/PicoFeed/Import.php b/vendor/PicoFeed/Import.php index 096dffc..7992b18 100644 --- a/vendor/PicoFeed/Import.php +++ b/vendor/PicoFeed/Import.php @@ -3,47 +3,75 @@ namespace PicoFeed; require_once __DIR__.'/Logging.php'; +require_once __DIR__.'/XmlParser.php'; +use PicoFeed\Logging; +use PicoFeed\XmlParser; + +/** + * OPML Import + * + * @author Frederic Guillot + * @package picofeed + */ class Import { + /** + * OPML file content + * + * @access private + * @var string + */ private $content = ''; + + /** + * Subscriptions + * + * @access private + * @var array + */ private $items = array(); - + /** + * Constructor + * + * @access public + * @param string $content OPML file content + */ public function __construct($content) { $this->content = $content; } - + /** + * Parse the OPML file + * + * @access public + * @return array|false + */ public function execute() { - \PicoFeed\Logging::log(\get_called_class().': start importation'); + Logging::setMessage(get_called_class().': start importation'); - try { + $xml = XmlParser::getSimpleXml(trim($this->content)); - \libxml_use_internal_errors(true); - - $xml = new \SimpleXMLElement(trim($this->content)); - - if ($xml->getName() !== 'opml' || ! isset($xml->body)) { - \PicoFeed\Logging::log(\get_called_class().': OPML tag not found'); - return false; - } - - $this->parseEntries($xml->body); - - \PicoFeed\Logging::log(\get_called_class().': '.count($this->items).' subscriptions found'); - } - catch (\Exception $e) { - \PicoFeed\Logging::log(\get_called_class().': '.$e->getMessage()); + if ($xml === false || $xml->getName() !== 'opml' || ! isset($xml->body)) { + Logging::setMessage(get_called_class().': OPML tag not found or malformed XML document'); return false; } + $this->parseEntries($xml->body); + Logging::setMessage(get_called_class().': '.count($this->items).' subscriptions found'); + return $this->items; } - + /** + * Parse each entries of the subscription list + * + * @access public + * @param SimpleXMLElement $tree XML node + */ public function parseEntries($tree) { if (isset($tree->outline)) { @@ -68,4 +96,4 @@ class Import } } } -} \ No newline at end of file +} diff --git a/vendor/PicoFeed/Item.php b/vendor/PicoFeed/Item.php new file mode 100644 index 0000000..4a446d4 --- /dev/null +++ b/vendor/PicoFeed/Item.php @@ -0,0 +1,202 @@ +$property.PHP_EOL; + } + + $output .= 'Item::content = '.strlen($this->content).' bytes'.PHP_EOL; + + return $output; + } + + /** + * Get title + * + * @access public + * $return string + */ + public function getTitle() + { + return $this->title; + } + + /** + * Get url + * + * @access public + * $return string + */ + public function getUrl() + { + return $this->url; + } + + /** + * Get id + * + * @access public + * $return string + */ + public function getId() + { + return $this->id; + } + + /** + * Get date + * + * @access public + * $return integer + */ + public function getDate() + { + return $this->date; + } + + /** + * Get content + * + * @access public + * $return string + */ + public function getContent() + { + return $this->content; + } + + /** + * Get enclosure url + * + * @access public + * $return string + */ + public function getEnclosureUrl() + { + return $this->enclosure_url; + } + + /** + * Get enclosure type + * + * @access public + * $return string + */ + public function getEnclosureType() + { + return $this->enclosure_type; + } + + /** + * Get language + * + * @access public + * $return string + */ + public function getLanguage() + { + return $this->language; + } + + /** + * Get author + * + * @access public + * $return string + */ + public function getAuthor() + { + return $this->author; + } +} diff --git a/vendor/PicoFeed/Logging.php b/vendor/PicoFeed/Logging.php index c753fe4..f7d6c96 100644 --- a/vendor/PicoFeed/Logging.php +++ b/vendor/PicoFeed/Logging.php @@ -2,12 +2,82 @@ namespace PicoFeed; +use DateTime; +use DateTimeZone; + +/** + * Logging class + * + * @author Frederic Guillot + * @package picofeed + */ class Logging { - public static $messages = array(); + /** + * List of messages + * + * @static + * @access private + * @var array + */ + private static $messages = array(); - public static function log($message) + /** + * Default timezone + * + * @static + * @access private + * @var array + */ + private static $timezone = 'UTC'; + + /** + * Add a new message + * + * @static + * @access public + * @param string $message Message + */ + public static function setMessage($message) { - self::$messages[] = '['.date('Y-m-d H:i:s').'] '.$message; + $date = new DateTime('now', new DateTimeZone(self::$timezone)); + + self::$messages[] = '['.$date->format('Y-m-d H:i:s').'] '.$message; } -} \ No newline at end of file + + /** + * Get all logged messages + * + * @static + * @access public + * @return array + */ + public static function getMessages() + { + return self::$messages; + } + + /** + * Remove all logged messages + * + * @static + * @access public + */ + public static function deleteMessages() + { + self::$messages = array(); + } + + /** + * Set a different timezone + * + * @static + * @see http://php.net/manual/en/timezones.php + * @access public + * @param string $timezone Timezone + */ + public static function setTimeZone($timezone) + { + self::$timezone = $timezone ?: self::$timezone; + } +} diff --git a/vendor/PicoFeed/Parser.php b/vendor/PicoFeed/Parser.php index 991c52b..edc2e81 100644 --- a/vendor/PicoFeed/Parser.php +++ b/vendor/PicoFeed/Parser.php @@ -2,10 +2,16 @@ namespace PicoFeed; -require_once __DIR__.'/Logging.php'; -require_once __DIR__.'/Filter.php'; -require_once __DIR__.'/Encoding.php'; -require_once __DIR__.'/Grabber.php'; +use DateTime; +use DateTimeZone; +use DOMXPath; +use SimpleXMLElement; +use PicoFeed\Config; +use PicoFeed\Encoding; +use PicoFeed\Filter; +use PicoFeed\Grabber; +use PicoFeed\Logging; +use PicoFeed\XmlParser; /** * Base parser class @@ -15,14 +21,29 @@ require_once __DIR__.'/Grabber.php'; */ abstract class Parser { + /** + * Config object + * + * @access private + * @var \PicoFeed\Config + */ + private $config = null; + /** * Hash algorithm used to generate item id, any value supported by PHP, see hash_algos() * - * @access public - * @static + * @access private * @var string */ - public static $hashAlgo = 'crc32b'; // crc32b seems to be faster and shorter than other hash algorithms + private $hash_algo = 'crc32b'; // crc32b seems to be faster and shorter than other hash algorithms + + /** + * Timezone used to parse feed dates + * + * @access private + * @var string + */ + private $timezone = 'UTC'; /** * Feed content (XML data) @@ -33,35 +54,28 @@ abstract class Parser protected $content = ''; /** - * Feed properties (values parsed) + * XML namespaces * - * @access public + * @access protected + * @var array */ - public $id = ''; - public $url = ''; - public $title = ''; - public $updated = ''; - public $language = ''; - public $items = array(); + protected $namespaces = array(); /** - * Content grabber parameters + * Enable the content grabber * - * @access public + * @access private + * @var bool */ - public $grabber = false; - public $grabber_ignore_urls = array(); - public $grabber_timeout = null; - public $grabber_user_agent = null; + public $enable_grabber = false; /** - * Parse feed content + * Ignore those urls for the content scraper * - * @abstract - * @access public - * @return mixed + * @access private + * @var array */ - abstract public function execute(); + private $grabber_ignore_urls = array(); /** * Constructor @@ -73,7 +87,7 @@ abstract class Parser public function __construct($content, $http_encoding = '') { $xml_encoding = Filter::getEncodingFromXmlTag($content); - Logging::log(\get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"'); + Logging::setMessage(get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"'); // Strip XML tag to avoid multiple encoding/decoding in the next XML processing $this->content = Filter::stripXmlTag($content); @@ -90,6 +104,52 @@ abstract class Parser $this->content = $this->normalizeData($this->content); } + /** + * Parse the document + * + * @access public + * @return mixed \PicoFeed\Feed instance or false + */ + public function execute() + { + Logging::setMessage(get_called_class().': begin parsing'); + + $xml = XmlParser::getSimpleXml($this->content); + + if ($xml === false) { + Logging::setMessage(get_called_class().': XML parsing error'); + Logging::setMessage(XmlParser::getErrors()); + return false; + } + + $this->namespaces = $xml->getNamespaces(true); + + $feed = new Feed; + $this->findFeedUrl($xml, $feed); + $this->findFeedTitle($xml, $feed); + $this->findFeedLanguage($xml, $feed); + $this->findFeedId($xml, $feed); + $this->findFeedDate($xml, $feed); + + foreach ($this->getItemsTree($xml) as $entry) { + + $item = new Item; + $this->findItemAuthor($xml, $entry, $item); + $this->findItemUrl($entry, $item); + $this->findItemTitle($entry, $item); + $this->findItemId($entry, $item, $feed); + $this->findItemDate($entry, $item); + $this->findItemContent($entry, $item); + $this->findItemEnclosure($entry, $item, $feed); + $this->findItemLanguage($entry, $item, $feed); + $feed->items[] = $item; + } + + Logging::setMessage(get_called_class().PHP_EOL.$feed); + + return $feed; + } + /** * Filter HTML for entry content * @@ -102,43 +162,40 @@ abstract class Parser { $content = ''; - if ($this->grabber && ! in_array($item_url, $this->grabber_ignore_urls)) { + // Setup the content scraper + if ($this->enable_grabber && ! in_array($item_url, $this->grabber_ignore_urls)) { + $grabber = new Grabber($item_url); - $grabber->download($this->grabber_timeout, $this->grabber_user_agent); - if ($grabber->parse()) $item_content = $grabber->content; + $grabber->setConfig($this->config); + $grabber->download(); + + if ($grabber->parse()) { + $item_content = $grabber->getContent(); + } } + // Content filtering if ($item_content) { - $filter = new Filter($item_content, $item_url); - $content = $filter->execute(); + + if ($this->config !== null) { + + $callback = $this->config->getContentFilteringCallback(); + + if (is_callable($callback)) { + $content = $callback($item_content, $item_url); + } + } + + if (! $content) { + $filter = new Filter($item_content, $item_url); + $filter->setConfig($this->config); + $content = $filter->execute(); + } } return $content; } - /** - * Get XML parser errors - * - * @access public - * @return string - */ - public function getXmlErrors() - { - $errors = array(); - - foreach(\libxml_get_errors() as $error) { - - $errors[] = sprintf('XML error: %s (Line: %d - Column: %d - Code: %d)', - $error->message, - $error->line, - $error->column, - $error->code - ); - } - - return implode(', ', $errors); - } - /** * Dirty quickfixes before XML parsing * @@ -148,6 +205,7 @@ abstract class Parser */ public function normalizeData($data) { + $data = str_replace("\x10", '', $data); $data = str_replace("\xc3\x20", '', $data); $data = str_replace("", '', $data); $data = $this->replaceEntityAttribute($data); @@ -194,7 +252,7 @@ abstract class Parser */ public function generateId() { - return hash(self::$hashAlgo, implode(func_get_args())); + return hash($this->hash_algo, implode(func_get_args())); } /** @@ -249,7 +307,8 @@ abstract class Parser } } - return time(); + $date = new DateTime('now', new DateTimeZone($this->timezone)); + return $date->getTimestamp(); } /** @@ -262,11 +321,15 @@ abstract class Parser */ public function getValidDate($format, $value) { - $date = \DateTime::createFromFormat($format, $value); + $date = DateTime::createFromFormat($format, $value, new DateTimeZone($this->timezone)); if ($date !== false) { - $errors = \DateTime::getLastErrors(); - if ($errors['error_count'] === 0 && $errors['warning_count'] === 0) return $date->getTimestamp(); + + $errors = DateTime::getLastErrors(); + + if ($errors['error_count'] === 0 && $errors['warning_count'] === 0) { + return $date->getTimestamp(); + } } return 0; @@ -299,10 +362,13 @@ abstract class Parser */ public function getXmlLang($xml) { - $dom = new \DOMDocument; - $dom->loadXML($this->content); + $dom = XmlParser::getDomDocument($this->content); - $xpath = new \DOMXPath($dom); + if ($dom === false) { + return ''; + } + + $xpath = new DOMXPath($dom); return $xpath->evaluate('string(//@xml:lang[1])') ?: ''; } @@ -318,30 +384,108 @@ abstract class Parser { $language = strtolower($language); - // Arabic (ar-**) - if (strpos($language, 'ar') === 0) return true; + $rtl_languages = array( + 'ar', // Arabic (ar-**) + 'fa', // Farsi (fa-**) + 'ur', // Urdu (ur-**) + 'ps', // Pashtu (ps-**) + 'syr', // Syriac (syr-**) + 'dv', // Divehi (dv-**) + 'he', // Hebrew (he-**) + 'yi', // Yiddish (yi-**) + ); - // Farsi (fa-**) - if (strpos($language, 'fa') === 0) return true; - - // Urdu (ur-**) - if (strpos($language, 'ur') === 0) return true; - - // Pashtu (ps-**) - if (strpos($language, 'ps') === 0) return true; - - // Syriac (syr-**) - if (strpos($language, 'syr') === 0) return true; - - // Divehi (dv-**) - if (strpos($language, 'dv') === 0) return true; - - // Hebrew (he-**) - if (strpos($language, 'he') === 0) return true; - - // Yiddish (yi-**) - if (strpos($language, 'yi') === 0) return true; + foreach ($rtl_languages as $prefix) { + if (strpos($language, $prefix) === 0) { + return true; + } + } return false; } + + /** + * Set Hash algorithm used for id generation + * + * @access public + * @param string $algo Algorithm name + * @return \PicoFeed\Parser + */ + public function setHashAlgo($algo) + { + $this->hash_algo = $algo ?: $this->hash_algo; + return $this; + } + + /** + * Set a different timezone + * + * @see http://php.net/manual/en/timezones.php + * @access public + * @param string $timezone Timezone + * @return \PicoFeed\Parser + */ + public function setTimezone($timezone) + { + $this->timezone = $timezone ?: $this->timezone; + return $this; + } + + /** + * Set config object + * + * @access public + * @param \PicoFeed\Config $config Config instance + * @return \PicoFeed\Parser + */ + public function setConfig($config) + { + $this->config = $config; + return $this; + } + + /** + * Enable the content grabber + * + * @access public + * @return \PicoFeed\Parser + */ + public function enableContentGrabber() + { + $this->enable_grabber = true; + } + + /** + * Set ignored URLs for the content grabber + * + * @access public + * @param array $urls URLs + * @return \PicoFeed\Parser + */ + public function setGrabberIgnoreUrls(array $urls) + { + $this->grabber_ignore_urls = $urls; + } + + /** + * Get a value from a XML namespace + * + * @access public + * @param SimpleXMLElement $xml XML element + * @param array $namespaces XML namespaces + * @param string $property XML tag name + * @return string + */ + public function getNamespaceValue(SimpleXMLElement $xml, array $namespaces, $property) + { + foreach ($namespaces as $name => $url) { + $namespace = $xml->children($namespaces[$name]); + + if ($namespace->$property->count() > 0) { + return (string) $namespace->$property; + } + } + + return ''; + } } diff --git a/vendor/PicoFeed/Parsers/Atom.php b/vendor/PicoFeed/Parsers/Atom.php index 1d068e1..7d228e4 100644 --- a/vendor/PicoFeed/Parsers/Atom.php +++ b/vendor/PicoFeed/Parsers/Atom.php @@ -2,91 +2,257 @@ namespace PicoFeed\Parsers; +use SimpleXMLElement; +use PicoFeed\Parser; +use PicoFeed\XmlParser; +use PicoFeed\Logging; +use PicoFeed\Filter; +use PicoFeed\Feed; +use PicoFeed\Item; + /** * Atom parser * * @author Frederic Guillot * @package parser */ -class Atom extends \PicoFeed\Parser +class Atom extends Parser { /** - * Parse the document + * Get the path to the items XML tree * * @access public - * @return mixed Atom instance or false + * @param SimpleXMLElement $xml Feed xml + * @return SimpleXMLElement */ - public function execute() + public function getItemsTree(SimpleXMLElement $xml) { - \PicoFeed\Logging::log(\get_called_class().': begin parsing'); + return $xml->entry; + } - \libxml_use_internal_errors(true); - $xml = \simplexml_load_string($this->content); + /** + * Find the feed url + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Feed $feed Feed object + */ + public function findFeedUrl(SimpleXMLElement $xml, Feed $feed) + { + $feed->url = $this->getLink($xml); + } - if ($xml === false) { - \PicoFeed\Logging::log(\get_called_class().': XML parsing error'); - \PicoFeed\Logging::log($this->getXmlErrors()); - return false; + /** + * Find the feed title + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Feed $feed Feed object + */ + public function findFeedTitle(SimpleXMLElement $xml, Feed $feed) + { + $feed->title = $this->stripWhiteSpace((string) $xml->title) ?: $feed->url; + } + + /** + * Find the feed language + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Feed $feed Feed object + */ + public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed) + { + $feed->language = $this->getXmlLang($this->content); + } + + /** + * Find the feed id + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Feed $feed Feed object + */ + public function findFeedId(SimpleXMLElement $xml, Feed $feed) + { + $feed->id = (string) $xml->id; + } + + /** + * Find the feed date + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Feed $feed Feed object + */ + public function findFeedDate(SimpleXMLElement $xml, Feed $feed) + { + $feed->date = $this->parseDate((string) $xml->updated); + } + + /** + * Find the item date + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param Item $item Item object + */ + public function findItemDate(SimpleXMLElement $entry, Item $item) + { + $item->date = $this->parseDate((string) $entry->updated); + } + + /** + * Find the item title + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param Item $item Item object + */ + public function findItemTitle(SimpleXMLElement $entry, Item $item) + { + $item->title = $this->stripWhiteSpace((string) $entry->title); + + if (empty($item->title)) { + $item->title = $item->url; + } + } + + /** + * Find the item author + * + * @access public + * @param SimpleXMLElement $xml Feed + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Item $item Item object + */ + public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item) + { + if (isset($entry->author->name)) { + $item->author = (string) $entry->author->name; + } + else { + $item->author = (string) $xml->author->name; + } + } + + /** + * Find the item content + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Item $item Item object + */ + public function findItemContent(SimpleXMLElement $entry, Item $item) + { + $item->content = $this->filterHtml($this->getContent($entry), $item->url); + } + + /** + * Find the item URL + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Item $item Item object + */ + public function findItemUrl(SimpleXMLElement $entry, Item $item) + { + $item->url = $this->getLink($entry); + } + + /** + * Genereate the item id + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Item $item Item object + * @param \PicoFeed\Feed $feed Feed object + */ + public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed) + { + $id = (string) $entry->id; + + if ($id !== $item->url) { + $item_permalink = $id; + } + else { + $item_permalink = $item->url; } - $this->language = $this->getXmlLang($this->content); - $this->url = $this->getUrl($xml); - $this->title = $this->stripWhiteSpace((string) $xml->title) ?: $this->url; - $this->id = (string) $xml->id; - $this->updated = $this->parseDate((string) $xml->updated); - $author = (string) $xml->author->name; + if ($this->isExcludedFromId($feed->url)) { + $feed_permalink = ''; + } + else { + $feed_permalink = $feed->url; + } - \PicoFeed\Logging::log(\get_called_class().': Title => '.$this->title); - \PicoFeed\Logging::log(\get_called_class().': Url => '.$this->url); + $item->id = $this->generateId($item_permalink, $feed_permalink); + } - foreach ($xml->entry as $entry) { + /** + * Find the item enclosure + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Item $item Item object + * @param \PicoFeed\Feed $feed Feed object + */ + public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed) + { + foreach ($entry->link as $link) { + if ((string) $link['rel'] === 'enclosure') { - if (isset($entry->author->name)) { - $author = (string) $entry->author->name; - } + $item->enclosure_url = (string) $link['href']; + $item->enclosure_type = (string) $link['type']; - $id = (string) $entry->id; - - $item = new \StdClass; - $item->url = $this->getUrl($entry); - $item->id = $this->generateId($id !== $item->url ? $id : $item->url, $this->isExcludedFromId($this->url) ? '' : $this->url); - $item->title = $this->stripWhiteSpace((string) $entry->title); - $item->updated = $this->parseDate((string) $entry->updated); - $item->author = $author; - $item->content = $this->filterHtml($this->getContent($entry), $item->url); - $item->language = $this->language; - - if (empty($item->title)) $item->title = $item->url; - - // Try to find an enclosure - foreach ($entry->link as $link) { - if ((string) $link['rel'] === 'enclosure') { - $item->enclosure = (string) $link['href']; - $item->enclosure_type = (string) $link['type']; - - if (\PicoFeed\Filter::isRelativePath($item->enclosure)) { - $item->enclosure = \PicoFeed\Filter::getAbsoluteUrl($item->enclosure, $this->url); - } - break; + if (Filter::isRelativePath($item->enclosure_url)) { + $item->enclosure_url = Filter::getAbsoluteUrl($item->enclosure_url, $feed->url); } - } - $this->items[] = $item; + break; + } + } + } + + /** + * Find the item language + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Item $item Item object + * @param \PicoFeed\Feed $feed Feed object + */ + public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed) + { + $item->language = $feed->language; + } + + /** + * Get the URL from a link tag + * + * @access public + * @param SimpleXMLElement $xml XML tag + * @return string + */ + public function getLink(SimpleXMLElement $xml) + { + foreach ($xml->link as $link) { + if ((string) $link['type'] === 'text/html' || (string) $link['type'] === 'application/xhtml+xml') { + return (string) $link['href']; + } } - \PicoFeed\Logging::log(\get_called_class().': parsing finished ('.count($this->items).' items)'); - - return $this; + return (string) $xml->link['href']; } /** * Get the entry content * * @access public - * @param SimpleXMLElement $entry XML Entry + * @param SimpleXMLElement $entry XML Entry * @return string */ - public function getContent($entry) + public function getContent(SimpleXMLElement $entry) { if (isset($entry->content) && ! empty($entry->content)) { @@ -103,22 +269,4 @@ class Atom extends \PicoFeed\Parser return ''; } - - /** - * Get the URL from a link tag - * - * @access public - * @param SimpleXMLElement $xml XML tag - * @return string - */ - public function getUrl($xml) - { - foreach ($xml->link as $link) { - if ((string) $link['type'] === 'text/html' || (string) $link['type'] === 'application/xhtml+xml') { - return (string) $link['href']; - } - } - - return (string) $xml->link['href']; - } -} \ No newline at end of file +} diff --git a/vendor/PicoFeed/Parsers/Rss10.php b/vendor/PicoFeed/Parsers/Rss10.php index c106ee3..748597a 100644 --- a/vendor/PicoFeed/Parsers/Rss10.php +++ b/vendor/PicoFeed/Parsers/Rss10.php @@ -2,86 +2,86 @@ namespace PicoFeed\Parsers; -class Rss10 extends \PicoFeed\Parser +require_once __DIR__.'/Rss20.php'; + +use SimpleXMLElement; +use PicoFeed\Feed; +use PicoFeed\Item; +use PicoFeed\Parsers\Rss20; + +/** + * RSS 1.0 parser + * + * @author Frederic Guillot + * @package parser + */ +class Rss10 extends Rss20 { - public function execute() + /** + * Get the path to the items XML tree + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @return SimpleXMLElement + */ + public function getItemsTree(SimpleXMLElement $xml) { - \PicoFeed\Logging::log(\get_called_class().': begin parsing'); + return $xml->item; + } - \libxml_use_internal_errors(true); - $xml = \simplexml_load_string($this->content); + /** + * Find the feed date + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Feed $feed Feed object + */ + public function findFeedDate(SimpleXMLElement $xml, Feed $feed) + { + $feed->date = $this->parseDate($this->getNamespaceValue($xml->channel, $this->namespaces, 'date')); + } - if ($xml === false) { - \PicoFeed\Logging::log(\get_called_class().': XML parsing error'); - \PicoFeed\Logging::log($this->getXmlErrors()); - return false; - } + /** + * Find the feed language + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Feed $feed Feed object + */ + public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed) + { + $feed->language = $this->getNamespaceValue($xml->channel, $this->namespaces, 'language'); + } - $namespaces = $xml->getNamespaces(true); - - $this->title = $this->stripWhiteSpace((string) $xml->channel->title) ?: $this->url; - $this->url = (string) $xml->channel->link; - $this->id = $this->url; - $this->language = ''; - - \PicoFeed\Logging::log(\get_called_class().': Title => '.$this->title); - \PicoFeed\Logging::log(\get_called_class().': Url => '.$this->url); - - if (isset($namespaces['dc'])) { - $ns_dc = $xml->channel->children($namespaces['dc']); - $this->updated = isset($ns_dc->date) ? $this->parseDate($ns_dc->date) : time(); + /** + * Genereate the item id + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Item $item Item object + * @param \PicoFeed\Feed $feed Feed object + */ + public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed) + { + if ($this->isExcludedFromId($feed->url)) { + $feed_permalink = ''; } else { - $this->updated = time(); + $feed_permalink = $feed->url; } - foreach ($xml->item as $entry) { - - $item = new \StdClass; - $item->title = $this->stripWhiteSpace((string) $entry->title); - $item->url = ''; - $item->author= ''; - $item->updated = ''; - $item->content = ''; - $item->language = ''; - - foreach ($namespaces as $name => $url) { - - $namespace = $entry->children($namespaces[$name]); - - if (! $item->url && ! empty($namespace->origLink)) $item->url = (string) $namespace->origLink; - if (! $item->author && ! empty($namespace->creator)) $item->author = (string) $namespace->creator; - if (! $item->updated && ! empty($namespace->date)) $item->updated = $this->parseDate((string) $namespace->date); - if (! $item->updated && ! empty($namespace->updated)) $item->updated = $this->parseDate((string) $namespace->updated); - if (! $item->content && ! empty($namespace->encoded)) $item->content = (string) $namespace->encoded; - } - - if (empty($item->url)) $item->url = (string) $entry->link; - if (empty($item->updated)) $item->updated = $this->updated; - - if (empty($item->content)) { - $item->content = isset($entry->description) ? (string) $entry->description : ''; - } - - if (empty($item->author)) { - - if (isset($entry->author)) { - $item->author = (string) $entry->author; - } - else if (isset($xml->channel->webMaster)) { - $item->author = (string) $xml->channel->webMaster; - } - } - - if (empty($item->title)) $item->title = $item->url; - - $item->id = $this->generateId($item->url, $this->isExcludedFromId($this->url) ? '' : $this->url); - $item->content = $this->filterHtml($item->content, $item->url); - $this->items[] = $item; - } - - \PicoFeed\Logging::log(\get_called_class().': parsing finished ('.count($this->items).' items)'); - - return $this; + $item->id = $this->generateId($item->url, $feed_permalink); } -} \ No newline at end of file + + /** + * Find the item enclosure + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Item $item Item object + * @param \PicoFeed\Feed $feed Feed object + */ + public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed) + { + } +} diff --git a/vendor/PicoFeed/Parsers/Rss20.php b/vendor/PicoFeed/Parsers/Rss20.php index 8feb985..7af0b9a 100644 --- a/vendor/PicoFeed/Parsers/Rss20.php +++ b/vendor/PicoFeed/Parsers/Rss20.php @@ -2,35 +2,43 @@ namespace PicoFeed\Parsers; +use SimpleXMLElement; +use PicoFeed\Parser; +use PicoFeed\XmlParser; +use PicoFeed\Logging; +use PicoFeed\Filter; +use PicoFeed\Feed; +use PicoFeed\Item; + /** * RSS 2.0 Parser * * @author Frederic Guillot * @package parser */ -class Rss20 extends \PicoFeed\Parser +class Rss20 extends Parser { /** - * Parse the document + * Get the path to the items XML tree * * @access public - * @return mixed Rss20 instance or false + * @param SimpleXMLElement $xml Feed xml + * @return SimpleXMLElement */ - public function execute() + public function getItemsTree(SimpleXMLElement $xml) { - \PicoFeed\Logging::log(\get_called_class().': begin parsing'); - - \libxml_use_internal_errors(true); - $xml = \simplexml_load_string($this->content); - - if ($xml === false) { - \PicoFeed\Logging::log(\get_called_class().': XML parsing error'); - \PicoFeed\Logging::log($this->getXmlErrors()); - return false; - } - - $namespaces = $xml->getNamespaces(true); + return $xml->channel->item; + } + /** + * Find the feed url + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Feed $feed Feed object + */ + public function findFeedUrl(SimpleXMLElement $xml, Feed $feed) + { if ($xml->channel->link && $xml->channel->link->count() > 1) { foreach ($xml->channel->link as $xml_link) { @@ -38,112 +46,228 @@ class Rss20 extends \PicoFeed\Parser $link = (string) $xml_link; if ($link !== '') { - $this->url = (string) $link; + $feed->url = $link; break; } } } else { - $this->url = (string) $xml->channel->link; + $feed->url = (string) $xml->channel->link; } - - $this->language = isset($xml->channel->language) ? (string) $xml->channel->language : ''; - $this->title = $this->stripWhiteSpace((string) $xml->channel->title) ?: $this->url; - $this->id = $this->url; - $this->updated = $this->parseDate(isset($xml->channel->pubDate) ? (string) $xml->channel->pubDate : (string) $xml->channel->lastBuildDate); - - \PicoFeed\Logging::log(\get_called_class().': Title => '.$this->title); - \PicoFeed\Logging::log(\get_called_class().': Url => '.$this->url); - - // RSS feed might be empty - if (! $xml->channel->item) { - \PicoFeed\Logging::log(\get_called_class().': feed empty or malformed'); - return $this; - } - - foreach ($xml->channel->item as $entry) { - - $item = new \StdClass; - $item->title = $this->stripWhiteSpace((string) $entry->title); - $item->url = ''; - $item->author= ''; - $item->updated = ''; - $item->content = ''; - $item->enclosure = ''; - $item->enclosure_type = ''; - $item->language = $this->language; - - foreach ($namespaces as $name => $url) { - - $namespace = $entry->children($namespaces[$name]); - - if (! $item->author && ! empty($namespace->creator)) $item->author = (string) $namespace->creator; - if (! $item->updated && ! empty($namespace->date)) $item->updated = $this->parseDate((string) $namespace->date); - if (! $item->updated && ! empty($namespace->updated)) $item->updated = $this->parseDate((string) $namespace->updated); - if (! $item->content && ! empty($namespace->encoded)) $item->content = (string) $namespace->encoded; - - // Get FeedBurner original links - if (! $item->url && ! empty($namespace->origLink)) $item->url = (string) $namespace->origLink; - if (! $item->enclosure && ! empty($namespace->origEnclosureLink)) $item->enclosure = (string) $namespace->origEnclosureLink; - } - - if (empty($item->url)) { - - if (isset($entry->link)) { - $item->url = (string) $entry->link; - } - else if (isset($entry->guid)) { - $item->url = (string) $entry->guid; - } - } - - if (empty($item->updated)) $item->updated = $this->parseDate((string) $entry->pubDate) ?: $this->updated; - - if (empty($item->content)) { - $item->content = isset($entry->description) ? (string) $entry->description : ''; - } - - if (empty($item->author)) { - - if (isset($entry->author)) { - $item->author = (string) $entry->author; - } - else if (isset($xml->channel->webMaster)) { - $item->author = (string) $xml->channel->webMaster; - } - } - - if (isset($entry->guid) && isset($entry->guid['isPermaLink']) && (string) $entry->guid['isPermaLink'] != 'false') { - $id = (string) $entry->guid; - $item->id = $this->generateId($id !== '' && $id !== $item->url ? $id : $item->url, $this->isExcludedFromId($this->url) ? '' : $this->url); - } - else { - $item->id = $this->generateId($item->url, $this->isExcludedFromId($this->url) ? '' : $this->url); - } - - if (empty($item->title)) $item->title = $item->url; - - // if optional enclosure tag with multimedia provided, capture here - if (isset($entry->enclosure)) { - - if (! $item->enclosure) { - $item->enclosure = isset($entry->enclosure['url']) ? (string) $entry->enclosure['url'] : ''; - } - - $item->enclosure_type = isset($entry->enclosure['type']) ? (string) $entry->enclosure['type'] : ''; - - if (\PicoFeed\Filter::isRelativePath($item->enclosure)) { - $item->enclosure = \PicoFeed\Filter::getAbsoluteUrl($item->enclosure, $this->url); - } - } - - $item->content = $this->filterHtml($item->content, $item->url); - $this->items[] = $item; - } - - \PicoFeed\Logging::log(\get_called_class().': parsing finished ('.count($this->items).' items)'); - - return $this; } -} \ No newline at end of file + + /** + * Find the feed title + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Feed $feed Feed object + */ + public function findFeedTitle(SimpleXMLElement $xml, Feed $feed) + { + $feed->title = $this->stripWhiteSpace((string) $xml->channel->title) ?: $feed->url; + } + + /** + * Find the feed language + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Feed $feed Feed object + */ + public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed) + { + $feed->language = isset($xml->channel->language) ? (string) $xml->channel->language : ''; + } + + /** + * Find the feed id + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Feed $feed Feed object + */ + public function findFeedId(SimpleXMLElement $xml, Feed $feed) + { + $feed->id = $feed->url; + } + + /** + * Find the feed date + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Feed $feed Feed object + */ + public function findFeedDate(SimpleXMLElement $xml, Feed $feed) + { + $date = isset($xml->channel->pubDate) ? $xml->channel->pubDate : $xml->channel->lastBuildDate; + $feed->date = $this->parseDate((string) $date); + } + + /** + * Find the item date + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Item $item Item object + */ + public function findItemDate(SimpleXMLElement $entry, Item $item) + { + $date = $this->getNamespaceValue($entry, $this->namespaces, 'date'); + + if (empty($date)) { + $date = $this->getNamespaceValue($entry, $this->namespaces, 'updated'); + } + + if (empty($date)) { + $date = (string) $entry->pubDate; + } + + $item->date = $this->parseDate($date); + } + + /** + * Find the item title + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Item $item Item object + */ + public function findItemTitle(SimpleXMLElement $entry, Item $item) + { + $item->title = $this->stripWhiteSpace((string) $entry->title); + + if (empty($item->title)) { + $item->title = $item->url; + } + } + + /** + * Find the item author + * + * @access public + * @param SimpleXMLElement $xml Feed + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Item $item Item object + */ + public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item) + { + $item->author = $this->getNamespaceValue($entry, $this->namespaces, 'creator'); + + if (empty($item->author)) { + if (isset($entry->author)) { + $item->author = (string) $entry->author; + } + else if (isset($xml->channel->webMaster)) { + $item->author = (string) $xml->channel->webMaster; + } + } + } + + /** + * Find the item content + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Item $item Item object + */ + public function findItemContent(SimpleXMLElement $entry, Item $item) + { + $content = $this->getNamespaceValue($entry, $this->namespaces, 'encoded'); + + if (empty($content) && $entry->description->count() > 0) { + $content = (string) $entry->description; + } + + $item->content = $this->filterHtml($content, $item->url); + } + + /** + * Find the item URL + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Item $item Item object + */ + public function findItemUrl(SimpleXMLElement $entry, Item $item) + { + $item->url = $this->getNamespaceValue($entry, $this->namespaces, 'origLink'); + + if (empty($item->url)) { + if (isset($entry->link)) { + $item->url = (string) $entry->link; + } + else if (isset($entry->guid)) { + $item->url = (string) $entry->guid; + } + } + } + + /** + * Genereate the item id + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Item $item Item object + * @param \PicoFeed\Feed $feed Feed object + */ + public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed) + { + if ($entry->guid->count() > 0 && (string) $entry->guid['isPermaLink'] !== 'false') { + $item_permalink = (string) $entry->guid; + } + else { + $item_permalink = $item->url; + } + + if ($this->isExcludedFromId($feed->url)) { + $feed_permalink = ''; + } + else { + $feed_permalink = $feed->url; + } + + $item->id = $this->generateId($item_permalink, $feed_permalink); + } + + /** + * Find the item enclosure + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Item $item Item object + * @param \PicoFeed\Feed $feed Feed object + */ + public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed) + { + if (isset($entry->enclosure)) { + + $item->enclosure_url = $this->getNamespaceValue($entry->enclosure, $this->namespaces, 'origEnclosureLink'); + + if (empty($item->enclosure_url)) { + $item->enclosure_url = isset($entry->enclosure['url']) ? (string) $entry->enclosure['url'] : ''; + } + + $item->enclosure_type = isset($entry->enclosure['type']) ? (string) $entry->enclosure['type'] : ''; + + if (Filter::isRelativePath($item->enclosure_url)) { + $item->enclosure_url = Filter::getAbsoluteUrl($item->enclosure_url, $feed->url); + } + } + } + + /** + * Find the item language + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Item $item Item object + * @param \PicoFeed\Feed $feed Feed object + */ + public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed) + { + $item->language = $feed->language; + } +} diff --git a/vendor/PicoFeed/Parsers/Rss91.php b/vendor/PicoFeed/Parsers/Rss91.php index abedf85..8df3ce0 100644 --- a/vendor/PicoFeed/Parsers/Rss91.php +++ b/vendor/PicoFeed/Parsers/Rss91.php @@ -4,4 +4,14 @@ namespace PicoFeed\Parsers; require_once __DIR__.'/Rss20.php'; -class Rss91 extends Rss20 {} \ No newline at end of file +use PicoFeed\Parsers\Rss20; + +/** + * RSS 0.91 Parser + * + * @author Frederic Guillot + * @package parser + */ +class Rss91 extends Rss20 +{ +} diff --git a/vendor/PicoFeed/Parsers/Rss92.php b/vendor/PicoFeed/Parsers/Rss92.php index 6fd6e15..71478a0 100644 --- a/vendor/PicoFeed/Parsers/Rss92.php +++ b/vendor/PicoFeed/Parsers/Rss92.php @@ -4,4 +4,14 @@ namespace PicoFeed\Parsers; require_once __DIR__.'/Rss20.php'; -class Rss92 extends Rss20 {} \ No newline at end of file +use PicoFeed\Parsers\Rss20; + +/** + * RSS 0.92 Parser + * + * @author Frederic Guillot + * @package parser + */ +class Rss92 extends Rss20 +{ +} diff --git a/vendor/PicoFeed/PicoFeed.php b/vendor/PicoFeed/PicoFeed.php new file mode 100644 index 0000000..89be939 --- /dev/null +++ b/vendor/PicoFeed/PicoFeed.php @@ -0,0 +1,20 @@ +content = $content; - $this->encoding = ''; - return $this; + $this->config = $config ?: new Config; + Logging::setTimezone($this->config->getTimezone()); } /** @@ -61,59 +69,60 @@ class Reader * @param string $url Feed content * @param string $last_modified Last modified HTTP header * @param string $etag Etag HTTP header - * @param string $timeout Client connection timeout - * @param string $user_agent HTTP user-agent - * @return Client + * @return \PicoFeed\Client */ - public function download($url, $last_modified = '', $etag = '', $timeout = 5, $user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)') + public function download($url, $last_modified = '', $etag = '') { if (strpos($url, 'http') !== 0) { - $url = 'http://'.$url; } - $client = Client::create(); - $client->url = $url; - $client->timeout = $timeout; - $client->user_agent = $user_agent; - $client->last_modified = $last_modified; - $client->etag = $etag; - $client->execute(); + $client = Client::getInstance(); + $client->setTimeout($this->config->getClientTimeout()) + ->setUserAgent($this->config->getClientUserAgent()) + ->setMaxRedirections($this->config->getMaxRedirections()) + ->setMaxBodySize($this->config->getMaxBodySize()) + ->setProxyHostname($this->config->getProxyHostname()) + ->setProxyPort($this->config->getProxyPort()) + ->setProxyUsername($this->config->getProxyUsername()) + ->setProxyPassword($this->config->getProxyPassword()) + ->setLastModified($last_modified) + ->setEtag($etag); - $this->content = $client->getContent(); - $this->url = $client->getUrl(); - $this->encoding = $client->getEncoding(); + if ($client->execute($url)) { + $this->content = $client->getContent(); + $this->url = $client->getUrl(); + $this->encoding = $client->getEncoding(); + } return $client; } /** - * Get the download content + * Get a parser instance with a custom config * * @access public - * @return string + * @param string $name Parser name + * @return \PicoFeed\Parser */ - public function getContent() + public function getParserInstance($name) { - return $this->content; - } + require_once __DIR__.'/Parsers/'.ucfirst($name).'.php'; + $name = '\PicoFeed\Parsers\\'.$name; - /** - * Get finale URL - * - * @access public - * @return string - */ - public function getUrl() - { - return $this->url; + $parser = new $name($this->content, $this->encoding); + $parser->setHashAlgo($this->config->getParserHashAlgo()); + $parser->setTimezone($this->config->getTimezone()); + $parser->setConfig($this->config); + + return $parser; } /** * Get the first XML tag * * @access public - * @param string $data Feed content + * @param string $data Feed content * @return string */ public function getFirstTag($data) @@ -138,6 +147,31 @@ class Reader return substr($data, $open_tag, $close_tag); } + /** + * Detect the feed format + * + * @access public + * @param string $parser_name Parser name + * @param string $haystack First XML tag + * @param array $needles List of strings that need to be there + * @return mixed False on failure or Parser instance + */ + public function detectFormat($parser_name, $haystack, array $needles) + { + $results = array(); + + foreach ($needles as $needle) { + $results[] = strpos($haystack, $needle) !== false; + } + + if (! in_array(false, $results, true)) { + Logging::setMessage(get_called_class().': Format detected => '.$parser_name); + return $this->getParserInstance($parser_name); + } + + return false; + } + /** * Discover feed format and return a parser instance * @@ -147,66 +181,44 @@ class Reader */ public function getParser($discover = false) { + $formats = array( + array('parser' => 'Atom', 'needles' => array(' 'Rss20', 'needles' => array(' 'Rss92', 'needles' => array(' 'Rss91', 'needles' => array(' 'Rss10', 'needles' => array('getFirstTag($this->content); - if (strpos($first_tag, 'detectFormat($format['parser'], $first_tag, $format['needles']); - require_once __DIR__.'/Parsers/Atom.php'; - return new Parsers\Atom($this->content, $this->encoding); + if ($parser !== false) { + return $parser; + } } - else if (strpos($first_tag, 'content, $this->encoding); - } - else if (strpos($first_tag, 'content, $this->encoding); - } - else if (strpos($first_tag, 'content, $this->encoding); - } - else if (strpos($first_tag, 'content, $this->encoding); - } - else if ($discover === true) { - - Logging::log(\get_called_class().': Format not supported or malformed'); - Logging::log(\get_called_class().':'.PHP_EOL.$this->content); + Logging::setMessage(get_called_class().': Format not supported or feed malformed'); + Logging::setMessage(get_called_class().': Content => '.PHP_EOL.$this->content); return false; } else if ($this->discover()) { - return $this->getParser(true); } - Logging::log(\get_called_class().': Subscription not found'); - Logging::log(\get_called_class().': Content => '.PHP_EOL.$this->content); + Logging::setMessage(get_called_class().': Subscription not found'); + Logging::setMessage(get_called_class().': Content => '.PHP_EOL.$this->content); return false; } /** - * Discover feed url inside a HTML document and download the feed + * Discover the feed url inside a HTML document and download the feed * * @access public * @return boolean @@ -214,18 +226,13 @@ class Reader public function discover() { if (! $this->content) { - return false; } - Logging::log(\get_called_class().': Try to discover a subscription'); + Logging::setMessage(get_called_class().': Try to discover a subscription'); - \libxml_use_internal_errors(true); - - $dom = new \DOMDocument; - $dom->loadHTML($this->content); - - $xpath = new \DOMXPath($dom); + $dom = XmlParser::getHtmlDocument($this->content); + $xpath = new DOMXPath($dom); $queries = array( "//link[@type='application/atom+xml']", @@ -251,7 +258,7 @@ class Reader $link = $this->url.$link; } - Logging::log(\get_called_class().': Find subscription link: '.$link); + Logging::setMessage(get_called_class().': Find subscription link: '.$link); $this->download($link); return true; @@ -261,4 +268,52 @@ class Reader return false; } + + /** + * Get the downloaded content + * + * @access public + * @return string + */ + public function getContent() + { + return $this->content; + } + + /** + * Set the page content + * + * @access public + * @param string $content Page content + * @return \PicoFeed\Reader + */ + public function setContent($content) + { + $this->content = $content; + return $this; + } + + /** + * Get final URL + * + * @access public + * @return string + */ + public function getUrl() + { + return $this->url; + } + + /** + * Set the URL + * + * @access public + * @param string $url URL + * @return \PicoFeed\Reader + */ + public function setUrl($url) + { + $this->url = $url; + return $this; + } } diff --git a/vendor/PicoFeed/Rules/journaldugeek.com.php b/vendor/PicoFeed/Rules/journaldugeek.com.php new file mode 100644 index 0000000..72de691 --- /dev/null +++ b/vendor/PicoFeed/Rules/journaldugeek.com.php @@ -0,0 +1,10 @@ + 'http://www./2014/05/20/le-playstation-now-arrive-en-beta-fermee-aux-etats-unis/', + 'body' => array( + '//div[@class="post-content"]', + ), + 'strip' => array( + '//style' + ) +); \ No newline at end of file diff --git a/vendor/PicoFeed/Writer.php b/vendor/PicoFeed/Writer.php index 3049968..9c73a92 100644 --- a/vendor/PicoFeed/Writer.php +++ b/vendor/PicoFeed/Writer.php @@ -2,22 +2,55 @@ namespace PicoFeed; +use RuntimeException; + +/** + * Base writer class + * + * @author Frederic Guillot + * @package picofeed + */ abstract class Writer { + /** + * Dom object + * + * @access protected + * @var DomDocument + */ + protected $dom; + + /** + * Items + * + * @access public + * @var array + */ public $items = array(); - + /** + * Generate the XML document + * + * @abstract + * @access public + * @param string $filename Optional filename + * @return string + */ abstract public function execute($filename = ''); - - public function checkRequiredProperties($properties, $container) + /** + * Check required properties to generate the output + * + * @access public + * @param array $properties List of properties + * @param mixed $container Object or array container + */ + public function checkRequiredProperties(array $properties, $container) { foreach ($properties as $property) { - if ((is_object($container) && ! isset($container->$property)) || (is_array($container) && ! isset($container[$property]))) { - - throw new \RuntimeException('Required property missing: '.$property); + throw new RuntimeException('Required property missing: '.$property); } } } -} \ No newline at end of file +} diff --git a/vendor/PicoFeed/Writers/Atom.php b/vendor/PicoFeed/Writers/Atom.php index 8c59f5e..e5be76e 100644 --- a/vendor/PicoFeed/Writers/Atom.php +++ b/vendor/PicoFeed/Writers/Atom.php @@ -2,32 +2,59 @@ namespace PicoFeed\Writers; -require_once __DIR__.'/../Writer.php'; +use DomDocument; +use DomElement; +use DomAttr; +use PicoFeed\Writer; -class Atom extends \PicoFeed\Writer +/** + * Atom writer class + * + * @author Frederic Guillot + * @package picofeed + */ +class Atom extends Writer { + /** + * List of required properties for each feed + * + * @access private + * @var array + */ private $required_feed_properties = array( 'title', 'site_url', 'feed_url', ); + /** + * List of required properties for each item + * + * @access private + * @var array + */ private $required_item_properties = array( 'title', 'url', ); - + /** + * Get the Atom document + * + * @access public + * @param string $filename Optional filename + * @return string + */ public function execute($filename = '') { $this->checkRequiredProperties($this->required_feed_properties, $this); - $this->dom = new \DomDocument('1.0', 'UTF-8'); + $this->dom = new DomDocument('1.0', 'UTF-8'); $this->dom->formatOutput = true; // $feed = $this->dom->createElement('feed'); - $feed->setAttributeNodeNS(new \DomAttr('xmlns', 'http://www.w3.org/2005/Atom')); + $feed->setAttributeNodeNS(new DomAttr('xmlns', 'http://www.w3.org/2005/Atom')); // $generator = $this->dom->createElement('generator', 'PicoFeed'); @@ -115,8 +142,16 @@ class Atom extends \PicoFeed\Writer } } - - public function addLink($xml, $url, $rel = 'alternate', $type = 'text/html') + /** + * Add Link + * + * @access public + * @param DomElement $xml XML node + * @param string $url URL + * @param string $rel Link rel attribute + * @param string $type Link type attribute + */ + public function addLink(DomElement $xml, $url, $rel = 'alternate', $type = 'text/html') { $link = $this->dom->createElement('link'); $link->setAttribute('rel', $rel); @@ -125,8 +160,14 @@ class Atom extends \PicoFeed\Writer $xml->appendChild($link); } - - public function addUpdated($xml, $value = '') + /** + * Add publication date + * + * @access public + * @param DomElement $xml XML node + * @param string $value Timestamp + */ + public function addUpdated(DomElement $xml, $value = '') { $xml->appendChild($this->dom->createElement( 'updated', @@ -134,8 +175,14 @@ class Atom extends \PicoFeed\Writer )); } - - public function addAuthor($xml, array $values) + /** + * Add author + * + * @access public + * @param DomElement $xml XML node + * @param array $values Author name and email + */ + public function addAuthor(DomElement $xml, array $values) { $author = $this->dom->createElement('author'); @@ -159,4 +206,4 @@ class Atom extends \PicoFeed\Writer $xml->appendChild($author); } -} \ No newline at end of file +} diff --git a/vendor/PicoFeed/Writers/Rss20.php b/vendor/PicoFeed/Writers/Rss20.php index e20a552..506e3c8 100644 --- a/vendor/PicoFeed/Writers/Rss20.php +++ b/vendor/PicoFeed/Writers/Rss20.php @@ -2,34 +2,61 @@ namespace PicoFeed\Writers; -require_once __DIR__.'/../Writer.php'; +use DomDocument; +use DomAttr; +use DomElement; +use PicoFeed\Writer; -class Rss20 extends \PicoFeed\Writer +/** + * Rss 2.0 writer class + * + * @author Frederic Guillot + * @package picofeed + */ +class Rss20 extends Writer { + /** + * List of required properties for each feed + * + * @access private + * @var array + */ private $required_feed_properties = array( 'title', 'site_url', 'feed_url', ); + /** + * List of required properties for each item + * + * @access private + * @var array + */ private $required_item_properties = array( 'title', 'url', ); - + /** + * Get the Rss 2.0 document + * + * @access public + * @param string $filename Optional filename + * @return string + */ public function execute($filename = '') { $this->checkRequiredProperties($this->required_feed_properties, $this); - $this->dom = new \DomDocument('1.0', 'UTF-8'); + $this->dom = new DomDocument('1.0', 'UTF-8'); $this->dom->formatOutput = true; // $rss = $this->dom->createElement('rss'); $rss->setAttribute('version', '2.0'); - $rss->setAttributeNodeNS(new \DomAttr('xmlns:content', 'http://purl.org/rss/1.0/modules/content/')); - $rss->setAttributeNodeNS(new \DomAttr('xmlns:atom', 'http://www.w3.org/2005/Atom')); + $rss->setAttributeNodeNS(new DomAttr('xmlns:content', 'http://purl.org/rss/1.0/modules/content/')); + $rss->setAttributeNodeNS(new DomAttr('xmlns:atom', 'http://www.w3.org/2005/Atom')); $channel = $this->dom->createElement('channel'); @@ -130,8 +157,14 @@ class Rss20 extends \PicoFeed\Writer } } - - public function addPubDate($xml, $value = '') + /** + * Add publication date + * + * @access public + * @param DomElement $xml XML node + * @param string $value Timestamp + */ + public function addPubDate(DomElement $xml, $value = '') { $xml->appendChild($this->dom->createElement( 'pubDate', @@ -139,8 +172,15 @@ class Rss20 extends \PicoFeed\Writer )); } - - public function addAuthor($xml, $tag, array $values) + /** + * Add author + * + * @access public + * @param DomElement $xml XML node + * @param string $tag Tag name + * @param array $values Author name and email + */ + public function addAuthor(DomElement $xml, $tag, array $values) { $value = ''; @@ -153,4 +193,4 @@ class Rss20 extends \PicoFeed\Writer $xml->appendChild($author); } } -} \ No newline at end of file +} diff --git a/vendor/PicoFeed/XmlParser.php b/vendor/PicoFeed/XmlParser.php new file mode 100644 index 0000000..be063a1 --- /dev/null +++ b/vendor/PicoFeed/XmlParser.php @@ -0,0 +1,136 @@ +loadXml($input, LIBXML_NONET); + + // The document is empty, there is probably some parsing errors + if ($dom->childNodes->length === 0) { + return false; + } + + // Scan for potential XEE attacks using ENTITY + foreach ($dom->childNodes as $child) { + if ($child->nodeType === XML_DOCUMENT_TYPE_NODE) { + if ($child->entities->length > 0) { + return false; + } + } + } + + return $dom; + } + + /** + * Load HTML document by using a DomDocument instance or return false on failure + * + * @static + * @access public + * @param string $input XML content + * @return mixed + */ + public static function getHtmlDocument($input) + { + libxml_use_internal_errors(true); + + $dom = new DomDocument; + + if (version_compare(PHP_VERSION, '5.4.0', '>=')) { + $dom->loadHTML($input, LIBXML_NONET); + } + else { + $dom->loadHTML($input); + } + + return $dom; + } + + /** + * Get XML parser errors + * + * @static + * @access public + * @return string + */ + public static function getErrors() + { + $errors = array(); + + foreach(libxml_get_errors() as $error) { + + $errors[] = sprintf('XML error: %s (Line: %d - Column: %d - Code: %d)', + $error->message, + $error->line, + $error->column, + $error->code + ); + } + + return implode(', ', $errors); + } +}