Update to the last version of PicoFeed

This commit is contained in:
Frédéric Guillot 2014-05-20 14:20:27 -04:00
parent 58cb6979a8
commit 3840a87128
33 changed files with 2920 additions and 1123 deletions

View File

@ -3,8 +3,19 @@
require __DIR__.'/check_setup.php'; require __DIR__.'/check_setup.php';
require __DIR__.'/lib/Translator.php'; require __DIR__.'/lib/Translator.php';
require __DIR__.'/vendor/PicoDb/Database.php'; require __DIR__.'/vendor/PicoDb/Database.php';
require __DIR__.'/vendor/PicoFeed/Client.php'; require __DIR__.'/vendor/PicoFeed/PicoFeed.php';
require __DIR__.'/vendor/PicoFeed/Parser.php'; require __DIR__.'/vendor/Readability/Readability.php';
require __DIR__.'/vendor/SimpleValidator/Validator.php';
require __DIR__.'/vendor/SimpleValidator/Base.php';
require __DIR__.'/vendor/SimpleValidator/Validators/Required.php';
require __DIR__.'/vendor/SimpleValidator/Validators/Unique.php';
require __DIR__.'/vendor/SimpleValidator/Validators/MaxLength.php';
require __DIR__.'/vendor/SimpleValidator/Validators/MinLength.php';
require __DIR__.'/vendor/SimpleValidator/Validators/Integer.php';
require __DIR__.'/vendor/SimpleValidator/Validators/Equals.php';
require __DIR__.'/vendor/SimpleValidator/Validators/AlphaNumeric.php';
require __DIR__.'/models/config.php'; require __DIR__.'/models/config.php';
require __DIR__.'/models/user.php'; require __DIR__.'/models/user.php';
require __DIR__.'/models/feed.php'; require __DIR__.'/models/feed.php';
@ -40,8 +51,6 @@ defined('AUTO_UPDATE_DOWNLOAD_DIRECTORY') or define('AUTO_UPDATE_DOWNLOAD_DIRECT
defined('AUTO_UPDATE_ARCHIVE_DIRECTORY') or define('AUTO_UPDATE_ARCHIVE_DIRECTORY', DATA_DIRECTORY.DIRECTORY_SEPARATOR.'archive'); defined('AUTO_UPDATE_ARCHIVE_DIRECTORY') or define('AUTO_UPDATE_ARCHIVE_DIRECTORY', DATA_DIRECTORY.DIRECTORY_SEPARATOR.'archive');
defined('AUTO_UPDATE_BACKUP_DIRECTORY') or define('AUTO_UPDATE_BACKUP_DIRECTORY', DATA_DIRECTORY.DIRECTORY_SEPARATOR.'backup'); defined('AUTO_UPDATE_BACKUP_DIRECTORY') or define('AUTO_UPDATE_BACKUP_DIRECTORY', DATA_DIRECTORY.DIRECTORY_SEPARATOR.'backup');
PicoFeed\Client::proxy(PROXY_HOSTNAME, PROXY_PORT, PROXY_USERNAME, PROXY_PASSWORD);
PicoDb\Database::bootstrap('db', function() { PicoDb\Database::bootstrap('db', function() {
$db = new PicoDb\Database(array( $db = new PicoDb\Database(array(

View File

@ -1,7 +1,5 @@
<?php <?php
require __DIR__.'/../vendor/PicoFeed/Writers/Atom.php';
use PicoFarad\Router; use PicoFarad\Router;
use PicoFarad\Response; use PicoFarad\Response;
use PicoFarad\Request; use PicoFarad\Request;

View File

@ -31,7 +31,7 @@ Router\before(function($action) {
date_default_timezone_set(Model\Config\get('timezone') ?: 'UTC'); date_default_timezone_set(Model\Config\get('timezone') ?: 'UTC');
// HTTP secure headers // HTTP secure headers
$frame_src = \PicoFeed\Filter::$iframe_whitelist; $frame_src = Model\Config\get_iframe_whitelist();;
$frame_src[] = 'https://login.persona.org'; $frame_src[] = 'https://login.persona.org';
Response\csp(array( Response\csp(array(

View File

@ -17,7 +17,7 @@ else {
} }
if (! empty($options['database'])) { if (! empty($options['database'])) {
\Model\Database\select($options['database']); Model\Database\select($options['database']);
} }
$limit = ! empty($options['limit']) && ctype_digit($options['limit']) ? (int) $options['limit'] : Model\Feed\LIMIT_ALL; $limit = ! empty($options['limit']) && ctype_digit($options['limit']) ? (int) $options['limit'] : Model\Feed\LIMIT_ALL;

View File

@ -2,29 +2,52 @@
namespace Model\Config; namespace Model\Config;
require_once __DIR__.'/../vendor/SimpleValidator/Validator.php';
require_once __DIR__.'/../vendor/SimpleValidator/Base.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/Required.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/Unique.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/MaxLength.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/MinLength.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/Integer.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/Equals.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/Integer.php';
use SimpleValidator\Validator; use SimpleValidator\Validator;
use SimpleValidator\Validators; use SimpleValidator\Validators;
use PicoDb\Database; use PicoDb\Database;
use PicoFeed\Config as ReaderConfig;
use PicoFeed\Logging;
const DB_VERSION = 24; const DB_VERSION = 24;
const HTTP_USERAGENT = 'Miniflux - http://miniflux.net'; const HTTP_USER_AGENT = 'Miniflux (http://miniflux.net)';
const HTTP_FAKE_USERAGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36';
// Get PicoFeed config
function get_reader_config()
{
$config = new ReaderConfig;
$config->setTimezone(get('timezone'));
$config->setClientTimeout(HTTP_TIMEOUT);
$config->setClientUserAgent(HTTP_USER_AGENT);
$config->setGrabberUserAgent(HTTP_USER_AGENT);
$config->setProxyHostname(PROXY_HOSTNAME);
$config->setProxyPort(PROXY_PORT);
$config->setProxyUsername(PROXY_USERNAME);
$config->setProxyPassword(PROXY_PASSWORD);
$config->setFilterIframeWhitelist(get_iframe_whitelist());
return $config;
}
function get_iframe_whitelist()
{
return array(
'//www.youtube.com',
'http://www.youtube.com',
'https://www.youtube.com',
'http://player.vimeo.com',
'https://player.vimeo.com',
'http://www.dailymotion.com',
'https://www.dailymotion.com',
);
}
// Send a debug message to the console // Send a debug message to the console
function debug($line) function debug($line)
{ {
\PicoFeed\Logging::log($line); Logging::setMessage($line);
write_debug(); write_debug();
} }
@ -32,14 +55,7 @@ function debug($line)
function write_debug() function write_debug()
{ {
if (DEBUG) { if (DEBUG) {
file_put_contents(DEBUG_FILENAME, implode(PHP_EOL, Logging::getMessages()));
$data = '';
foreach (\PicoFeed\Logging::$messages as $line) {
$data .= $line.PHP_EOL;
}
file_put_contents(DEBUG_FILENAME, $data);
} }
} }

View File

@ -2,14 +2,6 @@
namespace Model\Database; namespace Model\Database;
require_once __DIR__.'/../vendor/SimpleValidator/Validator.php';
require_once __DIR__.'/../vendor/SimpleValidator/Base.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/Required.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/MaxLength.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/MinLength.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/Equals.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/AlphaNumeric.php';
use SimpleValidator\Validator; use SimpleValidator\Validator;
use SimpleValidator\Validators; use SimpleValidator\Validators;

View File

@ -2,17 +2,15 @@
namespace Model\Feed; namespace Model\Feed;
require_once __DIR__.'/../vendor/PicoFeed/Filter.php';
require_once __DIR__.'/../vendor/PicoFeed/Export.php';
require_once __DIR__.'/../vendor/PicoFeed/Import.php';
require_once __DIR__.'/../vendor/PicoFeed/Reader.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validator.php';
require_once __DIR__.'/../vendor/SimpleValidator/Base.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/Required.php';
use SimpleValidator\Validator; use SimpleValidator\Validator;
use SimpleValidator\Validators; use SimpleValidator\Validators;
use PicoDb\Database; use PicoDb\Database;
use PicoFeed\Export;
use PicoFeed\Import;
use PicoFeed\Reader;
use PicoFeed\Logging;
use Model\Config;
use Model\Item;
const LIMIT_ALL = -1; const LIMIT_ALL = -1;
@ -32,14 +30,15 @@ function update(array $values)
// Export all feeds // Export all feeds
function export_opml() function export_opml()
{ {
$opml = new \PicoFeed\Export(get_all()); $opml = new Export(get_all());
return $opml->execute(); return $opml->execute();
} }
// Import OPML file // Import OPML file
function import_opml($content) function import_opml($content)
{ {
$import = new \PicoFeed\Import($content); Logging::setTimezone(Config\get('timezone'));
$import = new Import($content);
$feeds = $import->execute(); $feeds = $import->execute();
if ($feeds) { if ($feeds) {
@ -61,65 +60,71 @@ function import_opml($content)
$db->closeTransaction(); $db->closeTransaction();
\Model\Config\write_debug(); Config\write_debug();
return true; return true;
} }
\Model\Config\write_debug(); Config\write_debug();
return false; return false;
} }
// Add a new feed from an URL // Add a new feed from an URL
function create($url, $grabber = false) function create($url, $enable_grabber = false)
{ {
$reader = new \PicoFeed\Reader; $reader = new Reader(Config\get_reader_config());
$resource = $reader->download($url, '', '', HTTP_TIMEOUT, \Model\Config\HTTP_USERAGENT); $resource = $reader->download($url);
$parser = $reader->getParser(); $parser = $reader->getParser();
if ($parser !== false) { if ($parser !== false) {
$parser->grabber = $grabber; if ($enable_grabber) {
$parser->enableContentGrabber();
}
$feed = $parser->execute(); $feed = $parser->execute();
if ($feed === false) { if ($feed === false) {
\Model\Config\write_debug(); Config\write_debug();
return false; return false;
} }
if (! $feed->url) $feed->url = $reader->getUrl(); if (! $feed->getUrl()) {
$feed->url = $reader->getUrl();
}
if (! $feed->title) { if (! $feed->getTitle()) {
\Model\Config\write_debug(); Config\write_debug();
return false; return false;
} }
$db = Database::get('db'); $db = Database::get('db');
// Check if the feed is already there
if (! $db->table('feeds')->eq('feed_url', $reader->getUrl())->count()) { if (! $db->table('feeds')->eq('feed_url', $reader->getUrl())->count()) {
// Etag and LastModified are added the next update // Etag and LastModified are added the next update
$rs = $db->table('feeds')->save(array( $rs = $db->table('feeds')->save(array(
'title' => $feed->title, 'title' => $feed->getTitle(),
'site_url' => $feed->url, 'site_url' => $feed->getUrl(),
'feed_url' => $reader->getUrl(), 'feed_url' => $reader->getUrl(),
'download_content' => $grabber ? 1 : 0 'download_content' => $enable_grabber ? 1 : 0
)); ));
if ($rs) { if ($rs) {
$feed_id = $db->getConnection()->getLastId(); $feed_id = $db->getConnection()->getLastId();
\Model\Item\update_all($feed_id, $feed->items, $grabber); Item\update_all($feed_id, $feed->getItems(), $enable_grabber);
\Model\Config\write_debug(); Config\write_debug();
return (int) $feed_id; return (int) $feed_id;
} }
} }
} }
\Model\Config\write_debug(); Config\write_debug();
return false; return false;
} }
@ -143,16 +148,17 @@ function refresh_all($limit = LIMIT_ALL)
function refresh($feed_id) function refresh($feed_id)
{ {
$feed = get($feed_id); $feed = get($feed_id);
if (empty($feed)) return false;
$reader = new \PicoFeed\Reader; if (empty($feed)) {
return false;
}
$reader = new Reader(Config\get_reader_config());
$resource = $reader->download( $resource = $reader->download(
$feed['feed_url'], $feed['feed_url'],
$feed['last_modified'], $feed['last_modified'],
$feed['etag'], $feed['etag']
HTTP_TIMEOUT,
\Model\Config\HTTP_USERAGENT
); );
// Update the `last_checked` column each time, HTTP cache or not // Update the `last_checked` column each time, HTTP cache or not
@ -160,7 +166,7 @@ function refresh($feed_id)
if (! $resource->isModified()) { if (! $resource->isModified()) {
update_parsing_error($feed_id, 0); update_parsing_error($feed_id, 0);
\Model\Config\write_debug(); Config\write_debug();
return true; return true;
} }
@ -171,14 +177,8 @@ function refresh($feed_id)
if ($feed['download_content']) { if ($feed['download_content']) {
// Don't fetch previous items, only new one // Don't fetch previous items, only new one
$parser->grabber_ignore_urls = Database::get('db') $parser->enableContentGrabber();
->table('items') $parser->setGrabberIgnoreUrls(Database::get('db')->table('items')->eq('feed_id', $feed_id)->findAllByColumn('url'));
->eq('feed_id', $feed_id)
->findAllByColumn('url');
$parser->grabber = true;
$parser->grabber_timeout = HTTP_TIMEOUT;
$parser->grabber_user_agent = \Model\Config\HTTP_FAKE_USERAGENT;
} }
$result = $parser->execute(); $result = $parser->execute();
@ -187,15 +187,16 @@ function refresh($feed_id)
update_parsing_error($feed_id, 0); update_parsing_error($feed_id, 0);
update_cache($feed_id, $resource->getLastModified(), $resource->getEtag()); update_cache($feed_id, $resource->getLastModified(), $resource->getEtag());
\Model\Item\update_all($feed_id, $result->items, $parser->grabber);
\Model\Config\write_debug(); Item\update_all($feed_id, $result->getItems(), $feed['download_content']);
Config\write_debug();
return true; return true;
} }
} }
update_parsing_error($feed_id, 1); update_parsing_error($feed_id, 1);
\Model\Config\write_debug(); Config\write_debug();
return false; return false;
} }

View File

@ -2,11 +2,13 @@
namespace Model\Item; namespace Model\Item;
require_once __DIR__.'/../vendor/Readability/Readability.php'; use Model\Config;
require_once __DIR__.'/../vendor/PicoFeed/Grabber.php';
require_once __DIR__.'/../vendor/PicoFeed/Filter.php';
use PicoDb\Database; use PicoDb\Database;
use PicoFeed\Logging;
use PicoFeed\Grabber;
use PicoFeed\Client;
use PicoFeed\Filter;
use Readability;
// Get all items without filtering // Get all items without filtering
function get_everything() function get_everything()
@ -141,7 +143,7 @@ function get_bookmarks($offset = null, $limit = null)
->join('feeds', 'id', 'feed_id') ->join('feeds', 'id', 'feed_id')
->in('status', array('read', 'unread')) ->in('status', array('read', 'unread'))
->eq('bookmark', 1) ->eq('bookmark', 1)
->orderBy('updated', \Model\Config\get('items_sorting_direction')) ->orderBy('updated', Config\get('items_sorting_direction'))
->offset($offset) ->offset($offset)
->limit($limit) ->limit($limit)
->findAll(); ->findAll();
@ -201,7 +203,7 @@ function get_nav($item, $status = array('unread'), $bookmark = array(1, 0), $fee
->table('items') ->table('items')
->columns('id', 'status', 'title', 'bookmark') ->columns('id', 'status', 'title', 'bookmark')
->neq('status', 'removed') ->neq('status', 'removed')
->orderBy('updated', \Model\Config\get('items_sorting_direction')); ->orderBy('updated', Config\get('items_sorting_direction'));
if ($feed_id) $query->eq('feed_id', $feed_id); if ($feed_id) $query->eq('feed_id', $feed_id);
@ -377,7 +379,7 @@ function mark_feed_as_read($feed_id)
// Mark all read items to removed after X days // Mark all read items to removed after X days
function autoflush() function autoflush()
{ {
$autoflush = (int) \Model\Config\get('autoflush'); $autoflush = (int) Config\get('autoflush');
if ($autoflush > 0) { if ($autoflush > 0) {
@ -401,9 +403,9 @@ function autoflush()
} }
// Update all items // Update all items
function update_all($feed_id, array $items, $grabber = false) function update_all($feed_id, array $items, $enable_grabber = false)
{ {
$nocontent = (bool) \Model\Config\get('nocontent'); $nocontent = (bool) Config\get('nocontent');
$items_in_feed = array(); $items_in_feed = array();
@ -412,54 +414,55 @@ function update_all($feed_id, array $items, $grabber = false)
foreach ($items as $item) { foreach ($items as $item) {
\PicoFeed\Logging::log('Item => '.$item->id.' '.$item->url); Logging::setMessage('Item => '.$item->getId().' '.$item->getUrl());
// Item parsed correctly? // Item parsed correctly?
if ($item->id && $item->url) { if ($item->getId() && $item->getUrl()) {
\PicoFeed\Logging::log('Item parsed correctly'); Logging::setMessage('Item parsed correctly');
// Get item record in database, if any // Get item record in database, if any
$itemrec = $db $itemrec = $db
->table('items') ->table('items')
->columns('enclosure') ->columns('enclosure')
->eq('id', $item->id)->findOne(); ->eq('id', $item->getId())
->findOne();
// Insert a new item // Insert a new item
if ($itemrec === null) { if ($itemrec === null) {
\PicoFeed\Logging::log('Item added to the database'); Logging::setMessage('Item added to the database');
if (! $item->content && ! $nocontent && $grabber) { if ($enable_grabber && ! $nocontent && ! $item->getContent()) {
$item->content = download_content_url($item->url); $item->content = download_content_url($item->getUrl());
} }
$db->table('items')->save(array( $db->table('items')->save(array(
'id' => $item->id, 'id' => $item->getId(),
'title' => $item->title, 'title' => $item->getTitle(),
'url' => $item->url, 'url' => $item->getUrl(),
'updated' => $item->updated, 'updated' => $item->getDate(),
'author' => $item->author, 'author' => $item->getAuthor(),
'content' => $nocontent ? '' : $item->content, 'content' => $nocontent ? '' : $item->getContent(),
'status' => 'unread', 'status' => 'unread',
'feed_id' => $feed_id, 'feed_id' => $feed_id,
'enclosure' => isset($item->enclosure) ? $item->enclosure : null, 'enclosure' => $item->getEnclosureUrl(),
'enclosure_type' => isset($item->enclosure_type) ? $item->enclosure_type : null, 'enclosure_type' => $item->getEnclosureType(),
'language' => $item->language, 'language' => $item->getLanguage(),
)); ));
} }
else if (isset($item->enclosure) && $item->enclosure && !$itemrec['enclosure']) { else if (! $itemrec['enclosure'] && $item->getEnclosureUrl()) {
\PicoFeed\Logging::log('Update item enclosure'); Logging::setMessage('Update item enclosure');
$db->table('items')->eq('id', $item->id)->save(array( $db->table('items')->eq('id', $item->getId())->save(array(
'status' => 'unread', 'status' => 'unread',
'enclosure' => $item->enclosure, 'enclosure' => $item->getEnclosureUrl(),
'enclosure_type' => isset($item->enclosure_type) ? $item->enclosure_type : null, 'enclosure_type' => $item->getEnclosureType(),
)); ));
} }
else { else {
\PicoFeed\Logging::log('Item already in the database'); Logging::setMessage('Item already in the database');
} }
// Items inside this feed // Items inside this feed
@ -467,10 +470,20 @@ function update_all($feed_id, array $items, $grabber = false)
} }
} }
// Cleanup old items
cleanup($feed_id, $items_in_feed);
$db->closeTransaction();
}
// Remove from the database items marked as "removed" // Remove from the database items marked as "removed"
// and not present inside the feed // and not present inside the feed
function cleanup($feed_id, array $items_in_feed)
{
if (! empty($items_in_feed)) { if (! empty($items_in_feed)) {
$db = Database::get('db');
$removed_items = $db $removed_items = $db
->table('items') ->table('items')
->columns('id') ->columns('id')
@ -489,7 +502,7 @@ function update_all($feed_id, array $items, $grabber = false)
if (! empty($items_to_remove)) { if (! empty($items_to_remove)) {
$nb_items = count($items_to_remove); $nb_items = count($items_to_remove);
\PicoFeed\Logging::log('There is '.$nb_items.' items to remove'); Logging::setMessage('There is '.$nb_items.' items to remove');
// Handle the case when there is a huge number of items to remove // Handle the case when there is a huge number of items to remove
// Sqlite have a limit of 1000 sql variables by default // Sqlite have a limit of 1000 sql variables by default
@ -508,43 +521,31 @@ function update_all($feed_id, array $items, $grabber = false)
} }
} }
} }
\PicoFeed\Logging::log('Db transaction => '.($db->getConnection()->inTransaction() ? 'ok' : 'rollback'));
$db->closeTransaction();
} }
// Download content from an URL // Download content from an URL
function download_content_url($url) function download_content_url($url)
{ {
$client = \PicoFeed\Client::create();
$client->url = $url;
$client->timeout = HTTP_TIMEOUT;
$client->user_agent = \Model\Config\HTTP_FAKE_USERAGENT;
$client->execute();
$html = $client->getContent();
if (! empty($html)) {
// Try first with PicoFeed grabber and with Readability after
$grabber = new \PicoFeed\Grabber($url, $html, $client->getEncoding());
$content = ''; $content = '';
$grabber = new Grabber($url);
$grabber->setConfig(Config\get_reader_config());
$grabber->download();
if ($grabber->parse()) { if ($grabber->parse()) {
$content = $grabber->content; $content = $grabber->getcontent();
}
else {
$content = download_content_readability($grabber->getRawContent(), $url);
} }
if (empty($content)) { if (! empty($content)) {
$content = download_content_readability($grabber->html, $url); $filter = new Filter($content, $url);
$filter->setConfig(Config\get_reader_config());
$content = $filter->execute();
} }
// Filter content return $content;
$filter = new \PicoFeed\Filter($content, $url);
return $filter->execute();
}
return '';
} }
// Download content from item ID // Download content from item ID
@ -555,7 +556,7 @@ function download_content_id($item_id)
if (! empty($content)) { if (! empty($content)) {
if (! \Model\Config\get('nocontent')) { if (! Config\get('nocontent')) {
// Save content // Save content
Database::get('db') Database::get('db')
@ -564,7 +565,7 @@ function download_content_id($item_id)
->save(array('content' => $content)); ->save(array('content' => $content));
} }
\Model\Config\write_debug(); Config\write_debug();
return array( return array(
'result' => true, 'result' => true,
@ -572,7 +573,7 @@ function download_content_id($item_id)
); );
} }
\Model\Config\write_debug(); Config\write_debug();
return array( return array(
'result' => false, 'result' => false,
@ -585,7 +586,7 @@ function download_content_readability($content, $url)
{ {
if (! empty($content)) { if (! empty($content)) {
$readability = new \Readability($content, $url); $readability = new Readability($content, $url);
if ($readability->init()) { if ($readability->init()) {
return $readability->getContent()->innerHTML; return $readability->getContent()->innerHTML;

View File

@ -2,11 +2,6 @@
namespace Model\User; namespace Model\User;
require_once __DIR__.'/../vendor/SimpleValidator/Validator.php';
require_once __DIR__.'/../vendor/SimpleValidator/Base.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/Required.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/MaxLength.php';
use SimpleValidator\Validator; use SimpleValidator\Validator;
use SimpleValidator\Validators; use SimpleValidator\Validators;
use PicoDb\Database; use PicoDb\Database;

View File

@ -2,59 +2,170 @@
namespace PicoFeed; namespace PicoFeed;
require_once __DIR__.'/Logging.php'; use LogicException;
use Clients\Curl;
use Clients\Stream;
use PicoFeed\Logging;
/**
* Client class
*
* @author Frederic Guillot
* @package client
*/
abstract class Client abstract class Client
{ {
protected static $proxy_hostname = null; /**
protected static $proxy_port = null; * Flag that say if the resource have been modified
protected static $proxy_username = null; *
protected static $proxy_password = null; * @access private
* @var bool
*/
private $is_modified = true;
public $encoding = ''; /**
public $etag = ''; * HTTP encoding
public $last_modified = ''; *
public $is_modified = true; * @access private
public $content = ''; * @var string
public $url = ''; */
public $timeout = 10; private $encoding = '';
public $max_redirects = 5;
public $max_body_size = 2097152; // 2MB
public $user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)';
/**
* HTTP Etag header
*
* @access protected
* @var string
*/
protected $etag = '';
public static function create($adapter = null) /**
{ * HTTP Last-Modified header
return $adapter ?: self::chooseAdapter(); *
} * @access protected
* @var string
*/
protected $last_modified = '';
/**
* Proxy hostname
*
* @access protected
* @var string
*/
protected $proxy_hostname = '';
public static function chooseAdapter() /**
* Proxy port
*
* @access protected
* @var integer
*/
protected $proxy_port = 3128;
/**
* Proxy username
*
* @access protected
* @var string
*/
protected $proxy_username = '';
/**
* Proxy password
*
* @access protected
* @var string
*/
protected $proxy_password = '';
/**
* Client connection timeout
*
* @access protected
* @var integer
*/
protected $timeout = 10;
/**
* User-agent
*
* @access protected
* @var string
*/
protected $user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)';
/**
* Real URL used (can be changed after a HTTP redirect)
*
* @access protected
* @var string
*/
protected $url = '';
/**
* Page/Feed content
*
* @access protected
* @var string
*/
protected $content = '';
/**
* Number maximum of HTTP redirections to avoid infinite loops
*
* @access protected
* @var integer
*/
protected $max_redirects = 5;
/**
* Maximum size of the HTTP body response
*
* @access protected
* @var integer
*/
protected $max_body_size = 2097152; // 2MB
/**
* Get client instance: curl or stream driver
*
* @static
* @access public
* @return \PicoFeed\Client
*/
public static function getInstance()
{ {
if (function_exists('curl_init')) { if (function_exists('curl_init')) {
require_once __DIR__.'/Clients/Curl.php'; require_once __DIR__.'/Clients/Curl.php';
return new Clients\Curl; return new Clients\Curl;
}
} else if (ini_get('allow_url_fopen')) { else if (ini_get('allow_url_fopen')) {
require_once __DIR__.'/Clients/Stream.php'; require_once __DIR__.'/Clients/Stream.php';
return new Clients\Stream; return new Clients\Stream;
} }
throw new \LogicException('You must have "allow_url_fopen=1" or curl extension installed'); throw new LogicException('You must have "allow_url_fopen=1" or curl extension installed');
} }
/**
public function execute() * Perform the HTTP request
*
* @access public
* @param string $url URL
* @return bool
*/
public function execute($url = '')
{ {
if ($this->url === '') { if ($url !== '') {
throw new \LogicException('The URL is missing'); $this->url = $url;
} }
Logging::log(\get_called_class().' Fetch URL: '.$this->url); Logging::setMessage(get_called_class().' Fetch URL: '.$this->url);
Logging::log(\get_called_class().' Etag provided: '.$this->etag); Logging::setMessage(get_called_class().' Etag provided: '.$this->etag);
Logging::log(\get_called_class().' Last-Modified provided: '.$this->last_modified); Logging::setMessage(get_called_class().' Last-Modified provided: '.$this->last_modified);
$response = $this->doRequest(); $response = $this->doRequest();
@ -62,25 +173,42 @@ abstract class Client
if ($response['status'] == 304) { if ($response['status'] == 304) {
$this->is_modified = false; $this->is_modified = false;
Logging::log(\get_called_class().' Resource not modified'); Logging::setMessage(get_called_class().' Resource not modified');
} }
else if ($response['status'] == 404) { else if ($response['status'] == 404) {
Logging::log(\get_called_class().' Resource not found'); Logging::setMessage(get_called_class().' Resource not found');
} }
else { else {
$this->etag = isset($response['headers']['ETag']) ? $response['headers']['ETag'] : ''; $etag = isset($response['headers']['ETag']) ? $response['headers']['ETag'] : '';
$this->last_modified = isset($response['headers']['Last-Modified']) ? $response['headers']['Last-Modified'] : ''; $last_modified = isset($response['headers']['Last-Modified']) ? $response['headers']['Last-Modified'] : '';
$this->content = $response['body']; $this->content = $response['body'];
if (isset($response['headers']['Content-Type'])) { if (isset($response['headers']['Content-Type'])) {
$result = explode('charset=', strtolower($response['headers']['Content-Type'])); $result = explode('charset=', strtolower($response['headers']['Content-Type']));
$this->encoding = isset($result[1]) ? $result[1] : ''; $this->encoding = isset($result[1]) ? $result[1] : '';
} }
}
} if (($this->etag && $this->etag === $etag) || ($this->last_modified && $last_modified === $this->last_modified)) {
$this->is_modified = false;
} }
$this->etag = $etag;
$this->last_modified = $last_modified;
}
return true;
}
return false;
}
/**
* Parse HTTP headers
*
* @access public
* @param array $lines List of headers
* @return array
*/
public function parseHeaders(array $lines) public function parseHeaders(array $lines)
{ {
$status = 200; $status = 200;
@ -88,7 +216,7 @@ abstract class Client
foreach ($lines as $line) { foreach ($lines as $line) {
if (strpos($line, 'HTTP') === 0/* && strpos($line, '301') === false && strpos($line, '302') === false*/) { if (strpos($line, 'HTTP') === 0) {
$status = (int) substr($line, 9, 3); $status = (int) substr($line, 9, 3);
} }
else if (strpos($line, ':') !== false) { else if (strpos($line, ':') !== false) {
@ -98,71 +226,242 @@ abstract class Client
} }
} }
Logging::log(\get_called_class().' HTTP status code: '.$status); Logging::setMessage(get_called_class().' HTTP status code: '.$status);
foreach ($headers as $name => $value) { foreach ($headers as $name => $value) {
Logging::log(\get_called_class().' HTTP header: '.$name.' => '.$value); Logging::setMessage(get_called_class().' HTTP header: '.$name.' => '.$value);
} }
return array($status, $headers); return array($status, $headers);
} }
/**
public static function proxy($hostname, $port = 3128, $username = '', $password = '') * Set the Last-Modified HTTP header
{ *
self::$proxy_hostname = $hostname; * @access public
self::$proxy_port = $port; * @param string $last_modified Header value
self::$proxy_username = $username; * @return \PicoFeed\Client
self::$proxy_password = $password; */
}
public function setLastModified($last_modified) public function setLastModified($last_modified)
{ {
$this->last_modified = $last_modified; $this->last_modified = $last_modified;
return $this; return $this;
} }
/**
* Get the value of the Last-Modified HTTP header
*
* @access public
* @return string
*/
public function getLastModified() public function getLastModified()
{ {
return $this->last_modified; return $this->last_modified;
} }
/**
* Set the value of the Etag HTTP header
*
* @access public
* @param string $etag Etag HTTP header value
* @return \PicoFeed\Client
*/
public function setEtag($etag) public function setEtag($etag)
{ {
$this->etag = $etag; $this->etag = $etag;
return $this; return $this;
} }
/**
* Get the Etag HTTP header value
*
* @access public
* @return string
*/
public function getEtag() public function getEtag()
{ {
return $this->etag; return $this->etag;
} }
/**
* Get the final url value
*
* @access public
* @return string
*/
public function getUrl() public function getUrl()
{ {
return $this->url; return $this->url;
} }
/**
* Set the url
*
* @access public
* @return string
* @return \PicoFeed\Client
*/
public function setUrl($url)
{
$this->url = $url;
return $this;
}
/**
* Get the body of the HTTP response
*
* @access public
* @return string
*/
public function getContent() public function getContent()
{ {
return $this->content; return $this->content;
} }
/**
* Get the encoding value from HTTP headers
*
* @access public
* @return string
*/
public function getEncoding() public function getEncoding()
{ {
return $this->encoding; return $this->encoding;
} }
/**
* Return true if the remote resource has changed
*
* @access public
* @return bool
*/
public function isModified() public function isModified()
{ {
return $this->is_modified; return $this->is_modified;
} }
/**
* Set connection timeout
*
* @access public
* @param integer $timeout Connection timeout
* @return \PicoFeed\Client
*/
public function setTimeout($timeout)
{
$this->timeout = $timeout ?: $this->timeout;
return $this;
}
/**
* Set a custom user agent
*
* @access public
* @param string $user_agent User Agent
* @return \PicoFeed\Client
*/
public function setUserAgent($user_agent)
{
$this->user_agent = $user_agent ?: $this->user_agent;
return $this;
}
/**
* Set the mximum number of HTTP redirections
*
* @access public
* @param integer $max Maximum
* @return \PicoFeed\Client
*/
public function setMaxRedirections($max)
{
$this->max_redirects = $max ?: $this->max_redirects;
return $this;
}
/**
* Set the maximum size of the HTTP body
*
* @access public
* @param integer $max Maximum
* @return \PicoFeed\Client
*/
public function setMaxBodySize($max)
{
$this->max_body_size = $max ?: $this->max_body_size;
return $this;
}
/**
* Set the proxy hostname
*
* @access public
* @param string $hostname Proxy hostname
* @return \PicoFeed\Client
*/
public function setProxyHostname($hostname)
{
$this->proxy_hostname = $hostname ?: $this->proxy_hostname;
return $this;
}
/**
* Set the proxy port
*
* @access public
* @param integer $port Proxy port
* @return \PicoFeed\Client
*/
public function setProxyPort($port)
{
$this->proxy_port = $port ?: $this->proxy_port;
return $this;
}
/**
* Set the proxy username
*
* @access public
* @param string $username Proxy username
* @return \PicoFeed\Client
*/
public function setProxyUsername($username)
{
$this->proxy_username = $username ?: $this->proxy_username;
return $this;
}
/**
* Set the proxy password
*
* @access public
* @param string $password Password
* @return \PicoFeed\Client
*/
public function setProxyPassword($password)
{
$this->proxy_password = $password ?: $this->proxy_password;
return $this;
}
/**
* Set config object
*
* @access public
* @param \PicoFeed\Config $config Config instance
* @return \PicoFeed\Client
*/
public function setConfig($config)
{
$this->setTimeout($config->getGrabberTimeout());
$this->setUserAgent($config->getGrabberUserAgent());
$this->setMaxRedirections($config->getMaxRedirections());
$this->setMaxBodySize($config->getMaxBodySize());
$this->setProxyHostname($config->getProxyHostname());
$this->setProxyPort($config->getProxyPort());
$this->setProxyUsername($config->getProxyUsername());
$this->setProxyPassword($config->getProxyPassword());
return $this;
}
} }

View File

@ -3,27 +3,80 @@
namespace PicoFeed\Clients; namespace PicoFeed\Clients;
use \PicoFeed\Logging; use \PicoFeed\Logging;
use \PicoFeed\Client;
class Curl extends \PicoFeed\Client /**
* cURL HTTP client
*
* @author Frederic Guillot
* @package client
*/
class Curl extends Client
{ {
/**
* HTTP response body
*
* @access private
* @var string
*/
private $body = ''; private $body = '';
/**
* Body size
*
* @access private
* @var integer
*/
private $body_length = 0; private $body_length = 0;
/**
* HTTP response headers
*
* @access private
* @var array
*/
private $headers = array(); private $headers = array();
/**
* Counter on the number of header received
*
* @access private
* @var integer
*/
private $headers_counter = 0; private $headers_counter = 0;
/**
* cURL callback to read the HTTP body
*
* If the function return -1, curl stop to read the HTTP response
*
* @access public
* @param resource $ch cURL handler
* @param string $buffer Chunk of data
* @return integer Length of the buffer
*/
public function readBody($ch, $buffer) public function readBody($ch, $buffer)
{ {
$length = strlen($buffer); $length = strlen($buffer);
$this->body_length += $length; $this->body_length += $length;
if ($this->body_length > $this->max_body_size) return -1; if ($this->body_length > $this->max_body_size) {
return -1;
}
$this->body .= $buffer; $this->body .= $buffer;
return $length; return $length;
} }
/**
* cURL callback to read HTTP headers
*
* @access public
* @param resource $ch cURL handler
* @param string $buffer Header line
* @return integer Length of the buffer
*/
public function readHeaders($ch, $buffer) public function readHeaders($ch, $buffer)
{ {
$length = strlen($buffer); $length = strlen($buffer);
@ -43,7 +96,13 @@ class Curl extends \PicoFeed\Client
return $length; return $length;
} }
/**
* Do the HTTP request
*
* @access public
* @param bool $follow_location Flag used when there is an open_basedir restriction
* @return array HTTP response ['body' => ..., 'status' => ..., 'headers' => ...]
*/
public function doRequest($follow_location = true) public function doRequest($follow_location = true)
{ {
$request_headers = array('Connection: close'); $request_headers = array('Connection: close');
@ -54,6 +113,7 @@ class Curl extends \PicoFeed\Client
$ch = curl_init(); $ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $this->url); curl_setopt($ch, CURLOPT_URL, $this->url);
curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $this->timeout); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_USERAGENT, $this->user_agent); curl_setopt($ch, CURLOPT_USERAGENT, $this->user_agent);
@ -67,28 +127,34 @@ class Curl extends \PicoFeed\Client
curl_setopt($ch, CURLOPT_COOKIEJAR, 'php://memory'); curl_setopt($ch, CURLOPT_COOKIEJAR, 'php://memory');
curl_setopt($ch, CURLOPT_COOKIEFILE, 'php://memory'); curl_setopt($ch, CURLOPT_COOKIEFILE, 'php://memory');
if (parent::$proxy_hostname) { if ($this->proxy_hostname) {
curl_setopt($ch, CURLOPT_PROXYPORT, parent::$proxy_port); Logging::setMessage(get_called_class().' Proxy: '.$this->proxy_hostname.':'.$this->proxy_port);
curl_setopt($ch, CURLOPT_PROXYPORT, $this->proxy_port);
curl_setopt($ch, CURLOPT_PROXYTYPE, 'HTTP'); curl_setopt($ch, CURLOPT_PROXYTYPE, 'HTTP');
curl_setopt($ch, CURLOPT_PROXY, parent::$proxy_hostname); curl_setopt($ch, CURLOPT_PROXY, $this->proxy_hostname);
if (parent::$proxy_username) { if ($this->proxy_username) {
curl_setopt($ch, CURLOPT_PROXYUSERPWD, parent::$proxy_username.':'.parent::$proxy_password); Logging::setMessage(get_called_class().' Proxy credentials: Yes');
curl_setopt($ch, CURLOPT_PROXYUSERPWD, $this->proxy_username.':'.$this->proxy_password);
}
else {
Logging::setMessage(get_called_class().' Proxy credentials: No');
} }
} }
curl_exec($ch); curl_exec($ch);
Logging::log(\get_called_class().' cURL total time: '.curl_getinfo($ch, CURLINFO_TOTAL_TIME)); Logging::setMessage(get_called_class().' cURL total time: '.curl_getinfo($ch, CURLINFO_TOTAL_TIME));
Logging::log(\get_called_class().' cURL dns lookup time: '.curl_getinfo($ch, CURLINFO_NAMELOOKUP_TIME)); Logging::setMessage(get_called_class().' cURL dns lookup time: '.curl_getinfo($ch, CURLINFO_NAMELOOKUP_TIME));
Logging::log(\get_called_class().' cURL connect time: '.curl_getinfo($ch, CURLINFO_CONNECT_TIME)); Logging::setMessage(get_called_class().' cURL connect time: '.curl_getinfo($ch, CURLINFO_CONNECT_TIME));
Logging::log(\get_called_class().' cURL speed download: '.curl_getinfo($ch, CURLINFO_SPEED_DOWNLOAD)); Logging::setMessage(get_called_class().' cURL speed download: '.curl_getinfo($ch, CURLINFO_SPEED_DOWNLOAD));
Logging::log(\get_called_class().' cURL effective url: '.curl_getinfo($ch, CURLINFO_EFFECTIVE_URL)); Logging::setMessage(get_called_class().' cURL effective url: '.curl_getinfo($ch, CURLINFO_EFFECTIVE_URL));
if (curl_errno($ch)) { if (curl_errno($ch)) {
Logging::log(\get_called_class().' cURL error: '.curl_error($ch)); Logging::setMessage(get_called_class().' cURL error: '.curl_error($ch));
curl_close($ch); curl_close($ch);
return false; return false;

View File

@ -3,6 +3,7 @@
namespace PicoFeed\Clients; namespace PicoFeed\Clients;
use \PicoFeed\Logging; use \PicoFeed\Logging;
use \PicoFeed\Client;
/** /**
* Stream context HTTP client * Stream context HTTP client
@ -10,7 +11,7 @@ use \PicoFeed\Logging;
* @author Frederic Guillot * @author Frederic Guillot
* @package client * @package client
*/ */
class Stream extends \PicoFeed\Client class Stream extends Client
{ {
/** /**
* Do the HTTP request * Do the HTTP request
@ -24,11 +25,19 @@ class Stream extends \PicoFeed\Client
$headers = array( $headers = array(
'Connection: close', 'Connection: close',
'User-Agent: '.$this->user_agent, 'User-Agent: '.$this->user_agent,
'Accept-Encoding: gzip',
); );
if ($this->etag) $headers[] = 'If-None-Match: '.$this->etag; if (function_exists('gzdecode')) {
if ($this->last_modified) $headers[] = 'If-Modified-Since: '.$this->last_modified; $headers[] = 'Accept-Encoding: gzip';
}
if ($this->etag) {
$headers[] = 'If-None-Match: '.$this->etag;
}
if ($this->last_modified) {
$headers[] = 'If-Modified-Since: '.$this->last_modified;
}
// Create context // Create context
$context_options = array( $context_options = array(
@ -41,14 +50,22 @@ class Stream extends \PicoFeed\Client
) )
); );
if (parent::$proxy_hostname) { if ($this->proxy_hostname) {
$context_options['http']['proxy'] = 'tcp://'.parent::$proxy_hostname.':'.parent::$proxy_port;
Logging::setMessage(get_called_class().' Proxy: '.$this->proxy_hostname.':'.$this->proxy_port);
$context_options['http']['proxy'] = 'tcp://'.$this->proxy_hostname.':'.$this->proxy_port;
$context_options['http']['request_fulluri'] = true; $context_options['http']['request_fulluri'] = true;
if (parent::$proxy_username) { if ($this->proxy_username) {
$headers[] = 'Proxy-Authorization: Basic '.base64_encode(parent::$proxy_username.':'.parent::$proxy_password); Logging::setMessage(get_called_class().' Proxy credentials: Yes');
$headers[] = 'Proxy-Authorization: Basic '.base64_encode($this->proxy_username.':'.$this->proxy_password);
$context_options['http']['header'] = implode("\r\n", $headers); $context_options['http']['header'] = implode("\r\n", $headers);
} }
else {
Logging::setMessage(get_called_class().' Proxy credentials: No');
}
} }
$context = stream_context_create($context_options); $context = stream_context_create($context_options);

View File

@ -1,32 +1,6 @@
<?php <?php
/*
Copyright (c) 2008 Sebastián Grignoli
All rights reserved.
Redistribution and use in source and binary forms, with or without namespace PicoFeed;
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of copyright holders nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS
BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
/** /**
* @author "Sebastián Grignoli" <grignoli@framework2.com.ar> * @author "Sebastián Grignoli" <grignoli@framework2.com.ar>
@ -36,14 +10,10 @@ POSSIBILITY OF SUCH DAMAGE.
* @example https://github.com/neitanod/forceutf8 * @example https://github.com/neitanod/forceutf8
* @license Revised BSD * @license Revised BSD
*/ */
class Encoding
namespace PicoFeed; {
class Encoding {
protected static $win1252ToUtf8 = array( protected static $win1252ToUtf8 = array(
128 => "\xe2\x82\xac", 128 => "\xe2\x82\xac",
130 => "\xe2\x80\x9a", 130 => "\xe2\x80\x9a",
131 => "\xc6\x92", 131 => "\xc6\x92",
132 => "\xe2\x80\x9e", 132 => "\xe2\x80\x9e",
@ -55,10 +25,7 @@ class Encoding {
138 => "\xc5\xa0", 138 => "\xc5\xa0",
139 => "\xe2\x80\xb9", 139 => "\xe2\x80\xb9",
140 => "\xc5\x92", 140 => "\xc5\x92",
142 => "\xc5\xbd", 142 => "\xc5\xbd",
145 => "\xe2\x80\x98", 145 => "\xe2\x80\x98",
146 => "\xe2\x80\x99", 146 => "\xe2\x80\x99",
147 => "\xe2\x80\x9c", 147 => "\xe2\x80\x9c",
@ -71,49 +38,12 @@ class Encoding {
154 => "\xc5\xa1", 154 => "\xc5\xa1",
155 => "\xe2\x80\xba", 155 => "\xe2\x80\xba",
156 => "\xc5\x93", 156 => "\xc5\x93",
158 => "\xc5\xbe", 158 => "\xc5\xbe",
159 => "\xc5\xb8" 159 => "\xc5\xb8"
); );
protected static $brokenUtf8ToUtf8 = array(
"\xc2\x80" => "\xe2\x82\xac",
"\xc2\x82" => "\xe2\x80\x9a",
"\xc2\x83" => "\xc6\x92",
"\xc2\x84" => "\xe2\x80\x9e",
"\xc2\x85" => "\xe2\x80\xa6",
"\xc2\x86" => "\xe2\x80\xa0",
"\xc2\x87" => "\xe2\x80\xa1",
"\xc2\x88" => "\xcb\x86",
"\xc2\x89" => "\xe2\x80\xb0",
"\xc2\x8a" => "\xc5\xa0",
"\xc2\x8b" => "\xe2\x80\xb9",
"\xc2\x8c" => "\xc5\x92",
"\xc2\x8e" => "\xc5\xbd",
"\xc2\x91" => "\xe2\x80\x98",
"\xc2\x92" => "\xe2\x80\x99",
"\xc2\x93" => "\xe2\x80\x9c",
"\xc2\x94" => "\xe2\x80\x9d",
"\xc2\x95" => "\xe2\x80\xa2",
"\xc2\x96" => "\xe2\x80\x93",
"\xc2\x97" => "\xe2\x80\x94",
"\xc2\x98" => "\xcb\x9c",
"\xc2\x99" => "\xe2\x84\xa2",
"\xc2\x9a" => "\xc5\xa1",
"\xc2\x9b" => "\xe2\x80\xba",
"\xc2\x9c" => "\xc5\x93",
"\xc2\x9e" => "\xc5\xbe",
"\xc2\x9f" => "\xc5\xb8"
);
protected static $utf8ToWin1252 = array( protected static $utf8ToWin1252 = array(
"\xe2\x82\xac" => "\x80", "\xe2\x82\xac" => "\x80",
"\xe2\x80\x9a" => "\x82", "\xe2\x80\x9a" => "\x82",
"\xc6\x92" => "\x83", "\xc6\x92" => "\x83",
"\xe2\x80\x9e" => "\x84", "\xe2\x80\x9e" => "\x84",
@ -125,10 +55,7 @@ class Encoding {
"\xc5\xa0" => "\x8a", "\xc5\xa0" => "\x8a",
"\xe2\x80\xb9" => "\x8b", "\xe2\x80\xb9" => "\x8b",
"\xc5\x92" => "\x8c", "\xc5\x92" => "\x8c",
"\xc5\xbd" => "\x8e", "\xc5\xbd" => "\x8e",
"\xe2\x80\x98" => "\x91", "\xe2\x80\x98" => "\x91",
"\xe2\x80\x99" => "\x92", "\xe2\x80\x99" => "\x92",
"\xe2\x80\x9c" => "\x93", "\xe2\x80\x9c" => "\x93",
@ -141,12 +68,10 @@ class Encoding {
"\xc5\xa1" => "\x9a", "\xc5\xa1" => "\x9a",
"\xe2\x80\xba" => "\x9b", "\xe2\x80\xba" => "\x9b",
"\xc5\x93" => "\x9c", "\xc5\x93" => "\x9c",
"\xc5\xbe" => "\x9e", "\xc5\xbe" => "\x9e",
"\xc5\xb8" => "\x9f" "\xc5\xb8" => "\x9f"
); );
static function toUTF8($text){
/** /**
* Function Encoding::toUTF8 * Function Encoding::toUTF8
* *
@ -171,158 +96,95 @@ class Encoding {
* @return string The same string, UTF8 encoded * @return string The same string, UTF8 encoded
* *
*/ */
public static function toUTF8($text)
if(is_array($text))
{
foreach($text as $k => $v)
{ {
if (is_array($text)) {
foreach ($text as $k => $v) {
$text[$k] = self::toUTF8($v); $text[$k] = self::toUTF8($v);
} }
return $text; return $text;
} elseif(is_string($text)) { }
elseif (is_string($text)) {
$max = strlen($text); $max = strlen($text);
$buf = ""; $buf = "";
for ($i = 0; $i < $max; $i++) { for ($i = 0; $i < $max; $i++) {
$c1 = $text{$i}; $c1 = $text{$i};
if ($c1>="\xc0") { //Should be converted to UTF8, if it's not UTF8 already if ($c1>="\xc0") { //Should be converted to UTF8, if it's not UTF8 already
$c2 = $i+1 >= $max? "\x00" : $text{$i+1}; $c2 = $i+1 >= $max? "\x00" : $text{$i+1};
$c3 = $i+2 >= $max? "\x00" : $text{$i+2}; $c3 = $i+2 >= $max? "\x00" : $text{$i+2};
$c4 = $i+3 >= $max? "\x00" : $text{$i+3}; $c4 = $i+3 >= $max? "\x00" : $text{$i+3};
if ($c1 >= "\xc0" & $c1 <= "\xdf") { //looks like 2 bytes UTF8 if ($c1 >= "\xc0" & $c1 <= "\xdf") { //looks like 2 bytes UTF8
if ($c2 >= "\x80" && $c2 <= "\xbf") { //yeah, almost sure it's UTF8 already if ($c2 >= "\x80" && $c2 <= "\xbf") { //yeah, almost sure it's UTF8 already
$buf .= $c1 . $c2; $buf .= $c1 . $c2;
$i++; $i++;
} else { //not valid UTF8. Convert it. }
else { //not valid UTF8. Convert it.
$cc1 = (chr(ord($c1) / 64) | "\xc0"); $cc1 = (chr(ord($c1) / 64) | "\xc0");
$cc2 = ($c1 & "\x3f") | "\x80"; $cc2 = ($c1 & "\x3f") | "\x80";
$buf .= $cc1 . $cc2; $buf .= $cc1 . $cc2;
} }
} elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8 }
else if ($c1 >= "\xe0" & $c1 <= "\xef") { //looks like 3 bytes UTF8
if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { //yeah, almost sure it's UTF8 already if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { //yeah, almost sure it's UTF8 already
$buf .= $c1 . $c2 . $c3; $buf .= $c1 . $c2 . $c3;
$i = $i + 2; $i = $i + 2;
} else { //not valid UTF8. Convert it. }
else { //not valid UTF8. Convert it.
$cc1 = (chr(ord($c1) / 64) | "\xc0"); $cc1 = (chr(ord($c1) / 64) | "\xc0");
$cc2 = ($c1 & "\x3f") | "\x80"; $cc2 = ($c1 & "\x3f") | "\x80";
$buf .= $cc1 . $cc2; $buf .= $cc1 . $cc2;
} }
} elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8 }
else if ($c1 >= "\xf0" & $c1 <= "\xf7") { //looks like 4 bytes UTF8
if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") { //yeah, almost sure it's UTF8 already if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") { //yeah, almost sure it's UTF8 already
$buf .= $c1 . $c2 . $c3; $buf .= $c1 . $c2 . $c3;
$i = $i + 2; $i = $i + 2;
} else { //not valid UTF8. Convert it. }
else { //not valid UTF8. Convert it.
$cc1 = (chr(ord($c1) / 64) | "\xc0"); $cc1 = (chr(ord($c1) / 64) | "\xc0");
$cc2 = ($c1 & "\x3f") | "\x80"; $cc2 = ($c1 & "\x3f") | "\x80";
$buf .= $cc1 . $cc2; $buf .= $cc1 . $cc2;
} }
} else { //doesn't look like UTF8, but should be converted }
else { //doesn't look like UTF8, but should be converted
$cc1 = (chr(ord($c1) / 64) | "\xc0"); $cc1 = (chr(ord($c1) / 64) | "\xc0");
$cc2 = (($c1 & "\x3f") | "\x80"); $cc2 = (($c1 & "\x3f") | "\x80");
$buf .= $cc1 . $cc2; $buf .= $cc1 . $cc2;
} }
} elseif(($c1 & "\xc0") == "\x80"){ // needs conversion }
elseif (($c1 & "\xc0") == "\x80") { // needs conversion
if (isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases if (isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases
$buf .= self::$win1252ToUtf8[ord($c1)]; $buf .= self::$win1252ToUtf8[ord($c1)];
} else { }
else {
$cc1 = (chr(ord($c1) / 64) | "\xc0"); $cc1 = (chr(ord($c1) / 64) | "\xc0");
$cc2 = (($c1 & "\x3f") | "\x80"); $cc2 = (($c1 & "\x3f") | "\x80");
$buf .= $cc1 . $cc2; $buf .= $cc1 . $cc2;
} }
} else { // it doesn't need convesion }
else { // it doesn't need convesion
$buf .= $c1; $buf .= $c1;
} }
} }
return $buf; return $buf;
} else { }
else {
return $text; return $text;
} }
} }
static function toWin1252($text) {
if(is_array($text)) {
foreach($text as $k => $v) {
$text[$k] = self::toWin1252($v);
}
return $text;
} elseif(is_string($text)) {
return utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)));
} else {
return $text;
}
}
static function toISO8859($text) {
return self::toWin1252($text);
}
static function toLatin1($text) {
return self::toWin1252($text);
}
static function fixUTF8($text){
if(is_array($text)) {
foreach($text as $k => $v) {
$text[$k] = self::fixUTF8($v);
}
return $text;
}
$last = "";
while($last <> $text){
$last = $text;
$text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
}
$text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
return $text;
}
static function UTF8FixWin1252Chars($text){
// If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1
// (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
// See: http://en.wikipedia.org/wiki/Windows-1252
return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text);
}
static function removeBOM($str=""){
if(substr($str, 0,3) == pack("CCC",0xef,0xbb,0xbf)) {
$str=substr($str, 3);
}
return $str;
}
public static function normalizeEncoding($encodingLabel)
{
$encoding = strtoupper($encodingLabel);
$enc = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding);
$equivalences = array(
'ISO88591' => 'ISO-8859-1',
'ISO8859' => 'ISO-8859-1',
'ISO' => 'ISO-8859-1',
'LATIN1' => 'ISO-8859-1',
'LATIN' => 'ISO-8859-1',
'UTF8' => 'UTF-8',
'UTF' => 'UTF-8',
'WIN1252' => 'ISO-8859-1',
'WINDOWS1252' => 'ISO-8859-1'
);
if(empty($equivalences[$encoding])){
return 'UTF-8';
}
return $equivalences[$encoding];
}
public static function encode($encodingLabel, $text)
{
$encodingLabel = self::normalizeEncoding($encodingLabel);
if($encodingLabel == 'UTF-8') return Encoding::toUTF8($text);
if($encodingLabel == 'ISO-8859-1') return Encoding::toLatin1($text);
}
public static function cp1251ToUtf8($input) public static function cp1251ToUtf8($input)
{ {
return iconv('CP1251', 'UTF-8//TRANSLIT', $input); return iconv('CP1251', 'UTF-8//TRANSLIT', $input);

View File

@ -2,26 +2,56 @@
namespace PicoFeed; namespace PicoFeed;
use SimpleXMLElement;
/**
* OPML export class
*
* @author Frederic Guillot
* @package picofeed
*/
class Export class Export
{ {
/**
* List of feeds to exports
*
* @access private
* @var array
*/
private $content = array(); private $content = array();
public $required_fields = array( /**
* List of required properties for each feed
*
* @access private
* @var array
*/
private $required_fields = array(
'title', 'title',
'site_url', 'site_url',
'feed_url' 'feed_url',
); );
/**
* Constructor
*
* @access public
* @param array $content List of feeds
*/
public function __construct(array $content) public function __construct(array $content)
{ {
$this->content = $content; $this->content = $content;
} }
/**
* Get the OPML document
*
* @access public
* @return string
*/
public function execute() public function execute()
{ {
$xml = new \SimpleXMLElement('<?xml version="1.0" encoding="utf-8"?><opml/>'); $xml = new SimpleXMLElement('<?xml version="1.0" encoding="utf-8"?><opml/>');
$head = $xml->addChild('head'); $head = $xml->addChild('head');
$head->addChild('title', 'OPML Export'); $head->addChild('title', 'OPML Export');
@ -35,13 +65,14 @@ class Export
foreach ($this->required_fields as $field) { foreach ($this->required_fields as $field) {
if (! isset($feed[$field])) { if (! isset($feed[$field])) {
$valid = false; $valid = false;
break; break;
} }
} }
if (! $valid) continue; if (! $valid) {
continue;
}
$outline = $body->addChild('outline'); $outline = $body->addChild('outline');
$outline->addAttribute('xmlUrl', $feed['feed_url']); $outline->addAttribute('xmlUrl', $feed['feed_url']);

150
vendor/PicoFeed/Feed.php vendored Normal file
View File

@ -0,0 +1,150 @@
<?php
namespace PicoFeed;
/**
* Feed
*
* @author Frederic Guillot
* @package picofeed
*/
class Feed
{
/**
* Feed items
*
* @access public
* @var array
*/
public $items = array();
/**
* Feed id
*
* @access public
* @var string
*/
public $id = '';
/**
* Feed title
*
* @access public
* @var string
*/
public $title = '';
/**
* Item url
*
* @access public
* @var string
*/
public $url = '';
/**
* Item date
*
* @access public
* @var integer
*/
public $date = 0;
/**
* Item language
*
* @access public
* @var string
*/
public $language = '';
/**
* Return feed information
*
* @access public
* $return string
*/
public function __toString()
{
$output = '';
foreach (array('id', 'title', 'url', 'date', 'language') as $property) {
$output .= 'Feed::'.$property.' = '.$this->$property.PHP_EOL;
}
$output .= 'Feed::items = '.count($this->items).' items'.PHP_EOL;
foreach ($this->items as $item) {
$output .= '----'.PHP_EOL;
$output .= $item;
}
return $output;
}
/**
* Get title
*
* @access public
* $return string
*/
public function getTitle()
{
return $this->title;
}
/**
* Get url
*
* @access public
* $return string
*/
public function getUrl()
{
return $this->url;
}
/**
* Get date
*
* @access public
* $return integer
*/
public function getDate()
{
return $this->date;
}
/**
* Get language
*
* @access public
* $return string
*/
public function getLanguage()
{
return $this->language;
}
/**
* Get id
*
* @access public
* $return string
*/
public function getId()
{
return $this->id;
}
/**
* Get feed items
*
* @access public
* $return array
*/
public function getItems()
{
return $this->items;
}
}

View File

@ -2,14 +2,24 @@
namespace PicoFeed; namespace PicoFeed;
use DOMDocument;
/** /**
* Filter class * Filter class
* *
* @author Frederic Guillot * @author Frederic Guillot
* @package parser * @package picofeed
*/ */
class Filter class Filter
{ {
/**
* Config object
*
* @access private
* @var \PicoFeed\Config
*/
private $config = null;
/** /**
* Filtered XML data * Filtered XML data
* *
@ -61,11 +71,10 @@ class Filter
/** /**
* Tags and attribute whitelist * Tags and attribute whitelist
* *
* @static * @access private
* @access public
* @var array * @var array
*/ */
public static $whitelist_tags = array( private $whitelist_tags = array(
'audio' => array('controls', 'src'), 'audio' => array('controls', 'src'),
'video' => array('poster', 'controls', 'height', 'width', 'src'), 'video' => array('poster', 'controls', 'height', 'width', 'src'),
'source' => array('src', 'type'), 'source' => array('src', 'type'),
@ -109,11 +118,10 @@ class Filter
/** /**
* Tags blacklist, strip the content of those tags * Tags blacklist, strip the content of those tags
* *
* @static * @access private
* @access public
* @var array * @var array
*/ */
public static $blacklist_tags = array( private $blacklisted_tags = array(
'script' 'script'
); );
@ -121,11 +129,10 @@ class Filter
* Scheme whitelist * Scheme whitelist
* For a complete list go to http://en.wikipedia.org/wiki/URI_scheme * For a complete list go to http://en.wikipedia.org/wiki/URI_scheme
* *
* @static * @access private
* @access public
* @var array * @var array
*/ */
public static $scheme_whitelist = array( private $scheme_whitelist = array(
'//', '//',
'data:image/png;base64,', 'data:image/png;base64,',
'data:image/gif;base64,', 'data:image/gif;base64,',
@ -164,11 +171,10 @@ class Filter
/** /**
* Attributes used for external resources * Attributes used for external resources
* *
* @static * @access private
* @access public
* @var array * @var array
*/ */
public static $media_attributes = array( private $media_attributes = array(
'src', 'src',
'href', 'href',
'poster', 'poster',
@ -177,11 +183,10 @@ class Filter
/** /**
* Blacklisted resources * Blacklisted resources
* *
* @static * @access private
* @access public
* @var array * @var array
*/ */
public static $media_blacklist = array( private $media_blacklist = array(
'feeds.feedburner.com', 'feeds.feedburner.com',
'share.feedsportal.com', 'share.feedsportal.com',
'da.feedsportal.com', 'da.feedsportal.com',
@ -209,11 +214,10 @@ class Filter
/** /**
* Mandatory attributes for specified tags * Mandatory attributes for specified tags
* *
* @static * @access private
* @access public
* @var array * @var array
*/ */
public static $required_attributes = array( private $required_attributes = array(
'a' => array('href'), 'a' => array('href'),
'img' => array('src'), 'img' => array('src'),
'iframe' => array('src'), 'iframe' => array('src'),
@ -224,22 +228,20 @@ class Filter
/** /**
* Add attributes to specified tags * Add attributes to specified tags
* *
* @static * @access private
* @access public
* @var array * @var array
*/ */
public static $add_attributes = array( private $add_attributes = array(
'a' => 'rel="noreferrer" target="_blank"' 'a' => 'rel="noreferrer" target="_blank"'
); );
/** /**
* Attributes that must be integer * Attributes that must be integer
* *
* @static * @access private
* @access public
* @var array * @var array
*/ */
public static $integer_attributes = array( private $integer_attributes = array(
'width', 'width',
'height', 'height',
'frameborder', 'frameborder',
@ -248,11 +250,10 @@ class Filter
/** /**
* Iframe source whitelist, everything else is ignored * Iframe source whitelist, everything else is ignored
* *
* @static * @access private
* @access public
* @var array * @var array
*/ */
public static $iframe_whitelist = array( private $iframe_whitelist = array(
'//www.youtube.com', '//www.youtube.com',
'http://www.youtube.com', 'http://www.youtube.com',
'https://www.youtube.com', 'https://www.youtube.com',
@ -273,10 +274,10 @@ class Filter
{ {
$this->url = $site_url; $this->url = $site_url;
\libxml_use_internal_errors(true); libxml_use_internal_errors(true);
// Convert bad formatted documents to XML // Convert bad formatted documents to XML
$dom = new \DOMDocument; $dom = new DOMDocument;
$dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$data); $dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$data);
$this->input = $dom->saveXML($dom->getElementsByTagName('body')->item(0)); $this->input = $dom->saveXML($dom->getElementsByTagName('body')->item(0));
} }
@ -300,7 +301,7 @@ class Filter
$this->data = $this->removeEmptyTags($this->data); $this->data = $this->removeEmptyTags($this->data);
$this->data = $this->removeMultipleTags($this->data); $this->data = $this->removeMultipleTags($this->data);
return $this->data; return trim($this->data);
} }
/** /**
@ -372,9 +373,9 @@ class Filter
} }
// Check for required attributes // Check for required attributes
if (isset(self::$required_attributes[$name])) { if (isset($this->required_attributes[$name])) {
foreach (self::$required_attributes[$name] as $required_attribute) { foreach ($this->required_attributes[$name] as $required_attribute) {
if (! in_array($required_attribute, $used_attributes)) { if (! in_array($required_attribute, $used_attributes)) {
@ -389,9 +390,9 @@ class Filter
$this->data .= '<'.$name.$attr_data; $this->data .= '<'.$name.$attr_data;
// Add custom attributes // Add custom attributes
if (isset(self::$add_attributes[$name])) { if (isset($this->add_attributes[$name])) {
$this->data .= ' '.self::$add_attributes[$name].' '; $this->data .= ' '.$this->add_attributes[$name].' ';
} }
// If img or br, we don't close it here // If img or br, we don't close it here
@ -399,7 +400,7 @@ class Filter
} }
} }
if (in_array($name, self::$blacklist_tags)) { if (in_array($name, $this->blacklisted_tags)) {
$this->strip_content = true; $this->strip_content = true;
} }
@ -530,7 +531,7 @@ class Filter
*/ */
public function isAllowedTag($name) public function isAllowedTag($name)
{ {
return isset(self::$whitelist_tags[$name]); return isset($this->whitelist_tags[$name]);
} }
/** /**
@ -543,7 +544,7 @@ class Filter
*/ */
public function isAllowedAttribute($tag, $attribute) public function isAllowedAttribute($tag, $attribute)
{ {
return in_array($attribute, self::$whitelist_tags[$tag]); return in_array($attribute, $this->whitelist_tags[$tag]);
} }
/** /**
@ -555,7 +556,7 @@ class Filter
*/ */
public function isResource($attribute) public function isResource($attribute)
{ {
return in_array($attribute, self::$media_attributes); return in_array($attribute, $this->media_attributes);
} }
/** /**
@ -567,7 +568,7 @@ class Filter
*/ */
public function isAllowedIframeResource($value) public function isAllowedIframeResource($value)
{ {
foreach (self::$iframe_whitelist as $url) { foreach ($this->iframe_whitelist as $url) {
if (strpos($value, $url) === 0) { if (strpos($value, $url) === 0) {
return true; return true;
@ -586,7 +587,7 @@ class Filter
*/ */
public function isAllowedProtocol($value) public function isAllowedProtocol($value)
{ {
foreach (self::$scheme_whitelist as $protocol) { foreach ($this->scheme_whitelist as $protocol) {
if (strpos($value, $protocol) === 0) { if (strpos($value, $protocol) === 0) {
return true; return true;
@ -605,7 +606,7 @@ class Filter
*/ */
public function isBlacklistedMedia($resource) public function isBlacklistedMedia($resource)
{ {
foreach (self::$media_blacklist as $name) { foreach ($this->media_blacklist as $name) {
if (strpos($resource, $name) !== false) { if (strpos($resource, $name) !== false) {
return true; return true;
@ -640,7 +641,7 @@ class Filter
*/ */
public function validateAttributeValue($attribute, $value) public function validateAttributeValue($attribute, $value)
{ {
if (in_array($attribute, self::$integer_attributes)) { if (in_array($attribute, $this->integer_attributes)) {
return ctype_digit($value); return ctype_digit($value);
} }
@ -758,4 +759,147 @@ class Filter
return $encoding; return $encoding;
} }
/**
* Set whitelisted tags adn attributes for each tag
*
* @access public
* @param array $values List of tags: ['video' => ['src', 'cover'], 'img' => ['src']]
* @return \PicoFeed\Filter
*/
public function setWhitelistedTags(array $values)
{
$this->whitelist_tags = $values ?: $this->whitelist_tags;
return $this;
}
/**
* Set blacklisted tags
*
* @access public
* @param array $values List of tags: ['video', 'img']
* @return \PicoFeed\Filter
*/
public function setBlacklistedTags(array $values)
{
$this->blacklisted_tags = $values ?: $this->blacklisted_tags;
return $this;
}
/**
* Set scheme whitelist
*
* @access public
* @param array $values List of scheme: ['http://', 'ftp://']
* @return \PicoFeed\Filter
*/
public function setSchemeWhitelist(array $values)
{
$this->scheme_whitelist = $values ?: $this->scheme_whitelist;
return $this;
}
/**
* Set media attributes (used to load external resources)
*
* @access public
* @param array $values List of values: ['src', 'href']
* @return \PicoFeed\Filter
*/
public function setMediaAttributes(array $values)
{
$this->media_attributes = $values ?: $this->media_attributes;
return $this;
}
/**
* Set blacklisted external resources
*
* @access public
* @param array $values List of tags: ['http://google.com/', '...']
* @return \PicoFeed\Filter
*/
public function setMediaBlacklist(array $values)
{
$this->media_blacklist = $values ?: $this->media_blacklist;
return $this;
}
/**
* Set mandatory attributes for whitelisted tags
*
* @access public
* @param array $values List of tags: ['img' => 'src']
* @return \PicoFeed\Filter
*/
public function setRequiredAttributes(array $values)
{
$this->required_attributes = $values ?: $this->required_attributes;
return $this;
}
/**
* Set attributes to automatically to specific tags
*
* @access public
* @param array $values List of tags: ['a' => 'target="_blank"']
* @return \PicoFeed\Filter
*/
public function setAttributeOverrides(array $values)
{
$this->add_attributes = $values ?: $this->add_attributes;
return $this;
}
/**
* Set attributes that must be an integer
*
* @access public
* @param array $values List of tags: ['width', 'height']
* @return \PicoFeed\Filter
*/
public function setIntegerAttributes(array $values)
{
$this->integer_attributes = $values ?: $this->integer_attributes;
return $this;
}
/**
* Set allowed iframe resources
*
* @access public
* @param array $values List of tags: ['http://www.youtube.com']
* @return \PicoFeed\Filter
*/
public function setIframeWhitelist(array $values)
{
$this->iframe_whitelist = $values ?: $this->iframe_whitelist;
return $this;
}
/**
* Set config object
*
* @access public
* @param \PicoFeed\Config $config Config instance
* @return \PicoFeed\Parse
*/
public function setConfig($config)
{
$this->config = $config;
if ($this->config !== null) {
$this->setIframeWhitelist($this->config->getFilterIframeWhitelist(array()));
$this->setIntegerAttributes($this->config->getFilterIntegerAttributes(array()));
$this->setAttributeOverrides($this->config->getFilterAttributeOverrides(array()));
$this->setRequiredAttributes($this->config->getFilterRequiredAttributes(array()));
$this->setMediaBlacklist($this->config->getFilterMediaBlacklist(array()));
$this->setMediaAttributes($this->config->getFilterMediaAttributes(array()));
$this->setSchemeWhitelist($this->config->getFilterSchemeWhitelist(array()));
$this->setBlacklistedTags($this->config->getFilterBlacklistedTags(array()));
$this->setWhitelistedTags($this->config->getFilterWhitelistedTags(array()));
}
return $this;
}
} }

View File

@ -2,19 +2,59 @@
namespace PicoFeed; namespace PicoFeed;
require_once __DIR__.'/Client.php'; use DOMXPath;
require_once __DIR__.'/Encoding.php'; use PicoFeed\Logging;
require_once __DIR__.'/Logging.php'; use PicoFeed\Client;
require_once __DIR__.'/Filter.php'; use PicoFeed\Encoding;
use PicoFeed\Filter;
/**
* Grabber class
*
* @author Frederic Guillot
* @package picofeed
*/
class Grabber class Grabber
{ {
public $content = ''; /**
public $html = ''; * URL
public $encoding = ''; *
* @access private
* @var string
*/
private $url = '';
// Order is important, generic terms at the end /**
public $candidatesAttributes = array( * Relevant content
*
* @access private
* @var string
*/
private $content = '';
/**
* HTML content
*
* @access private
* @var string
*/
private $html = '';
/**
* HTML content encoding
*
* @access private
* @var string
*/
private $encoding = '';
/**
* List of attributes to try to get the content, order is important, generic terms at the end
*
* @access private
* @var array
*/
private $candidatesAttributes = array(
'articleBody', 'articleBody',
'articlebody', 'articlebody',
'article-body', 'article-body',
@ -37,7 +77,13 @@ class Grabber
'main', 'main',
); );
public $stripAttributes = array( /**
* List of attributes to strip
*
* @access private
* @var array
*/
private $stripAttributes = array(
'comment', 'comment',
'share', 'share',
'links', 'links',
@ -57,7 +103,13 @@ class Grabber
'categories', 'categories',
); );
public $stripTags = array( /**
* Tags to remove
*
* @access private
* @var array
*/
private $stripTags = array(
'script', 'script',
'style', 'style',
'nav', 'nav',
@ -67,7 +119,22 @@ class Grabber
'form', 'form',
); );
/**
* Config object
*
* @access private
* @var \PicoFeed\Config
*/
private $config = null;
/**
* Constructor
*
* @access public
* @param string $url Url
* @param string $html HTML content
* @param string $encoding Charset
*/
public function __construct($url, $html = '', $encoding = 'utf-8') public function __construct($url, $html = '', $encoding = 'utf-8')
{ {
$this->url = $url; $this->url = $url;
@ -75,13 +142,53 @@ class Grabber
$this->encoding = $encoding; $this->encoding = $encoding;
} }
/**
* Set config object
*
* @access public
* @param \PicoFeed\Config $config Config instance
* @return \PicoFeed\Grabber
*/
public function setConfig($config)
{
$this->config = $config;
return $this;
}
/**
* Get relevant content
*
* @access public
* @return string
*/
public function getContent()
{
return $this->content;
}
/**
* Get raw content (unfiltered)
*
* @access public
* @return string
*/
public function getRawContent()
{
return $this->html;
}
/**
* Parse the HTML content
*
* @access public
* @return bool
*/
public function parse() public function parse()
{ {
if ($this->html) { if ($this->html) {
Logging::log(\get_called_class().' Fix encoding'); Logging::setMessage(get_called_class().' Fix encoding');
Logging::log(\get_called_class().': HTTP Encoding "'.$this->encoding.'"'); Logging::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'"');
$this->html = Filter::stripHeadTags($this->html); $this->html = Filter::stripHeadTags($this->html);
@ -92,42 +199,63 @@ class Grabber
$this->html = Encoding::toUTF8($this->html); $this->html = Encoding::toUTF8($this->html);
} }
Logging::log(\get_called_class().' Content length: '.strlen($this->html).' bytes'); Logging::setMessage(get_called_class().' Content length: '.strlen($this->html).' bytes');
$rules = $this->getRules(); $rules = $this->getRules();
if (is_array($rules)) { if (is_array($rules)) {
Logging::log(\get_called_class().' Parse content with rules'); Logging::setMessage(get_called_class().' Parse content with rules');
$this->parseContentWithRules($rules); $this->parseContentWithRules($rules);
} }
else { else {
Logging::log(\get_called_class().' Parse content with candidates'); Logging::setMessage(get_called_class().' Parse content with candidates');
$this->parseContentWithCandidates(); $this->parseContentWithCandidates();
} }
} }
else { else {
Logging::log(\get_called_class().' No content fetched'); Logging::setMessage(get_called_class().' No content fetched');
} }
Logging::log(\get_called_class().' Content length: '.strlen($this->content).' bytes'); Logging::setMessage(get_called_class().' Content length: '.strlen($this->content).' bytes');
Logging::log(\get_called_class().' Grabber done'); Logging::setMessage(get_called_class().' Grabber done');
return $this->content !== ''; return $this->content !== '';
} }
/**
public function download($timeout = 5, $user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36') * Download the HTML content
*
* @access public
* @return HTML content
*/
public function download()
{ {
$client = Client::create(); $client = Client::getInstance();
$client->url = $this->url;
$client->timeout = $timeout; if ($this->config !== null) {
$client->user_agent = $user_agent;
$client->execute(); $client->setTimeout($this->config->getGrabberTimeout())
->setUserAgent($this->config->getGrabberUserAgent())
->setMaxRedirections($this->config->getMaxRedirections())
->setMaxBodySize($this->config->getMaxBodySize())
->setProxyHostname($this->config->getProxyHostname())
->setProxyPort($this->config->getProxyPort())
->setProxyUsername($this->config->getProxyUsername())
->setProxyPassword($this->config->getProxyPassword());
}
$client->execute($this->url);
$this->html = $client->getContent(); $this->html = $client->getContent();
$this->encoding = $client->getEncoding();
return $this->html; return $this->html;
} }
/**
* Try to find a predefined rule
*
* @access public
* @return mixed
*/
public function getRules() public function getRules()
{ {
$hostname = parse_url($this->url, PHP_URL_HOST); $hostname = parse_url($this->url, PHP_URL_HOST);
@ -147,7 +275,7 @@ class Grabber
$filename = __DIR__.'/Rules/'.$file.'.php'; $filename = __DIR__.'/Rules/'.$file.'.php';
if (file_exists($filename)) { if (file_exists($filename)) {
Logging::log(\get_called_class().' Load rule: '.$file); Logging::setMessage(get_called_class().' Load rule: '.$file);
return include $filename; return include $filename;
} }
} }
@ -155,13 +283,16 @@ class Grabber
return false; return false;
} }
/**
* Get the relevant content with predefined rules
*
* @access public
* @param array $rules Rules
*/
public function parseContentWithRules(array $rules) public function parseContentWithRules(array $rules)
{ {
\libxml_use_internal_errors(true); $dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$this->html);
$dom = new \DOMDocument; $xpath = new DOMXPath($dom);
$dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$this->html);
$xpath = new \DOMXPath($dom);
if (isset($rules['strip']) && is_array($rules['strip'])) { if (isset($rules['strip']) && is_array($rules['strip'])) {
@ -192,24 +323,26 @@ class Grabber
} }
} }
/**
* Get the relevant content with the list of potential attributes
*
* @access public
*/
public function parseContentWithCandidates() public function parseContentWithCandidates()
{ {
\libxml_use_internal_errors(true); $dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$this->html);
$dom = new \DOMDocument; $xpath = new DOMXPath($dom);
$dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$this->html);
$xpath = new \DOMXPath($dom);
// Try to lookup in each tag // Try to lookup in each tag
foreach ($this->candidatesAttributes as $candidate) { foreach ($this->candidatesAttributes as $candidate) {
Logging::log(\get_called_class().' Try this candidate: "'.$candidate.'"'); Logging::setMessage(get_called_class().' Try this candidate: "'.$candidate.'"');
$nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]'); $nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
if ($nodes !== false && $nodes->length > 0) { if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0)); $this->content = $dom->saveXML($nodes->item(0));
Logging::log(\get_called_class().' Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)'); Logging::setMessage(get_called_class().' Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
break; break;
} }
} }
@ -221,33 +354,38 @@ class Grabber
if ($nodes !== false && $nodes->length > 0) { if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0)); $this->content = $dom->saveXML($nodes->item(0));
Logging::log(\get_called_class().' Find <article/> tag ('.strlen($this->content).' bytes)'); Logging::setMessage(get_called_class().' Find <article/> tag ('.strlen($this->content).' bytes)');
} }
} }
if (strlen($this->content) < 50) { if (strlen($this->content) < 50) {
Logging::log(\get_called_class().' No enought content fetched, get the full body'); Logging::setMessage(get_called_class().' No enought content fetched, get the full body');
$this->content = $dom->saveXML($dom->firstChild); $this->content = $dom->saveXML($dom->firstChild);
} }
Logging::log(\get_called_class().' Strip garbage'); Logging::setMessage(get_called_class().' Strip garbage');
$this->stripGarbage(); $this->stripGarbage();
} }
/**
* Strip useless tags
*
* @access public
*/
public function stripGarbage() public function stripGarbage()
{ {
\libxml_use_internal_errors(true); $dom = XmlParser::getDomDocument($this->content);
$dom = new \DOMDocument;
$dom->loadXML($this->content); if ($dom !== false) {
$xpath = new \DOMXPath($dom);
$xpath = new DOMXPath($dom);
foreach ($this->stripTags as $tag) { foreach ($this->stripTags as $tag) {
$nodes = $xpath->query('//'.$tag); $nodes = $xpath->query('//'.$tag);
if ($nodes !== false && $nodes->length > 0) { if ($nodes !== false && $nodes->length > 0) {
Logging::log(\get_called_class().' Strip tag: "'.$tag.'"'); Logging::setMessage(get_called_class().' Strip tag: "'.$tag.'"');
foreach ($nodes as $node) { foreach ($nodes as $node) {
$node->parentNode->removeChild($node); $node->parentNode->removeChild($node);
} }
@ -259,7 +397,7 @@ class Grabber
$nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]'); $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
if ($nodes !== false && $nodes->length > 0) { if ($nodes !== false && $nodes->length > 0) {
Logging::log(\get_called_class().' Strip attribute: "'.$tag.'"'); Logging::setMessage(get_called_class().' Strip attribute: "'.$attribute.'"');
foreach ($nodes as $node) { foreach ($nodes as $node) {
$node->parentNode->removeChild($node); $node->parentNode->removeChild($node);
} }
@ -269,3 +407,4 @@ class Grabber
$this->content = $dom->saveXML($dom->documentElement); $this->content = $dom->saveXML($dom->documentElement);
} }
} }
}

View File

@ -3,47 +3,75 @@
namespace PicoFeed; namespace PicoFeed;
require_once __DIR__.'/Logging.php'; require_once __DIR__.'/Logging.php';
require_once __DIR__.'/XmlParser.php';
use PicoFeed\Logging;
use PicoFeed\XmlParser;
/**
* OPML Import
*
* @author Frederic Guillot
* @package picofeed
*/
class Import class Import
{ {
/**
* OPML file content
*
* @access private
* @var string
*/
private $content = ''; private $content = '';
/**
* Subscriptions
*
* @access private
* @var array
*/
private $items = array(); private $items = array();
/**
* Constructor
*
* @access public
* @param string $content OPML file content
*/
public function __construct($content) public function __construct($content)
{ {
$this->content = $content; $this->content = $content;
} }
/**
* Parse the OPML file
*
* @access public
* @return array|false
*/
public function execute() public function execute()
{ {
\PicoFeed\Logging::log(\get_called_class().': start importation'); Logging::setMessage(get_called_class().': start importation');
try { $xml = XmlParser::getSimpleXml(trim($this->content));
\libxml_use_internal_errors(true); if ($xml === false || $xml->getName() !== 'opml' || ! isset($xml->body)) {
Logging::setMessage(get_called_class().': OPML tag not found or malformed XML document');
$xml = new \SimpleXMLElement(trim($this->content));
if ($xml->getName() !== 'opml' || ! isset($xml->body)) {
\PicoFeed\Logging::log(\get_called_class().': OPML tag not found');
return false; return false;
} }
$this->parseEntries($xml->body); $this->parseEntries($xml->body);
Logging::setMessage(get_called_class().': '.count($this->items).' subscriptions found');
\PicoFeed\Logging::log(\get_called_class().': '.count($this->items).' subscriptions found');
}
catch (\Exception $e) {
\PicoFeed\Logging::log(\get_called_class().': '.$e->getMessage());
return false;
}
return $this->items; return $this->items;
} }
/**
* Parse each entries of the subscription list
*
* @access public
* @param SimpleXMLElement $tree XML node
*/
public function parseEntries($tree) public function parseEntries($tree)
{ {
if (isset($tree->outline)) { if (isset($tree->outline)) {

202
vendor/PicoFeed/Item.php vendored Normal file
View File

@ -0,0 +1,202 @@
<?php
namespace PicoFeed;
/**
* Feed Item
*
* @author Frederic Guillot
* @package picofeed
*/
class Item
{
/**
* Item id
*
* @access public
* @var string
*/
public $id = '';
/**
* Item title
*
* @access public
* @var string
*/
public $title = '';
/**
* Item url
*
* @access public
* @var string
*/
public $url = '';
/**
* Item author
*
* @access public
* @var string
*/
public $author= '';
/**
* Item date
*
* @access public
* @var integer
*/
public $date = 0;
/**
* Item content
*
* @access public
* @var string
*/
public $content = '';
/**
* Item enclosure url
*
* @access public
* @var string
*/
public $enclosure_url = '';
/**
* Item enclusure type
*
* @access public
* @var string
*/
public $enclosure_type = '';
/**
* Item language
*
* @access public
* @var string
*/
public $language = '';
/**
* Return item information
*
* @access public
* $return string
*/
public function __toString()
{
$output = '';
foreach (array('id', 'title', 'url', 'date', 'language', 'author', 'enclosure_url', 'enclosure_type') as $property) {
$output .= 'Item::'.$property.' = '.$this->$property.PHP_EOL;
}
$output .= 'Item::content = '.strlen($this->content).' bytes'.PHP_EOL;
return $output;
}
/**
* Get title
*
* @access public
* $return string
*/
public function getTitle()
{
return $this->title;
}
/**
* Get url
*
* @access public
* $return string
*/
public function getUrl()
{
return $this->url;
}
/**
* Get id
*
* @access public
* $return string
*/
public function getId()
{
return $this->id;
}
/**
* Get date
*
* @access public
* $return integer
*/
public function getDate()
{
return $this->date;
}
/**
* Get content
*
* @access public
* $return string
*/
public function getContent()
{
return $this->content;
}
/**
* Get enclosure url
*
* @access public
* $return string
*/
public function getEnclosureUrl()
{
return $this->enclosure_url;
}
/**
* Get enclosure type
*
* @access public
* $return string
*/
public function getEnclosureType()
{
return $this->enclosure_type;
}
/**
* Get language
*
* @access public
* $return string
*/
public function getLanguage()
{
return $this->language;
}
/**
* Get author
*
* @access public
* $return string
*/
public function getAuthor()
{
return $this->author;
}
}

View File

@ -2,12 +2,82 @@
namespace PicoFeed; namespace PicoFeed;
use DateTime;
use DateTimeZone;
/**
* Logging class
*
* @author Frederic Guillot
* @package picofeed
*/
class Logging class Logging
{ {
public static $messages = array(); /**
* List of messages
*
* @static
* @access private
* @var array
*/
private static $messages = array();
public static function log($message) /**
* Default timezone
*
* @static
* @access private
* @var array
*/
private static $timezone = 'UTC';
/**
* Add a new message
*
* @static
* @access public
* @param string $message Message
*/
public static function setMessage($message)
{ {
self::$messages[] = '['.date('Y-m-d H:i:s').'] '.$message; $date = new DateTime('now', new DateTimeZone(self::$timezone));
self::$messages[] = '['.$date->format('Y-m-d H:i:s').'] '.$message;
}
/**
* Get all logged messages
*
* @static
* @access public
* @return array
*/
public static function getMessages()
{
return self::$messages;
}
/**
* Remove all logged messages
*
* @static
* @access public
*/
public static function deleteMessages()
{
self::$messages = array();
}
/**
* Set a different timezone
*
* @static
* @see http://php.net/manual/en/timezones.php
* @access public
* @param string $timezone Timezone
*/
public static function setTimeZone($timezone)
{
self::$timezone = $timezone ?: self::$timezone;
} }
} }

View File

@ -2,10 +2,16 @@
namespace PicoFeed; namespace PicoFeed;
require_once __DIR__.'/Logging.php'; use DateTime;
require_once __DIR__.'/Filter.php'; use DateTimeZone;
require_once __DIR__.'/Encoding.php'; use DOMXPath;
require_once __DIR__.'/Grabber.php'; use SimpleXMLElement;
use PicoFeed\Config;
use PicoFeed\Encoding;
use PicoFeed\Filter;
use PicoFeed\Grabber;
use PicoFeed\Logging;
use PicoFeed\XmlParser;
/** /**
* Base parser class * Base parser class
@ -15,14 +21,29 @@ require_once __DIR__.'/Grabber.php';
*/ */
abstract class Parser abstract class Parser
{ {
/**
* Config object
*
* @access private
* @var \PicoFeed\Config
*/
private $config = null;
/** /**
* Hash algorithm used to generate item id, any value supported by PHP, see hash_algos() * Hash algorithm used to generate item id, any value supported by PHP, see hash_algos()
* *
* @access public * @access private
* @static
* @var string * @var string
*/ */
public static $hashAlgo = 'crc32b'; // crc32b seems to be faster and shorter than other hash algorithms private $hash_algo = 'crc32b'; // crc32b seems to be faster and shorter than other hash algorithms
/**
* Timezone used to parse feed dates
*
* @access private
* @var string
*/
private $timezone = 'UTC';
/** /**
* Feed content (XML data) * Feed content (XML data)
@ -33,35 +54,28 @@ abstract class Parser
protected $content = ''; protected $content = '';
/** /**
* Feed properties (values parsed) * XML namespaces
* *
* @access public * @access protected
* @var array
*/ */
public $id = ''; protected $namespaces = array();
public $url = '';
public $title = '';
public $updated = '';
public $language = '';
public $items = array();
/** /**
* Content grabber parameters * Enable the content grabber
* *
* @access public * @access private
* @var bool
*/ */
public $grabber = false; public $enable_grabber = false;
public $grabber_ignore_urls = array();
public $grabber_timeout = null;
public $grabber_user_agent = null;
/** /**
* Parse feed content * Ignore those urls for the content scraper
* *
* @abstract * @access private
* @access public * @var array
* @return mixed
*/ */
abstract public function execute(); private $grabber_ignore_urls = array();
/** /**
* Constructor * Constructor
@ -73,7 +87,7 @@ abstract class Parser
public function __construct($content, $http_encoding = '') public function __construct($content, $http_encoding = '')
{ {
$xml_encoding = Filter::getEncodingFromXmlTag($content); $xml_encoding = Filter::getEncodingFromXmlTag($content);
Logging::log(\get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"'); Logging::setMessage(get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"');
// Strip XML tag to avoid multiple encoding/decoding in the next XML processing // Strip XML tag to avoid multiple encoding/decoding in the next XML processing
$this->content = Filter::stripXmlTag($content); $this->content = Filter::stripXmlTag($content);
@ -90,6 +104,52 @@ abstract class Parser
$this->content = $this->normalizeData($this->content); $this->content = $this->normalizeData($this->content);
} }
/**
* Parse the document
*
* @access public
* @return mixed \PicoFeed\Feed instance or false
*/
public function execute()
{
Logging::setMessage(get_called_class().': begin parsing');
$xml = XmlParser::getSimpleXml($this->content);
if ($xml === false) {
Logging::setMessage(get_called_class().': XML parsing error');
Logging::setMessage(XmlParser::getErrors());
return false;
}
$this->namespaces = $xml->getNamespaces(true);
$feed = new Feed;
$this->findFeedUrl($xml, $feed);
$this->findFeedTitle($xml, $feed);
$this->findFeedLanguage($xml, $feed);
$this->findFeedId($xml, $feed);
$this->findFeedDate($xml, $feed);
foreach ($this->getItemsTree($xml) as $entry) {
$item = new Item;
$this->findItemAuthor($xml, $entry, $item);
$this->findItemUrl($entry, $item);
$this->findItemTitle($entry, $item);
$this->findItemId($entry, $item, $feed);
$this->findItemDate($entry, $item);
$this->findItemContent($entry, $item);
$this->findItemEnclosure($entry, $item, $feed);
$this->findItemLanguage($entry, $item, $feed);
$feed->items[] = $item;
}
Logging::setMessage(get_called_class().PHP_EOL.$feed);
return $feed;
}
/** /**
* Filter HTML for entry content * Filter HTML for entry content
* *
@ -102,43 +162,40 @@ abstract class Parser
{ {
$content = ''; $content = '';
if ($this->grabber && ! in_array($item_url, $this->grabber_ignore_urls)) { // Setup the content scraper
if ($this->enable_grabber && ! in_array($item_url, $this->grabber_ignore_urls)) {
$grabber = new Grabber($item_url); $grabber = new Grabber($item_url);
$grabber->download($this->grabber_timeout, $this->grabber_user_agent); $grabber->setConfig($this->config);
if ($grabber->parse()) $item_content = $grabber->content; $grabber->download();
if ($grabber->parse()) {
$item_content = $grabber->getContent();
}
} }
// Content filtering
if ($item_content) { if ($item_content) {
if ($this->config !== null) {
$callback = $this->config->getContentFilteringCallback();
if (is_callable($callback)) {
$content = $callback($item_content, $item_url);
}
}
if (! $content) {
$filter = new Filter($item_content, $item_url); $filter = new Filter($item_content, $item_url);
$filter->setConfig($this->config);
$content = $filter->execute(); $content = $filter->execute();
} }
}
return $content; return $content;
} }
/**
* Get XML parser errors
*
* @access public
* @return string
*/
public function getXmlErrors()
{
$errors = array();
foreach(\libxml_get_errors() as $error) {
$errors[] = sprintf('XML error: %s (Line: %d - Column: %d - Code: %d)',
$error->message,
$error->line,
$error->column,
$error->code
);
}
return implode(', ', $errors);
}
/** /**
* Dirty quickfixes before XML parsing * Dirty quickfixes before XML parsing
* *
@ -148,6 +205,7 @@ abstract class Parser
*/ */
public function normalizeData($data) public function normalizeData($data)
{ {
$data = str_replace("\x10", '', $data);
$data = str_replace("\xc3\x20", '', $data); $data = str_replace("\xc3\x20", '', $data);
$data = str_replace("&#x1F;", '', $data); $data = str_replace("&#x1F;", '', $data);
$data = $this->replaceEntityAttribute($data); $data = $this->replaceEntityAttribute($data);
@ -194,7 +252,7 @@ abstract class Parser
*/ */
public function generateId() public function generateId()
{ {
return hash(self::$hashAlgo, implode(func_get_args())); return hash($this->hash_algo, implode(func_get_args()));
} }
/** /**
@ -249,7 +307,8 @@ abstract class Parser
} }
} }
return time(); $date = new DateTime('now', new DateTimeZone($this->timezone));
return $date->getTimestamp();
} }
/** /**
@ -262,11 +321,15 @@ abstract class Parser
*/ */
public function getValidDate($format, $value) public function getValidDate($format, $value)
{ {
$date = \DateTime::createFromFormat($format, $value); $date = DateTime::createFromFormat($format, $value, new DateTimeZone($this->timezone));
if ($date !== false) { if ($date !== false) {
$errors = \DateTime::getLastErrors();
if ($errors['error_count'] === 0 && $errors['warning_count'] === 0) return $date->getTimestamp(); $errors = DateTime::getLastErrors();
if ($errors['error_count'] === 0 && $errors['warning_count'] === 0) {
return $date->getTimestamp();
}
} }
return 0; return 0;
@ -299,10 +362,13 @@ abstract class Parser
*/ */
public function getXmlLang($xml) public function getXmlLang($xml)
{ {
$dom = new \DOMDocument; $dom = XmlParser::getDomDocument($this->content);
$dom->loadXML($this->content);
$xpath = new \DOMXPath($dom); if ($dom === false) {
return '';
}
$xpath = new DOMXPath($dom);
return $xpath->evaluate('string(//@xml:lang[1])') ?: ''; return $xpath->evaluate('string(//@xml:lang[1])') ?: '';
} }
@ -318,30 +384,108 @@ abstract class Parser
{ {
$language = strtolower($language); $language = strtolower($language);
// Arabic (ar-**) $rtl_languages = array(
if (strpos($language, 'ar') === 0) return true; 'ar', // Arabic (ar-**)
'fa', // Farsi (fa-**)
'ur', // Urdu (ur-**)
'ps', // Pashtu (ps-**)
'syr', // Syriac (syr-**)
'dv', // Divehi (dv-**)
'he', // Hebrew (he-**)
'yi', // Yiddish (yi-**)
);
// Farsi (fa-**) foreach ($rtl_languages as $prefix) {
if (strpos($language, 'fa') === 0) return true; if (strpos($language, $prefix) === 0) {
return true;
// Urdu (ur-**) }
if (strpos($language, 'ur') === 0) return true; }
// Pashtu (ps-**)
if (strpos($language, 'ps') === 0) return true;
// Syriac (syr-**)
if (strpos($language, 'syr') === 0) return true;
// Divehi (dv-**)
if (strpos($language, 'dv') === 0) return true;
// Hebrew (he-**)
if (strpos($language, 'he') === 0) return true;
// Yiddish (yi-**)
if (strpos($language, 'yi') === 0) return true;
return false; return false;
} }
/**
* Set Hash algorithm used for id generation
*
* @access public
* @param string $algo Algorithm name
* @return \PicoFeed\Parser
*/
public function setHashAlgo($algo)
{
$this->hash_algo = $algo ?: $this->hash_algo;
return $this;
}
/**
* Set a different timezone
*
* @see http://php.net/manual/en/timezones.php
* @access public
* @param string $timezone Timezone
* @return \PicoFeed\Parser
*/
public function setTimezone($timezone)
{
$this->timezone = $timezone ?: $this->timezone;
return $this;
}
/**
* Set config object
*
* @access public
* @param \PicoFeed\Config $config Config instance
* @return \PicoFeed\Parser
*/
public function setConfig($config)
{
$this->config = $config;
return $this;
}
/**
* Enable the content grabber
*
* @access public
* @return \PicoFeed\Parser
*/
public function enableContentGrabber()
{
$this->enable_grabber = true;
}
/**
* Set ignored URLs for the content grabber
*
* @access public
* @param array $urls URLs
* @return \PicoFeed\Parser
*/
public function setGrabberIgnoreUrls(array $urls)
{
$this->grabber_ignore_urls = $urls;
}
/**
* Get a value from a XML namespace
*
* @access public
* @param SimpleXMLElement $xml XML element
* @param array $namespaces XML namespaces
* @param string $property XML tag name
* @return string
*/
public function getNamespaceValue(SimpleXMLElement $xml, array $namespaces, $property)
{
foreach ($namespaces as $name => $url) {
$namespace = $xml->children($namespaces[$name]);
if ($namespace->$property->count() > 0) {
return (string) $namespace->$property;
}
}
return '';
}
} }

View File

@ -2,81 +2,247 @@
namespace PicoFeed\Parsers; namespace PicoFeed\Parsers;
use SimpleXMLElement;
use PicoFeed\Parser;
use PicoFeed\XmlParser;
use PicoFeed\Logging;
use PicoFeed\Filter;
use PicoFeed\Feed;
use PicoFeed\Item;
/** /**
* Atom parser * Atom parser
* *
* @author Frederic Guillot * @author Frederic Guillot
* @package parser * @package parser
*/ */
class Atom extends \PicoFeed\Parser class Atom extends Parser
{ {
/** /**
* Parse the document * Get the path to the items XML tree
* *
* @access public * @access public
* @return mixed Atom instance or false * @param SimpleXMLElement $xml Feed xml
* @return SimpleXMLElement
*/ */
public function execute() public function getItemsTree(SimpleXMLElement $xml)
{ {
\PicoFeed\Logging::log(\get_called_class().': begin parsing'); return $xml->entry;
\libxml_use_internal_errors(true);
$xml = \simplexml_load_string($this->content);
if ($xml === false) {
\PicoFeed\Logging::log(\get_called_class().': XML parsing error');
\PicoFeed\Logging::log($this->getXmlErrors());
return false;
} }
$this->language = $this->getXmlLang($this->content); /**
$this->url = $this->getUrl($xml); * Find the feed url
$this->title = $this->stripWhiteSpace((string) $xml->title) ?: $this->url; *
$this->id = (string) $xml->id; * @access public
$this->updated = $this->parseDate((string) $xml->updated); * @param SimpleXMLElement $xml Feed xml
$author = (string) $xml->author->name; * @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedUrl(SimpleXMLElement $xml, Feed $feed)
{
$feed->url = $this->getLink($xml);
}
\PicoFeed\Logging::log(\get_called_class().': Title => '.$this->title); /**
\PicoFeed\Logging::log(\get_called_class().': Url => '.$this->url); * Find the feed title
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedTitle(SimpleXMLElement $xml, Feed $feed)
{
$feed->title = $this->stripWhiteSpace((string) $xml->title) ?: $feed->url;
}
foreach ($xml->entry as $entry) { /**
* Find the feed language
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed)
{
$feed->language = $this->getXmlLang($this->content);
}
/**
* Find the feed id
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedId(SimpleXMLElement $xml, Feed $feed)
{
$feed->id = (string) $xml->id;
}
/**
* Find the feed date
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedDate(SimpleXMLElement $xml, Feed $feed)
{
$feed->date = $this->parseDate((string) $xml->updated);
}
/**
* Find the item date
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
*/
public function findItemDate(SimpleXMLElement $entry, Item $item)
{
$item->date = $this->parseDate((string) $entry->updated);
}
/**
* Find the item title
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
*/
public function findItemTitle(SimpleXMLElement $entry, Item $item)
{
$item->title = $this->stripWhiteSpace((string) $entry->title);
if (empty($item->title)) {
$item->title = $item->url;
}
}
/**
* Find the item author
*
* @access public
* @param SimpleXMLElement $xml Feed
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
*/
public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item)
{
if (isset($entry->author->name)) { if (isset($entry->author->name)) {
$author = (string) $entry->author->name; $item->author = (string) $entry->author->name;
}
else {
$item->author = (string) $xml->author->name;
}
} }
/**
* Find the item content
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
*/
public function findItemContent(SimpleXMLElement $entry, Item $item)
{
$item->content = $this->filterHtml($this->getContent($entry), $item->url);
}
/**
* Find the item URL
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
*/
public function findItemUrl(SimpleXMLElement $entry, Item $item)
{
$item->url = $this->getLink($entry);
}
/**
* Genereate the item id
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
* @param \PicoFeed\Feed $feed Feed object
*/
public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$id = (string) $entry->id; $id = (string) $entry->id;
$item = new \StdClass; if ($id !== $item->url) {
$item->url = $this->getUrl($entry); $item_permalink = $id;
$item->id = $this->generateId($id !== $item->url ? $id : $item->url, $this->isExcludedFromId($this->url) ? '' : $this->url); }
$item->title = $this->stripWhiteSpace((string) $entry->title); else {
$item->updated = $this->parseDate((string) $entry->updated); $item_permalink = $item->url;
$item->author = $author; }
$item->content = $this->filterHtml($this->getContent($entry), $item->url);
$item->language = $this->language;
if (empty($item->title)) $item->title = $item->url; if ($this->isExcludedFromId($feed->url)) {
$feed_permalink = '';
}
else {
$feed_permalink = $feed->url;
}
// Try to find an enclosure $item->id = $this->generateId($item_permalink, $feed_permalink);
}
/**
* Find the item enclosure
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
* @param \PicoFeed\Feed $feed Feed object
*/
public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed)
{
foreach ($entry->link as $link) { foreach ($entry->link as $link) {
if ((string) $link['rel'] === 'enclosure') { if ((string) $link['rel'] === 'enclosure') {
$item->enclosure = (string) $link['href'];
$item->enclosure_url = (string) $link['href'];
$item->enclosure_type = (string) $link['type']; $item->enclosure_type = (string) $link['type'];
if (\PicoFeed\Filter::isRelativePath($item->enclosure)) { if (Filter::isRelativePath($item->enclosure_url)) {
$item->enclosure = \PicoFeed\Filter::getAbsoluteUrl($item->enclosure, $this->url); $item->enclosure_url = Filter::getAbsoluteUrl($item->enclosure_url, $feed->url);
} }
break; break;
} }
} }
$this->items[] = $item;
} }
\PicoFeed\Logging::log(\get_called_class().': parsing finished ('.count($this->items).' items)'); /**
* Find the item language
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
* @param \PicoFeed\Feed $feed Feed object
*/
public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$item->language = $feed->language;
}
return $this; /**
* Get the URL from a link tag
*
* @access public
* @param SimpleXMLElement $xml XML tag
* @return string
*/
public function getLink(SimpleXMLElement $xml)
{
foreach ($xml->link as $link) {
if ((string) $link['type'] === 'text/html' || (string) $link['type'] === 'application/xhtml+xml') {
return (string) $link['href'];
}
}
return (string) $xml->link['href'];
} }
/** /**
@ -86,7 +252,7 @@ class Atom extends \PicoFeed\Parser
* @param SimpleXMLElement $entry XML Entry * @param SimpleXMLElement $entry XML Entry
* @return string * @return string
*/ */
public function getContent($entry) public function getContent(SimpleXMLElement $entry)
{ {
if (isset($entry->content) && ! empty($entry->content)) { if (isset($entry->content) && ! empty($entry->content)) {
@ -103,22 +269,4 @@ class Atom extends \PicoFeed\Parser
return ''; return '';
} }
/**
* Get the URL from a link tag
*
* @access public
* @param SimpleXMLElement $xml XML tag
* @return string
*/
public function getUrl($xml)
{
foreach ($xml->link as $link) {
if ((string) $link['type'] === 'text/html' || (string) $link['type'] === 'application/xhtml+xml') {
return (string) $link['href'];
}
}
return (string) $xml->link['href'];
}
} }

View File

@ -2,86 +2,86 @@
namespace PicoFeed\Parsers; namespace PicoFeed\Parsers;
class Rss10 extends \PicoFeed\Parser require_once __DIR__.'/Rss20.php';
{
public function execute()
{
\PicoFeed\Logging::log(\get_called_class().': begin parsing');
\libxml_use_internal_errors(true); use SimpleXMLElement;
$xml = \simplexml_load_string($this->content); use PicoFeed\Feed;
use PicoFeed\Item;
use PicoFeed\Parsers\Rss20;
if ($xml === false) { /**
\PicoFeed\Logging::log(\get_called_class().': XML parsing error'); * RSS 1.0 parser
\PicoFeed\Logging::log($this->getXmlErrors()); *
return false; * @author Frederic Guillot
* @package parser
*/
class Rss10 extends Rss20
{
/**
* Get the path to the items XML tree
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @return SimpleXMLElement
*/
public function getItemsTree(SimpleXMLElement $xml)
{
return $xml->item;
} }
$namespaces = $xml->getNamespaces(true); /**
* Find the feed date
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedDate(SimpleXMLElement $xml, Feed $feed)
{
$feed->date = $this->parseDate($this->getNamespaceValue($xml->channel, $this->namespaces, 'date'));
}
$this->title = $this->stripWhiteSpace((string) $xml->channel->title) ?: $this->url; /**
$this->url = (string) $xml->channel->link; * Find the feed language
$this->id = $this->url; *
$this->language = ''; * @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed)
{
$feed->language = $this->getNamespaceValue($xml->channel, $this->namespaces, 'language');
}
\PicoFeed\Logging::log(\get_called_class().': Title => '.$this->title); /**
\PicoFeed\Logging::log(\get_called_class().': Url => '.$this->url); * Genereate the item id
*
if (isset($namespaces['dc'])) { * @access public
$ns_dc = $xml->channel->children($namespaces['dc']); * @param SimpleXMLElement $entry Feed item
$this->updated = isset($ns_dc->date) ? $this->parseDate($ns_dc->date) : time(); * @param \PicoFeed\Item $item Item object
* @param \PicoFeed\Feed $feed Feed object
*/
public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed)
{
if ($this->isExcludedFromId($feed->url)) {
$feed_permalink = '';
} }
else { else {
$this->updated = time(); $feed_permalink = $feed->url;
} }
foreach ($xml->item as $entry) { $item->id = $this->generateId($item->url, $feed_permalink);
$item = new \StdClass;
$item->title = $this->stripWhiteSpace((string) $entry->title);
$item->url = '';
$item->author= '';
$item->updated = '';
$item->content = '';
$item->language = '';
foreach ($namespaces as $name => $url) {
$namespace = $entry->children($namespaces[$name]);
if (! $item->url && ! empty($namespace->origLink)) $item->url = (string) $namespace->origLink;
if (! $item->author && ! empty($namespace->creator)) $item->author = (string) $namespace->creator;
if (! $item->updated && ! empty($namespace->date)) $item->updated = $this->parseDate((string) $namespace->date);
if (! $item->updated && ! empty($namespace->updated)) $item->updated = $this->parseDate((string) $namespace->updated);
if (! $item->content && ! empty($namespace->encoded)) $item->content = (string) $namespace->encoded;
} }
if (empty($item->url)) $item->url = (string) $entry->link; /**
if (empty($item->updated)) $item->updated = $this->updated; * Find the item enclosure
*
if (empty($item->content)) { * @access public
$item->content = isset($entry->description) ? (string) $entry->description : ''; * @param SimpleXMLElement $entry Feed item
} * @param \PicoFeed\Item $item Item object
* @param \PicoFeed\Feed $feed Feed object
if (empty($item->author)) { */
public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed)
if (isset($entry->author)) { {
$item->author = (string) $entry->author;
}
else if (isset($xml->channel->webMaster)) {
$item->author = (string) $xml->channel->webMaster;
}
}
if (empty($item->title)) $item->title = $item->url;
$item->id = $this->generateId($item->url, $this->isExcludedFromId($this->url) ? '' : $this->url);
$item->content = $this->filterHtml($item->content, $item->url);
$this->items[] = $item;
}
\PicoFeed\Logging::log(\get_called_class().': parsing finished ('.count($this->items).' items)');
return $this;
} }
} }

View File

@ -2,35 +2,43 @@
namespace PicoFeed\Parsers; namespace PicoFeed\Parsers;
use SimpleXMLElement;
use PicoFeed\Parser;
use PicoFeed\XmlParser;
use PicoFeed\Logging;
use PicoFeed\Filter;
use PicoFeed\Feed;
use PicoFeed\Item;
/** /**
* RSS 2.0 Parser * RSS 2.0 Parser
* *
* @author Frederic Guillot * @author Frederic Guillot
* @package parser * @package parser
*/ */
class Rss20 extends \PicoFeed\Parser class Rss20 extends Parser
{ {
/** /**
* Parse the document * Get the path to the items XML tree
* *
* @access public * @access public
* @return mixed Rss20 instance or false * @param SimpleXMLElement $xml Feed xml
* @return SimpleXMLElement
*/ */
public function execute() public function getItemsTree(SimpleXMLElement $xml)
{ {
\PicoFeed\Logging::log(\get_called_class().': begin parsing'); return $xml->channel->item;
\libxml_use_internal_errors(true);
$xml = \simplexml_load_string($this->content);
if ($xml === false) {
\PicoFeed\Logging::log(\get_called_class().': XML parsing error');
\PicoFeed\Logging::log($this->getXmlErrors());
return false;
} }
$namespaces = $xml->getNamespaces(true); /**
* Find the feed url
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedUrl(SimpleXMLElement $xml, Feed $feed)
{
if ($xml->channel->link && $xml->channel->link->count() > 1) { if ($xml->channel->link && $xml->channel->link->count() > 1) {
foreach ($xml->channel->link as $xml_link) { foreach ($xml->channel->link as $xml_link) {
@ -38,74 +46,117 @@ class Rss20 extends \PicoFeed\Parser
$link = (string) $xml_link; $link = (string) $xml_link;
if ($link !== '') { if ($link !== '') {
$this->url = (string) $link; $feed->url = $link;
break; break;
} }
} }
} }
else { else {
$this->url = (string) $xml->channel->link; $feed->url = (string) $xml->channel->link;
}
} }
$this->language = isset($xml->channel->language) ? (string) $xml->channel->language : ''; /**
$this->title = $this->stripWhiteSpace((string) $xml->channel->title) ?: $this->url; * Find the feed title
$this->id = $this->url; *
$this->updated = $this->parseDate(isset($xml->channel->pubDate) ? (string) $xml->channel->pubDate : (string) $xml->channel->lastBuildDate); * @access public
* @param SimpleXMLElement $xml Feed xml
\PicoFeed\Logging::log(\get_called_class().': Title => '.$this->title); * @param \PicoFeed\Feed $feed Feed object
\PicoFeed\Logging::log(\get_called_class().': Url => '.$this->url); */
public function findFeedTitle(SimpleXMLElement $xml, Feed $feed)
// RSS feed might be empty {
if (! $xml->channel->item) { $feed->title = $this->stripWhiteSpace((string) $xml->channel->title) ?: $feed->url;
\PicoFeed\Logging::log(\get_called_class().': feed empty or malformed');
return $this;
} }
foreach ($xml->channel->item as $entry) { /**
* Find the feed language
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed)
{
$feed->language = isset($xml->channel->language) ? (string) $xml->channel->language : '';
}
$item = new \StdClass; /**
* Find the feed id
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedId(SimpleXMLElement $xml, Feed $feed)
{
$feed->id = $feed->url;
}
/**
* Find the feed date
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedDate(SimpleXMLElement $xml, Feed $feed)
{
$date = isset($xml->channel->pubDate) ? $xml->channel->pubDate : $xml->channel->lastBuildDate;
$feed->date = $this->parseDate((string) $date);
}
/**
* Find the item date
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
*/
public function findItemDate(SimpleXMLElement $entry, Item $item)
{
$date = $this->getNamespaceValue($entry, $this->namespaces, 'date');
if (empty($date)) {
$date = $this->getNamespaceValue($entry, $this->namespaces, 'updated');
}
if (empty($date)) {
$date = (string) $entry->pubDate;
}
$item->date = $this->parseDate($date);
}
/**
* Find the item title
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
*/
public function findItemTitle(SimpleXMLElement $entry, Item $item)
{
$item->title = $this->stripWhiteSpace((string) $entry->title); $item->title = $this->stripWhiteSpace((string) $entry->title);
$item->url = '';
$item->author= '';
$item->updated = '';
$item->content = '';
$item->enclosure = '';
$item->enclosure_type = '';
$item->language = $this->language;
foreach ($namespaces as $name => $url) { if (empty($item->title)) {
$item->title = $item->url;
$namespace = $entry->children($namespaces[$name]);
if (! $item->author && ! empty($namespace->creator)) $item->author = (string) $namespace->creator;
if (! $item->updated && ! empty($namespace->date)) $item->updated = $this->parseDate((string) $namespace->date);
if (! $item->updated && ! empty($namespace->updated)) $item->updated = $this->parseDate((string) $namespace->updated);
if (! $item->content && ! empty($namespace->encoded)) $item->content = (string) $namespace->encoded;
// Get FeedBurner original links
if (! $item->url && ! empty($namespace->origLink)) $item->url = (string) $namespace->origLink;
if (! $item->enclosure && ! empty($namespace->origEnclosureLink)) $item->enclosure = (string) $namespace->origEnclosureLink;
}
if (empty($item->url)) {
if (isset($entry->link)) {
$item->url = (string) $entry->link;
}
else if (isset($entry->guid)) {
$item->url = (string) $entry->guid;
} }
} }
if (empty($item->updated)) $item->updated = $this->parseDate((string) $entry->pubDate) ?: $this->updated; /**
* Find the item author
if (empty($item->content)) { *
$item->content = isset($entry->description) ? (string) $entry->description : ''; * @access public
} * @param SimpleXMLElement $xml Feed
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
*/
public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item)
{
$item->author = $this->getNamespaceValue($entry, $this->namespaces, 'creator');
if (empty($item->author)) { if (empty($item->author)) {
if (isset($entry->author)) { if (isset($entry->author)) {
$item->author = (string) $entry->author; $item->author = (string) $entry->author;
} }
@ -113,37 +164,110 @@ class Rss20 extends \PicoFeed\Parser
$item->author = (string) $xml->channel->webMaster; $item->author = (string) $xml->channel->webMaster;
} }
} }
}
if (isset($entry->guid) && isset($entry->guid['isPermaLink']) && (string) $entry->guid['isPermaLink'] != 'false') { /**
$id = (string) $entry->guid; * Find the item content
$item->id = $this->generateId($id !== '' && $id !== $item->url ? $id : $item->url, $this->isExcludedFromId($this->url) ? '' : $this->url); *
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
*/
public function findItemContent(SimpleXMLElement $entry, Item $item)
{
$content = $this->getNamespaceValue($entry, $this->namespaces, 'encoded');
if (empty($content) && $entry->description->count() > 0) {
$content = (string) $entry->description;
}
$item->content = $this->filterHtml($content, $item->url);
}
/**
* Find the item URL
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
*/
public function findItemUrl(SimpleXMLElement $entry, Item $item)
{
$item->url = $this->getNamespaceValue($entry, $this->namespaces, 'origLink');
if (empty($item->url)) {
if (isset($entry->link)) {
$item->url = (string) $entry->link;
}
else if (isset($entry->guid)) {
$item->url = (string) $entry->guid;
}
}
}
/**
* Genereate the item id
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
* @param \PicoFeed\Feed $feed Feed object
*/
public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed)
{
if ($entry->guid->count() > 0 && (string) $entry->guid['isPermaLink'] !== 'false') {
$item_permalink = (string) $entry->guid;
} }
else { else {
$item->id = $this->generateId($item->url, $this->isExcludedFromId($this->url) ? '' : $this->url); $item_permalink = $item->url;
} }
if (empty($item->title)) $item->title = $item->url; if ($this->isExcludedFromId($feed->url)) {
$feed_permalink = '';
}
else {
$feed_permalink = $feed->url;
}
// if optional enclosure tag with multimedia provided, capture here $item->id = $this->generateId($item_permalink, $feed_permalink);
}
/**
* Find the item enclosure
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
* @param \PicoFeed\Feed $feed Feed object
*/
public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed)
{
if (isset($entry->enclosure)) { if (isset($entry->enclosure)) {
if (! $item->enclosure) { $item->enclosure_url = $this->getNamespaceValue($entry->enclosure, $this->namespaces, 'origEnclosureLink');
$item->enclosure = isset($entry->enclosure['url']) ? (string) $entry->enclosure['url'] : '';
if (empty($item->enclosure_url)) {
$item->enclosure_url = isset($entry->enclosure['url']) ? (string) $entry->enclosure['url'] : '';
} }
$item->enclosure_type = isset($entry->enclosure['type']) ? (string) $entry->enclosure['type'] : ''; $item->enclosure_type = isset($entry->enclosure['type']) ? (string) $entry->enclosure['type'] : '';
if (\PicoFeed\Filter::isRelativePath($item->enclosure)) { if (Filter::isRelativePath($item->enclosure_url)) {
$item->enclosure = \PicoFeed\Filter::getAbsoluteUrl($item->enclosure, $this->url); $item->enclosure_url = Filter::getAbsoluteUrl($item->enclosure_url, $feed->url);
}
} }
} }
$item->content = $this->filterHtml($item->content, $item->url); /**
$this->items[] = $item; * Find the item language
} *
* @access public
\PicoFeed\Logging::log(\get_called_class().': parsing finished ('.count($this->items).' items)'); * @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
return $this; * @param \PicoFeed\Feed $feed Feed object
*/
public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$item->language = $feed->language;
} }
} }

View File

@ -4,4 +4,14 @@ namespace PicoFeed\Parsers;
require_once __DIR__.'/Rss20.php'; require_once __DIR__.'/Rss20.php';
class Rss91 extends Rss20 {} use PicoFeed\Parsers\Rss20;
/**
* RSS 0.91 Parser
*
* @author Frederic Guillot
* @package parser
*/
class Rss91 extends Rss20
{
}

View File

@ -4,4 +4,14 @@ namespace PicoFeed\Parsers;
require_once __DIR__.'/Rss20.php'; require_once __DIR__.'/Rss20.php';
class Rss92 extends Rss20 {} use PicoFeed\Parsers\Rss20;
/**
* RSS 0.92 Parser
*
* @author Frederic Guillot
* @package parser
*/
class Rss92 extends Rss20
{
}

20
vendor/PicoFeed/PicoFeed.php vendored Normal file
View File

@ -0,0 +1,20 @@
<?php
// Include this file if you don't want to use an autoloader
require __DIR__.'/Config.php';
require __DIR__.'/Logging.php';
require __DIR__.'/Item.php';
require __DIR__.'/Feed.php';
require __DIR__.'/Client.php';
require __DIR__.'/Filter.php';
require __DIR__.'/XmlParser.php';
require __DIR__.'/Encoding.php';
require __DIR__.'/Grabber.php';
require __DIR__.'/Reader.php';
require __DIR__.'/Import.php';
require __DIR__.'/Export.php';
require __DIR__.'/Writer.php';
require __DIR__.'/Writers/Rss20.php';
require __DIR__.'/Writers/Atom.php';
require __DIR__.'/Parser.php';

View File

@ -2,16 +2,19 @@
namespace PicoFeed; namespace PicoFeed;
require_once __DIR__.'/Logging.php'; use DOMXPath;
require_once __DIR__.'/Parser.php'; use PicoFeed\Config;
require_once __DIR__.'/Client.php'; use PicoFeed\XmlParser;
require_once __DIR__.'/Filter.php'; use PicoFeed\Logging;
use PicoFeed\Filter;
use PicoFeed\Client;
use PicoFeed\Parser;
/** /**
* Reader class * Reader class
* *
* @author Frederic Guillot * @author Frederic Guillot
* @package parser * @package picofeed
*/ */
class Reader class Reader
{ {
@ -39,19 +42,24 @@ class Reader
*/ */
private $encoding = ''; private $encoding = '';
/**
* Config class instance
*
* @access private
* @var \PicoFeed\Config
*/
private $config = null;
/** /**
* Constructor * Constructor
* *
* @access public * @access public
* @param string $content Feed content * @param \PicoFeed\Config $config Config class instance
* @param string $encoding Feed encoding
* @return Reader
*/ */
public function __construct($content = '', $encoding = '') public function __construct(Config $config = null)
{ {
$this->content = $content; $this->config = $config ?: new Config;
$this->encoding = ''; Logging::setTimezone($this->config->getTimezone());
return $this;
} }
/** /**
@ -61,52 +69,53 @@ class Reader
* @param string $url Feed content * @param string $url Feed content
* @param string $last_modified Last modified HTTP header * @param string $last_modified Last modified HTTP header
* @param string $etag Etag HTTP header * @param string $etag Etag HTTP header
* @param string $timeout Client connection timeout * @return \PicoFeed\Client
* @param string $user_agent HTTP user-agent
* @return Client
*/ */
public function download($url, $last_modified = '', $etag = '', $timeout = 5, $user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)') public function download($url, $last_modified = '', $etag = '')
{ {
if (strpos($url, 'http') !== 0) { if (strpos($url, 'http') !== 0) {
$url = 'http://'.$url; $url = 'http://'.$url;
} }
$client = Client::create(); $client = Client::getInstance();
$client->url = $url; $client->setTimeout($this->config->getClientTimeout())
$client->timeout = $timeout; ->setUserAgent($this->config->getClientUserAgent())
$client->user_agent = $user_agent; ->setMaxRedirections($this->config->getMaxRedirections())
$client->last_modified = $last_modified; ->setMaxBodySize($this->config->getMaxBodySize())
$client->etag = $etag; ->setProxyHostname($this->config->getProxyHostname())
$client->execute(); ->setProxyPort($this->config->getProxyPort())
->setProxyUsername($this->config->getProxyUsername())
->setProxyPassword($this->config->getProxyPassword())
->setLastModified($last_modified)
->setEtag($etag);
if ($client->execute($url)) {
$this->content = $client->getContent(); $this->content = $client->getContent();
$this->url = $client->getUrl(); $this->url = $client->getUrl();
$this->encoding = $client->getEncoding(); $this->encoding = $client->getEncoding();
}
return $client; return $client;
} }
/** /**
* Get the download content * Get a parser instance with a custom config
* *
* @access public * @access public
* @return string * @param string $name Parser name
* @return \PicoFeed\Parser
*/ */
public function getContent() public function getParserInstance($name)
{ {
return $this->content; require_once __DIR__.'/Parsers/'.ucfirst($name).'.php';
} $name = '\PicoFeed\Parsers\\'.$name;
/** $parser = new $name($this->content, $this->encoding);
* Get finale URL $parser->setHashAlgo($this->config->getParserHashAlgo());
* $parser->setTimezone($this->config->getTimezone());
* @access public $parser->setConfig($this->config);
* @return string
*/ return $parser;
public function getUrl()
{
return $this->url;
} }
/** /**
@ -138,6 +147,31 @@ class Reader
return substr($data, $open_tag, $close_tag); return substr($data, $open_tag, $close_tag);
} }
/**
* Detect the feed format
*
* @access public
* @param string $parser_name Parser name
* @param string $haystack First XML tag
* @param array $needles List of strings that need to be there
* @return mixed False on failure or Parser instance
*/
public function detectFormat($parser_name, $haystack, array $needles)
{
$results = array();
foreach ($needles as $needle) {
$results[] = strpos($haystack, $needle) !== false;
}
if (! in_array(false, $results, true)) {
Logging::setMessage(get_called_class().': Format detected => '.$parser_name);
return $this->getParserInstance($parser_name);
}
return false;
}
/** /**
* Discover feed format and return a parser instance * Discover feed format and return a parser instance
* *
@ -147,66 +181,44 @@ class Reader
*/ */
public function getParser($discover = false) public function getParser($discover = false)
{ {
$formats = array(
array('parser' => 'Atom', 'needles' => array('<feed')),
array('parser' => 'Rss20', 'needles' => array('<rss', '2.0')),
array('parser' => 'Rss92', 'needles' => array('<rss', '0.92')),
array('parser' => 'Rss91', 'needles' => array('<rss', '0.91')),
array('parser' => 'Rss10', 'needles' => array('<rdf:', 'xmlns="http://purl.org/rss/1.0/"')),
);
$first_tag = $this->getFirstTag($this->content); $first_tag = $this->getFirstTag($this->content);
if (strpos($first_tag, '<feed') !== false) { foreach ($formats as $format) {
Logging::log(\get_called_class().': discover Atom feed'); $parser = $this->detectFormat($format['parser'], $first_tag, $format['needles']);
require_once __DIR__.'/Parsers/Atom.php'; if ($parser !== false) {
return new Parsers\Atom($this->content, $this->encoding); return $parser;
} }
else if (strpos($first_tag, '<rss') !== false &&
(strpos($first_tag, 'version="2.0"') !== false || strpos($first_tag, 'version=\'2.0\'') !== false)) {
Logging::log(\get_called_class().': discover RSS 2.0 feed');
require_once __DIR__.'/Parsers/Rss20.php';
return new Parsers\Rss20($this->content, $this->encoding);
} }
else if (strpos($first_tag, '<rss') !== false &&
(strpos($first_tag, 'version="0.92"') !== false || strpos($first_tag, 'version=\'0.92\'') !== false)) {
Logging::log(\get_called_class().': discover RSS 0.92 feed'); if ($discover === true) {
require_once __DIR__.'/Parsers/Rss92.php'; Logging::setMessage(get_called_class().': Format not supported or feed malformed');
return new Parsers\Rss92($this->content, $this->encoding); Logging::setMessage(get_called_class().': Content => '.PHP_EOL.$this->content);
}
else if (strpos($first_tag, '<rss') !== false &&
(strpos($first_tag, 'version="0.91"') !== false || strpos($first_tag, 'version=\'0.91\'') !== false)) {
Logging::log(\get_called_class().': discover RSS 0.91 feed');
require_once __DIR__.'/Parsers/Rss91.php';
return new Parsers\Rss91($this->content, $this->encoding);
}
else if (strpos($first_tag, '<rdf:') !== false && strpos($first_tag, 'xmlns="http://purl.org/rss/1.0/"') !== false) {
Logging::log(\get_called_class().': discover RSS 1.0 feed');
require_once __DIR__.'/Parsers/Rss10.php';
return new Parsers\Rss10($this->content, $this->encoding);
}
else if ($discover === true) {
Logging::log(\get_called_class().': Format not supported or malformed');
Logging::log(\get_called_class().':'.PHP_EOL.$this->content);
return false; return false;
} }
else if ($this->discover()) { else if ($this->discover()) {
return $this->getParser(true); return $this->getParser(true);
} }
Logging::log(\get_called_class().': Subscription not found'); Logging::setMessage(get_called_class().': Subscription not found');
Logging::log(\get_called_class().': Content => '.PHP_EOL.$this->content); Logging::setMessage(get_called_class().': Content => '.PHP_EOL.$this->content);
return false; return false;
} }
/** /**
* Discover feed url inside a HTML document and download the feed * Discover the feed url inside a HTML document and download the feed
* *
* @access public * @access public
* @return boolean * @return boolean
@ -214,18 +226,13 @@ class Reader
public function discover() public function discover()
{ {
if (! $this->content) { if (! $this->content) {
return false; return false;
} }
Logging::log(\get_called_class().': Try to discover a subscription'); Logging::setMessage(get_called_class().': Try to discover a subscription');
\libxml_use_internal_errors(true); $dom = XmlParser::getHtmlDocument($this->content);
$xpath = new DOMXPath($dom);
$dom = new \DOMDocument;
$dom->loadHTML($this->content);
$xpath = new \DOMXPath($dom);
$queries = array( $queries = array(
"//link[@type='application/atom+xml']", "//link[@type='application/atom+xml']",
@ -251,7 +258,7 @@ class Reader
$link = $this->url.$link; $link = $this->url.$link;
} }
Logging::log(\get_called_class().': Find subscription link: '.$link); Logging::setMessage(get_called_class().': Find subscription link: '.$link);
$this->download($link); $this->download($link);
return true; return true;
@ -261,4 +268,52 @@ class Reader
return false; return false;
} }
/**
* Get the downloaded content
*
* @access public
* @return string
*/
public function getContent()
{
return $this->content;
}
/**
* Set the page content
*
* @access public
* @param string $content Page content
* @return \PicoFeed\Reader
*/
public function setContent($content)
{
$this->content = $content;
return $this;
}
/**
* Get final URL
*
* @access public
* @return string
*/
public function getUrl()
{
return $this->url;
}
/**
* Set the URL
*
* @access public
* @param string $url URL
* @return \PicoFeed\Reader
*/
public function setUrl($url)
{
$this->url = $url;
return $this;
}
} }

View File

@ -0,0 +1,10 @@
<?php
return array(
'test_url' => 'http://www./2014/05/20/le-playstation-now-arrive-en-beta-fermee-aux-etats-unis/',
'body' => array(
'//div[@class="post-content"]',
),
'strip' => array(
'//style'
)
);

View File

@ -2,21 +2,54 @@
namespace PicoFeed; namespace PicoFeed;
use RuntimeException;
/**
* Base writer class
*
* @author Frederic Guillot
* @package picofeed
*/
abstract class Writer abstract class Writer
{ {
/**
* Dom object
*
* @access protected
* @var DomDocument
*/
protected $dom;
/**
* Items
*
* @access public
* @var array
*/
public $items = array(); public $items = array();
/**
* Generate the XML document
*
* @abstract
* @access public
* @param string $filename Optional filename
* @return string
*/
abstract public function execute($filename = ''); abstract public function execute($filename = '');
/**
public function checkRequiredProperties($properties, $container) * Check required properties to generate the output
*
* @access public
* @param array $properties List of properties
* @param mixed $container Object or array container
*/
public function checkRequiredProperties(array $properties, $container)
{ {
foreach ($properties as $property) { foreach ($properties as $property) {
if ((is_object($container) && ! isset($container->$property)) || (is_array($container) && ! isset($container[$property]))) { if ((is_object($container) && ! isset($container->$property)) || (is_array($container) && ! isset($container[$property]))) {
throw new RuntimeException('Required property missing: '.$property);
throw new \RuntimeException('Required property missing: '.$property);
} }
} }
} }

View File

@ -2,32 +2,59 @@
namespace PicoFeed\Writers; namespace PicoFeed\Writers;
require_once __DIR__.'/../Writer.php'; use DomDocument;
use DomElement;
use DomAttr;
use PicoFeed\Writer;
class Atom extends \PicoFeed\Writer /**
* Atom writer class
*
* @author Frederic Guillot
* @package picofeed
*/
class Atom extends Writer
{ {
/**
* List of required properties for each feed
*
* @access private
* @var array
*/
private $required_feed_properties = array( private $required_feed_properties = array(
'title', 'title',
'site_url', 'site_url',
'feed_url', 'feed_url',
); );
/**
* List of required properties for each item
*
* @access private
* @var array
*/
private $required_item_properties = array( private $required_item_properties = array(
'title', 'title',
'url', 'url',
); );
/**
* Get the Atom document
*
* @access public
* @param string $filename Optional filename
* @return string
*/
public function execute($filename = '') public function execute($filename = '')
{ {
$this->checkRequiredProperties($this->required_feed_properties, $this); $this->checkRequiredProperties($this->required_feed_properties, $this);
$this->dom = new \DomDocument('1.0', 'UTF-8'); $this->dom = new DomDocument('1.0', 'UTF-8');
$this->dom->formatOutput = true; $this->dom->formatOutput = true;
// <feed/> // <feed/>
$feed = $this->dom->createElement('feed'); $feed = $this->dom->createElement('feed');
$feed->setAttributeNodeNS(new \DomAttr('xmlns', 'http://www.w3.org/2005/Atom')); $feed->setAttributeNodeNS(new DomAttr('xmlns', 'http://www.w3.org/2005/Atom'));
// <generator/> // <generator/>
$generator = $this->dom->createElement('generator', 'PicoFeed'); $generator = $this->dom->createElement('generator', 'PicoFeed');
@ -115,8 +142,16 @@ class Atom extends \PicoFeed\Writer
} }
} }
/**
public function addLink($xml, $url, $rel = 'alternate', $type = 'text/html') * Add Link
*
* @access public
* @param DomElement $xml XML node
* @param string $url URL
* @param string $rel Link rel attribute
* @param string $type Link type attribute
*/
public function addLink(DomElement $xml, $url, $rel = 'alternate', $type = 'text/html')
{ {
$link = $this->dom->createElement('link'); $link = $this->dom->createElement('link');
$link->setAttribute('rel', $rel); $link->setAttribute('rel', $rel);
@ -125,8 +160,14 @@ class Atom extends \PicoFeed\Writer
$xml->appendChild($link); $xml->appendChild($link);
} }
/**
public function addUpdated($xml, $value = '') * Add publication date
*
* @access public
* @param DomElement $xml XML node
* @param string $value Timestamp
*/
public function addUpdated(DomElement $xml, $value = '')
{ {
$xml->appendChild($this->dom->createElement( $xml->appendChild($this->dom->createElement(
'updated', 'updated',
@ -134,8 +175,14 @@ class Atom extends \PicoFeed\Writer
)); ));
} }
/**
public function addAuthor($xml, array $values) * Add author
*
* @access public
* @param DomElement $xml XML node
* @param array $values Author name and email
*/
public function addAuthor(DomElement $xml, array $values)
{ {
$author = $this->dom->createElement('author'); $author = $this->dom->createElement('author');

View File

@ -2,34 +2,61 @@
namespace PicoFeed\Writers; namespace PicoFeed\Writers;
require_once __DIR__.'/../Writer.php'; use DomDocument;
use DomAttr;
use DomElement;
use PicoFeed\Writer;
class Rss20 extends \PicoFeed\Writer /**
* Rss 2.0 writer class
*
* @author Frederic Guillot
* @package picofeed
*/
class Rss20 extends Writer
{ {
/**
* List of required properties for each feed
*
* @access private
* @var array
*/
private $required_feed_properties = array( private $required_feed_properties = array(
'title', 'title',
'site_url', 'site_url',
'feed_url', 'feed_url',
); );
/**
* List of required properties for each item
*
* @access private
* @var array
*/
private $required_item_properties = array( private $required_item_properties = array(
'title', 'title',
'url', 'url',
); );
/**
* Get the Rss 2.0 document
*
* @access public
* @param string $filename Optional filename
* @return string
*/
public function execute($filename = '') public function execute($filename = '')
{ {
$this->checkRequiredProperties($this->required_feed_properties, $this); $this->checkRequiredProperties($this->required_feed_properties, $this);
$this->dom = new \DomDocument('1.0', 'UTF-8'); $this->dom = new DomDocument('1.0', 'UTF-8');
$this->dom->formatOutput = true; $this->dom->formatOutput = true;
// <rss/> // <rss/>
$rss = $this->dom->createElement('rss'); $rss = $this->dom->createElement('rss');
$rss->setAttribute('version', '2.0'); $rss->setAttribute('version', '2.0');
$rss->setAttributeNodeNS(new \DomAttr('xmlns:content', 'http://purl.org/rss/1.0/modules/content/')); $rss->setAttributeNodeNS(new DomAttr('xmlns:content', 'http://purl.org/rss/1.0/modules/content/'));
$rss->setAttributeNodeNS(new \DomAttr('xmlns:atom', 'http://www.w3.org/2005/Atom')); $rss->setAttributeNodeNS(new DomAttr('xmlns:atom', 'http://www.w3.org/2005/Atom'));
$channel = $this->dom->createElement('channel'); $channel = $this->dom->createElement('channel');
@ -130,8 +157,14 @@ class Rss20 extends \PicoFeed\Writer
} }
} }
/**
public function addPubDate($xml, $value = '') * Add publication date
*
* @access public
* @param DomElement $xml XML node
* @param string $value Timestamp
*/
public function addPubDate(DomElement $xml, $value = '')
{ {
$xml->appendChild($this->dom->createElement( $xml->appendChild($this->dom->createElement(
'pubDate', 'pubDate',
@ -139,8 +172,15 @@ class Rss20 extends \PicoFeed\Writer
)); ));
} }
/**
public function addAuthor($xml, $tag, array $values) * Add author
*
* @access public
* @param DomElement $xml XML node
* @param string $tag Tag name
* @param array $values Author name and email
*/
public function addAuthor(DomElement $xml, $tag, array $values)
{ {
$value = ''; $value = '';

136
vendor/PicoFeed/XmlParser.php vendored Normal file
View File

@ -0,0 +1,136 @@
<?php
namespace PicoFeed;
use DomDocument;
use SimpleXmlElement;
/**
* XML parser class
*
* Checks for XML eXternal Entity (XXE) and XML Entity Expansion (XEE) attacks on XML documents
*
* @author Frederic Guillot
* @package picofeed
*/
class XmlParser
{
/**
* Get a SimpleXmlElement instance or return false
*
* @static
* @access public
* @param string $input XML content
* @return mixed
*/
public static function getSimpleXml($input)
{
$dom = self::getDomDocument($input);
if ($dom !== false) {
$simplexml = simplexml_import_dom($dom);
if (! $simplexml instanceof SimpleXmlElement) {
return false;
}
return $simplexml;
}
return false;
}
/**
* Get a DomDocument instance or return false
*
* @static
* @access public
* @param string $input XML content
* @return mixed
*/
public static function getDomDocument($input)
{
if (substr(php_sapi_name(), 0, 3) === 'fpm') {
// If running with PHP-FPM and an entity is detected we refuse to parse the feed
// @see https://bugs.php.net/bug.php?id=64938
if (strpos($input, '<!ENTITY') !== false) {
return false;
}
}
else {
libxml_disable_entity_loader(true);
}
libxml_use_internal_errors(true);
$dom = new DomDocument;
$dom->loadXml($input, LIBXML_NONET);
// The document is empty, there is probably some parsing errors
if ($dom->childNodes->length === 0) {
return false;
}
// Scan for potential XEE attacks using ENTITY
foreach ($dom->childNodes as $child) {
if ($child->nodeType === XML_DOCUMENT_TYPE_NODE) {
if ($child->entities->length > 0) {
return false;
}
}
}
return $dom;
}
/**
* Load HTML document by using a DomDocument instance or return false on failure
*
* @static
* @access public
* @param string $input XML content
* @return mixed
*/
public static function getHtmlDocument($input)
{
libxml_use_internal_errors(true);
$dom = new DomDocument;
if (version_compare(PHP_VERSION, '5.4.0', '>=')) {
$dom->loadHTML($input, LIBXML_NONET);
}
else {
$dom->loadHTML($input);
}
return $dom;
}
/**
* Get XML parser errors
*
* @static
* @access public
* @return string
*/
public static function getErrors()
{
$errors = array();
foreach(libxml_get_errors() as $error) {
$errors[] = sprintf('XML error: %s (Line: %d - Column: %d - Code: %d)',
$error->message,
$error->line,
$error->column,
$error->code
);
}
return implode(', ', $errors);
}
}