Update to the last version of PicoFeed

This commit is contained in:
Frédéric Guillot 2014-05-20 14:20:27 -04:00
parent 58cb6979a8
commit 3840a87128
33 changed files with 2920 additions and 1123 deletions

View File

@ -3,8 +3,19 @@
require __DIR__.'/check_setup.php';
require __DIR__.'/lib/Translator.php';
require __DIR__.'/vendor/PicoDb/Database.php';
require __DIR__.'/vendor/PicoFeed/Client.php';
require __DIR__.'/vendor/PicoFeed/Parser.php';
require __DIR__.'/vendor/PicoFeed/PicoFeed.php';
require __DIR__.'/vendor/Readability/Readability.php';
require __DIR__.'/vendor/SimpleValidator/Validator.php';
require __DIR__.'/vendor/SimpleValidator/Base.php';
require __DIR__.'/vendor/SimpleValidator/Validators/Required.php';
require __DIR__.'/vendor/SimpleValidator/Validators/Unique.php';
require __DIR__.'/vendor/SimpleValidator/Validators/MaxLength.php';
require __DIR__.'/vendor/SimpleValidator/Validators/MinLength.php';
require __DIR__.'/vendor/SimpleValidator/Validators/Integer.php';
require __DIR__.'/vendor/SimpleValidator/Validators/Equals.php';
require __DIR__.'/vendor/SimpleValidator/Validators/AlphaNumeric.php';
require __DIR__.'/models/config.php';
require __DIR__.'/models/user.php';
require __DIR__.'/models/feed.php';
@ -40,8 +51,6 @@ defined('AUTO_UPDATE_DOWNLOAD_DIRECTORY') or define('AUTO_UPDATE_DOWNLOAD_DIRECT
defined('AUTO_UPDATE_ARCHIVE_DIRECTORY') or define('AUTO_UPDATE_ARCHIVE_DIRECTORY', DATA_DIRECTORY.DIRECTORY_SEPARATOR.'archive');
defined('AUTO_UPDATE_BACKUP_DIRECTORY') or define('AUTO_UPDATE_BACKUP_DIRECTORY', DATA_DIRECTORY.DIRECTORY_SEPARATOR.'backup');
PicoFeed\Client::proxy(PROXY_HOSTNAME, PROXY_PORT, PROXY_USERNAME, PROXY_PASSWORD);
PicoDb\Database::bootstrap('db', function() {
$db = new PicoDb\Database(array(

View File

@ -1,7 +1,5 @@
<?php
require __DIR__.'/../vendor/PicoFeed/Writers/Atom.php';
use PicoFarad\Router;
use PicoFarad\Response;
use PicoFarad\Request;

View File

@ -31,7 +31,7 @@ Router\before(function($action) {
date_default_timezone_set(Model\Config\get('timezone') ?: 'UTC');
// HTTP secure headers
$frame_src = \PicoFeed\Filter::$iframe_whitelist;
$frame_src = Model\Config\get_iframe_whitelist();;
$frame_src[] = 'https://login.persona.org';
Response\csp(array(

View File

@ -17,7 +17,7 @@ else {
}
if (! empty($options['database'])) {
\Model\Database\select($options['database']);
Model\Database\select($options['database']);
}
$limit = ! empty($options['limit']) && ctype_digit($options['limit']) ? (int) $options['limit'] : Model\Feed\LIMIT_ALL;

View File

@ -2,29 +2,52 @@
namespace Model\Config;
require_once __DIR__.'/../vendor/SimpleValidator/Validator.php';
require_once __DIR__.'/../vendor/SimpleValidator/Base.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/Required.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/Unique.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/MaxLength.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/MinLength.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/Integer.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/Equals.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/Integer.php';
use SimpleValidator\Validator;
use SimpleValidator\Validators;
use PicoDb\Database;
use PicoFeed\Config as ReaderConfig;
use PicoFeed\Logging;
const DB_VERSION = 24;
const HTTP_USERAGENT = 'Miniflux - http://miniflux.net';
const HTTP_FAKE_USERAGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36';
const DB_VERSION = 24;
const HTTP_USER_AGENT = 'Miniflux (http://miniflux.net)';
// Get PicoFeed config
function get_reader_config()
{
$config = new ReaderConfig;
$config->setTimezone(get('timezone'));
$config->setClientTimeout(HTTP_TIMEOUT);
$config->setClientUserAgent(HTTP_USER_AGENT);
$config->setGrabberUserAgent(HTTP_USER_AGENT);
$config->setProxyHostname(PROXY_HOSTNAME);
$config->setProxyPort(PROXY_PORT);
$config->setProxyUsername(PROXY_USERNAME);
$config->setProxyPassword(PROXY_PASSWORD);
$config->setFilterIframeWhitelist(get_iframe_whitelist());
return $config;
}
function get_iframe_whitelist()
{
return array(
'//www.youtube.com',
'http://www.youtube.com',
'https://www.youtube.com',
'http://player.vimeo.com',
'https://player.vimeo.com',
'http://www.dailymotion.com',
'https://www.dailymotion.com',
);
}
// Send a debug message to the console
function debug($line)
{
\PicoFeed\Logging::log($line);
Logging::setMessage($line);
write_debug();
}
@ -32,14 +55,7 @@ function debug($line)
function write_debug()
{
if (DEBUG) {
$data = '';
foreach (\PicoFeed\Logging::$messages as $line) {
$data .= $line.PHP_EOL;
}
file_put_contents(DEBUG_FILENAME, $data);
file_put_contents(DEBUG_FILENAME, implode(PHP_EOL, Logging::getMessages()));
}
}

View File

@ -2,14 +2,6 @@
namespace Model\Database;
require_once __DIR__.'/../vendor/SimpleValidator/Validator.php';
require_once __DIR__.'/../vendor/SimpleValidator/Base.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/Required.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/MaxLength.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/MinLength.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/Equals.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/AlphaNumeric.php';
use SimpleValidator\Validator;
use SimpleValidator\Validators;

View File

@ -2,17 +2,15 @@
namespace Model\Feed;
require_once __DIR__.'/../vendor/PicoFeed/Filter.php';
require_once __DIR__.'/../vendor/PicoFeed/Export.php';
require_once __DIR__.'/../vendor/PicoFeed/Import.php';
require_once __DIR__.'/../vendor/PicoFeed/Reader.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validator.php';
require_once __DIR__.'/../vendor/SimpleValidator/Base.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/Required.php';
use SimpleValidator\Validator;
use SimpleValidator\Validators;
use PicoDb\Database;
use PicoFeed\Export;
use PicoFeed\Import;
use PicoFeed\Reader;
use PicoFeed\Logging;
use Model\Config;
use Model\Item;
const LIMIT_ALL = -1;
@ -32,14 +30,15 @@ function update(array $values)
// Export all feeds
function export_opml()
{
$opml = new \PicoFeed\Export(get_all());
$opml = new Export(get_all());
return $opml->execute();
}
// Import OPML file
function import_opml($content)
{
$import = new \PicoFeed\Import($content);
Logging::setTimezone(Config\get('timezone'));
$import = new Import($content);
$feeds = $import->execute();
if ($feeds) {
@ -61,65 +60,71 @@ function import_opml($content)
$db->closeTransaction();
\Model\Config\write_debug();
Config\write_debug();
return true;
}
\Model\Config\write_debug();
Config\write_debug();
return false;
}
// Add a new feed from an URL
function create($url, $grabber = false)
function create($url, $enable_grabber = false)
{
$reader = new \PicoFeed\Reader;
$resource = $reader->download($url, '', '', HTTP_TIMEOUT, \Model\Config\HTTP_USERAGENT);
$reader = new Reader(Config\get_reader_config());
$resource = $reader->download($url);
$parser = $reader->getParser();
if ($parser !== false) {
$parser->grabber = $grabber;
if ($enable_grabber) {
$parser->enableContentGrabber();
}
$feed = $parser->execute();
if ($feed === false) {
\Model\Config\write_debug();
Config\write_debug();
return false;
}
if (! $feed->url) $feed->url = $reader->getUrl();
if (! $feed->getUrl()) {
$feed->url = $reader->getUrl();
}
if (! $feed->title) {
\Model\Config\write_debug();
if (! $feed->getTitle()) {
Config\write_debug();
return false;
}
$db = Database::get('db');
// Check if the feed is already there
if (! $db->table('feeds')->eq('feed_url', $reader->getUrl())->count()) {
// Etag and LastModified are added the next update
$rs = $db->table('feeds')->save(array(
'title' => $feed->title,
'site_url' => $feed->url,
'title' => $feed->getTitle(),
'site_url' => $feed->getUrl(),
'feed_url' => $reader->getUrl(),
'download_content' => $grabber ? 1 : 0
'download_content' => $enable_grabber ? 1 : 0
));
if ($rs) {
$feed_id = $db->getConnection()->getLastId();
\Model\Item\update_all($feed_id, $feed->items, $grabber);
\Model\Config\write_debug();
Item\update_all($feed_id, $feed->getItems(), $enable_grabber);
Config\write_debug();
return (int) $feed_id;
}
}
}
\Model\Config\write_debug();
Config\write_debug();
return false;
}
@ -143,16 +148,17 @@ function refresh_all($limit = LIMIT_ALL)
function refresh($feed_id)
{
$feed = get($feed_id);
if (empty($feed)) return false;
$reader = new \PicoFeed\Reader;
if (empty($feed)) {
return false;
}
$reader = new Reader(Config\get_reader_config());
$resource = $reader->download(
$feed['feed_url'],
$feed['last_modified'],
$feed['etag'],
HTTP_TIMEOUT,
\Model\Config\HTTP_USERAGENT
$feed['etag']
);
// Update the `last_checked` column each time, HTTP cache or not
@ -160,7 +166,7 @@ function refresh($feed_id)
if (! $resource->isModified()) {
update_parsing_error($feed_id, 0);
\Model\Config\write_debug();
Config\write_debug();
return true;
}
@ -171,14 +177,8 @@ function refresh($feed_id)
if ($feed['download_content']) {
// Don't fetch previous items, only new one
$parser->grabber_ignore_urls = Database::get('db')
->table('items')
->eq('feed_id', $feed_id)
->findAllByColumn('url');
$parser->grabber = true;
$parser->grabber_timeout = HTTP_TIMEOUT;
$parser->grabber_user_agent = \Model\Config\HTTP_FAKE_USERAGENT;
$parser->enableContentGrabber();
$parser->setGrabberIgnoreUrls(Database::get('db')->table('items')->eq('feed_id', $feed_id)->findAllByColumn('url'));
}
$result = $parser->execute();
@ -187,15 +187,16 @@ function refresh($feed_id)
update_parsing_error($feed_id, 0);
update_cache($feed_id, $resource->getLastModified(), $resource->getEtag());
\Model\Item\update_all($feed_id, $result->items, $parser->grabber);
\Model\Config\write_debug();
Item\update_all($feed_id, $result->getItems(), $feed['download_content']);
Config\write_debug();
return true;
}
}
update_parsing_error($feed_id, 1);
\Model\Config\write_debug();
Config\write_debug();
return false;
}

View File

@ -2,11 +2,13 @@
namespace Model\Item;
require_once __DIR__.'/../vendor/Readability/Readability.php';
require_once __DIR__.'/../vendor/PicoFeed/Grabber.php';
require_once __DIR__.'/../vendor/PicoFeed/Filter.php';
use Model\Config;
use PicoDb\Database;
use PicoFeed\Logging;
use PicoFeed\Grabber;
use PicoFeed\Client;
use PicoFeed\Filter;
use Readability;
// Get all items without filtering
function get_everything()
@ -141,7 +143,7 @@ function get_bookmarks($offset = null, $limit = null)
->join('feeds', 'id', 'feed_id')
->in('status', array('read', 'unread'))
->eq('bookmark', 1)
->orderBy('updated', \Model\Config\get('items_sorting_direction'))
->orderBy('updated', Config\get('items_sorting_direction'))
->offset($offset)
->limit($limit)
->findAll();
@ -201,7 +203,7 @@ function get_nav($item, $status = array('unread'), $bookmark = array(1, 0), $fee
->table('items')
->columns('id', 'status', 'title', 'bookmark')
->neq('status', 'removed')
->orderBy('updated', \Model\Config\get('items_sorting_direction'));
->orderBy('updated', Config\get('items_sorting_direction'));
if ($feed_id) $query->eq('feed_id', $feed_id);
@ -377,7 +379,7 @@ function mark_feed_as_read($feed_id)
// Mark all read items to removed after X days
function autoflush()
{
$autoflush = (int) \Model\Config\get('autoflush');
$autoflush = (int) Config\get('autoflush');
if ($autoflush > 0) {
@ -401,9 +403,9 @@ function autoflush()
}
// Update all items
function update_all($feed_id, array $items, $grabber = false)
function update_all($feed_id, array $items, $enable_grabber = false)
{
$nocontent = (bool) \Model\Config\get('nocontent');
$nocontent = (bool) Config\get('nocontent');
$items_in_feed = array();
@ -412,54 +414,55 @@ function update_all($feed_id, array $items, $grabber = false)
foreach ($items as $item) {
\PicoFeed\Logging::log('Item => '.$item->id.' '.$item->url);
Logging::setMessage('Item => '.$item->getId().' '.$item->getUrl());
// Item parsed correctly?
if ($item->id && $item->url) {
if ($item->getId() && $item->getUrl()) {
\PicoFeed\Logging::log('Item parsed correctly');
Logging::setMessage('Item parsed correctly');
// Get item record in database, if any
$itemrec = $db
->table('items')
->columns('enclosure')
->eq('id', $item->id)->findOne();
->eq('id', $item->getId())
->findOne();
// Insert a new item
if ($itemrec === null) {
\PicoFeed\Logging::log('Item added to the database');
Logging::setMessage('Item added to the database');
if (! $item->content && ! $nocontent && $grabber) {
$item->content = download_content_url($item->url);
if ($enable_grabber && ! $nocontent && ! $item->getContent()) {
$item->content = download_content_url($item->getUrl());
}
$db->table('items')->save(array(
'id' => $item->id,
'title' => $item->title,
'url' => $item->url,
'updated' => $item->updated,
'author' => $item->author,
'content' => $nocontent ? '' : $item->content,
'id' => $item->getId(),
'title' => $item->getTitle(),
'url' => $item->getUrl(),
'updated' => $item->getDate(),
'author' => $item->getAuthor(),
'content' => $nocontent ? '' : $item->getContent(),
'status' => 'unread',
'feed_id' => $feed_id,
'enclosure' => isset($item->enclosure) ? $item->enclosure : null,
'enclosure_type' => isset($item->enclosure_type) ? $item->enclosure_type : null,
'language' => $item->language,
'enclosure' => $item->getEnclosureUrl(),
'enclosure_type' => $item->getEnclosureType(),
'language' => $item->getLanguage(),
));
}
else if (isset($item->enclosure) && $item->enclosure && !$itemrec['enclosure']) {
else if (! $itemrec['enclosure'] && $item->getEnclosureUrl()) {
\PicoFeed\Logging::log('Update item enclosure');
Logging::setMessage('Update item enclosure');
$db->table('items')->eq('id', $item->id)->save(array(
$db->table('items')->eq('id', $item->getId())->save(array(
'status' => 'unread',
'enclosure' => $item->enclosure,
'enclosure_type' => isset($item->enclosure_type) ? $item->enclosure_type : null,
'enclosure' => $item->getEnclosureUrl(),
'enclosure_type' => $item->getEnclosureType(),
));
}
else {
\PicoFeed\Logging::log('Item already in the database');
Logging::setMessage('Item already in the database');
}
// Items inside this feed
@ -467,10 +470,20 @@ function update_all($feed_id, array $items, $grabber = false)
}
}
// Remove from the database items marked as "removed"
// and not present inside the feed
// Cleanup old items
cleanup($feed_id, $items_in_feed);
$db->closeTransaction();
}
// Remove from the database items marked as "removed"
// and not present inside the feed
function cleanup($feed_id, array $items_in_feed)
{
if (! empty($items_in_feed)) {
$db = Database::get('db');
$removed_items = $db
->table('items')
->columns('id')
@ -489,7 +502,7 @@ function update_all($feed_id, array $items, $grabber = false)
if (! empty($items_to_remove)) {
$nb_items = count($items_to_remove);
\PicoFeed\Logging::log('There is '.$nb_items.' items to remove');
Logging::setMessage('There is '.$nb_items.' items to remove');
// Handle the case when there is a huge number of items to remove
// Sqlite have a limit of 1000 sql variables by default
@ -508,43 +521,31 @@ function update_all($feed_id, array $items, $grabber = false)
}
}
}
\PicoFeed\Logging::log('Db transaction => '.($db->getConnection()->inTransaction() ? 'ok' : 'rollback'));
$db->closeTransaction();
}
// Download content from an URL
function download_content_url($url)
{
$client = \PicoFeed\Client::create();
$client->url = $url;
$client->timeout = HTTP_TIMEOUT;
$client->user_agent = \Model\Config\HTTP_FAKE_USERAGENT;
$client->execute();
$content = '';
$html = $client->getContent();
$grabber = new Grabber($url);
$grabber->setConfig(Config\get_reader_config());
$grabber->download();
if (! empty($html)) {
// Try first with PicoFeed grabber and with Readability after
$grabber = new \PicoFeed\Grabber($url, $html, $client->getEncoding());
$content = '';
if ($grabber->parse()) {
$content = $grabber->content;
}
if (empty($content)) {
$content = download_content_readability($grabber->html, $url);
}
// Filter content
$filter = new \PicoFeed\Filter($content, $url);
return $filter->execute();
if ($grabber->parse()) {
$content = $grabber->getcontent();
}
else {
$content = download_content_readability($grabber->getRawContent(), $url);
}
return '';
if (! empty($content)) {
$filter = new Filter($content, $url);
$filter->setConfig(Config\get_reader_config());
$content = $filter->execute();
}
return $content;
}
// Download content from item ID
@ -555,7 +556,7 @@ function download_content_id($item_id)
if (! empty($content)) {
if (! \Model\Config\get('nocontent')) {
if (! Config\get('nocontent')) {
// Save content
Database::get('db')
@ -564,7 +565,7 @@ function download_content_id($item_id)
->save(array('content' => $content));
}
\Model\Config\write_debug();
Config\write_debug();
return array(
'result' => true,
@ -572,7 +573,7 @@ function download_content_id($item_id)
);
}
\Model\Config\write_debug();
Config\write_debug();
return array(
'result' => false,
@ -585,7 +586,7 @@ function download_content_readability($content, $url)
{
if (! empty($content)) {
$readability = new \Readability($content, $url);
$readability = new Readability($content, $url);
if ($readability->init()) {
return $readability->getContent()->innerHTML;

View File

@ -2,11 +2,6 @@
namespace Model\User;
require_once __DIR__.'/../vendor/SimpleValidator/Validator.php';
require_once __DIR__.'/../vendor/SimpleValidator/Base.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/Required.php';
require_once __DIR__.'/../vendor/SimpleValidator/Validators/MaxLength.php';
use SimpleValidator\Validator;
use SimpleValidator\Validators;
use PicoDb\Database;

View File

@ -2,59 +2,170 @@
namespace PicoFeed;
require_once __DIR__.'/Logging.php';
use LogicException;
use Clients\Curl;
use Clients\Stream;
use PicoFeed\Logging;
/**
* Client class
*
* @author Frederic Guillot
* @package client
*/
abstract class Client
{
protected static $proxy_hostname = null;
protected static $proxy_port = null;
protected static $proxy_username = null;
protected static $proxy_password = null;
/**
* Flag that say if the resource have been modified
*
* @access private
* @var bool
*/
private $is_modified = true;
public $encoding = '';
public $etag = '';
public $last_modified = '';
public $is_modified = true;
public $content = '';
public $url = '';
public $timeout = 10;
public $max_redirects = 5;
public $max_body_size = 2097152; // 2MB
public $user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)';
/**
* HTTP encoding
*
* @access private
* @var string
*/
private $encoding = '';
/**
* HTTP Etag header
*
* @access protected
* @var string
*/
protected $etag = '';
public static function create($adapter = null)
{
return $adapter ?: self::chooseAdapter();
}
/**
* HTTP Last-Modified header
*
* @access protected
* @var string
*/
protected $last_modified = '';
/**
* Proxy hostname
*
* @access protected
* @var string
*/
protected $proxy_hostname = '';
public static function chooseAdapter()
/**
* Proxy port
*
* @access protected
* @var integer
*/
protected $proxy_port = 3128;
/**
* Proxy username
*
* @access protected
* @var string
*/
protected $proxy_username = '';
/**
* Proxy password
*
* @access protected
* @var string
*/
protected $proxy_password = '';
/**
* Client connection timeout
*
* @access protected
* @var integer
*/
protected $timeout = 10;
/**
* User-agent
*
* @access protected
* @var string
*/
protected $user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)';
/**
* Real URL used (can be changed after a HTTP redirect)
*
* @access protected
* @var string
*/
protected $url = '';
/**
* Page/Feed content
*
* @access protected
* @var string
*/
protected $content = '';
/**
* Number maximum of HTTP redirections to avoid infinite loops
*
* @access protected
* @var integer
*/
protected $max_redirects = 5;
/**
* Maximum size of the HTTP body response
*
* @access protected
* @var integer
*/
protected $max_body_size = 2097152; // 2MB
/**
* Get client instance: curl or stream driver
*
* @static
* @access public
* @return \PicoFeed\Client
*/
public static function getInstance()
{
if (function_exists('curl_init')) {
require_once __DIR__.'/Clients/Curl.php';
return new Clients\Curl;
} else if (ini_get('allow_url_fopen')) {
}
else if (ini_get('allow_url_fopen')) {
require_once __DIR__.'/Clients/Stream.php';
return new Clients\Stream;
}
throw new \LogicException('You must have "allow_url_fopen=1" or curl extension installed');
throw new LogicException('You must have "allow_url_fopen=1" or curl extension installed');
}
public function execute()
/**
* Perform the HTTP request
*
* @access public
* @param string $url URL
* @return bool
*/
public function execute($url = '')
{
if ($this->url === '') {
throw new \LogicException('The URL is missing');
if ($url !== '') {
$this->url = $url;
}
Logging::log(\get_called_class().' Fetch URL: '.$this->url);
Logging::log(\get_called_class().' Etag provided: '.$this->etag);
Logging::log(\get_called_class().' Last-Modified provided: '.$this->last_modified);
Logging::setMessage(get_called_class().' Fetch URL: '.$this->url);
Logging::setMessage(get_called_class().' Etag provided: '.$this->etag);
Logging::setMessage(get_called_class().' Last-Modified provided: '.$this->last_modified);
$response = $this->doRequest();
@ -62,25 +173,42 @@ abstract class Client
if ($response['status'] == 304) {
$this->is_modified = false;
Logging::log(\get_called_class().' Resource not modified');
Logging::setMessage(get_called_class().' Resource not modified');
}
else if ($response['status'] == 404) {
Logging::log(\get_called_class().' Resource not found');
Logging::setMessage(get_called_class().' Resource not found');
}
else {
$this->etag = isset($response['headers']['ETag']) ? $response['headers']['ETag'] : '';
$this->last_modified = isset($response['headers']['Last-Modified']) ? $response['headers']['Last-Modified'] : '';
$etag = isset($response['headers']['ETag']) ? $response['headers']['ETag'] : '';
$last_modified = isset($response['headers']['Last-Modified']) ? $response['headers']['Last-Modified'] : '';
$this->content = $response['body'];
if (isset($response['headers']['Content-Type'])) {
$result = explode('charset=', strtolower($response['headers']['Content-Type']));
$this->encoding = isset($result[1]) ? $result[1] : '';
}
if (($this->etag && $this->etag === $etag) || ($this->last_modified && $last_modified === $this->last_modified)) {
$this->is_modified = false;
}
$this->etag = $etag;
$this->last_modified = $last_modified;
}
return true;
}
return false;
}
/**
* Parse HTTP headers
*
* @access public
* @param array $lines List of headers
* @return array
*/
public function parseHeaders(array $lines)
{
$status = 200;
@ -88,7 +216,7 @@ abstract class Client
foreach ($lines as $line) {
if (strpos($line, 'HTTP') === 0/* && strpos($line, '301') === false && strpos($line, '302') === false*/) {
if (strpos($line, 'HTTP') === 0) {
$status = (int) substr($line, 9, 3);
}
else if (strpos($line, ':') !== false) {
@ -98,71 +226,242 @@ abstract class Client
}
}
Logging::log(\get_called_class().' HTTP status code: '.$status);
Logging::setMessage(get_called_class().' HTTP status code: '.$status);
foreach ($headers as $name => $value) {
Logging::log(\get_called_class().' HTTP header: '.$name.' => '.$value);
Logging::setMessage(get_called_class().' HTTP header: '.$name.' => '.$value);
}
return array($status, $headers);
}
public static function proxy($hostname, $port = 3128, $username = '', $password = '')
{
self::$proxy_hostname = $hostname;
self::$proxy_port = $port;
self::$proxy_username = $username;
self::$proxy_password = $password;
}
/**
* Set the Last-Modified HTTP header
*
* @access public
* @param string $last_modified Header value
* @return \PicoFeed\Client
*/
public function setLastModified($last_modified)
{
$this->last_modified = $last_modified;
return $this;
}
/**
* Get the value of the Last-Modified HTTP header
*
* @access public
* @return string
*/
public function getLastModified()
{
return $this->last_modified;
}
/**
* Set the value of the Etag HTTP header
*
* @access public
* @param string $etag Etag HTTP header value
* @return \PicoFeed\Client
*/
public function setEtag($etag)
{
$this->etag = $etag;
return $this;
}
/**
* Get the Etag HTTP header value
*
* @access public
* @return string
*/
public function getEtag()
{
return $this->etag;
}
/**
* Get the final url value
*
* @access public
* @return string
*/
public function getUrl()
{
return $this->url;
}
/**
* Set the url
*
* @access public
* @return string
* @return \PicoFeed\Client
*/
public function setUrl($url)
{
$this->url = $url;
return $this;
}
/**
* Get the body of the HTTP response
*
* @access public
* @return string
*/
public function getContent()
{
return $this->content;
}
/**
* Get the encoding value from HTTP headers
*
* @access public
* @return string
*/
public function getEncoding()
{
return $this->encoding;
}
/**
* Return true if the remote resource has changed
*
* @access public
* @return bool
*/
public function isModified()
{
return $this->is_modified;
}
}
/**
* Set connection timeout
*
* @access public
* @param integer $timeout Connection timeout
* @return \PicoFeed\Client
*/
public function setTimeout($timeout)
{
$this->timeout = $timeout ?: $this->timeout;
return $this;
}
/**
* Set a custom user agent
*
* @access public
* @param string $user_agent User Agent
* @return \PicoFeed\Client
*/
public function setUserAgent($user_agent)
{
$this->user_agent = $user_agent ?: $this->user_agent;
return $this;
}
/**
* Set the mximum number of HTTP redirections
*
* @access public
* @param integer $max Maximum
* @return \PicoFeed\Client
*/
public function setMaxRedirections($max)
{
$this->max_redirects = $max ?: $this->max_redirects;
return $this;
}
/**
* Set the maximum size of the HTTP body
*
* @access public
* @param integer $max Maximum
* @return \PicoFeed\Client
*/
public function setMaxBodySize($max)
{
$this->max_body_size = $max ?: $this->max_body_size;
return $this;
}
/**
* Set the proxy hostname
*
* @access public
* @param string $hostname Proxy hostname
* @return \PicoFeed\Client
*/
public function setProxyHostname($hostname)
{
$this->proxy_hostname = $hostname ?: $this->proxy_hostname;
return $this;
}
/**
* Set the proxy port
*
* @access public
* @param integer $port Proxy port
* @return \PicoFeed\Client
*/
public function setProxyPort($port)
{
$this->proxy_port = $port ?: $this->proxy_port;
return $this;
}
/**
* Set the proxy username
*
* @access public
* @param string $username Proxy username
* @return \PicoFeed\Client
*/
public function setProxyUsername($username)
{
$this->proxy_username = $username ?: $this->proxy_username;
return $this;
}
/**
* Set the proxy password
*
* @access public
* @param string $password Password
* @return \PicoFeed\Client
*/
public function setProxyPassword($password)
{
$this->proxy_password = $password ?: $this->proxy_password;
return $this;
}
/**
* Set config object
*
* @access public
* @param \PicoFeed\Config $config Config instance
* @return \PicoFeed\Client
*/
public function setConfig($config)
{
$this->setTimeout($config->getGrabberTimeout());
$this->setUserAgent($config->getGrabberUserAgent());
$this->setMaxRedirections($config->getMaxRedirections());
$this->setMaxBodySize($config->getMaxBodySize());
$this->setProxyHostname($config->getProxyHostname());
$this->setProxyPort($config->getProxyPort());
$this->setProxyUsername($config->getProxyUsername());
$this->setProxyPassword($config->getProxyPassword());
return $this;
}
}

View File

@ -3,27 +3,80 @@
namespace PicoFeed\Clients;
use \PicoFeed\Logging;
use \PicoFeed\Client;
class Curl extends \PicoFeed\Client
/**
* cURL HTTP client
*
* @author Frederic Guillot
* @package client
*/
class Curl extends Client
{
/**
* HTTP response body
*
* @access private
* @var string
*/
private $body = '';
/**
* Body size
*
* @access private
* @var integer
*/
private $body_length = 0;
/**
* HTTP response headers
*
* @access private
* @var array
*/
private $headers = array();
/**
* Counter on the number of header received
*
* @access private
* @var integer
*/
private $headers_counter = 0;
/**
* cURL callback to read the HTTP body
*
* If the function return -1, curl stop to read the HTTP response
*
* @access public
* @param resource $ch cURL handler
* @param string $buffer Chunk of data
* @return integer Length of the buffer
*/
public function readBody($ch, $buffer)
{
$length = strlen($buffer);
$this->body_length += $length;
if ($this->body_length > $this->max_body_size) return -1;
if ($this->body_length > $this->max_body_size) {
return -1;
}
$this->body .= $buffer;
return $length;
}
/**
* cURL callback to read HTTP headers
*
* @access public
* @param resource $ch cURL handler
* @param string $buffer Header line
* @return integer Length of the buffer
*/
public function readHeaders($ch, $buffer)
{
$length = strlen($buffer);
@ -43,7 +96,13 @@ class Curl extends \PicoFeed\Client
return $length;
}
/**
* Do the HTTP request
*
* @access public
* @param bool $follow_location Flag used when there is an open_basedir restriction
* @return array HTTP response ['body' => ..., 'status' => ..., 'headers' => ...]
*/
public function doRequest($follow_location = true)
{
$request_headers = array('Connection: close');
@ -54,6 +113,7 @@ class Curl extends \PicoFeed\Client
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $this->url);
curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_USERAGENT, $this->user_agent);
@ -67,28 +127,34 @@ class Curl extends \PicoFeed\Client
curl_setopt($ch, CURLOPT_COOKIEJAR, 'php://memory');
curl_setopt($ch, CURLOPT_COOKIEFILE, 'php://memory');
if (parent::$proxy_hostname) {
if ($this->proxy_hostname) {
curl_setopt($ch, CURLOPT_PROXYPORT, parent::$proxy_port);
Logging::setMessage(get_called_class().' Proxy: '.$this->proxy_hostname.':'.$this->proxy_port);
curl_setopt($ch, CURLOPT_PROXYPORT, $this->proxy_port);
curl_setopt($ch, CURLOPT_PROXYTYPE, 'HTTP');
curl_setopt($ch, CURLOPT_PROXY, parent::$proxy_hostname);
curl_setopt($ch, CURLOPT_PROXY, $this->proxy_hostname);
if (parent::$proxy_username) {
curl_setopt($ch, CURLOPT_PROXYUSERPWD, parent::$proxy_username.':'.parent::$proxy_password);
if ($this->proxy_username) {
Logging::setMessage(get_called_class().' Proxy credentials: Yes');
curl_setopt($ch, CURLOPT_PROXYUSERPWD, $this->proxy_username.':'.$this->proxy_password);
}
else {
Logging::setMessage(get_called_class().' Proxy credentials: No');
}
}
curl_exec($ch);
Logging::log(\get_called_class().' cURL total time: '.curl_getinfo($ch, CURLINFO_TOTAL_TIME));
Logging::log(\get_called_class().' cURL dns lookup time: '.curl_getinfo($ch, CURLINFO_NAMELOOKUP_TIME));
Logging::log(\get_called_class().' cURL connect time: '.curl_getinfo($ch, CURLINFO_CONNECT_TIME));
Logging::log(\get_called_class().' cURL speed download: '.curl_getinfo($ch, CURLINFO_SPEED_DOWNLOAD));
Logging::log(\get_called_class().' cURL effective url: '.curl_getinfo($ch, CURLINFO_EFFECTIVE_URL));
Logging::setMessage(get_called_class().' cURL total time: '.curl_getinfo($ch, CURLINFO_TOTAL_TIME));
Logging::setMessage(get_called_class().' cURL dns lookup time: '.curl_getinfo($ch, CURLINFO_NAMELOOKUP_TIME));
Logging::setMessage(get_called_class().' cURL connect time: '.curl_getinfo($ch, CURLINFO_CONNECT_TIME));
Logging::setMessage(get_called_class().' cURL speed download: '.curl_getinfo($ch, CURLINFO_SPEED_DOWNLOAD));
Logging::setMessage(get_called_class().' cURL effective url: '.curl_getinfo($ch, CURLINFO_EFFECTIVE_URL));
if (curl_errno($ch)) {
Logging::log(\get_called_class().' cURL error: '.curl_error($ch));
Logging::setMessage(get_called_class().' cURL error: '.curl_error($ch));
curl_close($ch);
return false;
@ -133,4 +199,4 @@ class Curl extends \PicoFeed\Client
'headers' => $headers
);
}
}
}

View File

@ -3,6 +3,7 @@
namespace PicoFeed\Clients;
use \PicoFeed\Logging;
use \PicoFeed\Client;
/**
* Stream context HTTP client
@ -10,7 +11,7 @@ use \PicoFeed\Logging;
* @author Frederic Guillot
* @package client
*/
class Stream extends \PicoFeed\Client
class Stream extends Client
{
/**
* Do the HTTP request
@ -24,11 +25,19 @@ class Stream extends \PicoFeed\Client
$headers = array(
'Connection: close',
'User-Agent: '.$this->user_agent,
'Accept-Encoding: gzip',
);
if ($this->etag) $headers[] = 'If-None-Match: '.$this->etag;
if ($this->last_modified) $headers[] = 'If-Modified-Since: '.$this->last_modified;
if (function_exists('gzdecode')) {
$headers[] = 'Accept-Encoding: gzip';
}
if ($this->etag) {
$headers[] = 'If-None-Match: '.$this->etag;
}
if ($this->last_modified) {
$headers[] = 'If-Modified-Since: '.$this->last_modified;
}
// Create context
$context_options = array(
@ -41,14 +50,22 @@ class Stream extends \PicoFeed\Client
)
);
if (parent::$proxy_hostname) {
$context_options['http']['proxy'] = 'tcp://'.parent::$proxy_hostname.':'.parent::$proxy_port;
if ($this->proxy_hostname) {
Logging::setMessage(get_called_class().' Proxy: '.$this->proxy_hostname.':'.$this->proxy_port);
$context_options['http']['proxy'] = 'tcp://'.$this->proxy_hostname.':'.$this->proxy_port;
$context_options['http']['request_fulluri'] = true;
if (parent::$proxy_username) {
$headers[] = 'Proxy-Authorization: Basic '.base64_encode(parent::$proxy_username.':'.parent::$proxy_password);
if ($this->proxy_username) {
Logging::setMessage(get_called_class().' Proxy credentials: Yes');
$headers[] = 'Proxy-Authorization: Basic '.base64_encode($this->proxy_username.':'.$this->proxy_password);
$context_options['http']['header'] = implode("\r\n", $headers);
}
else {
Logging::setMessage(get_called_class().' Proxy credentials: No');
}
}
$context = stream_context_create($context_options);

View File

@ -1,32 +1,6 @@
<?php
/*
Copyright (c) 2008 Sebastián Grignoli
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of copyright holders nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS
BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
namespace PicoFeed;
/**
* @author "Sebastián Grignoli" <grignoli@framework2.com.ar>
@ -35,15 +9,11 @@ POSSIBILITY OF SUCH DAMAGE.
* @link https://github.com/neitanod/forceutf8
* @example https://github.com/neitanod/forceutf8
* @license Revised BSD
*/
namespace PicoFeed;
class Encoding {
protected static $win1252ToUtf8 = array(
*/
class Encoding
{
protected static $win1252ToUtf8 = array(
128 => "\xe2\x82\xac",
130 => "\xe2\x80\x9a",
131 => "\xc6\x92",
132 => "\xe2\x80\x9e",
@ -55,10 +25,7 @@ class Encoding {
138 => "\xc5\xa0",
139 => "\xe2\x80\xb9",
140 => "\xc5\x92",
142 => "\xc5\xbd",
145 => "\xe2\x80\x98",
146 => "\xe2\x80\x99",
147 => "\xe2\x80\x9c",
@ -71,260 +38,155 @@ class Encoding {
154 => "\xc5\xa1",
155 => "\xe2\x80\xba",
156 => "\xc5\x93",
158 => "\xc5\xbe",
159 => "\xc5\xb8"
);
protected static $brokenUtf8ToUtf8 = array(
"\xc2\x80" => "\xe2\x82\xac",
"\xc2\x82" => "\xe2\x80\x9a",
"\xc2\x83" => "\xc6\x92",
"\xc2\x84" => "\xe2\x80\x9e",
"\xc2\x85" => "\xe2\x80\xa6",
"\xc2\x86" => "\xe2\x80\xa0",
"\xc2\x87" => "\xe2\x80\xa1",
"\xc2\x88" => "\xcb\x86",
"\xc2\x89" => "\xe2\x80\xb0",
"\xc2\x8a" => "\xc5\xa0",
"\xc2\x8b" => "\xe2\x80\xb9",
"\xc2\x8c" => "\xc5\x92",
"\xc2\x8e" => "\xc5\xbd",
"\xc2\x91" => "\xe2\x80\x98",
"\xc2\x92" => "\xe2\x80\x99",
"\xc2\x93" => "\xe2\x80\x9c",
"\xc2\x94" => "\xe2\x80\x9d",
"\xc2\x95" => "\xe2\x80\xa2",
"\xc2\x96" => "\xe2\x80\x93",
"\xc2\x97" => "\xe2\x80\x94",
"\xc2\x98" => "\xcb\x9c",
"\xc2\x99" => "\xe2\x84\xa2",
"\xc2\x9a" => "\xc5\xa1",
"\xc2\x9b" => "\xe2\x80\xba",
"\xc2\x9c" => "\xc5\x93",
"\xc2\x9e" => "\xc5\xbe",
"\xc2\x9f" => "\xc5\xb8"
);
protected static $utf8ToWin1252 = array(
"\xe2\x82\xac" => "\x80",
"\xe2\x80\x9a" => "\x82",
"\xc6\x92" => "\x83",
"\xe2\x80\x9e" => "\x84",
"\xe2\x80\xa6" => "\x85",
"\xe2\x80\xa0" => "\x86",
"\xe2\x80\xa1" => "\x87",
"\xcb\x86" => "\x88",
"\xe2\x80\xb0" => "\x89",
"\xc5\xa0" => "\x8a",
"\xe2\x80\xb9" => "\x8b",
"\xc5\x92" => "\x8c",
"\xc5\xbd" => "\x8e",
"\xe2\x80\x98" => "\x91",
"\xe2\x80\x99" => "\x92",
"\xe2\x80\x9c" => "\x93",
"\xe2\x80\x9d" => "\x94",
"\xe2\x80\xa2" => "\x95",
"\xe2\x80\x93" => "\x96",
"\xe2\x80\x94" => "\x97",
"\xcb\x9c" => "\x98",
"\xe2\x84\xa2" => "\x99",
"\xc5\xa1" => "\x9a",
"\xe2\x80\xba" => "\x9b",
"\xc5\x93" => "\x9c",
"\xc5\xbe" => "\x9e",
"\xc5\xb8" => "\x9f"
);
static function toUTF8($text){
/**
* Function Encoding::toUTF8
*
* This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
*
* It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1.
*
* It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
*
* 1) when any of these characters: ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß
* are followed by any of these: ("group B")
* ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿
* For example: %ABREPRESENT%C9%BB. «REPRESENTÉ»
* The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
* is also a valid unicode character, and will be left unchanged.
*
* 2) when any of these: àáâãäåæçèéêëìíîï are followed by TWO chars from group B,
* 3) when any of these: ðñòó are followed by THREE chars from group B.
*
* @name toUTF8
* @param string $text Any string.
* @return string The same string, UTF8 encoded
*
*/
protected static $utf8ToWin1252 = array(
"\xe2\x82\xac" => "\x80",
"\xe2\x80\x9a" => "\x82",
"\xc6\x92" => "\x83",
"\xe2\x80\x9e" => "\x84",
"\xe2\x80\xa6" => "\x85",
"\xe2\x80\xa0" => "\x86",
"\xe2\x80\xa1" => "\x87",
"\xcb\x86" => "\x88",
"\xe2\x80\xb0" => "\x89",
"\xc5\xa0" => "\x8a",
"\xe2\x80\xb9" => "\x8b",
"\xc5\x92" => "\x8c",
"\xc5\xbd" => "\x8e",
"\xe2\x80\x98" => "\x91",
"\xe2\x80\x99" => "\x92",
"\xe2\x80\x9c" => "\x93",
"\xe2\x80\x9d" => "\x94",
"\xe2\x80\xa2" => "\x95",
"\xe2\x80\x93" => "\x96",
"\xe2\x80\x94" => "\x97",
"\xcb\x9c" => "\x98",
"\xe2\x84\xa2" => "\x99",
"\xc5\xa1" => "\x9a",
"\xe2\x80\xba" => "\x9b",
"\xc5\x93" => "\x9c",
"\xc5\xbe" => "\x9e",
"\xc5\xb8" => "\x9f"
);
if(is_array($text))
/**
* Function Encoding::toUTF8
*
* This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
*
* It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1.
*
* It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
*
* 1) when any of these characters: ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß
* are followed by any of these: ("group B")
* ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿
* For example: %ABREPRESENT%C9%BB. «REPRESENTÉ»
* The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
* is also a valid unicode character, and will be left unchanged.
*
* 2) when any of these: àáâãäåæçèéêëìíîï are followed by TWO chars from group B,
* 3) when any of these: ðñòó are followed by THREE chars from group B.
*
* @name toUTF8
* @param string $text Any string.
* @return string The same string, UTF8 encoded
*
*/
public static function toUTF8($text)
{
foreach($text as $k => $v)
{
$text[$k] = self::toUTF8($v);
}
return $text;
} elseif(is_string($text)) {
if (is_array($text)) {
foreach ($text as $k => $v) {
$text[$k] = self::toUTF8($v);
}
$max = strlen($text);
$buf = "";
for($i = 0; $i < $max; $i++){
$c1 = $text{$i};
if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already
$c2 = $i+1 >= $max? "\x00" : $text{$i+1};
$c3 = $i+2 >= $max? "\x00" : $text{$i+2};
$c4 = $i+3 >= $max? "\x00" : $text{$i+3};
if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8
if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already
$buf .= $c1 . $c2;
$i++;
} else { //not valid UTF8. Convert it.
$cc1 = (chr(ord($c1) / 64) | "\xc0");
$cc2 = ($c1 & "\x3f") | "\x80";
$buf .= $cc1 . $cc2;
}
} elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8
if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already
$buf .= $c1 . $c2 . $c3;
$i = $i + 2;
} else { //not valid UTF8. Convert it.
$cc1 = (chr(ord($c1) / 64) | "\xc0");
$cc2 = ($c1 & "\x3f") | "\x80";
$buf .= $cc1 . $cc2;
}
} elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8
if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already
$buf .= $c1 . $c2 . $c3;
$i = $i + 2;
} else { //not valid UTF8. Convert it.
$cc1 = (chr(ord($c1) / 64) | "\xc0");
$cc2 = ($c1 & "\x3f") | "\x80";
$buf .= $cc1 . $cc2;
}
} else { //doesn't look like UTF8, but should be converted
$cc1 = (chr(ord($c1) / 64) | "\xc0");
$cc2 = (($c1 & "\x3f") | "\x80");
$buf .= $cc1 . $cc2;
}
} elseif(($c1 & "\xc0") == "\x80"){ // needs conversion
if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases
$buf .= self::$win1252ToUtf8[ord($c1)];
} else {
$cc1 = (chr(ord($c1) / 64) | "\xc0");
$cc2 = (($c1 & "\x3f") | "\x80");
$buf .= $cc1 . $cc2;
return $text;
}
elseif (is_string($text)) {
$max = strlen($text);
$buf = "";
for ($i = 0; $i < $max; $i++) {
$c1 = $text{$i};
if ($c1>="\xc0") { //Should be converted to UTF8, if it's not UTF8 already
$c2 = $i+1 >= $max? "\x00" : $text{$i+1};
$c3 = $i+2 >= $max? "\x00" : $text{$i+2};
$c4 = $i+3 >= $max? "\x00" : $text{$i+3};
if ($c1 >= "\xc0" & $c1 <= "\xdf") { //looks like 2 bytes UTF8
if ($c2 >= "\x80" && $c2 <= "\xbf") { //yeah, almost sure it's UTF8 already
$buf .= $c1 . $c2;
$i++;
}
else { //not valid UTF8. Convert it.
$cc1 = (chr(ord($c1) / 64) | "\xc0");
$cc2 = ($c1 & "\x3f") | "\x80";
$buf .= $cc1 . $cc2;
}
}
else if ($c1 >= "\xe0" & $c1 <= "\xef") { //looks like 3 bytes UTF8
if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { //yeah, almost sure it's UTF8 already
$buf .= $c1 . $c2 . $c3;
$i = $i + 2;
}
else { //not valid UTF8. Convert it.
$cc1 = (chr(ord($c1) / 64) | "\xc0");
$cc2 = ($c1 & "\x3f") | "\x80";
$buf .= $cc1 . $cc2;
}
}
else if ($c1 >= "\xf0" & $c1 <= "\xf7") { //looks like 4 bytes UTF8
if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") { //yeah, almost sure it's UTF8 already
$buf .= $c1 . $c2 . $c3;
$i = $i + 2;
}
else { //not valid UTF8. Convert it.
$cc1 = (chr(ord($c1) / 64) | "\xc0");
$cc2 = ($c1 & "\x3f") | "\x80";
$buf .= $cc1 . $cc2;
}
}
else { //doesn't look like UTF8, but should be converted
$cc1 = (chr(ord($c1) / 64) | "\xc0");
$cc2 = (($c1 & "\x3f") | "\x80");
$buf .= $cc1 . $cc2;
}
}
} else { // it doesn't need convesion
$buf .= $c1;
}
}
return $buf;
} else {
return $text;
}
}
elseif (($c1 & "\xc0") == "\x80") { // needs conversion
static function toWin1252($text) {
if(is_array($text)) {
foreach($text as $k => $v) {
$text[$k] = self::toWin1252($v);
}
return $text;
} elseif(is_string($text)) {
return utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)));
} else {
return $text;
}
}
if (isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases
$buf .= self::$win1252ToUtf8[ord($c1)];
}
else {
$cc1 = (chr(ord($c1) / 64) | "\xc0");
$cc2 = (($c1 & "\x3f") | "\x80");
$buf .= $cc1 . $cc2;
}
}
else { // it doesn't need convesion
$buf .= $c1;
}
}
static function toISO8859($text) {
return self::toWin1252($text);
}
static function toLatin1($text) {
return self::toWin1252($text);
}
static function fixUTF8($text){
if(is_array($text)) {
foreach($text as $k => $v) {
$text[$k] = self::fixUTF8($v);
}
return $text;
return $buf;
}
else {
return $text;
}
}
$last = "";
while($last <> $text){
$last = $text;
$text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
public static function cp1251ToUtf8($input)
{
return iconv('CP1251', 'UTF-8//TRANSLIT', $input);
}
$text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
return $text;
}
static function UTF8FixWin1252Chars($text){
// If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1
// (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
// See: http://en.wikipedia.org/wiki/Windows-1252
return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text);
}
static function removeBOM($str=""){
if(substr($str, 0,3) == pack("CCC",0xef,0xbb,0xbf)) {
$str=substr($str, 3);
}
return $str;
}
public static function normalizeEncoding($encodingLabel)
{
$encoding = strtoupper($encodingLabel);
$enc = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding);
$equivalences = array(
'ISO88591' => 'ISO-8859-1',
'ISO8859' => 'ISO-8859-1',
'ISO' => 'ISO-8859-1',
'LATIN1' => 'ISO-8859-1',
'LATIN' => 'ISO-8859-1',
'UTF8' => 'UTF-8',
'UTF' => 'UTF-8',
'WIN1252' => 'ISO-8859-1',
'WINDOWS1252' => 'ISO-8859-1'
);
if(empty($equivalences[$encoding])){
return 'UTF-8';
}
return $equivalences[$encoding];
}
public static function encode($encodingLabel, $text)
{
$encodingLabel = self::normalizeEncoding($encodingLabel);
if($encodingLabel == 'UTF-8') return Encoding::toUTF8($text);
if($encodingLabel == 'ISO-8859-1') return Encoding::toLatin1($text);
}
public static function cp1251ToUtf8($input)
{
return iconv('CP1251', 'UTF-8//TRANSLIT', $input);
}
}

View File

@ -2,26 +2,56 @@
namespace PicoFeed;
use SimpleXMLElement;
/**
* OPML export class
*
* @author Frederic Guillot
* @package picofeed
*/
class Export
{
/**
* List of feeds to exports
*
* @access private
* @var array
*/
private $content = array();
public $required_fields = array(
/**
* List of required properties for each feed
*
* @access private
* @var array
*/
private $required_fields = array(
'title',
'site_url',
'feed_url'
'feed_url',
);
/**
* Constructor
*
* @access public
* @param array $content List of feeds
*/
public function __construct(array $content)
{
$this->content = $content;
}
/**
* Get the OPML document
*
* @access public
* @return string
*/
public function execute()
{
$xml = new \SimpleXMLElement('<?xml version="1.0" encoding="utf-8"?><opml/>');
$xml = new SimpleXMLElement('<?xml version="1.0" encoding="utf-8"?><opml/>');
$head = $xml->addChild('head');
$head->addChild('title', 'OPML Export');
@ -35,13 +65,14 @@ class Export
foreach ($this->required_fields as $field) {
if (! isset($feed[$field])) {
$valid = false;
break;
}
}
if (! $valid) continue;
if (! $valid) {
continue;
}
$outline = $body->addChild('outline');
$outline->addAttribute('xmlUrl', $feed['feed_url']);
@ -55,4 +86,4 @@ class Export
return $xml->asXML();
}
}
}

150
vendor/PicoFeed/Feed.php vendored Normal file
View File

@ -0,0 +1,150 @@
<?php
namespace PicoFeed;
/**
* Feed
*
* @author Frederic Guillot
* @package picofeed
*/
class Feed
{
/**
* Feed items
*
* @access public
* @var array
*/
public $items = array();
/**
* Feed id
*
* @access public
* @var string
*/
public $id = '';
/**
* Feed title
*
* @access public
* @var string
*/
public $title = '';
/**
* Item url
*
* @access public
* @var string
*/
public $url = '';
/**
* Item date
*
* @access public
* @var integer
*/
public $date = 0;
/**
* Item language
*
* @access public
* @var string
*/
public $language = '';
/**
* Return feed information
*
* @access public
* $return string
*/
public function __toString()
{
$output = '';
foreach (array('id', 'title', 'url', 'date', 'language') as $property) {
$output .= 'Feed::'.$property.' = '.$this->$property.PHP_EOL;
}
$output .= 'Feed::items = '.count($this->items).' items'.PHP_EOL;
foreach ($this->items as $item) {
$output .= '----'.PHP_EOL;
$output .= $item;
}
return $output;
}
/**
* Get title
*
* @access public
* $return string
*/
public function getTitle()
{
return $this->title;
}
/**
* Get url
*
* @access public
* $return string
*/
public function getUrl()
{
return $this->url;
}
/**
* Get date
*
* @access public
* $return integer
*/
public function getDate()
{
return $this->date;
}
/**
* Get language
*
* @access public
* $return string
*/
public function getLanguage()
{
return $this->language;
}
/**
* Get id
*
* @access public
* $return string
*/
public function getId()
{
return $this->id;
}
/**
* Get feed items
*
* @access public
* $return array
*/
public function getItems()
{
return $this->items;
}
}

View File

@ -2,14 +2,24 @@
namespace PicoFeed;
use DOMDocument;
/**
* Filter class
*
* @author Frederic Guillot
* @package parser
* @package picofeed
*/
class Filter
{
/**
* Config object
*
* @access private
* @var \PicoFeed\Config
*/
private $config = null;
/**
* Filtered XML data
*
@ -61,11 +71,10 @@ class Filter
/**
* Tags and attribute whitelist
*
* @static
* @access public
* @access private
* @var array
*/
public static $whitelist_tags = array(
private $whitelist_tags = array(
'audio' => array('controls', 'src'),
'video' => array('poster', 'controls', 'height', 'width', 'src'),
'source' => array('src', 'type'),
@ -109,11 +118,10 @@ class Filter
/**
* Tags blacklist, strip the content of those tags
*
* @static
* @access public
* @access private
* @var array
*/
public static $blacklist_tags = array(
private $blacklisted_tags = array(
'script'
);
@ -121,11 +129,10 @@ class Filter
* Scheme whitelist
* For a complete list go to http://en.wikipedia.org/wiki/URI_scheme
*
* @static
* @access public
* @access private
* @var array
*/
public static $scheme_whitelist = array(
private $scheme_whitelist = array(
'//',
'data:image/png;base64,',
'data:image/gif;base64,',
@ -164,11 +171,10 @@ class Filter
/**
* Attributes used for external resources
*
* @static
* @access public
* @access private
* @var array
*/
public static $media_attributes = array(
private $media_attributes = array(
'src',
'href',
'poster',
@ -177,11 +183,10 @@ class Filter
/**
* Blacklisted resources
*
* @static
* @access public
* @access private
* @var array
*/
public static $media_blacklist = array(
private $media_blacklist = array(
'feeds.feedburner.com',
'share.feedsportal.com',
'da.feedsportal.com',
@ -209,11 +214,10 @@ class Filter
/**
* Mandatory attributes for specified tags
*
* @static
* @access public
* @access private
* @var array
*/
public static $required_attributes = array(
private $required_attributes = array(
'a' => array('href'),
'img' => array('src'),
'iframe' => array('src'),
@ -224,22 +228,20 @@ class Filter
/**
* Add attributes to specified tags
*
* @static
* @access public
* @access private
* @var array
*/
public static $add_attributes = array(
private $add_attributes = array(
'a' => 'rel="noreferrer" target="_blank"'
);
/**
* Attributes that must be integer
*
* @static
* @access public
* @access private
* @var array
*/
public static $integer_attributes = array(
private $integer_attributes = array(
'width',
'height',
'frameborder',
@ -248,11 +250,10 @@ class Filter
/**
* Iframe source whitelist, everything else is ignored
*
* @static
* @access public
* @access private
* @var array
*/
public static $iframe_whitelist = array(
private $iframe_whitelist = array(
'//www.youtube.com',
'http://www.youtube.com',
'https://www.youtube.com',
@ -273,10 +274,10 @@ class Filter
{
$this->url = $site_url;
\libxml_use_internal_errors(true);
libxml_use_internal_errors(true);
// Convert bad formatted documents to XML
$dom = new \DOMDocument;
$dom = new DOMDocument;
$dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$data);
$this->input = $dom->saveXML($dom->getElementsByTagName('body')->item(0));
}
@ -300,7 +301,7 @@ class Filter
$this->data = $this->removeEmptyTags($this->data);
$this->data = $this->removeMultipleTags($this->data);
return $this->data;
return trim($this->data);
}
/**
@ -372,9 +373,9 @@ class Filter
}
// Check for required attributes
if (isset(self::$required_attributes[$name])) {
if (isset($this->required_attributes[$name])) {
foreach (self::$required_attributes[$name] as $required_attribute) {
foreach ($this->required_attributes[$name] as $required_attribute) {
if (! in_array($required_attribute, $used_attributes)) {
@ -389,9 +390,9 @@ class Filter
$this->data .= '<'.$name.$attr_data;
// Add custom attributes
if (isset(self::$add_attributes[$name])) {
if (isset($this->add_attributes[$name])) {
$this->data .= ' '.self::$add_attributes[$name].' ';
$this->data .= ' '.$this->add_attributes[$name].' ';
}
// If img or br, we don't close it here
@ -399,7 +400,7 @@ class Filter
}
}
if (in_array($name, self::$blacklist_tags)) {
if (in_array($name, $this->blacklisted_tags)) {
$this->strip_content = true;
}
@ -530,7 +531,7 @@ class Filter
*/
public function isAllowedTag($name)
{
return isset(self::$whitelist_tags[$name]);
return isset($this->whitelist_tags[$name]);
}
/**
@ -543,7 +544,7 @@ class Filter
*/
public function isAllowedAttribute($tag, $attribute)
{
return in_array($attribute, self::$whitelist_tags[$tag]);
return in_array($attribute, $this->whitelist_tags[$tag]);
}
/**
@ -555,7 +556,7 @@ class Filter
*/
public function isResource($attribute)
{
return in_array($attribute, self::$media_attributes);
return in_array($attribute, $this->media_attributes);
}
/**
@ -567,7 +568,7 @@ class Filter
*/
public function isAllowedIframeResource($value)
{
foreach (self::$iframe_whitelist as $url) {
foreach ($this->iframe_whitelist as $url) {
if (strpos($value, $url) === 0) {
return true;
@ -586,7 +587,7 @@ class Filter
*/
public function isAllowedProtocol($value)
{
foreach (self::$scheme_whitelist as $protocol) {
foreach ($this->scheme_whitelist as $protocol) {
if (strpos($value, $protocol) === 0) {
return true;
@ -605,7 +606,7 @@ class Filter
*/
public function isBlacklistedMedia($resource)
{
foreach (self::$media_blacklist as $name) {
foreach ($this->media_blacklist as $name) {
if (strpos($resource, $name) !== false) {
return true;
@ -640,7 +641,7 @@ class Filter
*/
public function validateAttributeValue($attribute, $value)
{
if (in_array($attribute, self::$integer_attributes)) {
if (in_array($attribute, $this->integer_attributes)) {
return ctype_digit($value);
}
@ -758,4 +759,147 @@ class Filter
return $encoding;
}
/**
* Set whitelisted tags adn attributes for each tag
*
* @access public
* @param array $values List of tags: ['video' => ['src', 'cover'], 'img' => ['src']]
* @return \PicoFeed\Filter
*/
public function setWhitelistedTags(array $values)
{
$this->whitelist_tags = $values ?: $this->whitelist_tags;
return $this;
}
/**
* Set blacklisted tags
*
* @access public
* @param array $values List of tags: ['video', 'img']
* @return \PicoFeed\Filter
*/
public function setBlacklistedTags(array $values)
{
$this->blacklisted_tags = $values ?: $this->blacklisted_tags;
return $this;
}
/**
* Set scheme whitelist
*
* @access public
* @param array $values List of scheme: ['http://', 'ftp://']
* @return \PicoFeed\Filter
*/
public function setSchemeWhitelist(array $values)
{
$this->scheme_whitelist = $values ?: $this->scheme_whitelist;
return $this;
}
/**
* Set media attributes (used to load external resources)
*
* @access public
* @param array $values List of values: ['src', 'href']
* @return \PicoFeed\Filter
*/
public function setMediaAttributes(array $values)
{
$this->media_attributes = $values ?: $this->media_attributes;
return $this;
}
/**
* Set blacklisted external resources
*
* @access public
* @param array $values List of tags: ['http://google.com/', '...']
* @return \PicoFeed\Filter
*/
public function setMediaBlacklist(array $values)
{
$this->media_blacklist = $values ?: $this->media_blacklist;
return $this;
}
/**
* Set mandatory attributes for whitelisted tags
*
* @access public
* @param array $values List of tags: ['img' => 'src']
* @return \PicoFeed\Filter
*/
public function setRequiredAttributes(array $values)
{
$this->required_attributes = $values ?: $this->required_attributes;
return $this;
}
/**
* Set attributes to automatically to specific tags
*
* @access public
* @param array $values List of tags: ['a' => 'target="_blank"']
* @return \PicoFeed\Filter
*/
public function setAttributeOverrides(array $values)
{
$this->add_attributes = $values ?: $this->add_attributes;
return $this;
}
/**
* Set attributes that must be an integer
*
* @access public
* @param array $values List of tags: ['width', 'height']
* @return \PicoFeed\Filter
*/
public function setIntegerAttributes(array $values)
{
$this->integer_attributes = $values ?: $this->integer_attributes;
return $this;
}
/**
* Set allowed iframe resources
*
* @access public
* @param array $values List of tags: ['http://www.youtube.com']
* @return \PicoFeed\Filter
*/
public function setIframeWhitelist(array $values)
{
$this->iframe_whitelist = $values ?: $this->iframe_whitelist;
return $this;
}
/**
* Set config object
*
* @access public
* @param \PicoFeed\Config $config Config instance
* @return \PicoFeed\Parse
*/
public function setConfig($config)
{
$this->config = $config;
if ($this->config !== null) {
$this->setIframeWhitelist($this->config->getFilterIframeWhitelist(array()));
$this->setIntegerAttributes($this->config->getFilterIntegerAttributes(array()));
$this->setAttributeOverrides($this->config->getFilterAttributeOverrides(array()));
$this->setRequiredAttributes($this->config->getFilterRequiredAttributes(array()));
$this->setMediaBlacklist($this->config->getFilterMediaBlacklist(array()));
$this->setMediaAttributes($this->config->getFilterMediaAttributes(array()));
$this->setSchemeWhitelist($this->config->getFilterSchemeWhitelist(array()));
$this->setBlacklistedTags($this->config->getFilterBlacklistedTags(array()));
$this->setWhitelistedTags($this->config->getFilterWhitelistedTags(array()));
}
return $this;
}
}

View File

@ -2,19 +2,59 @@
namespace PicoFeed;
require_once __DIR__.'/Client.php';
require_once __DIR__.'/Encoding.php';
require_once __DIR__.'/Logging.php';
require_once __DIR__.'/Filter.php';
use DOMXPath;
use PicoFeed\Logging;
use PicoFeed\Client;
use PicoFeed\Encoding;
use PicoFeed\Filter;
/**
* Grabber class
*
* @author Frederic Guillot
* @package picofeed
*/
class Grabber
{
public $content = '';
public $html = '';
public $encoding = '';
/**
* URL
*
* @access private
* @var string
*/
private $url = '';
// Order is important, generic terms at the end
public $candidatesAttributes = array(
/**
* Relevant content
*
* @access private
* @var string
*/
private $content = '';
/**
* HTML content
*
* @access private
* @var string
*/
private $html = '';
/**
* HTML content encoding
*
* @access private
* @var string
*/
private $encoding = '';
/**
* List of attributes to try to get the content, order is important, generic terms at the end
*
* @access private
* @var array
*/
private $candidatesAttributes = array(
'articleBody',
'articlebody',
'article-body',
@ -37,7 +77,13 @@ class Grabber
'main',
);
public $stripAttributes = array(
/**
* List of attributes to strip
*
* @access private
* @var array
*/
private $stripAttributes = array(
'comment',
'share',
'links',
@ -57,7 +103,13 @@ class Grabber
'categories',
);
public $stripTags = array(
/**
* Tags to remove
*
* @access private
* @var array
*/
private $stripTags = array(
'script',
'style',
'nav',
@ -67,7 +119,22 @@ class Grabber
'form',
);
/**
* Config object
*
* @access private
* @var \PicoFeed\Config
*/
private $config = null;
/**
* Constructor
*
* @access public
* @param string $url Url
* @param string $html HTML content
* @param string $encoding Charset
*/
public function __construct($url, $html = '', $encoding = 'utf-8')
{
$this->url = $url;
@ -75,13 +142,53 @@ class Grabber
$this->encoding = $encoding;
}
/**
* Set config object
*
* @access public
* @param \PicoFeed\Config $config Config instance
* @return \PicoFeed\Grabber
*/
public function setConfig($config)
{
$this->config = $config;
return $this;
}
/**
* Get relevant content
*
* @access public
* @return string
*/
public function getContent()
{
return $this->content;
}
/**
* Get raw content (unfiltered)
*
* @access public
* @return string
*/
public function getRawContent()
{
return $this->html;
}
/**
* Parse the HTML content
*
* @access public
* @return bool
*/
public function parse()
{
if ($this->html) {
Logging::log(\get_called_class().' Fix encoding');
Logging::log(\get_called_class().': HTTP Encoding "'.$this->encoding.'"');
Logging::setMessage(get_called_class().' Fix encoding');
Logging::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'"');
$this->html = Filter::stripHeadTags($this->html);
@ -92,42 +199,63 @@ class Grabber
$this->html = Encoding::toUTF8($this->html);
}
Logging::log(\get_called_class().' Content length: '.strlen($this->html).' bytes');
Logging::setMessage(get_called_class().' Content length: '.strlen($this->html).' bytes');
$rules = $this->getRules();
if (is_array($rules)) {
Logging::log(\get_called_class().' Parse content with rules');
Logging::setMessage(get_called_class().' Parse content with rules');
$this->parseContentWithRules($rules);
}
else {
Logging::log(\get_called_class().' Parse content with candidates');
Logging::setMessage(get_called_class().' Parse content with candidates');
$this->parseContentWithCandidates();
}
}
else {
Logging::log(\get_called_class().' No content fetched');
Logging::setMessage(get_called_class().' No content fetched');
}
Logging::log(\get_called_class().' Content length: '.strlen($this->content).' bytes');
Logging::log(\get_called_class().' Grabber done');
Logging::setMessage(get_called_class().' Content length: '.strlen($this->content).' bytes');
Logging::setMessage(get_called_class().' Grabber done');
return $this->content !== '';
}
public function download($timeout = 5, $user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36')
/**
* Download the HTML content
*
* @access public
* @return HTML content
*/
public function download()
{
$client = Client::create();
$client->url = $this->url;
$client->timeout = $timeout;
$client->user_agent = $user_agent;
$client->execute();
$client = Client::getInstance();
if ($this->config !== null) {
$client->setTimeout($this->config->getGrabberTimeout())
->setUserAgent($this->config->getGrabberUserAgent())
->setMaxRedirections($this->config->getMaxRedirections())
->setMaxBodySize($this->config->getMaxBodySize())
->setProxyHostname($this->config->getProxyHostname())
->setProxyPort($this->config->getProxyPort())
->setProxyUsername($this->config->getProxyUsername())
->setProxyPassword($this->config->getProxyPassword());
}
$client->execute($this->url);
$this->html = $client->getContent();
$this->encoding = $client->getEncoding();
return $this->html;
}
/**
* Try to find a predefined rule
*
* @access public
* @return mixed
*/
public function getRules()
{
$hostname = parse_url($this->url, PHP_URL_HOST);
@ -147,7 +275,7 @@ class Grabber
$filename = __DIR__.'/Rules/'.$file.'.php';
if (file_exists($filename)) {
Logging::log(\get_called_class().' Load rule: '.$file);
Logging::setMessage(get_called_class().' Load rule: '.$file);
return include $filename;
}
}
@ -155,13 +283,16 @@ class Grabber
return false;
}
/**
* Get the relevant content with predefined rules
*
* @access public
* @param array $rules Rules
*/
public function parseContentWithRules(array $rules)
{
\libxml_use_internal_errors(true);
$dom = new \DOMDocument;
$dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$this->html);
$xpath = new \DOMXPath($dom);
$dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$this->html);
$xpath = new DOMXPath($dom);
if (isset($rules['strip']) && is_array($rules['strip'])) {
@ -192,24 +323,26 @@ class Grabber
}
}
/**
* Get the relevant content with the list of potential attributes
*
* @access public
*/
public function parseContentWithCandidates()
{
\libxml_use_internal_errors(true);
$dom = new \DOMDocument;
$dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$this->html);
$xpath = new \DOMXPath($dom);
$dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$this->html);
$xpath = new DOMXPath($dom);
// Try to lookup in each tag
foreach ($this->candidatesAttributes as $candidate) {
Logging::log(\get_called_class().' Try this candidate: "'.$candidate.'"');
Logging::setMessage(get_called_class().' Try this candidate: "'.$candidate.'"');
$nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
Logging::log(\get_called_class().' Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
Logging::setMessage(get_called_class().' Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
break;
}
}
@ -221,51 +354,57 @@ class Grabber
if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
Logging::log(\get_called_class().' Find <article/> tag ('.strlen($this->content).' bytes)');
Logging::setMessage(get_called_class().' Find <article/> tag ('.strlen($this->content).' bytes)');
}
}
if (strlen($this->content) < 50) {
Logging::log(\get_called_class().' No enought content fetched, get the full body');
Logging::setMessage(get_called_class().' No enought content fetched, get the full body');
$this->content = $dom->saveXML($dom->firstChild);
}
Logging::log(\get_called_class().' Strip garbage');
Logging::setMessage(get_called_class().' Strip garbage');
$this->stripGarbage();
}
/**
* Strip useless tags
*
* @access public
*/
public function stripGarbage()
{
\libxml_use_internal_errors(true);
$dom = new \DOMDocument;
$dom->loadXML($this->content);
$xpath = new \DOMXPath($dom);
$dom = XmlParser::getDomDocument($this->content);
foreach ($this->stripTags as $tag) {
if ($dom !== false) {
$nodes = $xpath->query('//'.$tag);
$xpath = new DOMXPath($dom);
if ($nodes !== false && $nodes->length > 0) {
Logging::log(\get_called_class().' Strip tag: "'.$tag.'"');
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
foreach ($this->stripTags as $tag) {
$nodes = $xpath->query('//'.$tag);
if ($nodes !== false && $nodes->length > 0) {
Logging::setMessage(get_called_class().' Strip tag: "'.$tag.'"');
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
}
}
}
foreach ($this->stripAttributes as $attribute) {
foreach ($this->stripAttributes as $attribute) {
$nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
$nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
if ($nodes !== false && $nodes->length > 0) {
Logging::log(\get_called_class().' Strip attribute: "'.$tag.'"');
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
if ($nodes !== false && $nodes->length > 0) {
Logging::setMessage(get_called_class().' Strip attribute: "'.$attribute.'"');
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
}
}
}
$this->content = $dom->saveXML($dom->documentElement);
$this->content = $dom->saveXML($dom->documentElement);
}
}
}

View File

@ -3,47 +3,75 @@
namespace PicoFeed;
require_once __DIR__.'/Logging.php';
require_once __DIR__.'/XmlParser.php';
use PicoFeed\Logging;
use PicoFeed\XmlParser;
/**
* OPML Import
*
* @author Frederic Guillot
* @package picofeed
*/
class Import
{
/**
* OPML file content
*
* @access private
* @var string
*/
private $content = '';
/**
* Subscriptions
*
* @access private
* @var array
*/
private $items = array();
/**
* Constructor
*
* @access public
* @param string $content OPML file content
*/
public function __construct($content)
{
$this->content = $content;
}
/**
* Parse the OPML file
*
* @access public
* @return array|false
*/
public function execute()
{
\PicoFeed\Logging::log(\get_called_class().': start importation');
Logging::setMessage(get_called_class().': start importation');
try {
$xml = XmlParser::getSimpleXml(trim($this->content));
\libxml_use_internal_errors(true);
$xml = new \SimpleXMLElement(trim($this->content));
if ($xml->getName() !== 'opml' || ! isset($xml->body)) {
\PicoFeed\Logging::log(\get_called_class().': OPML tag not found');
return false;
}
$this->parseEntries($xml->body);
\PicoFeed\Logging::log(\get_called_class().': '.count($this->items).' subscriptions found');
}
catch (\Exception $e) {
\PicoFeed\Logging::log(\get_called_class().': '.$e->getMessage());
if ($xml === false || $xml->getName() !== 'opml' || ! isset($xml->body)) {
Logging::setMessage(get_called_class().': OPML tag not found or malformed XML document');
return false;
}
$this->parseEntries($xml->body);
Logging::setMessage(get_called_class().': '.count($this->items).' subscriptions found');
return $this->items;
}
/**
* Parse each entries of the subscription list
*
* @access public
* @param SimpleXMLElement $tree XML node
*/
public function parseEntries($tree)
{
if (isset($tree->outline)) {
@ -68,4 +96,4 @@ class Import
}
}
}
}
}

202
vendor/PicoFeed/Item.php vendored Normal file
View File

@ -0,0 +1,202 @@
<?php
namespace PicoFeed;
/**
* Feed Item
*
* @author Frederic Guillot
* @package picofeed
*/
class Item
{
/**
* Item id
*
* @access public
* @var string
*/
public $id = '';
/**
* Item title
*
* @access public
* @var string
*/
public $title = '';
/**
* Item url
*
* @access public
* @var string
*/
public $url = '';
/**
* Item author
*
* @access public
* @var string
*/
public $author= '';
/**
* Item date
*
* @access public
* @var integer
*/
public $date = 0;
/**
* Item content
*
* @access public
* @var string
*/
public $content = '';
/**
* Item enclosure url
*
* @access public
* @var string
*/
public $enclosure_url = '';
/**
* Item enclusure type
*
* @access public
* @var string
*/
public $enclosure_type = '';
/**
* Item language
*
* @access public
* @var string
*/
public $language = '';
/**
* Return item information
*
* @access public
* $return string
*/
public function __toString()
{
$output = '';
foreach (array('id', 'title', 'url', 'date', 'language', 'author', 'enclosure_url', 'enclosure_type') as $property) {
$output .= 'Item::'.$property.' = '.$this->$property.PHP_EOL;
}
$output .= 'Item::content = '.strlen($this->content).' bytes'.PHP_EOL;
return $output;
}
/**
* Get title
*
* @access public
* $return string
*/
public function getTitle()
{
return $this->title;
}
/**
* Get url
*
* @access public
* $return string
*/
public function getUrl()
{
return $this->url;
}
/**
* Get id
*
* @access public
* $return string
*/
public function getId()
{
return $this->id;
}
/**
* Get date
*
* @access public
* $return integer
*/
public function getDate()
{
return $this->date;
}
/**
* Get content
*
* @access public
* $return string
*/
public function getContent()
{
return $this->content;
}
/**
* Get enclosure url
*
* @access public
* $return string
*/
public function getEnclosureUrl()
{
return $this->enclosure_url;
}
/**
* Get enclosure type
*
* @access public
* $return string
*/
public function getEnclosureType()
{
return $this->enclosure_type;
}
/**
* Get language
*
* @access public
* $return string
*/
public function getLanguage()
{
return $this->language;
}
/**
* Get author
*
* @access public
* $return string
*/
public function getAuthor()
{
return $this->author;
}
}

View File

@ -2,12 +2,82 @@
namespace PicoFeed;
use DateTime;
use DateTimeZone;
/**
* Logging class
*
* @author Frederic Guillot
* @package picofeed
*/
class Logging
{
public static $messages = array();
/**
* List of messages
*
* @static
* @access private
* @var array
*/
private static $messages = array();
public static function log($message)
/**
* Default timezone
*
* @static
* @access private
* @var array
*/
private static $timezone = 'UTC';
/**
* Add a new message
*
* @static
* @access public
* @param string $message Message
*/
public static function setMessage($message)
{
self::$messages[] = '['.date('Y-m-d H:i:s').'] '.$message;
$date = new DateTime('now', new DateTimeZone(self::$timezone));
self::$messages[] = '['.$date->format('Y-m-d H:i:s').'] '.$message;
}
}
/**
* Get all logged messages
*
* @static
* @access public
* @return array
*/
public static function getMessages()
{
return self::$messages;
}
/**
* Remove all logged messages
*
* @static
* @access public
*/
public static function deleteMessages()
{
self::$messages = array();
}
/**
* Set a different timezone
*
* @static
* @see http://php.net/manual/en/timezones.php
* @access public
* @param string $timezone Timezone
*/
public static function setTimeZone($timezone)
{
self::$timezone = $timezone ?: self::$timezone;
}
}

View File

@ -2,10 +2,16 @@
namespace PicoFeed;
require_once __DIR__.'/Logging.php';
require_once __DIR__.'/Filter.php';
require_once __DIR__.'/Encoding.php';
require_once __DIR__.'/Grabber.php';
use DateTime;
use DateTimeZone;
use DOMXPath;
use SimpleXMLElement;
use PicoFeed\Config;
use PicoFeed\Encoding;
use PicoFeed\Filter;
use PicoFeed\Grabber;
use PicoFeed\Logging;
use PicoFeed\XmlParser;
/**
* Base parser class
@ -15,14 +21,29 @@ require_once __DIR__.'/Grabber.php';
*/
abstract class Parser
{
/**
* Config object
*
* @access private
* @var \PicoFeed\Config
*/
private $config = null;
/**
* Hash algorithm used to generate item id, any value supported by PHP, see hash_algos()
*
* @access public
* @static
* @access private
* @var string
*/
public static $hashAlgo = 'crc32b'; // crc32b seems to be faster and shorter than other hash algorithms
private $hash_algo = 'crc32b'; // crc32b seems to be faster and shorter than other hash algorithms
/**
* Timezone used to parse feed dates
*
* @access private
* @var string
*/
private $timezone = 'UTC';
/**
* Feed content (XML data)
@ -33,35 +54,28 @@ abstract class Parser
protected $content = '';
/**
* Feed properties (values parsed)
* XML namespaces
*
* @access public
* @access protected
* @var array
*/
public $id = '';
public $url = '';
public $title = '';
public $updated = '';
public $language = '';
public $items = array();
protected $namespaces = array();
/**
* Content grabber parameters
* Enable the content grabber
*
* @access public
* @access private
* @var bool
*/
public $grabber = false;
public $grabber_ignore_urls = array();
public $grabber_timeout = null;
public $grabber_user_agent = null;
public $enable_grabber = false;
/**
* Parse feed content
* Ignore those urls for the content scraper
*
* @abstract
* @access public
* @return mixed
* @access private
* @var array
*/
abstract public function execute();
private $grabber_ignore_urls = array();
/**
* Constructor
@ -73,7 +87,7 @@ abstract class Parser
public function __construct($content, $http_encoding = '')
{
$xml_encoding = Filter::getEncodingFromXmlTag($content);
Logging::log(\get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"');
Logging::setMessage(get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"');
// Strip XML tag to avoid multiple encoding/decoding in the next XML processing
$this->content = Filter::stripXmlTag($content);
@ -90,6 +104,52 @@ abstract class Parser
$this->content = $this->normalizeData($this->content);
}
/**
* Parse the document
*
* @access public
* @return mixed \PicoFeed\Feed instance or false
*/
public function execute()
{
Logging::setMessage(get_called_class().': begin parsing');
$xml = XmlParser::getSimpleXml($this->content);
if ($xml === false) {
Logging::setMessage(get_called_class().': XML parsing error');
Logging::setMessage(XmlParser::getErrors());
return false;
}
$this->namespaces = $xml->getNamespaces(true);
$feed = new Feed;
$this->findFeedUrl($xml, $feed);
$this->findFeedTitle($xml, $feed);
$this->findFeedLanguage($xml, $feed);
$this->findFeedId($xml, $feed);
$this->findFeedDate($xml, $feed);
foreach ($this->getItemsTree($xml) as $entry) {
$item = new Item;
$this->findItemAuthor($xml, $entry, $item);
$this->findItemUrl($entry, $item);
$this->findItemTitle($entry, $item);
$this->findItemId($entry, $item, $feed);
$this->findItemDate($entry, $item);
$this->findItemContent($entry, $item);
$this->findItemEnclosure($entry, $item, $feed);
$this->findItemLanguage($entry, $item, $feed);
$feed->items[] = $item;
}
Logging::setMessage(get_called_class().PHP_EOL.$feed);
return $feed;
}
/**
* Filter HTML for entry content
*
@ -102,43 +162,40 @@ abstract class Parser
{
$content = '';
if ($this->grabber && ! in_array($item_url, $this->grabber_ignore_urls)) {
// Setup the content scraper
if ($this->enable_grabber && ! in_array($item_url, $this->grabber_ignore_urls)) {
$grabber = new Grabber($item_url);
$grabber->download($this->grabber_timeout, $this->grabber_user_agent);
if ($grabber->parse()) $item_content = $grabber->content;
$grabber->setConfig($this->config);
$grabber->download();
if ($grabber->parse()) {
$item_content = $grabber->getContent();
}
}
// Content filtering
if ($item_content) {
$filter = new Filter($item_content, $item_url);
$content = $filter->execute();
if ($this->config !== null) {
$callback = $this->config->getContentFilteringCallback();
if (is_callable($callback)) {
$content = $callback($item_content, $item_url);
}
}
if (! $content) {
$filter = new Filter($item_content, $item_url);
$filter->setConfig($this->config);
$content = $filter->execute();
}
}
return $content;
}
/**
* Get XML parser errors
*
* @access public
* @return string
*/
public function getXmlErrors()
{
$errors = array();
foreach(\libxml_get_errors() as $error) {
$errors[] = sprintf('XML error: %s (Line: %d - Column: %d - Code: %d)',
$error->message,
$error->line,
$error->column,
$error->code
);
}
return implode(', ', $errors);
}
/**
* Dirty quickfixes before XML parsing
*
@ -148,6 +205,7 @@ abstract class Parser
*/
public function normalizeData($data)
{
$data = str_replace("\x10", '', $data);
$data = str_replace("\xc3\x20", '', $data);
$data = str_replace("&#x1F;", '', $data);
$data = $this->replaceEntityAttribute($data);
@ -194,7 +252,7 @@ abstract class Parser
*/
public function generateId()
{
return hash(self::$hashAlgo, implode(func_get_args()));
return hash($this->hash_algo, implode(func_get_args()));
}
/**
@ -249,7 +307,8 @@ abstract class Parser
}
}
return time();
$date = new DateTime('now', new DateTimeZone($this->timezone));
return $date->getTimestamp();
}
/**
@ -262,11 +321,15 @@ abstract class Parser
*/
public function getValidDate($format, $value)
{
$date = \DateTime::createFromFormat($format, $value);
$date = DateTime::createFromFormat($format, $value, new DateTimeZone($this->timezone));
if ($date !== false) {
$errors = \DateTime::getLastErrors();
if ($errors['error_count'] === 0 && $errors['warning_count'] === 0) return $date->getTimestamp();
$errors = DateTime::getLastErrors();
if ($errors['error_count'] === 0 && $errors['warning_count'] === 0) {
return $date->getTimestamp();
}
}
return 0;
@ -299,10 +362,13 @@ abstract class Parser
*/
public function getXmlLang($xml)
{
$dom = new \DOMDocument;
$dom->loadXML($this->content);
$dom = XmlParser::getDomDocument($this->content);
$xpath = new \DOMXPath($dom);
if ($dom === false) {
return '';
}
$xpath = new DOMXPath($dom);
return $xpath->evaluate('string(//@xml:lang[1])') ?: '';
}
@ -318,30 +384,108 @@ abstract class Parser
{
$language = strtolower($language);
// Arabic (ar-**)
if (strpos($language, 'ar') === 0) return true;
$rtl_languages = array(
'ar', // Arabic (ar-**)
'fa', // Farsi (fa-**)
'ur', // Urdu (ur-**)
'ps', // Pashtu (ps-**)
'syr', // Syriac (syr-**)
'dv', // Divehi (dv-**)
'he', // Hebrew (he-**)
'yi', // Yiddish (yi-**)
);
// Farsi (fa-**)
if (strpos($language, 'fa') === 0) return true;
// Urdu (ur-**)
if (strpos($language, 'ur') === 0) return true;
// Pashtu (ps-**)
if (strpos($language, 'ps') === 0) return true;
// Syriac (syr-**)
if (strpos($language, 'syr') === 0) return true;
// Divehi (dv-**)
if (strpos($language, 'dv') === 0) return true;
// Hebrew (he-**)
if (strpos($language, 'he') === 0) return true;
// Yiddish (yi-**)
if (strpos($language, 'yi') === 0) return true;
foreach ($rtl_languages as $prefix) {
if (strpos($language, $prefix) === 0) {
return true;
}
}
return false;
}
/**
* Set Hash algorithm used for id generation
*
* @access public
* @param string $algo Algorithm name
* @return \PicoFeed\Parser
*/
public function setHashAlgo($algo)
{
$this->hash_algo = $algo ?: $this->hash_algo;
return $this;
}
/**
* Set a different timezone
*
* @see http://php.net/manual/en/timezones.php
* @access public
* @param string $timezone Timezone
* @return \PicoFeed\Parser
*/
public function setTimezone($timezone)
{
$this->timezone = $timezone ?: $this->timezone;
return $this;
}
/**
* Set config object
*
* @access public
* @param \PicoFeed\Config $config Config instance
* @return \PicoFeed\Parser
*/
public function setConfig($config)
{
$this->config = $config;
return $this;
}
/**
* Enable the content grabber
*
* @access public
* @return \PicoFeed\Parser
*/
public function enableContentGrabber()
{
$this->enable_grabber = true;
}
/**
* Set ignored URLs for the content grabber
*
* @access public
* @param array $urls URLs
* @return \PicoFeed\Parser
*/
public function setGrabberIgnoreUrls(array $urls)
{
$this->grabber_ignore_urls = $urls;
}
/**
* Get a value from a XML namespace
*
* @access public
* @param SimpleXMLElement $xml XML element
* @param array $namespaces XML namespaces
* @param string $property XML tag name
* @return string
*/
public function getNamespaceValue(SimpleXMLElement $xml, array $namespaces, $property)
{
foreach ($namespaces as $name => $url) {
$namespace = $xml->children($namespaces[$name]);
if ($namespace->$property->count() > 0) {
return (string) $namespace->$property;
}
}
return '';
}
}

View File

@ -2,91 +2,257 @@
namespace PicoFeed\Parsers;
use SimpleXMLElement;
use PicoFeed\Parser;
use PicoFeed\XmlParser;
use PicoFeed\Logging;
use PicoFeed\Filter;
use PicoFeed\Feed;
use PicoFeed\Item;
/**
* Atom parser
*
* @author Frederic Guillot
* @package parser
*/
class Atom extends \PicoFeed\Parser
class Atom extends Parser
{
/**
* Parse the document
* Get the path to the items XML tree
*
* @access public
* @return mixed Atom instance or false
* @param SimpleXMLElement $xml Feed xml
* @return SimpleXMLElement
*/
public function execute()
public function getItemsTree(SimpleXMLElement $xml)
{
\PicoFeed\Logging::log(\get_called_class().': begin parsing');
return $xml->entry;
}
\libxml_use_internal_errors(true);
$xml = \simplexml_load_string($this->content);
/**
* Find the feed url
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedUrl(SimpleXMLElement $xml, Feed $feed)
{
$feed->url = $this->getLink($xml);
}
if ($xml === false) {
\PicoFeed\Logging::log(\get_called_class().': XML parsing error');
\PicoFeed\Logging::log($this->getXmlErrors());
return false;
/**
* Find the feed title
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedTitle(SimpleXMLElement $xml, Feed $feed)
{
$feed->title = $this->stripWhiteSpace((string) $xml->title) ?: $feed->url;
}
/**
* Find the feed language
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed)
{
$feed->language = $this->getXmlLang($this->content);
}
/**
* Find the feed id
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedId(SimpleXMLElement $xml, Feed $feed)
{
$feed->id = (string) $xml->id;
}
/**
* Find the feed date
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedDate(SimpleXMLElement $xml, Feed $feed)
{
$feed->date = $this->parseDate((string) $xml->updated);
}
/**
* Find the item date
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
*/
public function findItemDate(SimpleXMLElement $entry, Item $item)
{
$item->date = $this->parseDate((string) $entry->updated);
}
/**
* Find the item title
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
*/
public function findItemTitle(SimpleXMLElement $entry, Item $item)
{
$item->title = $this->stripWhiteSpace((string) $entry->title);
if (empty($item->title)) {
$item->title = $item->url;
}
}
/**
* Find the item author
*
* @access public
* @param SimpleXMLElement $xml Feed
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
*/
public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item)
{
if (isset($entry->author->name)) {
$item->author = (string) $entry->author->name;
}
else {
$item->author = (string) $xml->author->name;
}
}
/**
* Find the item content
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
*/
public function findItemContent(SimpleXMLElement $entry, Item $item)
{
$item->content = $this->filterHtml($this->getContent($entry), $item->url);
}
/**
* Find the item URL
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
*/
public function findItemUrl(SimpleXMLElement $entry, Item $item)
{
$item->url = $this->getLink($entry);
}
/**
* Genereate the item id
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
* @param \PicoFeed\Feed $feed Feed object
*/
public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$id = (string) $entry->id;
if ($id !== $item->url) {
$item_permalink = $id;
}
else {
$item_permalink = $item->url;
}
$this->language = $this->getXmlLang($this->content);
$this->url = $this->getUrl($xml);
$this->title = $this->stripWhiteSpace((string) $xml->title) ?: $this->url;
$this->id = (string) $xml->id;
$this->updated = $this->parseDate((string) $xml->updated);
$author = (string) $xml->author->name;
if ($this->isExcludedFromId($feed->url)) {
$feed_permalink = '';
}
else {
$feed_permalink = $feed->url;
}
\PicoFeed\Logging::log(\get_called_class().': Title => '.$this->title);
\PicoFeed\Logging::log(\get_called_class().': Url => '.$this->url);
$item->id = $this->generateId($item_permalink, $feed_permalink);
}
foreach ($xml->entry as $entry) {
/**
* Find the item enclosure
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
* @param \PicoFeed\Feed $feed Feed object
*/
public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed)
{
foreach ($entry->link as $link) {
if ((string) $link['rel'] === 'enclosure') {
if (isset($entry->author->name)) {
$author = (string) $entry->author->name;
}
$item->enclosure_url = (string) $link['href'];
$item->enclosure_type = (string) $link['type'];
$id = (string) $entry->id;
$item = new \StdClass;
$item->url = $this->getUrl($entry);
$item->id = $this->generateId($id !== $item->url ? $id : $item->url, $this->isExcludedFromId($this->url) ? '' : $this->url);
$item->title = $this->stripWhiteSpace((string) $entry->title);
$item->updated = $this->parseDate((string) $entry->updated);
$item->author = $author;
$item->content = $this->filterHtml($this->getContent($entry), $item->url);
$item->language = $this->language;
if (empty($item->title)) $item->title = $item->url;
// Try to find an enclosure
foreach ($entry->link as $link) {
if ((string) $link['rel'] === 'enclosure') {
$item->enclosure = (string) $link['href'];
$item->enclosure_type = (string) $link['type'];
if (\PicoFeed\Filter::isRelativePath($item->enclosure)) {
$item->enclosure = \PicoFeed\Filter::getAbsoluteUrl($item->enclosure, $this->url);
}
break;
if (Filter::isRelativePath($item->enclosure_url)) {
$item->enclosure_url = Filter::getAbsoluteUrl($item->enclosure_url, $feed->url);
}
}
$this->items[] = $item;
break;
}
}
}
/**
* Find the item language
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
* @param \PicoFeed\Feed $feed Feed object
*/
public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$item->language = $feed->language;
}
/**
* Get the URL from a link tag
*
* @access public
* @param SimpleXMLElement $xml XML tag
* @return string
*/
public function getLink(SimpleXMLElement $xml)
{
foreach ($xml->link as $link) {
if ((string) $link['type'] === 'text/html' || (string) $link['type'] === 'application/xhtml+xml') {
return (string) $link['href'];
}
}
\PicoFeed\Logging::log(\get_called_class().': parsing finished ('.count($this->items).' items)');
return $this;
return (string) $xml->link['href'];
}
/**
* Get the entry content
*
* @access public
* @param SimpleXMLElement $entry XML Entry
* @param SimpleXMLElement $entry XML Entry
* @return string
*/
public function getContent($entry)
public function getContent(SimpleXMLElement $entry)
{
if (isset($entry->content) && ! empty($entry->content)) {
@ -103,22 +269,4 @@ class Atom extends \PicoFeed\Parser
return '';
}
/**
* Get the URL from a link tag
*
* @access public
* @param SimpleXMLElement $xml XML tag
* @return string
*/
public function getUrl($xml)
{
foreach ($xml->link as $link) {
if ((string) $link['type'] === 'text/html' || (string) $link['type'] === 'application/xhtml+xml') {
return (string) $link['href'];
}
}
return (string) $xml->link['href'];
}
}
}

View File

@ -2,86 +2,86 @@
namespace PicoFeed\Parsers;
class Rss10 extends \PicoFeed\Parser
require_once __DIR__.'/Rss20.php';
use SimpleXMLElement;
use PicoFeed\Feed;
use PicoFeed\Item;
use PicoFeed\Parsers\Rss20;
/**
* RSS 1.0 parser
*
* @author Frederic Guillot
* @package parser
*/
class Rss10 extends Rss20
{
public function execute()
/**
* Get the path to the items XML tree
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @return SimpleXMLElement
*/
public function getItemsTree(SimpleXMLElement $xml)
{
\PicoFeed\Logging::log(\get_called_class().': begin parsing');
return $xml->item;
}
\libxml_use_internal_errors(true);
$xml = \simplexml_load_string($this->content);
/**
* Find the feed date
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedDate(SimpleXMLElement $xml, Feed $feed)
{
$feed->date = $this->parseDate($this->getNamespaceValue($xml->channel, $this->namespaces, 'date'));
}
if ($xml === false) {
\PicoFeed\Logging::log(\get_called_class().': XML parsing error');
\PicoFeed\Logging::log($this->getXmlErrors());
return false;
}
/**
* Find the feed language
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed)
{
$feed->language = $this->getNamespaceValue($xml->channel, $this->namespaces, 'language');
}
$namespaces = $xml->getNamespaces(true);
$this->title = $this->stripWhiteSpace((string) $xml->channel->title) ?: $this->url;
$this->url = (string) $xml->channel->link;
$this->id = $this->url;
$this->language = '';
\PicoFeed\Logging::log(\get_called_class().': Title => '.$this->title);
\PicoFeed\Logging::log(\get_called_class().': Url => '.$this->url);
if (isset($namespaces['dc'])) {
$ns_dc = $xml->channel->children($namespaces['dc']);
$this->updated = isset($ns_dc->date) ? $this->parseDate($ns_dc->date) : time();
/**
* Genereate the item id
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
* @param \PicoFeed\Feed $feed Feed object
*/
public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed)
{
if ($this->isExcludedFromId($feed->url)) {
$feed_permalink = '';
}
else {
$this->updated = time();
$feed_permalink = $feed->url;
}
foreach ($xml->item as $entry) {
$item = new \StdClass;
$item->title = $this->stripWhiteSpace((string) $entry->title);
$item->url = '';
$item->author= '';
$item->updated = '';
$item->content = '';
$item->language = '';
foreach ($namespaces as $name => $url) {
$namespace = $entry->children($namespaces[$name]);
if (! $item->url && ! empty($namespace->origLink)) $item->url = (string) $namespace->origLink;
if (! $item->author && ! empty($namespace->creator)) $item->author = (string) $namespace->creator;
if (! $item->updated && ! empty($namespace->date)) $item->updated = $this->parseDate((string) $namespace->date);
if (! $item->updated && ! empty($namespace->updated)) $item->updated = $this->parseDate((string) $namespace->updated);
if (! $item->content && ! empty($namespace->encoded)) $item->content = (string) $namespace->encoded;
}
if (empty($item->url)) $item->url = (string) $entry->link;
if (empty($item->updated)) $item->updated = $this->updated;
if (empty($item->content)) {
$item->content = isset($entry->description) ? (string) $entry->description : '';
}
if (empty($item->author)) {
if (isset($entry->author)) {
$item->author = (string) $entry->author;
}
else if (isset($xml->channel->webMaster)) {
$item->author = (string) $xml->channel->webMaster;
}
}
if (empty($item->title)) $item->title = $item->url;
$item->id = $this->generateId($item->url, $this->isExcludedFromId($this->url) ? '' : $this->url);
$item->content = $this->filterHtml($item->content, $item->url);
$this->items[] = $item;
}
\PicoFeed\Logging::log(\get_called_class().': parsing finished ('.count($this->items).' items)');
return $this;
$item->id = $this->generateId($item->url, $feed_permalink);
}
}
/**
* Find the item enclosure
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
* @param \PicoFeed\Feed $feed Feed object
*/
public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed)
{
}
}

View File

@ -2,35 +2,43 @@
namespace PicoFeed\Parsers;
use SimpleXMLElement;
use PicoFeed\Parser;
use PicoFeed\XmlParser;
use PicoFeed\Logging;
use PicoFeed\Filter;
use PicoFeed\Feed;
use PicoFeed\Item;
/**
* RSS 2.0 Parser
*
* @author Frederic Guillot
* @package parser
*/
class Rss20 extends \PicoFeed\Parser
class Rss20 extends Parser
{
/**
* Parse the document
* Get the path to the items XML tree
*
* @access public
* @return mixed Rss20 instance or false
* @param SimpleXMLElement $xml Feed xml
* @return SimpleXMLElement
*/
public function execute()
public function getItemsTree(SimpleXMLElement $xml)
{
\PicoFeed\Logging::log(\get_called_class().': begin parsing');
\libxml_use_internal_errors(true);
$xml = \simplexml_load_string($this->content);
if ($xml === false) {
\PicoFeed\Logging::log(\get_called_class().': XML parsing error');
\PicoFeed\Logging::log($this->getXmlErrors());
return false;
}
$namespaces = $xml->getNamespaces(true);
return $xml->channel->item;
}
/**
* Find the feed url
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedUrl(SimpleXMLElement $xml, Feed $feed)
{
if ($xml->channel->link && $xml->channel->link->count() > 1) {
foreach ($xml->channel->link as $xml_link) {
@ -38,112 +46,228 @@ class Rss20 extends \PicoFeed\Parser
$link = (string) $xml_link;
if ($link !== '') {
$this->url = (string) $link;
$feed->url = $link;
break;
}
}
}
else {
$this->url = (string) $xml->channel->link;
$feed->url = (string) $xml->channel->link;
}
$this->language = isset($xml->channel->language) ? (string) $xml->channel->language : '';
$this->title = $this->stripWhiteSpace((string) $xml->channel->title) ?: $this->url;
$this->id = $this->url;
$this->updated = $this->parseDate(isset($xml->channel->pubDate) ? (string) $xml->channel->pubDate : (string) $xml->channel->lastBuildDate);
\PicoFeed\Logging::log(\get_called_class().': Title => '.$this->title);
\PicoFeed\Logging::log(\get_called_class().': Url => '.$this->url);
// RSS feed might be empty
if (! $xml->channel->item) {
\PicoFeed\Logging::log(\get_called_class().': feed empty or malformed');
return $this;
}
foreach ($xml->channel->item as $entry) {
$item = new \StdClass;
$item->title = $this->stripWhiteSpace((string) $entry->title);
$item->url = '';
$item->author= '';
$item->updated = '';
$item->content = '';
$item->enclosure = '';
$item->enclosure_type = '';
$item->language = $this->language;
foreach ($namespaces as $name => $url) {
$namespace = $entry->children($namespaces[$name]);
if (! $item->author && ! empty($namespace->creator)) $item->author = (string) $namespace->creator;
if (! $item->updated && ! empty($namespace->date)) $item->updated = $this->parseDate((string) $namespace->date);
if (! $item->updated && ! empty($namespace->updated)) $item->updated = $this->parseDate((string) $namespace->updated);
if (! $item->content && ! empty($namespace->encoded)) $item->content = (string) $namespace->encoded;
// Get FeedBurner original links
if (! $item->url && ! empty($namespace->origLink)) $item->url = (string) $namespace->origLink;
if (! $item->enclosure && ! empty($namespace->origEnclosureLink)) $item->enclosure = (string) $namespace->origEnclosureLink;
}
if (empty($item->url)) {
if (isset($entry->link)) {
$item->url = (string) $entry->link;
}
else if (isset($entry->guid)) {
$item->url = (string) $entry->guid;
}
}
if (empty($item->updated)) $item->updated = $this->parseDate((string) $entry->pubDate) ?: $this->updated;
if (empty($item->content)) {
$item->content = isset($entry->description) ? (string) $entry->description : '';
}
if (empty($item->author)) {
if (isset($entry->author)) {
$item->author = (string) $entry->author;
}
else if (isset($xml->channel->webMaster)) {
$item->author = (string) $xml->channel->webMaster;
}
}
if (isset($entry->guid) && isset($entry->guid['isPermaLink']) && (string) $entry->guid['isPermaLink'] != 'false') {
$id = (string) $entry->guid;
$item->id = $this->generateId($id !== '' && $id !== $item->url ? $id : $item->url, $this->isExcludedFromId($this->url) ? '' : $this->url);
}
else {
$item->id = $this->generateId($item->url, $this->isExcludedFromId($this->url) ? '' : $this->url);
}
if (empty($item->title)) $item->title = $item->url;
// if optional enclosure tag with multimedia provided, capture here
if (isset($entry->enclosure)) {
if (! $item->enclosure) {
$item->enclosure = isset($entry->enclosure['url']) ? (string) $entry->enclosure['url'] : '';
}
$item->enclosure_type = isset($entry->enclosure['type']) ? (string) $entry->enclosure['type'] : '';
if (\PicoFeed\Filter::isRelativePath($item->enclosure)) {
$item->enclosure = \PicoFeed\Filter::getAbsoluteUrl($item->enclosure, $this->url);
}
}
$item->content = $this->filterHtml($item->content, $item->url);
$this->items[] = $item;
}
\PicoFeed\Logging::log(\get_called_class().': parsing finished ('.count($this->items).' items)');
return $this;
}
}
/**
* Find the feed title
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedTitle(SimpleXMLElement $xml, Feed $feed)
{
$feed->title = $this->stripWhiteSpace((string) $xml->channel->title) ?: $feed->url;
}
/**
* Find the feed language
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed)
{
$feed->language = isset($xml->channel->language) ? (string) $xml->channel->language : '';
}
/**
* Find the feed id
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedId(SimpleXMLElement $xml, Feed $feed)
{
$feed->id = $feed->url;
}
/**
* Find the feed date
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedDate(SimpleXMLElement $xml, Feed $feed)
{
$date = isset($xml->channel->pubDate) ? $xml->channel->pubDate : $xml->channel->lastBuildDate;
$feed->date = $this->parseDate((string) $date);
}
/**
* Find the item date
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
*/
public function findItemDate(SimpleXMLElement $entry, Item $item)
{
$date = $this->getNamespaceValue($entry, $this->namespaces, 'date');
if (empty($date)) {
$date = $this->getNamespaceValue($entry, $this->namespaces, 'updated');
}
if (empty($date)) {
$date = (string) $entry->pubDate;
}
$item->date = $this->parseDate($date);
}
/**
* Find the item title
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
*/
public function findItemTitle(SimpleXMLElement $entry, Item $item)
{
$item->title = $this->stripWhiteSpace((string) $entry->title);
if (empty($item->title)) {
$item->title = $item->url;
}
}
/**
* Find the item author
*
* @access public
* @param SimpleXMLElement $xml Feed
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
*/
public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item)
{
$item->author = $this->getNamespaceValue($entry, $this->namespaces, 'creator');
if (empty($item->author)) {
if (isset($entry->author)) {
$item->author = (string) $entry->author;
}
else if (isset($xml->channel->webMaster)) {
$item->author = (string) $xml->channel->webMaster;
}
}
}
/**
* Find the item content
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
*/
public function findItemContent(SimpleXMLElement $entry, Item $item)
{
$content = $this->getNamespaceValue($entry, $this->namespaces, 'encoded');
if (empty($content) && $entry->description->count() > 0) {
$content = (string) $entry->description;
}
$item->content = $this->filterHtml($content, $item->url);
}
/**
* Find the item URL
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
*/
public function findItemUrl(SimpleXMLElement $entry, Item $item)
{
$item->url = $this->getNamespaceValue($entry, $this->namespaces, 'origLink');
if (empty($item->url)) {
if (isset($entry->link)) {
$item->url = (string) $entry->link;
}
else if (isset($entry->guid)) {
$item->url = (string) $entry->guid;
}
}
}
/**
* Genereate the item id
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
* @param \PicoFeed\Feed $feed Feed object
*/
public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed)
{
if ($entry->guid->count() > 0 && (string) $entry->guid['isPermaLink'] !== 'false') {
$item_permalink = (string) $entry->guid;
}
else {
$item_permalink = $item->url;
}
if ($this->isExcludedFromId($feed->url)) {
$feed_permalink = '';
}
else {
$feed_permalink = $feed->url;
}
$item->id = $this->generateId($item_permalink, $feed_permalink);
}
/**
* Find the item enclosure
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
* @param \PicoFeed\Feed $feed Feed object
*/
public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed)
{
if (isset($entry->enclosure)) {
$item->enclosure_url = $this->getNamespaceValue($entry->enclosure, $this->namespaces, 'origEnclosureLink');
if (empty($item->enclosure_url)) {
$item->enclosure_url = isset($entry->enclosure['url']) ? (string) $entry->enclosure['url'] : '';
}
$item->enclosure_type = isset($entry->enclosure['type']) ? (string) $entry->enclosure['type'] : '';
if (Filter::isRelativePath($item->enclosure_url)) {
$item->enclosure_url = Filter::getAbsoluteUrl($item->enclosure_url, $feed->url);
}
}
}
/**
* Find the item language
*
* @access public
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Item $item Item object
* @param \PicoFeed\Feed $feed Feed object
*/
public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$item->language = $feed->language;
}
}

View File

@ -4,4 +4,14 @@ namespace PicoFeed\Parsers;
require_once __DIR__.'/Rss20.php';
class Rss91 extends Rss20 {}
use PicoFeed\Parsers\Rss20;
/**
* RSS 0.91 Parser
*
* @author Frederic Guillot
* @package parser
*/
class Rss91 extends Rss20
{
}

View File

@ -4,4 +4,14 @@ namespace PicoFeed\Parsers;
require_once __DIR__.'/Rss20.php';
class Rss92 extends Rss20 {}
use PicoFeed\Parsers\Rss20;
/**
* RSS 0.92 Parser
*
* @author Frederic Guillot
* @package parser
*/
class Rss92 extends Rss20
{
}

20
vendor/PicoFeed/PicoFeed.php vendored Normal file
View File

@ -0,0 +1,20 @@
<?php
// Include this file if you don't want to use an autoloader
require __DIR__.'/Config.php';
require __DIR__.'/Logging.php';
require __DIR__.'/Item.php';
require __DIR__.'/Feed.php';
require __DIR__.'/Client.php';
require __DIR__.'/Filter.php';
require __DIR__.'/XmlParser.php';
require __DIR__.'/Encoding.php';
require __DIR__.'/Grabber.php';
require __DIR__.'/Reader.php';
require __DIR__.'/Import.php';
require __DIR__.'/Export.php';
require __DIR__.'/Writer.php';
require __DIR__.'/Writers/Rss20.php';
require __DIR__.'/Writers/Atom.php';
require __DIR__.'/Parser.php';

View File

@ -2,16 +2,19 @@
namespace PicoFeed;
require_once __DIR__.'/Logging.php';
require_once __DIR__.'/Parser.php';
require_once __DIR__.'/Client.php';
require_once __DIR__.'/Filter.php';
use DOMXPath;
use PicoFeed\Config;
use PicoFeed\XmlParser;
use PicoFeed\Logging;
use PicoFeed\Filter;
use PicoFeed\Client;
use PicoFeed\Parser;
/**
* Reader class
*
* @author Frederic Guillot
* @package parser
* @package picofeed
*/
class Reader
{
@ -39,19 +42,24 @@ class Reader
*/
private $encoding = '';
/**
* Config class instance
*
* @access private
* @var \PicoFeed\Config
*/
private $config = null;
/**
* Constructor
*
* @access public
* @param string $content Feed content
* @param string $encoding Feed encoding
* @return Reader
* @param \PicoFeed\Config $config Config class instance
*/
public function __construct($content = '', $encoding = '')
public function __construct(Config $config = null)
{
$this->content = $content;
$this->encoding = '';
return $this;
$this->config = $config ?: new Config;
Logging::setTimezone($this->config->getTimezone());
}
/**
@ -61,59 +69,60 @@ class Reader
* @param string $url Feed content
* @param string $last_modified Last modified HTTP header
* @param string $etag Etag HTTP header
* @param string $timeout Client connection timeout
* @param string $user_agent HTTP user-agent
* @return Client
* @return \PicoFeed\Client
*/
public function download($url, $last_modified = '', $etag = '', $timeout = 5, $user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)')
public function download($url, $last_modified = '', $etag = '')
{
if (strpos($url, 'http') !== 0) {
$url = 'http://'.$url;
}
$client = Client::create();
$client->url = $url;
$client->timeout = $timeout;
$client->user_agent = $user_agent;
$client->last_modified = $last_modified;
$client->etag = $etag;
$client->execute();
$client = Client::getInstance();
$client->setTimeout($this->config->getClientTimeout())
->setUserAgent($this->config->getClientUserAgent())
->setMaxRedirections($this->config->getMaxRedirections())
->setMaxBodySize($this->config->getMaxBodySize())
->setProxyHostname($this->config->getProxyHostname())
->setProxyPort($this->config->getProxyPort())
->setProxyUsername($this->config->getProxyUsername())
->setProxyPassword($this->config->getProxyPassword())
->setLastModified($last_modified)
->setEtag($etag);
$this->content = $client->getContent();
$this->url = $client->getUrl();
$this->encoding = $client->getEncoding();
if ($client->execute($url)) {
$this->content = $client->getContent();
$this->url = $client->getUrl();
$this->encoding = $client->getEncoding();
}
return $client;
}
/**
* Get the download content
* Get a parser instance with a custom config
*
* @access public
* @return string
* @param string $name Parser name
* @return \PicoFeed\Parser
*/
public function getContent()
public function getParserInstance($name)
{
return $this->content;
}
require_once __DIR__.'/Parsers/'.ucfirst($name).'.php';
$name = '\PicoFeed\Parsers\\'.$name;
/**
* Get finale URL
*
* @access public
* @return string
*/
public function getUrl()
{
return $this->url;
$parser = new $name($this->content, $this->encoding);
$parser->setHashAlgo($this->config->getParserHashAlgo());
$parser->setTimezone($this->config->getTimezone());
$parser->setConfig($this->config);
return $parser;
}
/**
* Get the first XML tag
*
* @access public
* @param string $data Feed content
* @param string $data Feed content
* @return string
*/
public function getFirstTag($data)
@ -138,6 +147,31 @@ class Reader
return substr($data, $open_tag, $close_tag);
}
/**
* Detect the feed format
*
* @access public
* @param string $parser_name Parser name
* @param string $haystack First XML tag
* @param array $needles List of strings that need to be there
* @return mixed False on failure or Parser instance
*/
public function detectFormat($parser_name, $haystack, array $needles)
{
$results = array();
foreach ($needles as $needle) {
$results[] = strpos($haystack, $needle) !== false;
}
if (! in_array(false, $results, true)) {
Logging::setMessage(get_called_class().': Format detected => '.$parser_name);
return $this->getParserInstance($parser_name);
}
return false;
}
/**
* Discover feed format and return a parser instance
*
@ -147,66 +181,44 @@ class Reader
*/
public function getParser($discover = false)
{
$formats = array(
array('parser' => 'Atom', 'needles' => array('<feed')),
array('parser' => 'Rss20', 'needles' => array('<rss', '2.0')),
array('parser' => 'Rss92', 'needles' => array('<rss', '0.92')),
array('parser' => 'Rss91', 'needles' => array('<rss', '0.91')),
array('parser' => 'Rss10', 'needles' => array('<rdf:', 'xmlns="http://purl.org/rss/1.0/"')),
);
$first_tag = $this->getFirstTag($this->content);
if (strpos($first_tag, '<feed') !== false) {
foreach ($formats as $format) {
Logging::log(\get_called_class().': discover Atom feed');
$parser = $this->detectFormat($format['parser'], $first_tag, $format['needles']);
require_once __DIR__.'/Parsers/Atom.php';
return new Parsers\Atom($this->content, $this->encoding);
if ($parser !== false) {
return $parser;
}
}
else if (strpos($first_tag, '<rss') !== false &&
(strpos($first_tag, 'version="2.0"') !== false || strpos($first_tag, 'version=\'2.0\'') !== false)) {
Logging::log(\get_called_class().': discover RSS 2.0 feed');
if ($discover === true) {
require_once __DIR__.'/Parsers/Rss20.php';
return new Parsers\Rss20($this->content, $this->encoding);
}
else if (strpos($first_tag, '<rss') !== false &&
(strpos($first_tag, 'version="0.92"') !== false || strpos($first_tag, 'version=\'0.92\'') !== false)) {
Logging::log(\get_called_class().': discover RSS 0.92 feed');
require_once __DIR__.'/Parsers/Rss92.php';
return new Parsers\Rss92($this->content, $this->encoding);
}
else if (strpos($first_tag, '<rss') !== false &&
(strpos($first_tag, 'version="0.91"') !== false || strpos($first_tag, 'version=\'0.91\'') !== false)) {
Logging::log(\get_called_class().': discover RSS 0.91 feed');
require_once __DIR__.'/Parsers/Rss91.php';
return new Parsers\Rss91($this->content, $this->encoding);
}
else if (strpos($first_tag, '<rdf:') !== false && strpos($first_tag, 'xmlns="http://purl.org/rss/1.0/"') !== false) {
Logging::log(\get_called_class().': discover RSS 1.0 feed');
require_once __DIR__.'/Parsers/Rss10.php';
return new Parsers\Rss10($this->content, $this->encoding);
}
else if ($discover === true) {
Logging::log(\get_called_class().': Format not supported or malformed');
Logging::log(\get_called_class().':'.PHP_EOL.$this->content);
Logging::setMessage(get_called_class().': Format not supported or feed malformed');
Logging::setMessage(get_called_class().': Content => '.PHP_EOL.$this->content);
return false;
}
else if ($this->discover()) {
return $this->getParser(true);
}
Logging::log(\get_called_class().': Subscription not found');
Logging::log(\get_called_class().': Content => '.PHP_EOL.$this->content);
Logging::setMessage(get_called_class().': Subscription not found');
Logging::setMessage(get_called_class().': Content => '.PHP_EOL.$this->content);
return false;
}
/**
* Discover feed url inside a HTML document and download the feed
* Discover the feed url inside a HTML document and download the feed
*
* @access public
* @return boolean
@ -214,18 +226,13 @@ class Reader
public function discover()
{
if (! $this->content) {
return false;
}
Logging::log(\get_called_class().': Try to discover a subscription');
Logging::setMessage(get_called_class().': Try to discover a subscription');
\libxml_use_internal_errors(true);
$dom = new \DOMDocument;
$dom->loadHTML($this->content);
$xpath = new \DOMXPath($dom);
$dom = XmlParser::getHtmlDocument($this->content);
$xpath = new DOMXPath($dom);
$queries = array(
"//link[@type='application/atom+xml']",
@ -251,7 +258,7 @@ class Reader
$link = $this->url.$link;
}
Logging::log(\get_called_class().': Find subscription link: '.$link);
Logging::setMessage(get_called_class().': Find subscription link: '.$link);
$this->download($link);
return true;
@ -261,4 +268,52 @@ class Reader
return false;
}
/**
* Get the downloaded content
*
* @access public
* @return string
*/
public function getContent()
{
return $this->content;
}
/**
* Set the page content
*
* @access public
* @param string $content Page content
* @return \PicoFeed\Reader
*/
public function setContent($content)
{
$this->content = $content;
return $this;
}
/**
* Get final URL
*
* @access public
* @return string
*/
public function getUrl()
{
return $this->url;
}
/**
* Set the URL
*
* @access public
* @param string $url URL
* @return \PicoFeed\Reader
*/
public function setUrl($url)
{
$this->url = $url;
return $this;
}
}

View File

@ -0,0 +1,10 @@
<?php
return array(
'test_url' => 'http://www./2014/05/20/le-playstation-now-arrive-en-beta-fermee-aux-etats-unis/',
'body' => array(
'//div[@class="post-content"]',
),
'strip' => array(
'//style'
)
);

View File

@ -2,22 +2,55 @@
namespace PicoFeed;
use RuntimeException;
/**
* Base writer class
*
* @author Frederic Guillot
* @package picofeed
*/
abstract class Writer
{
/**
* Dom object
*
* @access protected
* @var DomDocument
*/
protected $dom;
/**
* Items
*
* @access public
* @var array
*/
public $items = array();
/**
* Generate the XML document
*
* @abstract
* @access public
* @param string $filename Optional filename
* @return string
*/
abstract public function execute($filename = '');
public function checkRequiredProperties($properties, $container)
/**
* Check required properties to generate the output
*
* @access public
* @param array $properties List of properties
* @param mixed $container Object or array container
*/
public function checkRequiredProperties(array $properties, $container)
{
foreach ($properties as $property) {
if ((is_object($container) && ! isset($container->$property)) || (is_array($container) && ! isset($container[$property]))) {
throw new \RuntimeException('Required property missing: '.$property);
throw new RuntimeException('Required property missing: '.$property);
}
}
}
}
}

View File

@ -2,32 +2,59 @@
namespace PicoFeed\Writers;
require_once __DIR__.'/../Writer.php';
use DomDocument;
use DomElement;
use DomAttr;
use PicoFeed\Writer;
class Atom extends \PicoFeed\Writer
/**
* Atom writer class
*
* @author Frederic Guillot
* @package picofeed
*/
class Atom extends Writer
{
/**
* List of required properties for each feed
*
* @access private
* @var array
*/
private $required_feed_properties = array(
'title',
'site_url',
'feed_url',
);
/**
* List of required properties for each item
*
* @access private
* @var array
*/
private $required_item_properties = array(
'title',
'url',
);
/**
* Get the Atom document
*
* @access public
* @param string $filename Optional filename
* @return string
*/
public function execute($filename = '')
{
$this->checkRequiredProperties($this->required_feed_properties, $this);
$this->dom = new \DomDocument('1.0', 'UTF-8');
$this->dom = new DomDocument('1.0', 'UTF-8');
$this->dom->formatOutput = true;
// <feed/>
$feed = $this->dom->createElement('feed');
$feed->setAttributeNodeNS(new \DomAttr('xmlns', 'http://www.w3.org/2005/Atom'));
$feed->setAttributeNodeNS(new DomAttr('xmlns', 'http://www.w3.org/2005/Atom'));
// <generator/>
$generator = $this->dom->createElement('generator', 'PicoFeed');
@ -115,8 +142,16 @@ class Atom extends \PicoFeed\Writer
}
}
public function addLink($xml, $url, $rel = 'alternate', $type = 'text/html')
/**
* Add Link
*
* @access public
* @param DomElement $xml XML node
* @param string $url URL
* @param string $rel Link rel attribute
* @param string $type Link type attribute
*/
public function addLink(DomElement $xml, $url, $rel = 'alternate', $type = 'text/html')
{
$link = $this->dom->createElement('link');
$link->setAttribute('rel', $rel);
@ -125,8 +160,14 @@ class Atom extends \PicoFeed\Writer
$xml->appendChild($link);
}
public function addUpdated($xml, $value = '')
/**
* Add publication date
*
* @access public
* @param DomElement $xml XML node
* @param string $value Timestamp
*/
public function addUpdated(DomElement $xml, $value = '')
{
$xml->appendChild($this->dom->createElement(
'updated',
@ -134,8 +175,14 @@ class Atom extends \PicoFeed\Writer
));
}
public function addAuthor($xml, array $values)
/**
* Add author
*
* @access public
* @param DomElement $xml XML node
* @param array $values Author name and email
*/
public function addAuthor(DomElement $xml, array $values)
{
$author = $this->dom->createElement('author');
@ -159,4 +206,4 @@ class Atom extends \PicoFeed\Writer
$xml->appendChild($author);
}
}
}

View File

@ -2,34 +2,61 @@
namespace PicoFeed\Writers;
require_once __DIR__.'/../Writer.php';
use DomDocument;
use DomAttr;
use DomElement;
use PicoFeed\Writer;
class Rss20 extends \PicoFeed\Writer
/**
* Rss 2.0 writer class
*
* @author Frederic Guillot
* @package picofeed
*/
class Rss20 extends Writer
{
/**
* List of required properties for each feed
*
* @access private
* @var array
*/
private $required_feed_properties = array(
'title',
'site_url',
'feed_url',
);
/**
* List of required properties for each item
*
* @access private
* @var array
*/
private $required_item_properties = array(
'title',
'url',
);
/**
* Get the Rss 2.0 document
*
* @access public
* @param string $filename Optional filename
* @return string
*/
public function execute($filename = '')
{
$this->checkRequiredProperties($this->required_feed_properties, $this);
$this->dom = new \DomDocument('1.0', 'UTF-8');
$this->dom = new DomDocument('1.0', 'UTF-8');
$this->dom->formatOutput = true;
// <rss/>
$rss = $this->dom->createElement('rss');
$rss->setAttribute('version', '2.0');
$rss->setAttributeNodeNS(new \DomAttr('xmlns:content', 'http://purl.org/rss/1.0/modules/content/'));
$rss->setAttributeNodeNS(new \DomAttr('xmlns:atom', 'http://www.w3.org/2005/Atom'));
$rss->setAttributeNodeNS(new DomAttr('xmlns:content', 'http://purl.org/rss/1.0/modules/content/'));
$rss->setAttributeNodeNS(new DomAttr('xmlns:atom', 'http://www.w3.org/2005/Atom'));
$channel = $this->dom->createElement('channel');
@ -130,8 +157,14 @@ class Rss20 extends \PicoFeed\Writer
}
}
public function addPubDate($xml, $value = '')
/**
* Add publication date
*
* @access public
* @param DomElement $xml XML node
* @param string $value Timestamp
*/
public function addPubDate(DomElement $xml, $value = '')
{
$xml->appendChild($this->dom->createElement(
'pubDate',
@ -139,8 +172,15 @@ class Rss20 extends \PicoFeed\Writer
));
}
public function addAuthor($xml, $tag, array $values)
/**
* Add author
*
* @access public
* @param DomElement $xml XML node
* @param string $tag Tag name
* @param array $values Author name and email
*/
public function addAuthor(DomElement $xml, $tag, array $values)
{
$value = '';
@ -153,4 +193,4 @@ class Rss20 extends \PicoFeed\Writer
$xml->appendChild($author);
}
}
}
}

136
vendor/PicoFeed/XmlParser.php vendored Normal file
View File

@ -0,0 +1,136 @@
<?php
namespace PicoFeed;
use DomDocument;
use SimpleXmlElement;
/**
* XML parser class
*
* Checks for XML eXternal Entity (XXE) and XML Entity Expansion (XEE) attacks on XML documents
*
* @author Frederic Guillot
* @package picofeed
*/
class XmlParser
{
/**
* Get a SimpleXmlElement instance or return false
*
* @static
* @access public
* @param string $input XML content
* @return mixed
*/
public static function getSimpleXml($input)
{
$dom = self::getDomDocument($input);
if ($dom !== false) {
$simplexml = simplexml_import_dom($dom);
if (! $simplexml instanceof SimpleXmlElement) {
return false;
}
return $simplexml;
}
return false;
}
/**
* Get a DomDocument instance or return false
*
* @static
* @access public
* @param string $input XML content
* @return mixed
*/
public static function getDomDocument($input)
{
if (substr(php_sapi_name(), 0, 3) === 'fpm') {
// If running with PHP-FPM and an entity is detected we refuse to parse the feed
// @see https://bugs.php.net/bug.php?id=64938
if (strpos($input, '<!ENTITY') !== false) {
return false;
}
}
else {
libxml_disable_entity_loader(true);
}
libxml_use_internal_errors(true);
$dom = new DomDocument;
$dom->loadXml($input, LIBXML_NONET);
// The document is empty, there is probably some parsing errors
if ($dom->childNodes->length === 0) {
return false;
}
// Scan for potential XEE attacks using ENTITY
foreach ($dom->childNodes as $child) {
if ($child->nodeType === XML_DOCUMENT_TYPE_NODE) {
if ($child->entities->length > 0) {
return false;
}
}
}
return $dom;
}
/**
* Load HTML document by using a DomDocument instance or return false on failure
*
* @static
* @access public
* @param string $input XML content
* @return mixed
*/
public static function getHtmlDocument($input)
{
libxml_use_internal_errors(true);
$dom = new DomDocument;
if (version_compare(PHP_VERSION, '5.4.0', '>=')) {
$dom->loadHTML($input, LIBXML_NONET);
}
else {
$dom->loadHTML($input);
}
return $dom;
}
/**
* Get XML parser errors
*
* @static
* @access public
* @return string
*/
public static function getErrors()
{
$errors = array();
foreach(libxml_get_errors() as $error) {
$errors[] = sprintf('XML error: %s (Line: %d - Column: %d - Code: %d)',
$error->message,
$error->line,
$error->column,
$error->code
);
}
return implode(', ', $errors);
}
}