Update PicoFeed and PicoDb

This commit is contained in:
Frédéric Guillot 2014-10-19 14:42:31 -04:00
parent e1b22f2d35
commit e2280f1b7b
34 changed files with 2364 additions and 2485 deletions

View File

@ -3,7 +3,6 @@
require __DIR__.'/lib/Translator.php'; require __DIR__.'/lib/Translator.php';
require __DIR__.'/vendor/PicoDb/Database.php'; require __DIR__.'/vendor/PicoDb/Database.php';
require __DIR__.'/vendor/PicoFeed/PicoFeed.php'; require __DIR__.'/vendor/PicoFeed/PicoFeed.php';
require __DIR__.'/vendor/Readability/Readability.php';
require __DIR__.'/vendor/SimpleValidator/Validator.php'; require __DIR__.'/vendor/SimpleValidator/Validator.php';
require __DIR__.'/vendor/SimpleValidator/Base.php'; require __DIR__.'/vendor/SimpleValidator/Base.php';

View File

@ -8,8 +8,7 @@ How the content grabber works?
1. Try with rules first (xpath patterns) for the domain name (see `PicoFeed\Rules\`) 1. Try with rules first (xpath patterns) for the domain name (see `PicoFeed\Rules\`)
2. Try to find the text content by using common attributes for class and id 2. Try to find the text content by using common attributes for class and id
3. Fallback to Readability if no content is found 3. Finally, if nothing is found, the feed content is displayed
4. Finally, if nothing is found, the feed content is displayed
The content downloader use a fake user agent, actually Google Chrome under Mac Os X. The content downloader use a fake user agent, actually Google Chrome under Mac Os X.

View File

@ -8,7 +8,6 @@ use PicoFeed\Logging;
use PicoFeed\Grabber; use PicoFeed\Grabber;
use PicoFeed\Client; use PicoFeed\Client;
use PicoFeed\Filter; use PicoFeed\Filter;
use Readability;
// Get all items without filtering // Get all items without filtering
function get_everything() function get_everything()
@ -535,12 +534,9 @@ function download_content_url($url)
if ($grabber->parse()) { if ($grabber->parse()) {
$content = $grabber->getcontent(); $content = $grabber->getcontent();
} }
else {
$content = download_content_readability($grabber->getRawContent(), $url);
}
if (! empty($content)) { if (! empty($content)) {
$filter = new Filter($content, $url); $filter = Filter::html($content, $url);
$filter->setConfig(Config\get_reader_config()); $filter->setConfig(Config\get_reader_config());
$content = $filter->execute(); $content = $filter->execute();
} }
@ -580,18 +576,3 @@ function download_content_id($item_id)
'content' => '' 'content' => ''
); );
} }
// Download content with Readability PHP port
function download_content_readability($content, $url)
{
if (! empty($content)) {
$readability = new Readability($content, $url);
if ($readability->init()) {
return $readability->getContent()->innerHTML;
}
}
return '';
}

View File

@ -86,6 +86,11 @@ class Database
public function escapeIdentifier($value) public function escapeIdentifier($value)
{ {
// Do not escape custom query
if (strpos($value, '.') !== false || strpos($value, ' ') !== false) {
return $value;
}
return $this->pdo->escapeIdentifier($value); return $this->pdo->escapeIdentifier($value);
} }

View File

@ -70,7 +70,6 @@ class Mysql extends \PDO {
public function escapeIdentifier($value) public function escapeIdentifier($value)
{ {
if (strpos($value, '.') !== false) return $value;
return '`'.$value.'`'; return '`'.$value.'`';
} }
} }

View File

@ -51,7 +51,6 @@ class Sqlite extends \PDO {
public function escapeIdentifier($value) public function escapeIdentifier($value)
{ {
if (strpos($value, '.') !== false) return $value;
return '"'.$value.'"'; return '"'.$value.'"';
} }
} }

View File

@ -173,6 +173,10 @@ class Table
public function buildSelectQuery() public function buildSelectQuery()
{ {
foreach ($this->columns as $key => $value) {
$this->columns[$key] = $this->db->escapeIdentifier($value);
}
return sprintf( return sprintf(
'SELECT %s %s FROM %s %s %s %s %s %s %s', 'SELECT %s %s FROM %s %s %s %s %s %s %s',
$this->distinct ? 'DISTINCT' : '', $this->distinct ? 'DISTINCT' : '',
@ -350,7 +354,7 @@ class Table
switch (strtolower($name)) { switch (strtolower($name)) {
case 'in': case 'in':
if (isset($arguments[1]) && is_array($arguments[1])) { if (isset($arguments[1]) && is_array($arguments[1]) && ! empty($arguments[1])) {
$sql = sprintf( $sql = sprintf(
'%s IN (%s)', '%s IN (%s)',
@ -361,7 +365,7 @@ class Table
break; break;
case 'notin': case 'notin':
if (isset($arguments[1]) && is_array($arguments[1])) { if (isset($arguments[1]) && is_array($arguments[1]) && ! empty($arguments[1])) {
$sql = sprintf( $sql = sprintf(
'%s NOT IN (%s)', '%s NOT IN (%s)',

View File

@ -5,7 +5,6 @@ namespace PicoFeed;
use LogicException; use LogicException;
use Clients\Curl; use Clients\Curl;
use Clients\Stream; use Clients\Stream;
use PicoFeed\Logging;
/** /**
* Client class * Client class
@ -23,6 +22,14 @@ abstract class Client
*/ */
private $is_modified = true; private $is_modified = true;
/**
* Flag that say if the resource is a 404
*
* @access private
* @var bool
*/
private $is_not_found = false;
/** /**
* HTTP encoding * HTTP encoding
* *
@ -170,38 +177,110 @@ abstract class Client
$response = $this->doRequest(); $response = $this->doRequest();
if (is_array($response)) { if (is_array($response)) {
$this->handleNotModifiedResponse($response);
if ($response['status'] == 304) { $this->handleNotFoundResponse($response);
$this->is_modified = false; $this->handleNormalResponse($response);
Logging::setMessage(get_called_class().' Resource not modified');
}
else if ($response['status'] == 404) {
Logging::setMessage(get_called_class().' Resource not found');
}
else {
$etag = isset($response['headers']['ETag']) ? $response['headers']['ETag'] : '';
$last_modified = isset($response['headers']['Last-Modified']) ? $response['headers']['Last-Modified'] : '';
$this->content = $response['body'];
if (isset($response['headers']['Content-Type'])) {
$result = explode('charset=', strtolower($response['headers']['Content-Type']));
$this->encoding = isset($result[1]) ? $result[1] : '';
}
if (($this->etag && $this->etag === $etag) || ($this->last_modified && $last_modified === $this->last_modified)) {
$this->is_modified = false;
}
$this->etag = $etag;
$this->last_modified = $last_modified;
}
return true; return true;
} }
return false; return false;
} }
/**
* Handle not modified response
*
* @access public
* @param array $response Client response
*/
public function handleNotModifiedResponse(array $response)
{
if ($response['status'] == 304) {
$this->is_modified = false;
}
else if ($response['status'] == 200) {
$etag = $this->getHeader($response, 'ETag');
$last_modified = $this->getHeader($response, 'Last-Modified');
if ($this->isPropertyEquals('etag', $etag) || $this->isPropertyEquals('last_modified', $last_modified)) {
$this->is_modified = false;
}
$this->etag = $etag;
$this->last_modified = $last_modified;
}
if ($this->is_modified === false) {
Logging::setMessage(get_called_class().' Resource not modified');
}
}
/**
* Handle not found response
*
* @access public
* @param array $response Client response
*/
public function handleNotFoundResponse(array $response)
{
if ($response['status'] == 404) {
$this->is_not_found = true;
Logging::setMessage(get_called_class().' Resource not found');
}
}
/**
* Handle normal response
*
* @access public
* @param array $response Client response
*/
public function handleNormalResponse(array $response)
{
if ($response['status'] == 200) {
$this->content = $response['body'];
$this->encoding = $this->findCharset($response);
}
}
/**
* Check if a class property equals to a value
*
* @access public
* @param string $property Class property
* @param string $value Value
* @return boolean
*/
private function isPropertyEquals($property, $value)
{
return $this->$property && $this->$property === $value;
}
/**
* Find charset from response headers
*
* @access public
* @param array $response Client response
*/
public function findCharset(array $response)
{
$result = explode('charset=', strtolower($this->getHeader($response, 'Content-Type')));
return isset($result[1]) ? $result[1] : '';
}
/**
* Get header value from a client response
*
* @access public
* @param array $response Client response
* @param string $header Header name
* @return string
*/
public function getHeader(array $response, $header)
{
return isset($response['headers'][$header]) ? $response['headers'][$header] : '';
}
/** /**
* Parse HTTP headers * Parse HTTP headers
* *
@ -340,6 +419,17 @@ abstract class Client
return $this->is_modified; return $this->is_modified;
} }
/**
* Return true if the remote resource is not found
*
* @access public
* @return bool
*/
public function isNotFound()
{
return $this->is_not_found;
}
/** /**
* Set connection timeout * Set connection timeout
* *
@ -453,14 +543,16 @@ abstract class Client
*/ */
public function setConfig($config) public function setConfig($config)
{ {
$this->setTimeout($config->getGrabberTimeout()); if ($config !== null) {
$this->setUserAgent($config->getGrabberUserAgent()); $this->setTimeout($config->getGrabberTimeout());
$this->setMaxRedirections($config->getMaxRedirections()); $this->setUserAgent($config->getGrabberUserAgent());
$this->setMaxBodySize($config->getMaxBodySize()); $this->setMaxRedirections($config->getMaxRedirections());
$this->setProxyHostname($config->getProxyHostname()); $this->setMaxBodySize($config->getMaxBodySize());
$this->setProxyPort($config->getProxyPort()); $this->setProxyHostname($config->getProxyHostname());
$this->setProxyUsername($config->getProxyUsername()); $this->setProxyPort($config->getProxyPort());
$this->setProxyPassword($config->getProxyPassword()); $this->setProxyUsername($config->getProxyUsername());
$this->setProxyPassword($config->getProxyPassword());
}
return $this; return $this;
} }

View File

@ -97,36 +97,37 @@ class Curl extends Client
} }
/** /**
* Do the HTTP request * Prepare HTTP headers
* *
* @access public * @access private
* @param bool $follow_location Flag used when there is an open_basedir restriction * @return array
* @return array HTTP response ['body' => ..., 'status' => ..., 'headers' => ...]
*/ */
public function doRequest($follow_location = true) private function prepareHeaders()
{ {
$request_headers = array('Connection: close'); $headers = array(
'Connection: close',
'User-Agent: '.$this->user_agent,
);
if ($this->etag) $request_headers[] = 'If-None-Match: '.$this->etag; if ($this->etag) {
if ($this->last_modified) $request_headers[] = 'If-Modified-Since: '.$this->last_modified; $headers[] = 'If-None-Match: '.$this->etag;
}
$ch = curl_init(); if ($this->last_modified) {
$headers[] = 'If-Modified-Since: '.$this->last_modified;
}
curl_setopt($ch, CURLOPT_URL, $this->url); return $headers;
curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1); }
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_USERAGENT, $this->user_agent);
curl_setopt($ch, CURLOPT_HTTPHEADER, $request_headers);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, ini_get('open_basedir') === '');
curl_setopt($ch, CURLOPT_MAXREDIRS, $this->max_redirects);
curl_setopt($ch, CURLOPT_ENCODING, '');
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // For auto-signed certificates...
curl_setopt($ch, CURLOPT_WRITEFUNCTION, array($this, 'readBody'));
curl_setopt($ch, CURLOPT_HEADERFUNCTION, array($this, 'readHeaders'));
curl_setopt($ch, CURLOPT_COOKIEJAR, 'php://memory');
curl_setopt($ch, CURLOPT_COOKIEFILE, 'php://memory');
/**
* Prepare curl proxy context
*
* @access private
* @return resource
*/
private function prepareProxyContext($ch)
{
if ($this->proxy_hostname) { if ($this->proxy_hostname) {
Logging::setMessage(get_called_class().' Proxy: '.$this->proxy_hostname.':'.$this->proxy_port); Logging::setMessage(get_called_class().' Proxy: '.$this->proxy_hostname.':'.$this->proxy_port);
@ -144,6 +145,47 @@ class Curl extends Client
} }
} }
return $ch;
}
/**
* Prepare curl context
*
* @access private
* @return resource
*/
private function prepareContext()
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $this->url);
curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_HTTPHEADER, $this->prepareHeaders());
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, ini_get('open_basedir') === '');
curl_setopt($ch, CURLOPT_MAXREDIRS, $this->max_redirects);
curl_setopt($ch, CURLOPT_ENCODING, '');
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // For auto-signed certificates...
curl_setopt($ch, CURLOPT_WRITEFUNCTION, array($this, 'readBody'));
curl_setopt($ch, CURLOPT_HEADERFUNCTION, array($this, 'readHeaders'));
curl_setopt($ch, CURLOPT_COOKIEJAR, 'php://memory');
curl_setopt($ch, CURLOPT_COOKIEFILE, 'php://memory');
$ch = $this->prepareProxyContext($ch);
return $ch;
}
/**
* Execute curl context
*
* @access private
* @return resource
*/
private function executeContext()
{
$ch = $this->prepareContext();
curl_exec($ch); curl_exec($ch);
Logging::setMessage(get_called_class().' cURL total time: '.curl_getinfo($ch, CURLINFO_TOTAL_TIME)); Logging::setMessage(get_called_class().' cURL total time: '.curl_getinfo($ch, CURLINFO_TOTAL_TIME));
@ -153,44 +195,34 @@ class Curl extends Client
Logging::setMessage(get_called_class().' cURL effective url: '.curl_getinfo($ch, CURLINFO_EFFECTIVE_URL)); Logging::setMessage(get_called_class().' cURL effective url: '.curl_getinfo($ch, CURLINFO_EFFECTIVE_URL));
if (curl_errno($ch)) { if (curl_errno($ch)) {
Logging::setMessage(get_called_class().' cURL error: '.curl_error($ch)); Logging::setMessage(get_called_class().' cURL error: '.curl_error($ch));
curl_close($ch); curl_close($ch);
return false; return false;
} }
curl_close($ch); curl_close($ch);
return true;
}
/**
* Do the HTTP request
*
* @access public
* @param bool $follow_location Flag used when there is an open_basedir restriction
* @return array HTTP response ['body' => ..., 'status' => ..., 'headers' => ...]
*/
public function doRequest($follow_location = true)
{
if (! $this->executeContext()) {
return false;
}
list($status, $headers) = $this->parseHeaders(explode("\r\n", $this->headers[$this->headers_counter - 1])); list($status, $headers) = $this->parseHeaders(explode("\r\n", $this->headers[$this->headers_counter - 1]));
if ($follow_location && ini_get('open_basedir') !== '' && ($status == 301 || $status == 302)) { // When resticted with open_basedir
if ($this->needToHandleRedirection($follow_location, $status)) {
$nb_redirects = 0; return $this->handleRedirection($headers['Location']);
$this->url = $headers['Location'];
$this->body = '';
$this->body_length = 0;
$this->headers = array();
$this->headers_counter = 0;
while (true) {
$nb_redirects++;
if ($nb_redirects >= $this->max_redirects) return false;
$result = $this->doRequest(false);
if ($result['status'] == 301 || $result['status'] == 302) {
$this->url = $result['headers']['Location'];
$this->body = '';
$this->body_length = 0;
$this->headers = array();
$this->headers_counter = 0;
}
else {
return $result;
}
}
} }
return array( return array(
@ -199,4 +231,58 @@ class Curl extends Client
'headers' => $headers 'headers' => $headers
); );
} }
/**
* Check if the redirection have to be handled manually
*
* @access private
* @param boolean $follow_location Flag
* @param integer $status HTTP status code
* @return boolean
*/
private function needToHandleRedirection($follow_location, $status)
{
return $follow_location && ini_get('open_basedir') !== '' && ($status == 301 || $status == 302);
}
/**
* Handle manually redirections when there is an open base dir restriction
*
* @access private
* @param string $location Redirected URL
* @return boolean|array
*/
private function handleRedirection($location)
{
$nb_redirects = 0;
$this->url = $location;
$this->body = '';
$this->body_length = 0;
$this->headers = array();
$this->headers_counter = 0;
while (true) {
$nb_redirects++;
if ($nb_redirects >= $this->max_redirects) {
return false;
}
$result = $this->doRequest(false);
if ($result['status'] == 301 || $result['status'] == 302) {
$this->url = $result['headers']['Location'];
$this->body = '';
$this->body_length = 0;
$this->headers = array();
$this->headers_counter = 0;
}
else {
return $result;
}
}
return false;
}
} }

View File

@ -14,14 +14,13 @@ use \PicoFeed\Client;
class Stream extends Client class Stream extends Client
{ {
/** /**
* Do the HTTP request * Prepare HTTP headers
* *
* @access public * @access private
* @return array HTTP response ['body' => ..., 'status' => ..., 'headers' => ...] * @return array
*/ */
public function doRequest() private function prepareHeaders()
{ {
// Prepare HTTP headers for the request
$headers = array( $headers = array(
'Connection: close', 'Connection: close',
'User-Agent: '.$this->user_agent, 'User-Agent: '.$this->user_agent,
@ -39,14 +38,27 @@ class Stream extends Client
$headers[] = 'If-Modified-Since: '.$this->last_modified; $headers[] = 'If-Modified-Since: '.$this->last_modified;
} }
// Create context if ($this->proxy_username) {
$context_options = array( $headers[] = 'Proxy-Authorization: Basic '.base64_encode($this->proxy_username.':'.$this->proxy_password);
}
return $headers;
}
/**
* Prepare stream context
*
* @access private
* @return array
*/
private function prepareContext()
{
$context = array(
'http' => array( 'http' => array(
'method' => 'GET', 'method' => 'GET',
'protocol_version' => 1.1, 'protocol_version' => 1.1,
'timeout' => $this->timeout, 'timeout' => $this->timeout,
'max_redirects' => $this->max_redirects, 'max_redirects' => $this->max_redirects,
'header' => implode("\r\n", $headers)
) )
); );
@ -54,31 +66,46 @@ class Stream extends Client
Logging::setMessage(get_called_class().' Proxy: '.$this->proxy_hostname.':'.$this->proxy_port); Logging::setMessage(get_called_class().' Proxy: '.$this->proxy_hostname.':'.$this->proxy_port);
$context_options['http']['proxy'] = 'tcp://'.$this->proxy_hostname.':'.$this->proxy_port; $context['http']['proxy'] = 'tcp://'.$this->proxy_hostname.':'.$this->proxy_port;
$context_options['http']['request_fulluri'] = true; $context['http']['request_fulluri'] = true;
if ($this->proxy_username) { if ($this->proxy_username) {
Logging::setMessage(get_called_class().' Proxy credentials: Yes'); Logging::setMessage(get_called_class().' Proxy credentials: Yes');
$headers[] = 'Proxy-Authorization: Basic '.base64_encode($this->proxy_username.':'.$this->proxy_password);
$context_options['http']['header'] = implode("\r\n", $headers);
} }
else { else {
Logging::setMessage(get_called_class().' Proxy credentials: No'); Logging::setMessage(get_called_class().' Proxy credentials: No');
} }
} }
$context = stream_context_create($context_options); $context['http']['header'] = implode("\r\n", $this->prepareHeaders());
return $context;
}
/**
* Do the HTTP request
*
* @access public
* @return array HTTP response ['body' => ..., 'status' => ..., 'headers' => ...]
*/
public function doRequest()
{
// Create context
$context = stream_context_create($this->prepareContext());
// Make HTTP request // Make HTTP request
$stream = @fopen($this->url, 'r', false, $context); $stream = @fopen($this->url, 'r', false, $context);
if (! is_resource($stream)) return false; if (! is_resource($stream)) {
return false;
}
// Get the entire body until the max size // Get the entire body until the max size
$body = stream_get_contents($stream, $this->max_body_size + 1); $body = stream_get_contents($stream, $this->max_body_size + 1);
// If the body size is too large abort everything // If the body size is too large abort everything
if (strlen($body) > $this->max_body_size) return false; if (strlen($body) > $this->max_body_size) {
return false;
}
// Get HTTP headers response // Get HTTP headers response
$metadata = stream_get_meta_data($stream); $metadata = stream_get_meta_data($stream);
@ -87,6 +114,23 @@ class Stream extends Client
fclose($stream); fclose($stream);
return array(
'status' => $status,
'body' => $this->decodeBody($body, $headers),
'headers' => $headers
);
}
/**
* Decode body response according to the HTTP headers
*
* @access public
* @param string $body Raw body
* @param array $headers HTTP headers
* @return string
*/
public function decodeBody($body, array $headers)
{
if (isset($headers['Transfer-Encoding']) && $headers['Transfer-Encoding'] === 'chunked') { if (isset($headers['Transfer-Encoding']) && $headers['Transfer-Encoding'] === 'chunked') {
$body = $this->decodeChunked($body); $body = $this->decodeChunked($body);
} }
@ -95,11 +139,7 @@ class Stream extends Client
$body = @gzdecode($body); $body = @gzdecode($body);
} }
return array( return $body;
'status' => $status,
'body' => $body,
'headers' => $headers
);
} }
/** /**

View File

@ -7,6 +7,52 @@ namespace PicoFeed;
* *
* @author Frederic Guillot * @author Frederic Guillot
* @package picofeed * @package picofeed
*
* @method \PicoFeed\Config setClientTimeout(integer $value)
* @method \PicoFeed\Config setClientUserAgent(string $value)
* @method \PicoFeed\Config setMaxRedirections(integer $value)
* @method \PicoFeed\Config setMaxBodySize(integer $value)
* @method \PicoFeed\Config setProxyHostname(string $value)
* @method \PicoFeed\Config setProxyPort(integer $value)
* @method \PicoFeed\Config setProxyUsername(string $value)
* @method \PicoFeed\Config setProxyPassword(string $value)
* @method \PicoFeed\Config setGrabberTimeout(integer $value)
* @method \PicoFeed\Config setGrabberUserAgent(string $value)
* @method \PicoFeed\Config setParserHashAlgo(string $value)
* @method \PicoFeed\Config setContentFiltering(boolean $value)
* @method \PicoFeed\Config setTimezone(string $value)
* @method \PicoFeed\Config setFilterIframeWhitelist(array $value)
* @method \PicoFeed\Config setFilterIntegerAttributes(array $value)
* @method \PicoFeed\Config setFilterAttributeOverrides(array $value)
* @method \PicoFeed\Config setFilterRequiredAttributes(array $value)
* @method \PicoFeed\Config setFilterMediaBlacklist(array $value)
* @method \PicoFeed\Config setFilterMediaAttributes(array $value)
* @method \PicoFeed\Config setFilterSchemeWhitelist(array $value)
* @method \PicoFeed\Config setFilterWhitelistedTags(array $value)
* @method \PicoFeed\Config setFilterBlacklistedTags(array $value)
*
* @method integer getClientTimeout()
* @method string getClientUserAgent()
* @method integer getMaxRedirections()
* @method integer getMaxBodySize()
* @method string getProxyHostname()
* @method integer getProxyPort()
* @method string getProxyUsername()
* @method string getProxyPassword()
* @method integer getGrabberTimeout()
* @method string getGrabberUserAgent()
* @method string getParserHashAlgo()
* @method boolean getContentFiltering(bool $default_value)
* @method string getTimezone()
* @method array getFilterIframeWhitelist(array $default_value)
* @method array getFilterIntegerAttributes(array $default_value)
* @method array getFilterAttributeOverrides(array $default_value)
* @method array getFilterRequiredAttributes(array $default_value)
* @method array getFilterMediaBlacklist(array $default_value)
* @method array getFilterMediaAttributes(array $default_value)
* @method array getFilterSchemeWhitelist(array $default_value)
* @method array getFilterWhitelistedTags(array $default_value)
* @method array getFilterBlacklistedTags(array $default_value)
*/ */
class Config class Config
{ {

View File

@ -58,23 +58,39 @@ class Export
$body = $xml->addChild('body'); $body = $xml->addChild('body');
foreach ($this->content as $feed) { foreach ($this->content as $category => $values) {
$valid = true; if (is_string($category)) {
$this->createCategory($body, $category, $values);
foreach ($this->required_fields as $field) {
if (! isset($feed[$field])) {
$valid = false;
break;
}
} }
else {
if (! $valid) { $this->createEntry($body, $values);
continue;
} }
}
$outline = $body->addChild('outline'); return $xml->asXML();
}
/**
* Create a feed entry
*
* @access public
* @param SimpleXMLElement $parent Parent Element
* @param array $feed Feed properties
*/
public function createEntry(SimpleXMLElement $parent, array $feed)
{
$valid = true;
foreach ($this->required_fields as $field) {
if (! isset($feed[$field])) {
$valid = false;
break;
}
}
if ($valid) {
$outline = $parent->addChild('outline');
$outline->addAttribute('xmlUrl', $feed['feed_url']); $outline->addAttribute('xmlUrl', $feed['feed_url']);
$outline->addAttribute('htmlUrl', $feed['site_url']); $outline->addAttribute('htmlUrl', $feed['site_url']);
$outline->addAttribute('title', $feed['title']); $outline->addAttribute('title', $feed['title']);
@ -83,7 +99,34 @@ class Export
$outline->addAttribute('type', 'rss'); $outline->addAttribute('type', 'rss');
$outline->addAttribute('version', 'RSS'); $outline->addAttribute('version', 'RSS');
} }
}
return $xml->asXML(); /**
* Create entries for a feed list
*
* @access public
* @param SimpleXMLElement $parent Parent Element
* @param array $feeds Feed list
*/
public function createEntries(SimpleXMLElement $parent, array $feeds)
{
foreach ($feeds as $feed) {
$this->createEntry($parent, $feed);
}
}
/**
* Create a category entry
*
* @access public
* @param SimpleXMLElement $parent Parent Element
* @param string $category Category
* @param array $feed Feed properties
*/
public function createCategory(SimpleXMLElement $parent, $category, array $feeds)
{
$outline = $parent->addChild('outline');
$outline->addAttribute('text', $category);
$this->createEntries($outline, $feeds);
} }
} }

163
vendor/PicoFeed/Favicon.php vendored Normal file
View File

@ -0,0 +1,163 @@
<?php
namespace PicoFeed;
use DOMXpath;
/**
* Favicon class
*
* https://en.wikipedia.org/wiki/Favicon
*
* @author Frederic Guillot
* @package picofeed
*/
class Favicon
{
/**
* Config class instance
*
* @access private
* @var \PicoFeed\Config
*/
private $config = null;
/**
* Icon content
*
* @access private
* @var string
*/
private $content = '';
/**
* Constructor
*
* @access public
* @param \PicoFeed\Config $config Config class instance
*/
public function __construct(Config $config = null)
{
$this->config = $config ?: new Config;
}
/**
* Get the icon file content (available only after the download)
*
* @access public
* @return string
*/
public function getContent()
{
return $this->content;
}
/**
* Download and check if a resource exists
*
* @access public
* @param string $url URL
* @return string Resource content
*/
public function download($url)
{
Logging::setMessage(get_called_class().' Download => '.$url);
$client = Client::getInstance();
$client->setConfig($this->config);
if ($client->execute($url) && ! $client->isNotFound()) {
return $client->getContent();
}
return '';
}
/**
* Check if a remote file exists
*
* @access public
* @param string $url URL
* @return boolean
*/
public function exists($url)
{
return $this->download($url) !== '';
}
/**
* Get the icon link for a website
*
* @access public
* @param string $website_link URL
* @return string
*/
public function find($website_link)
{
$website = new Url($website_link);
$icons = $this->extract($this->download($website->getBaseUrl('/')));
$icons[] = $website->getBaseUrl('/favicon.ico');
foreach ($icons as $icon_link) {
$icon_link = $this->convertLink($website, new Url($icon_link));
$this->content = $this->download($icon_link);
if ($this->content !== '') {
return $icon_link;
}
}
return '';
}
/**
* Convert icon links to absolute url
*
* @access public
* @param \PicoFeed\Url $website Website url
* @param \PicoFeed\Url $icon Icon url
* @return string
*/
public function convertLink(Url $website, Url $icon)
{
$base_url = '';
if ($icon->isRelativeUrl()) {
$base_url = $website->getBaseUrl();
}
else if ($icon->isProtocolRelative()) {
$icon->setScheme($website->getScheme());
}
return $icon->getAbsoluteUrl($base_url);
}
/**
* Extract the icon links from the HTML
*
* @access public
* @param string $html HTML
* @return array
*/
public function extract($html)
{
$icons = array();
if (empty($html)) {
return $icons;
}
$dom = XmlParser::getHtmlDocument($html);
$xpath = new DOMXpath($dom);
$elements = $xpath->query("//link[contains(@rel, 'icon') and not(contains(@rel, 'apple'))]");
for ($i = 0; $i < $elements->length; $i++) {
$icons[] = $elements->item($i)->getAttribute('href');
}
return $icons;
}
}

View File

@ -35,7 +35,15 @@ class Feed
public $title = ''; public $title = '';
/** /**
* Item url * Feed description
*
* @access public
* @var string
*/
public $description = '';
/**
* Feed url
* *
* @access public * @access public
* @var string * @var string
@ -43,7 +51,7 @@ class Feed
public $url = ''; public $url = '';
/** /**
* Item date * Feed date
* *
* @access public * @access public
* @var integer * @var integer
@ -51,13 +59,21 @@ class Feed
public $date = 0; public $date = 0;
/** /**
* Item language * Feed language
* *
* @access public * @access public
* @var string * @var string
*/ */
public $language = ''; public $language = '';
/**
* Feed logo URL (not the same as icon)
*
* @access public
* @var string
*/
public $logo = '';
/** /**
* Return feed information * Return feed information
* *
@ -68,7 +84,7 @@ class Feed
{ {
$output = ''; $output = '';
foreach (array('id', 'title', 'url', 'date', 'language') as $property) { foreach (array('id', 'title', 'url', 'date', 'language', 'description', 'logo') as $property) {
$output .= 'Feed::'.$property.' = '.$this->$property.PHP_EOL; $output .= 'Feed::'.$property.' = '.$this->$property.PHP_EOL;
} }
@ -93,6 +109,28 @@ class Feed
return $this->title; return $this->title;
} }
/**
* Get description
*
* @access public
* $return string
*/
public function getDescription()
{
return $this->description;
}
/**
* Get the logo url
*
* @access public
* $return string
*/
public function getLogo()
{
return $this->logo;
}
/** /**
* Get url * Get url
* *

View File

@ -2,7 +2,7 @@
namespace PicoFeed; namespace PicoFeed;
use DOMDocument; use PicoFeed\Filter\Html;
/** /**
* Filter class * Filter class
@ -13,436 +13,18 @@ use DOMDocument;
class Filter class Filter
{ {
/** /**
* Config object * Get the Html filter instance
*
* @access private
* @var \PicoFeed\Config
*/
private $config = null;
/**
* Filtered XML data
*
* @access private
* @var string
*/
private $data = '';
/**
* Site URL (used to build absolute URL)
*
* @access private
* @var string
*/
private $url = '';
/**
* Unfiltered XML data
*
* @access private
* @var string
*/
private $input = '';
/**
* List of empty tags
*
* @access private
* @var array
*/
private $empty_tags = array();
/**
* Flag to remove the content of a tag
*
* @access private
* @var boolean
*/
private $strip_content = false;
/**
* Flag to remember if the current payload is a source code <pre/>
*
* @access private
* @var boolean
*/
private $is_code = false;
/**
* Tags and attribute whitelist
*
* @access private
* @var array
*/
private $whitelist_tags = array(
'audio' => array('controls', 'src'),
'video' => array('poster', 'controls', 'height', 'width', 'src'),
'source' => array('src', 'type'),
'dt' => array(),
'dd' => array(),
'dl' => array(),
'table' => array(),
'caption' => array(),
'tr' => array(),
'th' => array(),
'td' => array(),
'tbody' => array(),
'thead' => array(),
'h2' => array(),
'h3' => array(),
'h4' => array(),
'h5' => array(),
'h6' => array(),
'strong' => array(),
'em' => array(),
'code' => array(),
'pre' => array(),
'blockquote' => array(),
'p' => array(),
'ul' => array(),
'li' => array(),
'ol' => array(),
'br' => array(),
'del' => array(),
'a' => array('href'),
'img' => array('src', 'title', 'alt'),
'figure' => array(),
'figcaption' => array(),
'cite' => array(),
'time' => array('datetime'),
'abbr' => array('title'),
'iframe' => array('width', 'height', 'frameborder', 'src'),
'q' => array('cite')
);
/**
* Tags blacklist, strip the content of those tags
*
* @access private
* @var array
*/
private $blacklisted_tags = array(
'script'
);
/**
* Scheme whitelist
* For a complete list go to http://en.wikipedia.org/wiki/URI_scheme
*
* @access private
* @var array
*/
private $scheme_whitelist = array(
'//',
'data:image/png;base64,',
'data:image/gif;base64,',
'data:image/jpg;base64,',
'bitcoin:',
'callto:',
'ed2k://',
'facetime://',
'feed:',
'ftp://',
'geo:',
'git://',
'http://',
'https://',
'irc://',
'irc6://',
'ircs://',
'jabber:',
'magnet:',
'mailto:',
'nntp://',
'rtmp://',
'sftp://',
'sip:',
'sips:',
'skype:',
'smb://',
'sms:',
'spotify:',
'ssh:',
'steam:',
'svn://',
'tel:',
);
/**
* Attributes used for external resources
*
* @access private
* @var array
*/
private $media_attributes = array(
'src',
'href',
'poster',
);
/**
* Blacklisted resources
*
* @access private
* @var array
*/
private $media_blacklist = array(
'feeds.feedburner.com',
'share.feedsportal.com',
'da.feedsportal.com',
'rss.feedsportal.com',
'res.feedsportal.com',
'res1.feedsportal.com',
'res2.feedsportal.com',
'res3.feedsportal.com',
'pi.feedsportal.com',
'rss.nytimes.com',
'feeds.wordpress.com',
'stats.wordpress.com',
'rss.cnn.com',
'twitter.com/home?status=',
'twitter.com/share',
'twitter_icon_large.png',
'www.facebook.com/sharer.php',
'facebook_icon_large.png',
'plus.google.com/share',
'www.gstatic.com/images/icons/gplus-16.png',
'www.gstatic.com/images/icons/gplus-32.png',
'www.gstatic.com/images/icons/gplus-64.png',
);
/**
* Mandatory attributes for specified tags
*
* @access private
* @var array
*/
private $required_attributes = array(
'a' => array('href'),
'img' => array('src'),
'iframe' => array('src'),
'audio' => array('src'),
'source' => array('src'),
);
/**
* Add attributes to specified tags
*
* @access private
* @var array
*/
private $add_attributes = array(
'a' => 'rel="noreferrer" target="_blank"'
);
/**
* Attributes that must be integer
*
* @access private
* @var array
*/
private $integer_attributes = array(
'width',
'height',
'frameborder',
);
/**
* Iframe source whitelist, everything else is ignored
*
* @access private
* @var array
*/
private $iframe_whitelist = array(
'//www.youtube.com',
'http://www.youtube.com',
'https://www.youtube.com',
'http://player.vimeo.com',
'https://player.vimeo.com',
'http://www.dailymotion.com',
'https://www.dailymotion.com',
);
/**
* Initialize the filter, all inputs data must be encoded in UTF-8 before
* *
* @static
* @access public * @access public
* @param string $data XML content * @param string $html HTML content
* @param string $site_url Site URL (used to build absolute URL) * @param string $website Site URL (used to build absolute URL)
* @return PicoFeed\Filter\Html
*/ */
public function __construct($data, $site_url) public static function html($html, $website)
{ {
$this->url = $site_url; $filter = new Html($html, $website);
return $filter;
libxml_use_internal_errors(true);
// Convert bad formatted documents to XML
$dom = new DOMDocument;
$dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$data);
$this->input = $dom->saveXML($dom->getElementsByTagName('body')->item(0));
}
/**
* Run tags/attributes filtering
*
* @access public
* @return string
*/
public function execute()
{
$parser = xml_parser_create();
xml_set_object($parser, $this);
xml_set_element_handler($parser, 'startTag', 'endTag');
xml_set_character_data_handler($parser, 'dataTag');
xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, false);
xml_parse($parser, $this->input, true); // We ignore parsing error (for old libxml)
xml_parser_free($parser);
$this->data = $this->removeEmptyTags($this->data);
$this->data = $this->removeMultipleTags($this->data);
return trim($this->data);
}
/**
* Parse opening tag
*
* @access public
* @param resource $parser XML parser
* @param string $name Tag name
* @param array $attributes Tag attributes
*/
public function startTag($parser, $name, $attributes)
{
$empty_tag = false;
$this->strip_content = false;
if ($this->is_code === false && $name === 'pre') $this->is_code = true;
if ($this->isPixelTracker($name, $attributes)) {
$empty_tag = true;
}
else if ($this->isAllowedTag($name)) {
$attr_data = '';
$used_attributes = array();
foreach ($attributes as $attribute => $value) {
if ($value != '' && $this->isAllowedAttribute($name, $attribute)) {
if ($this->isResource($attribute)) {
if ($name === 'iframe') {
if ($this->isAllowedIframeResource($value)) {
$attr_data .= ' '.$attribute.'="'.$this->escape($value).'"';
$used_attributes[] = $attribute;
}
}
else if ($this->isRelativePath($value)) {
$attr_data .= ' '.$attribute.'="'.$this->escape($this->getAbsoluteUrl($value, $this->url)).'"';
$used_attributes[] = $attribute;
}
else if ($this->isAllowedProtocol($value) && ! $this->isBlacklistedMedia($value)) {
if ($attribute == 'src' &&
isset($attributes['data-src']) &&
$this->isAllowedProtocol($attributes['data-src']) &&
! $this->isBlacklistedMedia($attributes['data-src'])) {
$value = $attributes['data-src'];
}
// Replace protocol-relative url // by http://
if (substr($value, 0, 2) === '//') $value = 'http:'.$value;
$attr_data .= ' '.$attribute.'="'.$this->escape($value).'"';
$used_attributes[] = $attribute;
}
}
else if ($this->validateAttributeValue($attribute, $value)) {
$attr_data .= ' '.$attribute.'="'.$this->escape($value).'"';
$used_attributes[] = $attribute;
}
}
}
// Check for required attributes
if (isset($this->required_attributes[$name])) {
foreach ($this->required_attributes[$name] as $required_attribute) {
if (! in_array($required_attribute, $used_attributes)) {
$empty_tag = true;
break;
}
}
}
if (! $empty_tag) {
$this->data .= '<'.$name.$attr_data;
// Add custom attributes
if (isset($this->add_attributes[$name])) {
$this->data .= ' '.$this->add_attributes[$name].' ';
}
// If img or br, we don't close it here
if ($name !== 'img' && $name !== 'br') $this->data .= '>';
}
}
if (in_array($name, $this->blacklisted_tags)) {
$this->strip_content = true;
}
$this->empty_tags[] = $empty_tag;
}
/**
* Parse closing tag
*
* @access public
* @param resource $parser XML parser
* @param string $name Tag name
*/
public function endTag($parser, $name)
{
if (! array_pop($this->empty_tags) && $this->isAllowedTag($name)) {
$this->data .= $name !== 'img' && $name !== 'br' ? '</'.$name.'>' : '/>';
}
if ($this->is_code && $name === 'pre') $this->is_code = false;
}
/**
* Parse tag content
*
* @access public
* @param resource $parser XML parser
* @param string $content Tag content
*/
public function dataTag($parser, $content)
{
$content = str_replace("\xc2\xa0", ' ', $content); // Replace &nbsp; with normal space
// Issue with Cyrillic characters
// Replace mutliple space by a single one
// if (! $this->is_code) {
// $content = preg_replace('!\s+!', ' ', $content);
// }
if (! $this->strip_content) {
$this->data .= $this->escape($content);
}
} }
/** /**
@ -454,222 +36,7 @@ class Filter
*/ */
public static function escape($content) public static function escape($content)
{ {
return htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false); return @htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false);
}
/**
* Get the absolute url for a relative link
*
* @access public
* @param string $path Relative path
* @param string $url Site base url
* @return string
*/
public static function getAbsoluteUrl($path, $url)
{
$components = parse_url($url);
if (! isset($components['scheme'])) $components['scheme'] = 'http';
if (! isset($components['host'])) {
if ($url) {
$components['host'] = $url;
$components['path'] = '/';
}
else {
return '';
}
}
if (! strlen($path)) return $url;
if ($path{0} === '/') {
// Absolute path
return $components['scheme'].'://'.$components['host'].$path;
}
else {
// Relative path
$url_path = isset($components['path']) && ! empty($components['path']) ? $components['path'] : '/';
$length = strlen($url_path);
if ($length > 1 && $url_path{$length - 1} !== '/') {
$url_path = dirname($url_path).'/';
}
if (substr($path, 0, 2) === './') {
$path = substr($path, 2);
}
return $components['scheme'].'://'.$components['host'].$url_path.$path;
}
}
/**
* Check if an url is relative
*
* @access public
* @param string $value Attribute value
* @return boolean
*/
public static function isRelativePath($value)
{
if (strpos($value, 'data:') === 0) return false;
return strpos($value, '://') === false && strpos($value, '//') !== 0;
}
/**
* Check if a tag is on the whitelist
*
* @access public
* @param string $name Tag name
* @return boolean
*/
public function isAllowedTag($name)
{
return isset($this->whitelist_tags[$name]);
}
/**
* Check if an attribute is allowed for a given tag
*
* @access public
* @param string $tag Tag name
* @param array $attribute Attribute name
* @return boolean
*/
public function isAllowedAttribute($tag, $attribute)
{
return in_array($attribute, $this->whitelist_tags[$tag]);
}
/**
* Check if an attribute name is an external resource
*
* @access public
* @param string $data Attribute name
* @return boolean
*/
public function isResource($attribute)
{
return in_array($attribute, $this->media_attributes);
}
/**
* Check if an iframe url is allowed
*
* @access public
* @param string $value Attribute value
* @return boolean
*/
public function isAllowedIframeResource($value)
{
foreach ($this->iframe_whitelist as $url) {
if (strpos($value, $url) === 0) {
return true;
}
}
return false;
}
/**
* Detect if the protocol is allowed or not
*
* @access public
* @param string $value Attribute value
* @return boolean
*/
public function isAllowedProtocol($value)
{
foreach ($this->scheme_whitelist as $protocol) {
if (strpos($value, $protocol) === 0) {
return true;
}
}
return false;
}
/**
* Detect if an url is blacklisted
*
* @access public
* @param string $resouce Attribute value (URL)
* @return boolean
*/
public function isBlacklistedMedia($resource)
{
foreach ($this->media_blacklist as $name) {
if (strpos($resource, $name) !== false) {
return true;
}
}
return false;
}
/**
* Detect if an image tag is a pixel tracker
*
* @access public
* @param string $tag Tag name
* @param array $attributes Tag attributes
* @return boolean
*/
public function isPixelTracker($tag, array $attributes)
{
return $tag === 'img' &&
isset($attributes['height']) && isset($attributes['width']) &&
$attributes['height'] == 1 && $attributes['width'] == 1;
}
/**
* Check if an attribute value is integer
*
* @access public
* @param string $attribute Attribute name
* @param string $value Attribute value
* @return boolean
*/
public function validateAttributeValue($attribute, $value)
{
if (in_array($attribute, $this->integer_attributes)) {
return ctype_digit($value);
}
return true;
}
/**
* Replace <br/><br/> by only one
*
* @access public
* @param string $data Input data
* @return string
*/
public function removeMultipleTags($data)
{
return preg_replace("/(<br\s*\/?>\s*)+/", "<br/>", $data);
}
/**
* Remove empty tags
*
* @access public
* @param string $data Input data
* @return string
*/
public function removeEmptyTags($data)
{
return preg_replace('/<([^<\/>]*)>([\s]*?|(?R))<\/\1>/imsU', '', $data);
} }
/** /**
@ -734,145 +101,41 @@ class Filter
} }
/** /**
* Set whitelisted tags adn attributes for each tag * Trim whitespace from the begining, the end and inside a string and don't break utf-8 string
* *
* @static
* @access public * @access public
* @param array $values List of tags: ['video' => ['src', 'cover'], 'img' => ['src']] * @param string $value Raw data
* @return \PicoFeed\Filter * @return string Normalized data
*/ */
public function setWhitelistedTags(array $values) public static function stripWhiteSpace($value)
{ {
$this->whitelist_tags = $values ?: $this->whitelist_tags; $value = str_replace("\r", "", $value);
return $this; $value = str_replace("\t", "", $value);
$value = str_replace("\n", "", $value);
return trim($value);
} }
/** /**
* Set blacklisted tags * Dirty quickfixes before XML parsing
* *
* @static
* @access public * @access public
* @param array $values List of tags: ['video', 'img'] * @param string $data Raw data
* @return \PicoFeed\Filter * @return string Normalized data
*/ */
public function setBlacklistedTags(array $values) public static function normalizeData($data)
{ {
$this->blacklisted_tags = $values ?: $this->blacklisted_tags; $invalid_chars = array(
return $this; "\x10",
} "\xc3\x20",
"&#x1F;",
);
/** foreach ($invalid_chars as $needle) {
* Set scheme whitelist $data = str_replace($needle, '', $data);
*
* @access public
* @param array $values List of scheme: ['http://', 'ftp://']
* @return \PicoFeed\Filter
*/
public function setSchemeWhitelist(array $values)
{
$this->scheme_whitelist = $values ?: $this->scheme_whitelist;
return $this;
}
/**
* Set media attributes (used to load external resources)
*
* @access public
* @param array $values List of values: ['src', 'href']
* @return \PicoFeed\Filter
*/
public function setMediaAttributes(array $values)
{
$this->media_attributes = $values ?: $this->media_attributes;
return $this;
}
/**
* Set blacklisted external resources
*
* @access public
* @param array $values List of tags: ['http://google.com/', '...']
* @return \PicoFeed\Filter
*/
public function setMediaBlacklist(array $values)
{
$this->media_blacklist = $values ?: $this->media_blacklist;
return $this;
}
/**
* Set mandatory attributes for whitelisted tags
*
* @access public
* @param array $values List of tags: ['img' => 'src']
* @return \PicoFeed\Filter
*/
public function setRequiredAttributes(array $values)
{
$this->required_attributes = $values ?: $this->required_attributes;
return $this;
}
/**
* Set attributes to automatically to specific tags
*
* @access public
* @param array $values List of tags: ['a' => 'target="_blank"']
* @return \PicoFeed\Filter
*/
public function setAttributeOverrides(array $values)
{
$this->add_attributes = $values ?: $this->add_attributes;
return $this;
}
/**
* Set attributes that must be an integer
*
* @access public
* @param array $values List of tags: ['width', 'height']
* @return \PicoFeed\Filter
*/
public function setIntegerAttributes(array $values)
{
$this->integer_attributes = $values ?: $this->integer_attributes;
return $this;
}
/**
* Set allowed iframe resources
*
* @access public
* @param array $values List of tags: ['http://www.youtube.com']
* @return \PicoFeed\Filter
*/
public function setIframeWhitelist(array $values)
{
$this->iframe_whitelist = $values ?: $this->iframe_whitelist;
return $this;
}
/**
* Set config object
*
* @access public
* @param \PicoFeed\Config $config Config instance
* @return \PicoFeed\Parse
*/
public function setConfig($config)
{
$this->config = $config;
if ($this->config !== null) {
$this->setIframeWhitelist($this->config->getFilterIframeWhitelist(array()));
$this->setIntegerAttributes($this->config->getFilterIntegerAttributes(array()));
$this->setAttributeOverrides($this->config->getFilterAttributeOverrides(array()));
$this->setRequiredAttributes($this->config->getFilterRequiredAttributes(array()));
$this->setMediaBlacklist($this->config->getFilterMediaBlacklist(array()));
$this->setMediaAttributes($this->config->getFilterMediaAttributes(array()));
$this->setSchemeWhitelist($this->config->getFilterSchemeWhitelist(array()));
$this->setBlacklistedTags($this->config->getFilterBlacklistedTags(array()));
$this->setWhitelistedTags($this->config->getFilterWhitelistedTags(array()));
} }
return $this; return $data;
} }
} }

590
vendor/PicoFeed/Filter/Attribute.php vendored Normal file
View File

@ -0,0 +1,590 @@
<?php
namespace PicoFeed\Filter;
use \PicoFeed\Url;
use \PicoFeed\Filter;
/**
* Attribute Filter class
*
* @author Frederic Guillot
* @package filter
*/
class Attribute
{
/**
* Tags and attribute whitelist
*
* @access private
* @var array
*/
private $attribute_whitelist = array(
'audio' => array('controls', 'src'),
'video' => array('poster', 'controls', 'height', 'width', 'src'),
'source' => array('src', 'type'),
'dt' => array(),
'dd' => array(),
'dl' => array(),
'table' => array(),
'caption' => array(),
'tr' => array(),
'th' => array(),
'td' => array(),
'tbody' => array(),
'thead' => array(),
'h2' => array(),
'h3' => array(),
'h4' => array(),
'h5' => array(),
'h6' => array(),
'strong' => array(),
'em' => array(),
'code' => array(),
'pre' => array(),
'blockquote' => array(),
'p' => array(),
'ul' => array(),
'li' => array(),
'ol' => array(),
'br' => array(),
'del' => array(),
'a' => array('href'),
'img' => array('src', 'title', 'alt'),
'figure' => array(),
'figcaption' => array(),
'cite' => array(),
'time' => array('datetime'),
'abbr' => array('title'),
'iframe' => array('width', 'height', 'frameborder', 'src'),
'q' => array('cite')
);
/**
* Scheme whitelist
*
* For a complete list go to http://en.wikipedia.org/wiki/URI_scheme
*
* @access private
* @var array
*/
private $scheme_whitelist = array(
'bitcoin:',
'callto:',
'ed2k://',
'facetime://',
'feed:',
'ftp://',
'geo:',
'git://',
'http://',
'https://',
'irc://',
'irc6://',
'ircs://',
'jabber:',
'magnet:',
'mailto:',
'nntp://',
'rtmp://',
'sftp://',
'sip:',
'sips:',
'skype:',
'smb://',
'sms:',
'spotify:',
'ssh:',
'steam:',
'svn://',
'tel:',
);
/**
* Iframe source whitelist, everything else is ignored
*
* @access private
* @var array
*/
private $iframe_whitelist = array(
'http://www.youtube.com',
'https://www.youtube.com',
'http://player.vimeo.com',
'https://player.vimeo.com',
'http://www.dailymotion.com',
'https://www.dailymotion.com',
);
/**
* Blacklisted resources
*
* @access private
* @var array
*/
private $media_blacklist = array(
'api.flattr.com',
'feeds.feedburner.com',
'share.feedsportal.com',
'da.feedsportal.com',
'rss.feedsportal.com',
'res.feedsportal.com',
'res1.feedsportal.com',
'res2.feedsportal.com',
'res3.feedsportal.com',
'pi.feedsportal.com',
'rss.nytimes.com',
'feeds.wordpress.com',
'stats.wordpress.com',
'rss.cnn.com',
'twitter.com/home?status=',
'twitter.com/share',
'twitter_icon_large.png',
'www.facebook.com/sharer.php',
'facebook_icon_large.png',
'plus.google.com/share',
'www.gstatic.com/images/icons/gplus-16.png',
'www.gstatic.com/images/icons/gplus-32.png',
'www.gstatic.com/images/icons/gplus-64.png',
);
/**
* Attributes used for external resources
*
* @access private
* @var array
*/
private $media_attributes = array(
'src',
'href',
'poster',
);
/**
* Attributes that must be integer
*
* @access private
* @var array
*/
private $integer_attributes = array(
'width',
'height',
'frameborder',
);
/**
* Mandatory attributes for specified tags
*
* @access private
* @var array
*/
private $required_attributes = array(
'a' => array('href'),
'img' => array('src'),
'iframe' => array('src'),
'audio' => array('src'),
'source' => array('src'),
);
/**
* Add attributes to specified tags
*
* @access private
* @var array
*/
private $add_attributes = array(
'a' => array('rel' => 'noreferrer', 'target' => '_blank')
);
/**
* List of filters to apply
*
* @access private
* @var array
*/
private $filters = array(
'filterEmptyAttribute',
'filterAllowedAttribute',
'filterIntegerAttribute',
'filterAbsoluteUrlAttribute',
'filterIframeAttribute',
'filterBlacklistResourceAttribute',
'filterProtocolUrlAttribute',
);
/**
* Add attributes to specified tags
*
* @access private
* @var \PicoFeed\Url
*/
private $website = null;
/**
* Constructor
*
* @access public
* @param \PicoFeed\Url $website Website url instance
*/
public function __construct(Url $website)
{
$this->website = $website;
}
/**
* Apply filters to the attributes list
*
* @access public
* @param string $tag Tag name
* @param array $attributes Attributes dictionary
* @return array Filtered attributes
*/
public function filter($tag, array $attributes)
{
foreach ($attributes as $attribute => &$value) {
foreach ($this->filters as $filter) {
if (! $this->$filter($tag, $attribute, $value)) {
unset($attributes[$attribute]);
break;
}
}
}
return $attributes;
}
/**
* Return true if the value is not empty (remove empty attributes)
*
* @access public
* @param string $tag Tag name
* @param string $attribute Atttribute name
* @param string $value Atttribute value
* @return boolean
*/
public function filterEmptyAttribute($tag, $attribute, $value)
{
return $value !== '';
}
/**
* Return true if the value is allowed (remove not allowed attributes)
*
* @access public
* @param string $tag Tag name
* @param string $attribute Atttribute name
* @param string $value Atttribute value
* @return boolean
*/
public function filterAllowedAttribute($tag, $attribute, $value)
{
return isset($this->attribute_whitelist[$tag]) && in_array($attribute, $this->attribute_whitelist[$tag]);
}
/**
* Return true if the value is not integer (remove attributes that should have an integer value)
*
* @access public
* @param string $tag Tag name
* @param string $attribute Atttribute name
* @param string $value Atttribute value
* @return boolean
*/
public function filterIntegerAttribute($tag, $attribute, $value)
{
if (in_array($attribute, $this->integer_attributes)) {
return ctype_digit($value);
}
return true;
}
/**
* Return true if the iframe source is allowed (remove not allowed iframe)
*
* @access public
* @param string $tag Tag name
* @param string $attribute Atttribute name
* @param string $value Atttribute value
* @return boolean
*/
public function filterIframeAttribute($tag, $attribute, $value)
{
if ($tag === 'iframe' && $attribute === 'src') {
foreach ($this->iframe_whitelist as $url) {
if (strpos($value, $url) === 0) {
return true;
}
}
return false;
}
return true;
}
/**
* Return true if the resource is not blacklisted (remove blacklisted resource attributes)
*
* @access public
* @param string $tag Tag name
* @param string $attribute Atttribute name
* @param string $value Atttribute value
* @return boolean
*/
public function filterBlacklistResourceAttribute($tag, $attribute, $value)
{
if ($this->isResource($attribute) && $this->isBlacklistedMedia($value)) {
return false;
}
return true;
}
/**
* Convert all relative links to absolute url
*
* @access public
* @param string $tag Tag name
* @param string $attribute Atttribute name
* @param string $value Atttribute value
* @return boolean
*/
public function filterAbsoluteUrlAttribute($tag, $attribute, &$value)
{
if ($this->isResource($attribute)) {
$value = Url::resolve($value, $this->website);
}
return true;
}
/**
* Return true if the scheme is authorized
*
* @access public
* @param string $tag Tag name
* @param string $attribute Atttribute name
* @param string $value Atttribute value
* @return boolean
*/
public function filterProtocolUrlAttribute($tag, $attribute, $value)
{
if ($this->isResource($attribute) && ! $this->isAllowedProtocol($value)) {
return false;
}
return true;
}
/**
* Automatically add/override some attributes for specific tags
*
* @access public
* @param string $tag Tag name
* @param array $attributes Atttributes list
* @return array
*/
public function addAttributes($tag, array $attributes)
{
if (isset($this->add_attributes[$tag])) {
$attributes += $this->add_attributes[$tag];
}
return $attributes;
}
/**
* Return true if all required attributes are present
*
* @access public
* @param string $tag Tag name
* @param array $attributes Atttributes list
* @return boolean
*/
public function hasRequiredAttributes($tag, array $attributes)
{
if (isset($this->required_attributes[$tag])) {
foreach ($this->required_attributes[$tag] as $attribute) {
if (! isset($attributes[$attribute])) {
return false;
}
}
}
return true;
}
/**
* Check if an attribute name is an external resource
*
* @access public
* @param string $data Attribute name
* @return boolean
*/
public function isResource($attribute)
{
return in_array($attribute, $this->media_attributes);
}
/**
* Detect if the protocol is allowed or not
*
* @access public
* @param string $value Attribute value
* @return boolean
*/
public function isAllowedProtocol($value)
{
foreach ($this->scheme_whitelist as $protocol) {
if (strpos($value, $protocol) === 0) {
return true;
}
}
return false;
}
/**
* Detect if an url is blacklisted
*
* @access public
* @param string $resouce Attribute value (URL)
* @return boolean
*/
public function isBlacklistedMedia($resource)
{
foreach ($this->media_blacklist as $name) {
if (strpos($resource, $name) !== false) {
return true;
}
}
return false;
}
/**
* Convert the attribute list to html
*
* @access public
* @param array $attributes Attributes
* @return string
*/
public function toHtml(array $attributes)
{
$html = array();
foreach ($attributes as $attribute => $value) {
$html[] = sprintf('%s="%s"', $attribute, Filter::escape($value));
}
return implode(' ', $html);
}
/**
* Set whitelisted tags adn attributes for each tag
*
* @access public
* @param array $values List of tags: ['video' => ['src', 'cover'], 'img' => ['src']]
* @return \PicoFeed\Filter
*/
public function setWhitelistedAttributes(array $values)
{
$this->attribute_whitelist = $values ?: $this->attribute_whitelist;
return $this;
}
/**
* Set scheme whitelist
*
* @access public
* @param array $values List of scheme: ['http://', 'ftp://']
* @return \PicoFeed\Filter
*/
public function setSchemeWhitelist(array $values)
{
$this->scheme_whitelist = $values ?: $this->scheme_whitelist;
return $this;
}
/**
* Set media attributes (used to load external resources)
*
* @access public
* @param array $values List of values: ['src', 'href']
* @return \PicoFeed\Filter
*/
public function setMediaAttributes(array $values)
{
$this->media_attributes = $values ?: $this->media_attributes;
return $this;
}
/**
* Set blacklisted external resources
*
* @access public
* @param array $values List of tags: ['http://google.com/', '...']
* @return \PicoFeed\Filter
*/
public function setMediaBlacklist(array $values)
{
$this->media_blacklist = $values ?: $this->media_blacklist;
return $this;
}
/**
* Set mandatory attributes for whitelisted tags
*
* @access public
* @param array $values List of tags: ['img' => 'src']
* @return \PicoFeed\Filter
*/
public function setRequiredAttributes(array $values)
{
$this->required_attributes = $values ?: $this->required_attributes;
return $this;
}
/**
* Set attributes to automatically to specific tags
*
* @access public
* @param array $values List of tags: ['a' => 'target="_blank"']
* @return \PicoFeed\Filter
*/
public function setAttributeOverrides(array $values)
{
$this->add_attributes = $values ?: $this->add_attributes;
return $this;
}
/**
* Set attributes that must be an integer
*
* @access public
* @param array $values List of tags: ['width', 'height']
* @return \PicoFeed\Filter
*/
public function setIntegerAttributes(array $values)
{
$this->integer_attributes = $values ?: $this->integer_attributes;
return $this;
}
/**
* Set allowed iframe resources
*
* @access public
* @param array $values List of tags: ['http://www.youtube.com']
* @return \PicoFeed\Filter
*/
public function setIframeWhitelist(array $values)
{
$this->iframe_whitelist = $values ?: $this->iframe_whitelist;
return $this;
}
}

197
vendor/PicoFeed/Filter/Html.php vendored Normal file
View File

@ -0,0 +1,197 @@
<?php
namespace PicoFeed\Filter;
use \PicoFeed\Url;
use \PicoFeed\Filter;
use \PicoFeed\XmlParser;
/**
* HTML Filter class
*
* @author Frederic Guillot
* @package filter
*/
class Html
{
/**
* Config object
*
* @access private
* @var \PicoFeed\Config
*/
private $config = null;
/**
* Unfiltered XML data
*
* @access private
* @var string
*/
private $input = '';
/**
* Filtered XML data
*
* @access private
* @var string
*/
private $output = '';
/**
* List of empty tags
*
* @access private
* @var array
*/
private $empty_tags = array();
/**
* Empty flag
*
* @access private
* @var boolean
*/
private $empty = true;
/**
* Tag instance
*
* @access public
* @var \PicoFeed\Filter\Tag
*/
public $tag = '';
/**
* Attribute instance
*
* @access public
* @var \PicoFeed\Filter\Attribute
*/
public $attribute = '';
/**
* Initialize the filter, all inputs data must be encoded in UTF-8 before
*
* @access public
* @param string $html HTML content
* @param string $website Site URL (used to build absolute URL)
*/
public function __construct($html, $website)
{
$this->input = XmlParser::HtmlToXml($html);
$this->output = '';
$this->tag = new Tag;
$this->attribute = new Attribute(new Url($website));
}
/**
* Set config object
*
* @access public
* @param \PicoFeed\Config $config Config instance
* @return \PicoFeed\Html
*/
public function setConfig($config)
{
$this->config = $config;
if ($this->config !== null) {
$this->attribute->setIframeWhitelist($this->config->getFilterIframeWhitelist(array()));
$this->attribute->setIntegerAttributes($this->config->getFilterIntegerAttributes(array()));
$this->attribute->setAttributeOverrides($this->config->getFilterAttributeOverrides(array()));
$this->attribute->setRequiredAttributes($this->config->getFilterRequiredAttributes(array()));
$this->attribute->setMediaBlacklist($this->config->getFilterMediaBlacklist(array()));
$this->attribute->setMediaAttributes($this->config->getFilterMediaAttributes(array()));
$this->attribute->setSchemeWhitelist($this->config->getFilterSchemeWhitelist(array()));
$this->attribute->setWhitelistedAttributes($this->config->getFilterWhitelistedTags(array()));
$this->tag->setWhitelistedTags(array_keys($this->config->getFilterWhitelistedTags(array())));
}
return $this;
}
/**
* Run tags/attributes filtering
*
* @access public
* @return string
*/
public function execute()
{
$parser = xml_parser_create();
xml_set_object($parser, $this);
xml_set_element_handler($parser, 'startTag', 'endTag');
xml_set_character_data_handler($parser, 'dataTag');
xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, false);
xml_parse($parser, $this->input, true);
xml_parser_free($parser);
$this->postFilter();
return $this->output;
}
public function postFilter()
{
$this->output = $this->tag->removeEmptyTags($this->output);
$this->output = trim($this->output);
}
/**
* Parse opening tag
*
* @access public
* @param resource $parser XML parser
* @param string $name Tag name
* @param array $attributes Tag attributes
*/
public function startTag($parser, $tag, array $attributes)
{
$this->empty = true;
if ($this->tag->isAllowed($tag, $attributes)) {
$attributes = $this->attribute->filter($tag, $attributes);
if ($this->attribute->hasRequiredAttributes($tag, $attributes)) {
$attributes = $this->attribute->addAttributes($tag, $attributes);
$this->output .= $this->tag->openHtmlTag($tag, $this->attribute->toHtml($attributes));
$this->empty = false;
}
}
$this->empty_tags[] = $this->empty;
}
/**
* Parse closing tag
*
* @access public
* @param resource $parser XML parser
* @param string $name Tag name
*/
public function endTag($parser, $tag)
{
if (! array_pop($this->empty_tags) && $this->tag->isAllowedTag($tag)) {
$this->output .= $this->tag->closeHtmlTag($tag);
}
}
/**
* Parse tag content
*
* @access public
* @param resource $parser XML parser
* @param string $content Tag content
*/
public function dataTag($parser, $content)
{
// Replace &nbsp; with normal space
$content = str_replace("\xc2\xa0", ' ', $content);
$this->output .= Filter::escape($content);
}
}

173
vendor/PicoFeed/Filter/Tag.php vendored Normal file
View File

@ -0,0 +1,173 @@
<?php
namespace PicoFeed\Filter;
/**
* Tag Filter class
*
* @author Frederic Guillot
* @package filter
*/
class Tag
{
/**
* Tags whitelist
*
* @access private
* @var array
*/
private $tag_whitelist = array(
'audio',
'video',
'source',
'dt',
'dd',
'dl',
'table',
'caption',
'tr',
'th',
'td',
'tbody',
'thead',
'h2',
'h3',
'h4',
'h5',
'h6',
'strong',
'em',
'code',
'pre',
'blockquote',
'p',
'ul',
'li',
'ol',
'br',
'del',
'a',
'img',
'figure',
'figcaption',
'cite',
'time',
'abbr',
'iframe',
'q',
);
/**
* Check if the tag is allowed and is not a pixel tracker
*
* @access public
* @param string $tag Tag name
* @param array $attributes Attributes dictionary
* @return boolean
*/
public function isAllowed($tag, array $attributes)
{
return $this->isAllowedTag($tag) && ! $this->isPixelTracker($tag, $attributes);
}
/**
* Return the HTML opening tag
*
* @access public
* @param string $tag Tag name
* @param string $attributes Attributes converted in html
* @return string
*/
public function openHtmlTag($tag, $attributes = '')
{
return '<'.$tag.(empty($attributes) ? '' : ' '.$attributes).($this->isSelfClosingTag($tag) ? '/>' : '>');
}
/**
* Return the HTML closing tag
*
* @access public
* @param string $tag Tag name
* @return string
*/
public function closeHtmlTag($tag)
{
return $this->isSelfClosingTag($tag) ? '' : '</'.$tag.'>';
}
/**
* Return true is the tag is self-closing
*
* @access public
* @param string $tag Tag name
* @return boolean
*/
public function isSelfClosingTag($tag)
{
return in_array($tag, array('br', 'img'));
}
/**
* Check if a tag is on the whitelist
*
* @access public
* @param string $tag Tag name
* @return boolean
*/
public function isAllowedTag($tag)
{
return in_array($tag, $this->tag_whitelist);
}
/**
* Detect if an image tag is a pixel tracker
*
* @access public
* @param string $tag Tag name
* @param array $attributes Tag attributes
* @return boolean
*/
public function isPixelTracker($tag, array $attributes)
{
return $tag === 'img' &&
isset($attributes['height']) && isset($attributes['width']) &&
$attributes['height'] == 1 && $attributes['width'] == 1;
}
/**
* Remove empty tags
*
* @access public
* @param string $data Input data
* @return string
*/
public function removeEmptyTags($data)
{
return preg_replace('/<([^<\/>]*)>([\s]*?|(?R))<\/\1>/imsU', '', $data);
}
/**
* Replace <br/><br/> by only one
*
* @access public
* @param string $data Input data
* @return string
*/
public function removeMultipleTags($data)
{
return preg_replace("/(<br\s*\/?>\s*)+/", "<br/>", $data);
}
/**
* Set whitelisted tags adn attributes for each tag
*
* @access public
* @param array $values List of tags: ['video' => ['src', 'cover'], 'img' => ['src']]
* @return \PicoFeed\Filter
*/
public function setWhitelistedTags(array $values)
{
$this->tag_whitelist = $values ?: $this->tag_whitelist;
return $this;
}
}

View File

@ -3,10 +3,6 @@
namespace PicoFeed; namespace PicoFeed;
use DOMXPath; use DOMXPath;
use PicoFeed\Logging;
use PicoFeed\Client;
use PicoFeed\Encoding;
use PicoFeed\Filter;
/** /**
* Grabber class * Grabber class
@ -224,20 +220,9 @@ class Grabber
public function download() public function download()
{ {
$client = Client::getInstance(); $client = Client::getInstance();
$client->setConfig($this->config);
if ($this->config !== null) {
$client->setTimeout($this->config->getGrabberTimeout())
->setUserAgent($this->config->getGrabberUserAgent())
->setMaxRedirections($this->config->getMaxRedirections())
->setMaxBodySize($this->config->getMaxBodySize())
->setProxyHostname($this->config->getProxyHostname())
->setProxyPort($this->config->getProxyPort())
->setProxyUsername($this->config->getProxyUsername())
->setProxyPassword($this->config->getProxyPassword());
}
$client->execute($this->url); $client->execute($this->url);
$this->html = $client->getContent(); $this->html = $client->getContent();
$this->encoding = $client->getEncoding(); $this->encoding = $client->getEncoding();
@ -253,6 +238,11 @@ class Grabber
public function getRules() public function getRules()
{ {
$hostname = parse_url($this->url, PHP_URL_HOST); $hostname = parse_url($this->url, PHP_URL_HOST);
if ($hostname === false) {
return false;
}
$files = array($hostname); $files = array($hostname);
if (substr($hostname, 0, 4) == 'www.') { if (substr($hostname, 0, 4) == 'www.') {

View File

@ -2,11 +2,8 @@
namespace PicoFeed; namespace PicoFeed;
require_once __DIR__.'/Logging.php'; use SimpleXmlElement;
require_once __DIR__.'/XmlParser.php'; use StdClass;
use PicoFeed\Logging;
use PicoFeed\XmlParser;
/** /**
* OPML Import * OPML Import
@ -79,21 +76,94 @@ class Import
foreach ($tree->outline as $item) { foreach ($tree->outline as $item) {
if (isset($item->outline)) { if (isset($item->outline)) {
$this->parseEntries($item); $this->parseEntries($item);
} }
else if ((isset($item['text']) || isset($item['title'])) && isset($item['xmlUrl'])) { else if ((isset($item['text']) || isset($item['title'])) && isset($item['xmlUrl'])) {
$entry = new \StdClass; $entry = new StdClass;
$entry->category = isset($tree['title']) ? (string) $tree['title'] : (string) $tree['text']; $entry->category = $this->findCategory($tree);
$entry->title = isset($item['title']) ? (string) $item['title'] : (string) $item['text']; $entry->title = $this->findTitle($item);
$entry->feed_url = (string) $item['xmlUrl']; $entry->feed_url = $this->findFeedUrl($item);
$entry->site_url = isset($item['htmlUrl']) ? (string) $item['htmlUrl'] : $entry->feed_url; $entry->site_url = $this->findSiteUrl($item, $entry);
$entry->type = isset($item['version']) ? (string) $item['version'] : isset($item['type']) ? (string) $item['type'] : 'rss'; $entry->type = $this->findType($item);
$entry->description = isset($item['description']) ? (string) $item['description'] : $entry->title; $entry->description = $this->findDescription($item, $entry);
$this->items[] = $entry; $this->items[] = $entry;
} }
} }
} }
} }
/**
* Find category
*
* @access public
* @param SimpleXmlElement $tree XML tree
* @return string
*/
public function findCategory(SimpleXmlElement $tree)
{
return isset($tree['title']) ? (string) $tree['title'] : (string) $tree['text'];
}
/**
* Find title
*
* @access public
* @param SimpleXmlElement $item XML tree
* @return string
*/
public function findTitle(SimpleXmlElement $item)
{
return isset($item['title']) ? (string) $item['title'] : (string) $item['text'];
}
/**
* Find feed url
*
* @access public
* @param SimpleXmlElement $item XML tree
* @return string
*/
public function findFeedUrl(SimpleXmlElement $item)
{
return (string) $item['xmlUrl'];
}
/**
* Find site url
*
* @access public
* @param SimpleXmlElement $item XML tree
* @param StdClass $entry Feed entry
* @return string
*/
public function findSiteUrl(SimpleXmlElement $item, StdClass $entry)
{
return isset($item['htmlUrl']) ? (string) $item['htmlUrl'] : $entry->feed_url;
}
/**
* Find type
*
* @access public
* @param SimpleXmlElement $item XML tree
* @return string
*/
public function findType(SimpleXmlElement $item)
{
return isset($item['version']) ? (string) $item['version'] : isset($item['type']) ? (string) $item['type'] : 'rss';
}
/**
* Find description
*
* @access public
* @param SimpleXmlElement $item XML tree
* @param StdClass $entry Feed entry
* @return string
*/
public function findDescription(SimpleXmlElement $item, StdClass $entry)
{
return isset($item['description']) ? (string) $item['description'] : $entry->title;
}
} }

View File

@ -27,7 +27,7 @@ class Logging
* *
* @static * @static
* @access private * @access private
* @var array * @var string
*/ */
private static $timezone = 'UTC'; private static $timezone = 'UTC';

View File

@ -4,14 +4,6 @@ namespace PicoFeed;
use DateTime; use DateTime;
use DateTimeZone; use DateTimeZone;
use DOMXPath;
use SimpleXMLElement;
use PicoFeed\Config;
use PicoFeed\Encoding;
use PicoFeed\Filter;
use PicoFeed\Grabber;
use PicoFeed\Logging;
use PicoFeed\XmlParser;
/** /**
* Base parser class * Base parser class
@ -61,13 +53,21 @@ abstract class Parser
*/ */
protected $namespaces = array(); protected $namespaces = array();
/**
* Enable the content filtering
*
* @access private
* @var bool
*/
private $enable_filter = true;
/** /**
* Enable the content grabber * Enable the content grabber
* *
* @access private * @access private
* @var bool * @var bool
*/ */
public $enable_grabber = false; private $enable_grabber = false;
/** /**
* Ignore those urls for the content scraper * Ignore those urls for the content scraper
@ -96,7 +96,7 @@ abstract class Parser
$this->content = Encoding::convert($this->content, $xml_encoding ?: $http_encoding); $this->content = Encoding::convert($this->content, $xml_encoding ?: $http_encoding);
// Workarounds // Workarounds
$this->content = $this->normalizeData($this->content); $this->content = Filter::normalizeData($this->content);
} }
/** /**
@ -122,9 +122,11 @@ abstract class Parser
$feed = new Feed; $feed = new Feed;
$this->findFeedUrl($xml, $feed); $this->findFeedUrl($xml, $feed);
$this->findFeedTitle($xml, $feed); $this->findFeedTitle($xml, $feed);
$this->findFeedDescription($xml, $feed);
$this->findFeedLanguage($xml, $feed); $this->findFeedLanguage($xml, $feed);
$this->findFeedId($xml, $feed); $this->findFeedId($xml, $feed);
$this->findFeedDate($xml, $feed); $this->findFeedDate($xml, $feed);
$this->findFeedLogo($xml, $feed);
foreach ($this->getItemsTree($xml) as $entry) { foreach ($this->getItemsTree($xml) as $entry) {
@ -137,6 +139,10 @@ abstract class Parser
$this->findItemContent($entry, $item); $this->findItemContent($entry, $item);
$this->findItemEnclosure($entry, $item, $feed); $this->findItemEnclosure($entry, $item, $feed);
$this->findItemLanguage($entry, $item, $feed); $this->findItemLanguage($entry, $item, $feed);
$this->scrapWebsite($item);
$this->filterItemContent($feed, $item);
$feed->items[] = $item; $feed->items[] = $item;
} }
@ -146,103 +152,42 @@ abstract class Parser
} }
/** /**
* Filter HTML for entry content * Fetch item content with the content grabber
* *
* @access public * @access public
* @param string $item_content Item content * @param Item $item Item object
* @param string $item_url Item URL
* @return string Filtered content
*/ */
public function filterHtml($item_content, $item_url) public function scrapWebsite(Item $item)
{ {
$content = ''; if ($this->enable_grabber && ! in_array($item->getUrl(), $this->grabber_ignore_urls)) {
// Setup the content scraper $grabber = new Grabber($item->getUrl());
if ($this->enable_grabber && ! in_array($item_url, $this->grabber_ignore_urls)) {
$grabber = new Grabber($item_url);
$grabber->setConfig($this->config); $grabber->setConfig($this->config);
$grabber->download(); $grabber->download();
if ($grabber->parse()) { if ($grabber->parse()) {
$item_content = $grabber->getContent(); $item->content = $grabber->getContent() ?: $item->content;
} }
} }
// Content filtering
if ($item_content) {
if ($this->config !== null) {
$callback = $this->config->getContentFilteringCallback();
if (is_callable($callback)) {
$content = $callback($item_content, $item_url);
}
}
if (! $content) {
$filter = new Filter($item_content, $item_url);
$filter->setConfig($this->config);
$content = $filter->execute();
}
}
return $content;
} }
/** /**
* Dirty quickfixes before XML parsing * Filter HTML for entry content
* *
* @access public * @access public
* @param string $data Raw data * @param Feed $feed Feed object
* @return string Normalized data * @param Item $item Item object
*/ */
public function normalizeData($data) public function filterItemContent(Feed $feed, Item $item)
{ {
$invalid_chars = array( if ($this->isFilteringEnabled()) {
"\x10", $filter = Filter::html($item->getContent(), $feed->getUrl());
"\xc3\x20", $filter->setConfig($this->config);
"&#x1F;", $item->content = $filter->execute();
); }
else {
foreach ($invalid_chars as $needle) { Logging::setMessage(get_called_class().': Content filtering disabled');
$data = str_replace($needle, '', $data);
} }
$data = $this->replaceEntityAttribute($data);
return $data;
}
/**
* Replace & by &amp; for each href attribute (Fix broken feeds)
*
* @access public
* @param string $content Raw data
* @return string Normalized data
*/
public function replaceEntityAttribute($content)
{
$content = preg_replace_callback('/href="[^"]+"/', function(array $matches) {
return htmlspecialchars($matches[0], ENT_NOQUOTES, 'UTF-8', false);
}, $content);
return $content;
}
/**
* Trim whitespace from the begining, the end and inside a string and don't break utf-8 string
*
* @access public
* @param string $value Raw data
* @return string Normalized data
*/
public function stripWhiteSpace($value)
{
$value = str_replace("\r", "", $value);
$value = str_replace("\t", "", $value);
$value = str_replace("\n", "", $value);
return trim($value);
} }
/** /**
@ -355,25 +300,6 @@ abstract class Parser
return false; return false;
} }
/**
* Get xml:lang value
*
* @access public
* @param string $xml XML string
* @return string Language
*/
public function getXmlLang($xml)
{
$dom = XmlParser::getDomDocument($this->content);
if ($dom === false) {
return '';
}
$xpath = new DOMXPath($dom);
return $xpath->evaluate('string(//@xml:lang[1])') ?: '';
}
/** /**
* Return true if the given language is "Right to Left" * Return true if the given language is "Right to Left"
* *
@ -446,6 +372,32 @@ abstract class Parser
return $this; return $this;
} }
/**
* Enable the content grabber
*
* @access public
* @return \PicoFeed\Parser
*/
public function disableContentFiltering()
{
$this->enable_filter = false;
}
/**
* Return true if the content filtering is enabled
*
* @access public
* @return boolean
*/
public function isFilteringEnabled()
{
if ($this->config === null) {
return $this->enable_filter;
}
return $this->config->getContentFiltering($this->enable_filter);
}
/** /**
* Enable the content grabber * Enable the content grabber
* *
@ -468,37 +420,4 @@ abstract class Parser
{ {
$this->grabber_ignore_urls = $urls; $this->grabber_ignore_urls = $urls;
} }
/**
* Get a value from a XML namespace
*
* @access public
* @param SimpleXMLElement $xml XML element
* @param array $namespaces XML namespaces
* @param string $property XML tag name
* @param string $attribute XML attribute name
* @return string
*/
public function getNamespaceValue(SimpleXMLElement $xml, array $namespaces, $property, $attribute = '')
{
foreach ($namespaces as $name => $url) {
$namespace = $xml->children($namespaces[$name]);
if ($namespace->$property->count() > 0) {
if ($attribute) {
foreach ($namespace->$property->attributes() as $xml_attribute => $xml_value) {
if ($xml_attribute === $attribute && $xml_value) {
return (string) $xml_value;
}
}
}
return (string) $namespace->$property;
}
}
return '';
}
} }

View File

@ -6,9 +6,10 @@ use SimpleXMLElement;
use PicoFeed\Parser; use PicoFeed\Parser;
use PicoFeed\XmlParser; use PicoFeed\XmlParser;
use PicoFeed\Logging; use PicoFeed\Logging;
use PicoFeed\Filter;
use PicoFeed\Feed; use PicoFeed\Feed;
use PicoFeed\Filter;
use PicoFeed\Item; use PicoFeed\Item;
use PicoFeed\Url;
/** /**
* Atom parser * Atom parser
@ -42,6 +43,30 @@ class Atom extends Parser
$feed->url = $this->getLink($xml); $feed->url = $this->getLink($xml);
} }
/**
* Find the feed description
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedDescription(SimpleXMLElement $xml, Feed $feed)
{
$feed->description = (string) $xml->subtitle;
}
/**
* Find the feed logo url
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedLogo(SimpleXMLElement $xml, Feed $feed)
{
$feed->logo = (string) $xml->logo;
}
/** /**
* Find the feed title * Find the feed title
* *
@ -51,7 +76,7 @@ class Atom extends Parser
*/ */
public function findFeedTitle(SimpleXMLElement $xml, Feed $feed) public function findFeedTitle(SimpleXMLElement $xml, Feed $feed)
{ {
$feed->title = $this->stripWhiteSpace((string) $xml->title) ?: $feed->url; $feed->title = Filter::stripWhiteSpace((string) $xml->title) ?: $feed->url;
} }
/** /**
@ -63,7 +88,7 @@ class Atom extends Parser
*/ */
public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed) public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed)
{ {
$feed->language = $this->getXmlLang($this->content); $feed->language = XmlParser::getXmlLang($this->content);
} }
/** /**
@ -107,11 +132,11 @@ class Atom extends Parser
* *
* @access public * @access public
* @param SimpleXMLElement $entry Feed item * @param SimpleXMLElement $entry Feed item
* @param Item $item Item object * @param Item $item Item object
*/ */
public function findItemTitle(SimpleXMLElement $entry, Item $item) public function findItemTitle(SimpleXMLElement $entry, Item $item)
{ {
$item->title = $this->stripWhiteSpace((string) $entry->title); $item->title = Filter::stripWhiteSpace((string) $entry->title);
if (empty($item->title)) { if (empty($item->title)) {
$item->title = $item->url; $item->title = $item->url;
@ -145,7 +170,7 @@ class Atom extends Parser
*/ */
public function findItemContent(SimpleXMLElement $entry, Item $item) public function findItemContent(SimpleXMLElement $entry, Item $item)
{ {
$item->content = $this->filterHtml($this->getContent($entry), $item->url); $item->content = $this->getContent($entry);
} }
/** /**
@ -202,13 +227,8 @@ class Atom extends Parser
foreach ($entry->link as $link) { foreach ($entry->link as $link) {
if ((string) $link['rel'] === 'enclosure') { if ((string) $link['rel'] === 'enclosure') {
$item->enclosure_url = (string) $link['href']; $item->enclosure_url = Url::resolve((string) $link['href'], $feed->url);
$item->enclosure_type = (string) $link['type']; $item->enclosure_type = (string) $link['type'];
if (Filter::isRelativePath($item->enclosure_url)) {
$item->enclosure_url = Filter::getAbsoluteUrl($item->enclosure_url, $feed->url);
}
break; break;
} }
} }

View File

@ -7,6 +7,7 @@ require_once __DIR__.'/Rss20.php';
use SimpleXMLElement; use SimpleXMLElement;
use PicoFeed\Feed; use PicoFeed\Feed;
use PicoFeed\Item; use PicoFeed\Item;
use PicoFeed\XmlParser;
use PicoFeed\Parsers\Rss20; use PicoFeed\Parsers\Rss20;
/** /**
@ -38,7 +39,7 @@ class Rss10 extends Rss20
*/ */
public function findFeedDate(SimpleXMLElement $xml, Feed $feed) public function findFeedDate(SimpleXMLElement $xml, Feed $feed)
{ {
$feed->date = $this->parseDate($this->getNamespaceValue($xml->channel, $this->namespaces, 'date')); $feed->date = $this->parseDate(XmlParser::getNamespaceValue($xml->channel, $this->namespaces, 'date'));
} }
/** /**
@ -50,7 +51,7 @@ class Rss10 extends Rss20
*/ */
public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed) public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed)
{ {
$feed->language = $this->getNamespaceValue($xml->channel, $this->namespaces, 'language'); $feed->language = XmlParser::getNamespaceValue($xml->channel, $this->namespaces, 'language');
} }
/** /**

View File

@ -6,9 +6,10 @@ use SimpleXMLElement;
use PicoFeed\Parser; use PicoFeed\Parser;
use PicoFeed\XmlParser; use PicoFeed\XmlParser;
use PicoFeed\Logging; use PicoFeed\Logging;
use PicoFeed\Filter;
use PicoFeed\Feed; use PicoFeed\Feed;
use PicoFeed\Filter;
use PicoFeed\Item; use PicoFeed\Item;
use PicoFeed\Url;
/** /**
* RSS 2.0 Parser * RSS 2.0 Parser
@ -57,6 +58,32 @@ class Rss20 extends Parser
} }
} }
/**
* Find the feed description
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedDescription(SimpleXMLElement $xml, Feed $feed)
{
$feed->description = (string) $xml->channel->description;
}
/**
* Find the feed logo url
*
* @access public
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Feed $feed Feed object
*/
public function findFeedLogo(SimpleXMLElement $xml, Feed $feed)
{
if (isset($xml->channel->image->url)) {
$feed->logo = (string) $xml->channel->image->url;
}
}
/** /**
* Find the feed title * Find the feed title
* *
@ -66,7 +93,7 @@ class Rss20 extends Parser
*/ */
public function findFeedTitle(SimpleXMLElement $xml, Feed $feed) public function findFeedTitle(SimpleXMLElement $xml, Feed $feed)
{ {
$feed->title = $this->stripWhiteSpace((string) $xml->channel->title) ?: $feed->url; $feed->title = Filter::stripWhiteSpace((string) $xml->channel->title) ?: $feed->url;
} }
/** /**
@ -115,10 +142,10 @@ class Rss20 extends Parser
*/ */
public function findItemDate(SimpleXMLElement $entry, Item $item) public function findItemDate(SimpleXMLElement $entry, Item $item)
{ {
$date = $this->getNamespaceValue($entry, $this->namespaces, 'date'); $date = XmlParser::getNamespaceValue($entry, $this->namespaces, 'date');
if (empty($date)) { if (empty($date)) {
$date = $this->getNamespaceValue($entry, $this->namespaces, 'updated'); $date = XmlParser::getNamespaceValue($entry, $this->namespaces, 'updated');
} }
if (empty($date)) { if (empty($date)) {
@ -137,7 +164,7 @@ class Rss20 extends Parser
*/ */
public function findItemTitle(SimpleXMLElement $entry, Item $item) public function findItemTitle(SimpleXMLElement $entry, Item $item)
{ {
$item->title = $this->stripWhiteSpace((string) $entry->title); $item->title = Filter::stripWhiteSpace((string) $entry->title);
if (empty($item->title)) { if (empty($item->title)) {
$item->title = $item->url; $item->title = $item->url;
@ -154,7 +181,7 @@ class Rss20 extends Parser
*/ */
public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item) public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item)
{ {
$item->author = $this->getNamespaceValue($entry, $this->namespaces, 'creator'); $item->author = XmlParser::getNamespaceValue($entry, $this->namespaces, 'creator');
if (empty($item->author)) { if (empty($item->author)) {
if (isset($entry->author)) { if (isset($entry->author)) {
@ -175,13 +202,13 @@ class Rss20 extends Parser
*/ */
public function findItemContent(SimpleXMLElement $entry, Item $item) public function findItemContent(SimpleXMLElement $entry, Item $item)
{ {
$content = $this->getNamespaceValue($entry, $this->namespaces, 'encoded'); $content = XmlParser::getNamespaceValue($entry, $this->namespaces, 'encoded');
if (empty($content) && $entry->description->count() > 0) { if (empty($content) && $entry->description->count() > 0) {
$content = (string) $entry->description; $content = (string) $entry->description;
} }
$item->content = $this->filterHtml($content, $item->url); $item->content = $content;
} }
/** /**
@ -194,9 +221,9 @@ class Rss20 extends Parser
public function findItemUrl(SimpleXMLElement $entry, Item $item) public function findItemUrl(SimpleXMLElement $entry, Item $item)
{ {
$links = array( $links = array(
$this->getNamespaceValue($entry, $this->namespaces, 'origLink'), XmlParser::getNamespaceValue($entry, $this->namespaces, 'origLink'),
isset($entry->link) ? (string) $entry->link : '', isset($entry->link) ? (string) $entry->link : '',
$this->getNamespaceValue($entry, $this->namespaces, 'link', 'href'), XmlParser::getNamespaceValue($entry, $this->namespaces, 'link', 'href'),
isset($entry->guid) ? (string) $entry->guid : '', isset($entry->guid) ? (string) $entry->guid : '',
); );
@ -247,17 +274,14 @@ class Rss20 extends Parser
{ {
if (isset($entry->enclosure)) { if (isset($entry->enclosure)) {
$item->enclosure_url = $this->getNamespaceValue($entry->enclosure, $this->namespaces, 'origEnclosureLink'); $item->enclosure_url = XmlParser::getNamespaceValue($entry->enclosure, $this->namespaces, 'origEnclosureLink');
if (empty($item->enclosure_url)) { if (empty($item->enclosure_url)) {
$item->enclosure_url = isset($entry->enclosure['url']) ? (string) $entry->enclosure['url'] : ''; $item->enclosure_url = isset($entry->enclosure['url']) ? (string) $entry->enclosure['url'] : '';
} }
$item->enclosure_type = isset($entry->enclosure['type']) ? (string) $entry->enclosure['type'] : ''; $item->enclosure_type = isset($entry->enclosure['type']) ? (string) $entry->enclosure['type'] : '';
$item->enclosure_url = Url::resolve($item->enclosure_url, $feed->url);
if (Filter::isRelativePath($item->enclosure_url)) {
$item->enclosure_url = Filter::getAbsoluteUrl($item->enclosure_url, $feed->url);
}
} }
} }

View File

@ -4,10 +4,14 @@
require __DIR__.'/Config.php'; require __DIR__.'/Config.php';
require __DIR__.'/Logging.php'; require __DIR__.'/Logging.php';
require __DIR__.'/Url.php';
require __DIR__.'/Item.php'; require __DIR__.'/Item.php';
require __DIR__.'/Feed.php'; require __DIR__.'/Feed.php';
require __DIR__.'/Client.php'; require __DIR__.'/Client.php';
require __DIR__.'/Filter.php'; require __DIR__.'/Filter.php';
require __DIR__.'/Filter/Attribute.php';
require __DIR__.'/Filter/Tag.php';
require __DIR__.'/Filter/Html.php';
require __DIR__.'/XmlParser.php'; require __DIR__.'/XmlParser.php';
require __DIR__.'/Encoding.php'; require __DIR__.'/Encoding.php';
require __DIR__.'/Grabber.php'; require __DIR__.'/Grabber.php';
@ -18,3 +22,4 @@ require __DIR__.'/Writer.php';
require __DIR__.'/Writers/Rss20.php'; require __DIR__.'/Writers/Rss20.php';
require __DIR__.'/Writers/Atom.php'; require __DIR__.'/Writers/Atom.php';
require __DIR__.'/Parser.php'; require __DIR__.'/Parser.php';
require __DIR__.'/Favicon.php';

View File

@ -9,6 +9,7 @@ use PicoFeed\Logging;
use PicoFeed\Filter; use PicoFeed\Filter;
use PicoFeed\Client; use PicoFeed\Client;
use PicoFeed\Parser; use PicoFeed\Parser;
use PicoFeed\Url;
/** /**
* Reader class * Reader class
@ -78,14 +79,7 @@ class Reader
} }
$client = Client::getInstance(); $client = Client::getInstance();
$client->setTimeout($this->config->getClientTimeout()) $client->setConfig($this->config)
->setUserAgent($this->config->getClientUserAgent())
->setMaxRedirections($this->config->getMaxRedirections())
->setMaxBodySize($this->config->getMaxBodySize())
->setProxyHostname($this->config->getProxyHostname())
->setProxyPort($this->config->getProxyPort())
->setProxyUsername($this->config->getProxyUsername())
->setProxyPassword($this->config->getProxyPassword())
->setLastModified($last_modified) ->setLastModified($last_modified)
->setEtag($etag); ->setEtag($etag);
@ -249,16 +243,13 @@ class Reader
if (! empty($link)) { if (! empty($link)) {
// Relative links $feedUrl = new Url($link);
if (strpos($link, 'http') !== 0) { $siteUrl = new Url($this->url);
if ($link{0} === '/') $link = substr($link, 1); $link = $feedUrl->getAbsoluteUrl($feedUrl->isRelativeUrl() ? $siteUrl->getBaseUrl() : '');
if ($this->url{strlen($this->url) - 1} !== '/') $this->url .= '/';
$link = $this->url.$link;
}
Logging::setMessage(get_called_class().': Find subscription link: '.$link); Logging::setMessage(get_called_class().': Find subscription link: '.$link);
$this->download($link); $this->download($link);
return true; return true;

254
vendor/PicoFeed/Url.php vendored Normal file
View File

@ -0,0 +1,254 @@
<?php
namespace PicoFeed;
/**
* URL class
*
* @author Frederic Guillot
* @package picofeed
*/
class Url
{
/**
* URL
*
* @access private
* @var string
*/
private $url = '';
/**
* URL components
*
* @access private
* @var array
*/
private $components = array();
/**
* Constructor
*
* @access public
* @param string $url URL
*/
public function __construct($url)
{
$this->url = $url;
$this->components = parse_url($url) ?: array();
// Issue with PHP < 5.4.7 and protocol relative url
if (version_compare(PHP_VERSION, '5.4.7', '<') && $this->isProtocolRelative()) {
$pos = strpos($this->components['path'], '/', 2);
if ($pos === false) {
$pos = strlen($this->components['path']);
}
$this->components['host'] = substr($this->components['path'], 2, $pos - 2);
$this->components['path'] = substr($this->components['path'], $pos);
}
}
/**
* Shortcut method to get an absolute url from relative url
*
* @static
* @access public
* @param string $item_url Unknown url (can be relative or not)
* @param mixed $website_url Website url
* @return string
*/
public static function resolve($item_url, $website_url)
{
$link = new Url($item_url);
$website = is_string($website_url) ? new Url($website_url) : $website_url;
if ($link->isRelativeUrl()) {
if ($link->isRelativePath()) {
return $link->getAbsoluteUrl($website->getAbsoluteUrl());
}
return $link->getAbsoluteUrl($website->getBaseUrl());
}
else if ($link->isProtocolRelative()) {
$link->setScheme($website->getScheme());
}
return $link->getAbsoluteUrl();
}
/**
* Get the base URL
*
* @access public
* @param string $suffix Add a suffix to the url
* @return string
*/
public function getBaseUrl($suffix = '')
{
return $this->hasHost() ? $this->getScheme('://').$this->getHost().$this->getPort(':').$suffix : '';
}
/**
* Get the absolute URL
*
* @access public
* @param string $base_url Use this url as base url
* @return string
*/
public function getAbsoluteUrl($base_url = '')
{
if ($base_url) {
$base = new Url($base_url);
$url = $base->getAbsoluteUrl().substr($this->getFullPath(), 1);
}
else {
$url = $this->hasHost() ? $this->getBaseUrl().$this->getFullPath() : '';
}
return $url;
}
/**
* Return true if the url is relative
*
* @access public
* @return boolean
*/
public function isRelativeUrl()
{
return ! $this->hasScheme() && ! $this->isProtocolRelative();
}
/**
* Return true if the path is relative
*
* @access public
* @return boolean
*/
public function isRelativePath()
{
$path = $this->getPath();
return empty($path) || $path{0} !== '/';
}
/**
* Get the path
*
* @access public
* @return string
*/
public function getPath()
{
return empty($this->components['path']) ? '' : $this->components['path'];
}
/**
* Get the full path (path + querystring + fragment)
*
* @access public
* @return string
*/
public function getFullPath()
{
$path = $this->isRelativePath() ? '/' : '';
$path .= $this->getPath();
$path .= empty($this->components['query']) ? '' : '?'.$this->components['query'];
$path .= empty($this->components['fragment']) ? '' : '#'.$this->components['fragment'];
return $path;
}
/**
* Get the hostname
*
* @access public
* @return string
*/
public function getHost()
{
return empty($this->components['host']) ? '' : $this->components['host'];
}
/**
* Return true if the url has a hostname
*
* @access public
* @return boolean
*/
public function hasHost()
{
return ! empty($this->components['host']);
}
/**
* Get the scheme
*
* @access public
* @param string $suffix Suffix to add when there is a scheme
* @return string
*/
public function getScheme($suffix = '')
{
return ($this->hasScheme() ? $this->components['scheme'] : 'http').$suffix;
}
/**
* Set the scheme
*
* @access public
* @param string $scheme Set a scheme
* @return string
*/
public function setScheme($scheme)
{
$this->components['scheme'] = $scheme;
}
/**
* Return true if the url has a scheme
*
* @access public
* @return boolean
*/
public function hasScheme()
{
return ! empty($this->components['scheme']);
}
/**
* Get the port
*
* @access public
* @param string $prefix Prefix to add when there is a port
* @return string
*/
public function getPort($prefix = '')
{
return $this->hasPort() ? $prefix.$this->components['port'] : '';
}
/**
* Return true if the url has a port
*
* @access public
* @return boolean
*/
public function hasPort()
{
return ! empty($this->components['port']);
}
/**
* Return true if the url is protocol relative (start with //)
*
* @access public
* @return boolean
*/
public function isProtocolRelative()
{
return strpos($this->url, '//') === 0;
}
}

View File

@ -7,8 +7,9 @@ use RuntimeException;
/** /**
* Base writer class * Base writer class
* *
* @author Frederic Guillot * @author Frederic Guillot
* @package picofeed * @package picofeed
* @property string $description Feed description
*/ */
abstract class Writer abstract class Writer
{ {
@ -16,7 +17,7 @@ abstract class Writer
* Dom object * Dom object
* *
* @access protected * @access protected
* @var DomDocument * @var \DomDocument
*/ */
protected $dom; protected $dom;
@ -28,6 +29,46 @@ abstract class Writer
*/ */
public $items = array(); public $items = array();
/**
* Author
*
* @access public
* @var array
*/
public $author = array();
/**
* Feed URL
*
* @access public
* @var string
*/
public $feed_url = '';
/**
* Website URL
*
* @access public
* @var string
*/
public $site_url = '';
/**
* Feed title
*
* @access public
* @var string
*/
public $title = '';
/**
* Feed modification date (timestamp)
*
* @access public
* @var integer
*/
public $updated = 0;
/** /**
* Generate the XML document * Generate the XML document
* *

View File

@ -72,7 +72,7 @@ class Atom extends Writer
$feed->appendChild($id); $feed->appendChild($id);
// <updated/> // <updated/>
$this->addUpdated($feed, isset($this->updated) ? $this->updated : ''); $this->addUpdated($feed, $this->updated);
// <link rel="alternate" type="text/html" href="http://example.org/"/> // <link rel="alternate" type="text/html" href="http://example.org/"/>
$this->addLink($feed, $this->site_url); $this->addLink($feed, $this->site_url);
@ -85,51 +85,8 @@ class Atom extends Writer
// <entry/> // <entry/>
foreach ($this->items as $item) { foreach ($this->items as $item) {
$this->checkRequiredProperties($this->required_item_properties, $item); $this->checkRequiredProperties($this->required_item_properties, $item);
$feed->appendChild($this->createEntry($item));
$entry = $this->dom->createElement('entry');
// <title/>
$title = $this->dom->createElement('title');
$title->appendChild($this->dom->createTextNode($item['title']));
$entry->appendChild($title);
// <id/>
$id = $this->dom->createElement('id');
$id->appendChild($this->dom->createTextNode(isset($item['id']) ? $item['id'] : $item['url']));
$entry->appendChild($id);
// <updated/>
$this->addUpdated($entry, isset($item['updated']) ? $item['updated'] : '');
// <published/>
if (isset($item['published'])) {
$entry->appendChild($this->dom->createElement('published', date(DATE_ATOM, $item['published'])));
}
// <link rel="alternate" type="text/html" href="http://example.org/"/>
$this->addLink($entry, $item['url']);
// <summary/>
if (isset($item['summary'])) {
$summary = $this->dom->createElement('summary');
$summary->appendChild($this->dom->createTextNode($item['summary']));
$entry->appendChild($summary);
}
// <content/>
if (isset($item['content'])) {
$content = $this->dom->createElement('content');
$content->setAttribute('type', 'html');
$content->appendChild($this->dom->createCDATASection($item['content']));
$entry->appendChild($content);
}
// <author/>
if (isset($item['author'])) $this->addAuthor($entry, $item['author']);
$feed->appendChild($entry);
} }
$this->dom->appendChild($feed); $this->dom->appendChild($feed);
@ -142,6 +99,61 @@ class Atom extends Writer
} }
} }
/**
* Create item entry
*
* @access public
* @param arrray $item Item properties
* @return DomElement
*/
public function createEntry(array $item)
{
$entry = $this->dom->createElement('entry');
// <title/>
$title = $this->dom->createElement('title');
$title->appendChild($this->dom->createTextNode($item['title']));
$entry->appendChild($title);
// <id/>
$id = $this->dom->createElement('id');
$id->appendChild($this->dom->createTextNode(isset($item['id']) ? $item['id'] : $item['url']));
$entry->appendChild($id);
// <updated/>
$this->addUpdated($entry, isset($item['updated']) ? $item['updated'] : '');
// <published/>
if (isset($item['published'])) {
$entry->appendChild($this->dom->createElement('published', date(DATE_ATOM, $item['published'])));
}
// <link rel="alternate" type="text/html" href="http://example.org/"/>
$this->addLink($entry, $item['url']);
// <summary/>
if (isset($item['summary'])) {
$summary = $this->dom->createElement('summary');
$summary->appendChild($this->dom->createTextNode($item['summary']));
$entry->appendChild($summary);
}
// <content/>
if (isset($item['content'])) {
$content = $this->dom->createElement('content');
$content->setAttribute('type', 'html');
$content->appendChild($this->dom->createCDATASection($item['content']));
$entry->appendChild($content);
}
// <author/>
if (isset($item['author'])) {
$this->addAuthor($entry, $item['author']);
}
return $entry;
}
/** /**
* Add Link * Add Link
* *
@ -165,9 +177,9 @@ class Atom extends Writer
* *
* @access public * @access public
* @param DomElement $xml XML node * @param DomElement $xml XML node
* @param string $value Timestamp * @param integer $value Timestamp
*/ */
public function addUpdated(DomElement $xml, $value = '') public function addUpdated(DomElement $xml, $value = 0)
{ {
$xml->appendChild($this->dom->createElement( $xml->appendChild($this->dom->createElement(
'updated', 'updated',

View File

@ -75,7 +75,7 @@ class Rss20 extends Writer
$channel->appendChild($description); $channel->appendChild($description);
// <pubDate/> // <pubDate/>
$this->addPubDate($channel, isset($this->updated) ? $this->updated : ''); $this->addPubDate($channel, $this->updated);
// <atom:link/> // <atom:link/>
$link = $this->dom->createElement('atom:link'); $link = $this->dom->createElement('atom:link');
@ -94,56 +94,8 @@ class Rss20 extends Writer
// <item/> // <item/>
foreach ($this->items as $item) { foreach ($this->items as $item) {
$this->checkRequiredProperties($this->required_item_properties, $item); $this->checkRequiredProperties($this->required_item_properties, $item);
$channel->appendChild($this->createEntry($item));
$entry = $this->dom->createElement('item');
// <title/>
$title = $this->dom->createElement('title');
$title->appendChild($this->dom->createTextNode($item['title']));
$entry->appendChild($title);
// <link/>
$link = $this->dom->createElement('link');
$link->appendChild($this->dom->createTextNode($item['url']));
$entry->appendChild($link);
// <guid/>
if (isset($item['id'])) {
$guid = $this->dom->createElement('guid');
$guid->setAttribute('isPermaLink', 'false');
$guid->appendChild($this->dom->createTextNode($item['id']));
$entry->appendChild($guid);
}
else {
$guid = $this->dom->createElement('guid');
$guid->setAttribute('isPermaLink', 'true');
$guid->appendChild($this->dom->createTextNode($item['url']));
$entry->appendChild($guid);
}
// <pubDate/>
$this->addPubDate($entry, isset($item['updated']) ? $item['updated'] : '');
// <description/>
if (isset($item['summary'])) {
$description = $this->dom->createElement('description');
$description->appendChild($this->dom->createTextNode($item['summary']));
$entry->appendChild($description);
}
// <content/>
if (isset($item['content'])) {
$content = $this->dom->createElement('content:encoded');
$content->appendChild($this->dom->createCDATASection($item['content']));
$entry->appendChild($content);
}
// <author/>
if (isset($item['author'])) $this->addAuthor($entry, 'author', $item['author']);
$channel->appendChild($entry);
} }
$rss->appendChild($channel); $rss->appendChild($channel);
@ -157,14 +109,74 @@ class Rss20 extends Writer
} }
} }
/**
* Create item entry
*
* @access public
* @param arrray $item Item properties
* @return DomElement
*/
public function createEntry(array $item)
{
$entry = $this->dom->createElement('item');
// <title/>
$title = $this->dom->createElement('title');
$title->appendChild($this->dom->createTextNode($item['title']));
$entry->appendChild($title);
// <link/>
$link = $this->dom->createElement('link');
$link->appendChild($this->dom->createTextNode($item['url']));
$entry->appendChild($link);
// <guid/>
if (isset($item['id'])) {
$guid = $this->dom->createElement('guid');
$guid->setAttribute('isPermaLink', 'false');
$guid->appendChild($this->dom->createTextNode($item['id']));
$entry->appendChild($guid);
}
else {
$guid = $this->dom->createElement('guid');
$guid->setAttribute('isPermaLink', 'true');
$guid->appendChild($this->dom->createTextNode($item['url']));
$entry->appendChild($guid);
}
// <pubDate/>
$this->addPubDate($entry, isset($item['updated']) ? $item['updated'] : '');
// <description/>
if (isset($item['summary'])) {
$description = $this->dom->createElement('description');
$description->appendChild($this->dom->createTextNode($item['summary']));
$entry->appendChild($description);
}
// <content/>
if (isset($item['content'])) {
$content = $this->dom->createElement('content:encoded');
$content->appendChild($this->dom->createCDATASection($item['content']));
$entry->appendChild($content);
}
// <author/>
if (isset($item['author'])) {
$this->addAuthor($entry, 'author', $item['author']);
}
return $entry;
}
/** /**
* Add publication date * Add publication date
* *
* @access public * @access public
* @param DomElement $xml XML node * @param DomElement $xml XML node
* @param string $value Timestamp * @param integer $value Timestamp
*/ */
public function addPubDate(DomElement $xml, $value = '') public function addPubDate(DomElement $xml, $value = 0)
{ {
$xml->appendChild($this->dom->createElement( $xml->appendChild($this->dom->createElement(
'pubDate', 'pubDate',

View File

@ -3,6 +3,7 @@
namespace PicoFeed; namespace PicoFeed;
use DomDocument; use DomDocument;
use DOMXPath;
use SimpleXmlElement; use SimpleXmlElement;
/** /**
@ -110,6 +111,20 @@ class XmlParser
return $dom; return $dom;
} }
/**
* Convert a HTML document to XML
*
* @static
* @access public
* @param string $html HTML document
* @return string
*/
public static function HtmlToXml($html)
{
$dom = self::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html);
return $dom->saveXML($dom->getElementsByTagName('body')->item(0));
}
/** /**
* Get XML parser errors * Get XML parser errors
* *
@ -160,4 +175,58 @@ class XmlParser
return $encoding; return $encoding;
} }
/**
* Get xml:lang value
*
* @static
* @access public
* @param string $xml XML string
* @return string Language
*/
public static function getXmlLang($xml)
{
$dom = self::getDomDocument($xml);
if ($dom === false) {
return '';
}
$xpath = new DOMXPath($dom);
return $xpath->evaluate('string(//@xml:lang[1])') ?: '';
}
/**
* Get a value from a XML namespace
*
* @static
* @access public
* @param SimpleXMLElement $xml XML element
* @param array $namespaces XML namespaces
* @param string $property XML tag name
* @param string $attribute XML attribute name
* @return string
*/
public static function getNamespaceValue(SimpleXMLElement $xml, array $namespaces, $property, $attribute = '')
{
foreach ($namespaces as $name => $url) {
$namespace = $xml->children($namespaces[$name]);
if ($namespace->$property->count() > 0) {
if ($attribute) {
foreach ($namespace->$property->attributes() as $xml_attribute => $xml_value) {
if ($xml_attribute === $attribute && $xml_value) {
return (string) $xml_value;
}
}
}
return (string) $namespace->$property;
}
}
return '';
}
} }

View File

@ -1,109 +0,0 @@
<?php
/**
* JavaScript-like HTML DOM Element
*
* This class extends PHP's DOMElement to allow
* users to get and set the innerHTML property of
* HTML elements in the same way it's done in
* JavaScript.
*
* Example usage:
* @code
* require_once 'JSLikeHTMLElement.php';
* header('Content-Type: text/plain');
* $doc = new DOMDocument();
* $doc->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
* $doc->loadHTML('<div><p>Para 1</p><p>Para 2</p></div>');
* $elem = $doc->getElementsByTagName('div')->item(0);
*
* // print innerHTML
* echo $elem->innerHTML; // prints '<p>Para 1</p><p>Para 2</p>'
* echo "\n\n";
*
* // set innerHTML
* $elem->innerHTML = '<a href="http://fivefilters.org">FiveFilters.org</a>';
* echo $elem->innerHTML; // prints '<a href="http://fivefilters.org">FiveFilters.org</a>'
* echo "\n\n";
*
* // print document (with our changes)
* echo $doc->saveXML();
* @endcode
*
* @author Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net
* @see http://fivefilters.org (the project this was written for)
*/
class JSLikeHTMLElement extends DOMElement
{
/**
* Used for setting innerHTML like it's done in JavaScript:
* @code
* $div->innerHTML = '<h2>Chapter 2</h2><p>The story begins...</p>';
* @endcode
*/
public function __set($name, $value) {
if ($name == 'innerHTML') {
// first, empty the element
for ($x=$this->childNodes->length-1; $x>=0; $x--) {
$this->removeChild($this->childNodes->item($x));
}
// $value holds our new inner HTML
if ($value != '') {
$f = $this->ownerDocument->createDocumentFragment();
// appendXML() expects well-formed markup (XHTML)
$result = @$f->appendXML($value); // @ to suppress PHP warnings
if ($result) {
if ($f->hasChildNodes()) $this->appendChild($f);
} else {
// $value is probably ill-formed
$f = new DOMDocument();
$value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8');
// Using <htmlfragment> will generate a warning, but so will bad HTML
// (and by this point, bad HTML is what we've got).
// We use it (and suppress the warning) because an HTML fragment will
// be wrapped around <html><body> tags which we don't really want to keep.
// Note: despite the warning, if loadHTML succeeds it will return true.
$result = @$f->loadHTML('<htmlfragment>'.$value.'</htmlfragment>');
if ($result) {
$import = $f->getElementsByTagName('htmlfragment')->item(0);
foreach ($import->childNodes as $child) {
$importedNode = $this->ownerDocument->importNode($child, true);
$this->appendChild($importedNode);
}
} else {
// oh well, we tried, we really did. :(
// this element is now empty
}
}
}
} else {
$trace = debug_backtrace();
trigger_error('Undefined property via __set(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE);
}
}
/**
* Used for getting innerHTML like it's done in JavaScript:
* @code
* $string = $div->innerHTML;
* @endcode
*/
public function __get($name)
{
if ($name == 'innerHTML') {
$inner = '';
foreach ($this->childNodes as $child) {
$inner .= $this->ownerDocument->saveXML($child);
}
return $inner;
}
$trace = debug_backtrace();
trigger_error('Undefined property via __get(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE);
return null;
}
public function __toString()
{
return '['.$this->tagName.']';
}
}

File diff suppressed because it is too large Load Diff