From e2280f1b7ba2a312f4c9ef4866342520ae6cb093 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Sun, 19 Oct 2014 14:42:31 -0400 Subject: [PATCH] Update PicoFeed and PicoDb --- common.php | 1 - docs/full-article-download.markdown | 3 +- models/item.php | 21 +- vendor/PicoDb/Database.php | 5 + vendor/PicoDb/Drivers/Mysql.php | 1 - vendor/PicoDb/Drivers/Sqlite.php | 1 - vendor/PicoDb/Table.php | 8 +- vendor/PicoFeed/Client.php | 162 ++- vendor/PicoFeed/Clients/Curl.php | 190 +++- vendor/PicoFeed/Clients/Stream.php | 82 +- vendor/PicoFeed/Config.php | 46 + vendor/PicoFeed/Export.php | 71 +- vendor/PicoFeed/Favicon.php | 163 ++++ vendor/PicoFeed/Feed.php | 46 +- vendor/PicoFeed/Filter.php | 801 +-------------- vendor/PicoFeed/Filter/Attribute.php | 590 +++++++++++ vendor/PicoFeed/Filter/Html.php | 197 ++++ vendor/PicoFeed/Filter/Tag.php | 173 ++++ vendor/PicoFeed/Grabber.php | 24 +- vendor/PicoFeed/Import.php | 96 +- vendor/PicoFeed/Logging.php | 2 +- vendor/PicoFeed/Parser.php | 199 ++-- vendor/PicoFeed/Parsers/Atom.php | 44 +- vendor/PicoFeed/Parsers/Rss10.php | 5 +- vendor/PicoFeed/Parsers/Rss20.php | 54 +- vendor/PicoFeed/PicoFeed.php | 5 + vendor/PicoFeed/Reader.php | 21 +- vendor/PicoFeed/Url.php | 254 +++++ vendor/PicoFeed/Writer.php | 47 +- vendor/PicoFeed/Writers/Atom.php | 106 +- vendor/PicoFeed/Writers/Rss20.php | 116 ++- vendor/PicoFeed/XmlParser.php | 69 ++ vendor/Readability/JSLikeHTMLElement.php | 109 --- vendor/Readability/Readability.php | 1137 ---------------------- 34 files changed, 2364 insertions(+), 2485 deletions(-) create mode 100644 vendor/PicoFeed/Favicon.php create mode 100644 vendor/PicoFeed/Filter/Attribute.php create mode 100644 vendor/PicoFeed/Filter/Html.php create mode 100644 vendor/PicoFeed/Filter/Tag.php create mode 100644 vendor/PicoFeed/Url.php delete mode 100755 vendor/Readability/JSLikeHTMLElement.php delete mode 100755 vendor/Readability/Readability.php diff --git a/common.php b/common.php index 9ee3f11..00662f9 100644 --- a/common.php +++ b/common.php @@ -3,7 +3,6 @@ require __DIR__.'/lib/Translator.php'; require __DIR__.'/vendor/PicoDb/Database.php'; require __DIR__.'/vendor/PicoFeed/PicoFeed.php'; -require __DIR__.'/vendor/Readability/Readability.php'; require __DIR__.'/vendor/SimpleValidator/Validator.php'; require __DIR__.'/vendor/SimpleValidator/Base.php'; diff --git a/docs/full-article-download.markdown b/docs/full-article-download.markdown index 9e9d29c..229cac7 100644 --- a/docs/full-article-download.markdown +++ b/docs/full-article-download.markdown @@ -8,8 +8,7 @@ How the content grabber works? 1. Try with rules first (xpath patterns) for the domain name (see `PicoFeed\Rules\`) 2. Try to find the text content by using common attributes for class and id -3. Fallback to Readability if no content is found -4. Finally, if nothing is found, the feed content is displayed +3. Finally, if nothing is found, the feed content is displayed The content downloader use a fake user agent, actually Google Chrome under Mac Os X. diff --git a/models/item.php b/models/item.php index 6cdc889..8e1b57c 100644 --- a/models/item.php +++ b/models/item.php @@ -8,7 +8,6 @@ use PicoFeed\Logging; use PicoFeed\Grabber; use PicoFeed\Client; use PicoFeed\Filter; -use Readability; // Get all items without filtering function get_everything() @@ -535,12 +534,9 @@ function download_content_url($url) if ($grabber->parse()) { $content = $grabber->getcontent(); } - else { - $content = download_content_readability($grabber->getRawContent(), $url); - } if (! empty($content)) { - $filter = new Filter($content, $url); + $filter = Filter::html($content, $url); $filter->setConfig(Config\get_reader_config()); $content = $filter->execute(); } @@ -580,18 +576,3 @@ function download_content_id($item_id) 'content' => '' ); } - -// Download content with Readability PHP port -function download_content_readability($content, $url) -{ - if (! empty($content)) { - - $readability = new Readability($content, $url); - - if ($readability->init()) { - return $readability->getContent()->innerHTML; - } - } - - return ''; -} diff --git a/vendor/PicoDb/Database.php b/vendor/PicoDb/Database.php index 5d0beb8..c09d8a9 100644 --- a/vendor/PicoDb/Database.php +++ b/vendor/PicoDb/Database.php @@ -86,6 +86,11 @@ class Database public function escapeIdentifier($value) { + // Do not escape custom query + if (strpos($value, '.') !== false || strpos($value, ' ') !== false) { + return $value; + } + return $this->pdo->escapeIdentifier($value); } diff --git a/vendor/PicoDb/Drivers/Mysql.php b/vendor/PicoDb/Drivers/Mysql.php index 22277a0..96148a1 100644 --- a/vendor/PicoDb/Drivers/Mysql.php +++ b/vendor/PicoDb/Drivers/Mysql.php @@ -70,7 +70,6 @@ class Mysql extends \PDO { public function escapeIdentifier($value) { - if (strpos($value, '.') !== false) return $value; return '`'.$value.'`'; } } \ No newline at end of file diff --git a/vendor/PicoDb/Drivers/Sqlite.php b/vendor/PicoDb/Drivers/Sqlite.php index 83b61c4..38c823a 100644 --- a/vendor/PicoDb/Drivers/Sqlite.php +++ b/vendor/PicoDb/Drivers/Sqlite.php @@ -51,7 +51,6 @@ class Sqlite extends \PDO { public function escapeIdentifier($value) { - if (strpos($value, '.') !== false) return $value; return '"'.$value.'"'; } } \ No newline at end of file diff --git a/vendor/PicoDb/Table.php b/vendor/PicoDb/Table.php index cc63743..9c6bf4f 100644 --- a/vendor/PicoDb/Table.php +++ b/vendor/PicoDb/Table.php @@ -173,6 +173,10 @@ class Table public function buildSelectQuery() { + foreach ($this->columns as $key => $value) { + $this->columns[$key] = $this->db->escapeIdentifier($value); + } + return sprintf( 'SELECT %s %s FROM %s %s %s %s %s %s %s', $this->distinct ? 'DISTINCT' : '', @@ -350,7 +354,7 @@ class Table switch (strtolower($name)) { case 'in': - if (isset($arguments[1]) && is_array($arguments[1])) { + if (isset($arguments[1]) && is_array($arguments[1]) && ! empty($arguments[1])) { $sql = sprintf( '%s IN (%s)', @@ -361,7 +365,7 @@ class Table break; case 'notin': - if (isset($arguments[1]) && is_array($arguments[1])) { + if (isset($arguments[1]) && is_array($arguments[1]) && ! empty($arguments[1])) { $sql = sprintf( '%s NOT IN (%s)', diff --git a/vendor/PicoFeed/Client.php b/vendor/PicoFeed/Client.php index a79840c..59e9aa9 100644 --- a/vendor/PicoFeed/Client.php +++ b/vendor/PicoFeed/Client.php @@ -5,7 +5,6 @@ namespace PicoFeed; use LogicException; use Clients\Curl; use Clients\Stream; -use PicoFeed\Logging; /** * Client class @@ -23,6 +22,14 @@ abstract class Client */ private $is_modified = true; + /** + * Flag that say if the resource is a 404 + * + * @access private + * @var bool + */ + private $is_not_found = false; + /** * HTTP encoding * @@ -170,38 +177,110 @@ abstract class Client $response = $this->doRequest(); if (is_array($response)) { - - if ($response['status'] == 304) { - $this->is_modified = false; - Logging::setMessage(get_called_class().' Resource not modified'); - } - else if ($response['status'] == 404) { - Logging::setMessage(get_called_class().' Resource not found'); - } - else { - $etag = isset($response['headers']['ETag']) ? $response['headers']['ETag'] : ''; - $last_modified = isset($response['headers']['Last-Modified']) ? $response['headers']['Last-Modified'] : ''; - $this->content = $response['body']; - - if (isset($response['headers']['Content-Type'])) { - $result = explode('charset=', strtolower($response['headers']['Content-Type'])); - $this->encoding = isset($result[1]) ? $result[1] : ''; - } - - if (($this->etag && $this->etag === $etag) || ($this->last_modified && $last_modified === $this->last_modified)) { - $this->is_modified = false; - } - - $this->etag = $etag; - $this->last_modified = $last_modified; - } - + $this->handleNotModifiedResponse($response); + $this->handleNotFoundResponse($response); + $this->handleNormalResponse($response); return true; } return false; } + /** + * Handle not modified response + * + * @access public + * @param array $response Client response + */ + public function handleNotModifiedResponse(array $response) + { + if ($response['status'] == 304) { + $this->is_modified = false; + } + else if ($response['status'] == 200) { + + $etag = $this->getHeader($response, 'ETag'); + $last_modified = $this->getHeader($response, 'Last-Modified'); + + if ($this->isPropertyEquals('etag', $etag) || $this->isPropertyEquals('last_modified', $last_modified)) { + $this->is_modified = false; + } + + $this->etag = $etag; + $this->last_modified = $last_modified; + } + + if ($this->is_modified === false) { + Logging::setMessage(get_called_class().' Resource not modified'); + } + } + + /** + * Handle not found response + * + * @access public + * @param array $response Client response + */ + public function handleNotFoundResponse(array $response) + { + if ($response['status'] == 404) { + $this->is_not_found = true; + Logging::setMessage(get_called_class().' Resource not found'); + } + } + + /** + * Handle normal response + * + * @access public + * @param array $response Client response + */ + public function handleNormalResponse(array $response) + { + if ($response['status'] == 200) { + $this->content = $response['body']; + $this->encoding = $this->findCharset($response); + } + } + + /** + * Check if a class property equals to a value + * + * @access public + * @param string $property Class property + * @param string $value Value + * @return boolean + */ + private function isPropertyEquals($property, $value) + { + return $this->$property && $this->$property === $value; + } + + /** + * Find charset from response headers + * + * @access public + * @param array $response Client response + */ + public function findCharset(array $response) + { + $result = explode('charset=', strtolower($this->getHeader($response, 'Content-Type'))); + return isset($result[1]) ? $result[1] : ''; + } + + /** + * Get header value from a client response + * + * @access public + * @param array $response Client response + * @param string $header Header name + * @return string + */ + public function getHeader(array $response, $header) + { + return isset($response['headers'][$header]) ? $response['headers'][$header] : ''; + } + /** * Parse HTTP headers * @@ -340,6 +419,17 @@ abstract class Client return $this->is_modified; } + /** + * Return true if the remote resource is not found + * + * @access public + * @return bool + */ + public function isNotFound() + { + return $this->is_not_found; + } + /** * Set connection timeout * @@ -453,14 +543,16 @@ abstract class Client */ public function setConfig($config) { - $this->setTimeout($config->getGrabberTimeout()); - $this->setUserAgent($config->getGrabberUserAgent()); - $this->setMaxRedirections($config->getMaxRedirections()); - $this->setMaxBodySize($config->getMaxBodySize()); - $this->setProxyHostname($config->getProxyHostname()); - $this->setProxyPort($config->getProxyPort()); - $this->setProxyUsername($config->getProxyUsername()); - $this->setProxyPassword($config->getProxyPassword()); + if ($config !== null) { + $this->setTimeout($config->getGrabberTimeout()); + $this->setUserAgent($config->getGrabberUserAgent()); + $this->setMaxRedirections($config->getMaxRedirections()); + $this->setMaxBodySize($config->getMaxBodySize()); + $this->setProxyHostname($config->getProxyHostname()); + $this->setProxyPort($config->getProxyPort()); + $this->setProxyUsername($config->getProxyUsername()); + $this->setProxyPassword($config->getProxyPassword()); + } return $this; } diff --git a/vendor/PicoFeed/Clients/Curl.php b/vendor/PicoFeed/Clients/Curl.php index 66a4773..a1ee96d 100644 --- a/vendor/PicoFeed/Clients/Curl.php +++ b/vendor/PicoFeed/Clients/Curl.php @@ -97,36 +97,37 @@ class Curl extends Client } /** - * Do the HTTP request + * Prepare HTTP headers * - * @access public - * @param bool $follow_location Flag used when there is an open_basedir restriction - * @return array HTTP response ['body' => ..., 'status' => ..., 'headers' => ...] + * @access private + * @return array */ - public function doRequest($follow_location = true) + private function prepareHeaders() { - $request_headers = array('Connection: close'); + $headers = array( + 'Connection: close', + 'User-Agent: '.$this->user_agent, + ); - if ($this->etag) $request_headers[] = 'If-None-Match: '.$this->etag; - if ($this->last_modified) $request_headers[] = 'If-Modified-Since: '.$this->last_modified; + if ($this->etag) { + $headers[] = 'If-None-Match: '.$this->etag; + } - $ch = curl_init(); + if ($this->last_modified) { + $headers[] = 'If-Modified-Since: '.$this->last_modified; + } - curl_setopt($ch, CURLOPT_URL, $this->url); - curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1); - curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $this->timeout); - curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); - curl_setopt($ch, CURLOPT_USERAGENT, $this->user_agent); - curl_setopt($ch, CURLOPT_HTTPHEADER, $request_headers); - curl_setopt($ch, CURLOPT_FOLLOWLOCATION, ini_get('open_basedir') === ''); - curl_setopt($ch, CURLOPT_MAXREDIRS, $this->max_redirects); - curl_setopt($ch, CURLOPT_ENCODING, ''); - curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // For auto-signed certificates... - curl_setopt($ch, CURLOPT_WRITEFUNCTION, array($this, 'readBody')); - curl_setopt($ch, CURLOPT_HEADERFUNCTION, array($this, 'readHeaders')); - curl_setopt($ch, CURLOPT_COOKIEJAR, 'php://memory'); - curl_setopt($ch, CURLOPT_COOKIEFILE, 'php://memory'); + return $headers; + } + /** + * Prepare curl proxy context + * + * @access private + * @return resource + */ + private function prepareProxyContext($ch) + { if ($this->proxy_hostname) { Logging::setMessage(get_called_class().' Proxy: '.$this->proxy_hostname.':'.$this->proxy_port); @@ -144,6 +145,47 @@ class Curl extends Client } } + return $ch; + } + + /** + * Prepare curl context + * + * @access private + * @return resource + */ + private function prepareContext() + { + $ch = curl_init(); + + curl_setopt($ch, CURLOPT_URL, $this->url); + curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1); + curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $this->timeout); + curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); + curl_setopt($ch, CURLOPT_HTTPHEADER, $this->prepareHeaders()); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, ini_get('open_basedir') === ''); + curl_setopt($ch, CURLOPT_MAXREDIRS, $this->max_redirects); + curl_setopt($ch, CURLOPT_ENCODING, ''); + curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // For auto-signed certificates... + curl_setopt($ch, CURLOPT_WRITEFUNCTION, array($this, 'readBody')); + curl_setopt($ch, CURLOPT_HEADERFUNCTION, array($this, 'readHeaders')); + curl_setopt($ch, CURLOPT_COOKIEJAR, 'php://memory'); + curl_setopt($ch, CURLOPT_COOKIEFILE, 'php://memory'); + + $ch = $this->prepareProxyContext($ch); + + return $ch; + } + + /** + * Execute curl context + * + * @access private + * @return resource + */ + private function executeContext() + { + $ch = $this->prepareContext(); curl_exec($ch); Logging::setMessage(get_called_class().' cURL total time: '.curl_getinfo($ch, CURLINFO_TOTAL_TIME)); @@ -153,44 +195,34 @@ class Curl extends Client Logging::setMessage(get_called_class().' cURL effective url: '.curl_getinfo($ch, CURLINFO_EFFECTIVE_URL)); if (curl_errno($ch)) { - Logging::setMessage(get_called_class().' cURL error: '.curl_error($ch)); - curl_close($ch); return false; } curl_close($ch); + return true; + } + + /** + * Do the HTTP request + * + * @access public + * @param bool $follow_location Flag used when there is an open_basedir restriction + * @return array HTTP response ['body' => ..., 'status' => ..., 'headers' => ...] + */ + public function doRequest($follow_location = true) + { + if (! $this->executeContext()) { + return false; + } + list($status, $headers) = $this->parseHeaders(explode("\r\n", $this->headers[$this->headers_counter - 1])); - if ($follow_location && ini_get('open_basedir') !== '' && ($status == 301 || $status == 302)) { - - $nb_redirects = 0; - $this->url = $headers['Location']; - $this->body = ''; - $this->body_length = 0; - $this->headers = array(); - $this->headers_counter = 0; - - while (true) { - - $nb_redirects++; - if ($nb_redirects >= $this->max_redirects) return false; - - $result = $this->doRequest(false); - - if ($result['status'] == 301 || $result['status'] == 302) { - $this->url = $result['headers']['Location']; - $this->body = ''; - $this->body_length = 0; - $this->headers = array(); - $this->headers_counter = 0; - } - else { - return $result; - } - } + // When resticted with open_basedir + if ($this->needToHandleRedirection($follow_location, $status)) { + return $this->handleRedirection($headers['Location']); } return array( @@ -199,4 +231,58 @@ class Curl extends Client 'headers' => $headers ); } + + /** + * Check if the redirection have to be handled manually + * + * @access private + * @param boolean $follow_location Flag + * @param integer $status HTTP status code + * @return boolean + */ + private function needToHandleRedirection($follow_location, $status) + { + return $follow_location && ini_get('open_basedir') !== '' && ($status == 301 || $status == 302); + } + + /** + * Handle manually redirections when there is an open base dir restriction + * + * @access private + * @param string $location Redirected URL + * @return boolean|array + */ + private function handleRedirection($location) + { + $nb_redirects = 0; + $this->url = $location; + $this->body = ''; + $this->body_length = 0; + $this->headers = array(); + $this->headers_counter = 0; + + while (true) { + + $nb_redirects++; + + if ($nb_redirects >= $this->max_redirects) { + return false; + } + + $result = $this->doRequest(false); + + if ($result['status'] == 301 || $result['status'] == 302) { + $this->url = $result['headers']['Location']; + $this->body = ''; + $this->body_length = 0; + $this->headers = array(); + $this->headers_counter = 0; + } + else { + return $result; + } + } + + return false; + } } diff --git a/vendor/PicoFeed/Clients/Stream.php b/vendor/PicoFeed/Clients/Stream.php index af5ae7e..f16952f 100644 --- a/vendor/PicoFeed/Clients/Stream.php +++ b/vendor/PicoFeed/Clients/Stream.php @@ -14,14 +14,13 @@ use \PicoFeed\Client; class Stream extends Client { /** - * Do the HTTP request + * Prepare HTTP headers * - * @access public - * @return array HTTP response ['body' => ..., 'status' => ..., 'headers' => ...] + * @access private + * @return array */ - public function doRequest() + private function prepareHeaders() { - // Prepare HTTP headers for the request $headers = array( 'Connection: close', 'User-Agent: '.$this->user_agent, @@ -39,14 +38,27 @@ class Stream extends Client $headers[] = 'If-Modified-Since: '.$this->last_modified; } - // Create context - $context_options = array( + if ($this->proxy_username) { + $headers[] = 'Proxy-Authorization: Basic '.base64_encode($this->proxy_username.':'.$this->proxy_password); + } + + return $headers; + } + + /** + * Prepare stream context + * + * @access private + * @return array + */ + private function prepareContext() + { + $context = array( 'http' => array( 'method' => 'GET', 'protocol_version' => 1.1, 'timeout' => $this->timeout, 'max_redirects' => $this->max_redirects, - 'header' => implode("\r\n", $headers) ) ); @@ -54,31 +66,46 @@ class Stream extends Client Logging::setMessage(get_called_class().' Proxy: '.$this->proxy_hostname.':'.$this->proxy_port); - $context_options['http']['proxy'] = 'tcp://'.$this->proxy_hostname.':'.$this->proxy_port; - $context_options['http']['request_fulluri'] = true; + $context['http']['proxy'] = 'tcp://'.$this->proxy_hostname.':'.$this->proxy_port; + $context['http']['request_fulluri'] = true; if ($this->proxy_username) { Logging::setMessage(get_called_class().' Proxy credentials: Yes'); - - $headers[] = 'Proxy-Authorization: Basic '.base64_encode($this->proxy_username.':'.$this->proxy_password); - $context_options['http']['header'] = implode("\r\n", $headers); } else { Logging::setMessage(get_called_class().' Proxy credentials: No'); } } - $context = stream_context_create($context_options); + $context['http']['header'] = implode("\r\n", $this->prepareHeaders()); + + return $context; + } + + /** + * Do the HTTP request + * + * @access public + * @return array HTTP response ['body' => ..., 'status' => ..., 'headers' => ...] + */ + public function doRequest() + { + // Create context + $context = stream_context_create($this->prepareContext()); // Make HTTP request $stream = @fopen($this->url, 'r', false, $context); - if (! is_resource($stream)) return false; + if (! is_resource($stream)) { + return false; + } // Get the entire body until the max size $body = stream_get_contents($stream, $this->max_body_size + 1); // If the body size is too large abort everything - if (strlen($body) > $this->max_body_size) return false; + if (strlen($body) > $this->max_body_size) { + return false; + } // Get HTTP headers response $metadata = stream_get_meta_data($stream); @@ -87,6 +114,23 @@ class Stream extends Client fclose($stream); + return array( + 'status' => $status, + 'body' => $this->decodeBody($body, $headers), + 'headers' => $headers + ); + } + + /** + * Decode body response according to the HTTP headers + * + * @access public + * @param string $body Raw body + * @param array $headers HTTP headers + * @return string + */ + public function decodeBody($body, array $headers) + { if (isset($headers['Transfer-Encoding']) && $headers['Transfer-Encoding'] === 'chunked') { $body = $this->decodeChunked($body); } @@ -95,11 +139,7 @@ class Stream extends Client $body = @gzdecode($body); } - return array( - 'status' => $status, - 'body' => $body, - 'headers' => $headers - ); + return $body; } /** diff --git a/vendor/PicoFeed/Config.php b/vendor/PicoFeed/Config.php index 935e019..283ce23 100644 --- a/vendor/PicoFeed/Config.php +++ b/vendor/PicoFeed/Config.php @@ -7,6 +7,52 @@ namespace PicoFeed; * * @author Frederic Guillot * @package picofeed + * + * @method \PicoFeed\Config setClientTimeout(integer $value) + * @method \PicoFeed\Config setClientUserAgent(string $value) + * @method \PicoFeed\Config setMaxRedirections(integer $value) + * @method \PicoFeed\Config setMaxBodySize(integer $value) + * @method \PicoFeed\Config setProxyHostname(string $value) + * @method \PicoFeed\Config setProxyPort(integer $value) + * @method \PicoFeed\Config setProxyUsername(string $value) + * @method \PicoFeed\Config setProxyPassword(string $value) + * @method \PicoFeed\Config setGrabberTimeout(integer $value) + * @method \PicoFeed\Config setGrabberUserAgent(string $value) + * @method \PicoFeed\Config setParserHashAlgo(string $value) + * @method \PicoFeed\Config setContentFiltering(boolean $value) + * @method \PicoFeed\Config setTimezone(string $value) + * @method \PicoFeed\Config setFilterIframeWhitelist(array $value) + * @method \PicoFeed\Config setFilterIntegerAttributes(array $value) + * @method \PicoFeed\Config setFilterAttributeOverrides(array $value) + * @method \PicoFeed\Config setFilterRequiredAttributes(array $value) + * @method \PicoFeed\Config setFilterMediaBlacklist(array $value) + * @method \PicoFeed\Config setFilterMediaAttributes(array $value) + * @method \PicoFeed\Config setFilterSchemeWhitelist(array $value) + * @method \PicoFeed\Config setFilterWhitelistedTags(array $value) + * @method \PicoFeed\Config setFilterBlacklistedTags(array $value) + * + * @method integer getClientTimeout() + * @method string getClientUserAgent() + * @method integer getMaxRedirections() + * @method integer getMaxBodySize() + * @method string getProxyHostname() + * @method integer getProxyPort() + * @method string getProxyUsername() + * @method string getProxyPassword() + * @method integer getGrabberTimeout() + * @method string getGrabberUserAgent() + * @method string getParserHashAlgo() + * @method boolean getContentFiltering(bool $default_value) + * @method string getTimezone() + * @method array getFilterIframeWhitelist(array $default_value) + * @method array getFilterIntegerAttributes(array $default_value) + * @method array getFilterAttributeOverrides(array $default_value) + * @method array getFilterRequiredAttributes(array $default_value) + * @method array getFilterMediaBlacklist(array $default_value) + * @method array getFilterMediaAttributes(array $default_value) + * @method array getFilterSchemeWhitelist(array $default_value) + * @method array getFilterWhitelistedTags(array $default_value) + * @method array getFilterBlacklistedTags(array $default_value) */ class Config { diff --git a/vendor/PicoFeed/Export.php b/vendor/PicoFeed/Export.php index df03f98..5fa0c4b 100644 --- a/vendor/PicoFeed/Export.php +++ b/vendor/PicoFeed/Export.php @@ -58,23 +58,39 @@ class Export $body = $xml->addChild('body'); - foreach ($this->content as $feed) { + foreach ($this->content as $category => $values) { - $valid = true; - - foreach ($this->required_fields as $field) { - - if (! isset($feed[$field])) { - $valid = false; - break; - } + if (is_string($category)) { + $this->createCategory($body, $category, $values); } - - if (! $valid) { - continue; + else { + $this->createEntry($body, $values); } + } - $outline = $body->addChild('outline'); + return $xml->asXML(); + } + + /** + * Create a feed entry + * + * @access public + * @param SimpleXMLElement $parent Parent Element + * @param array $feed Feed properties + */ + public function createEntry(SimpleXMLElement $parent, array $feed) + { + $valid = true; + + foreach ($this->required_fields as $field) { + if (! isset($feed[$field])) { + $valid = false; + break; + } + } + + if ($valid) { + $outline = $parent->addChild('outline'); $outline->addAttribute('xmlUrl', $feed['feed_url']); $outline->addAttribute('htmlUrl', $feed['site_url']); $outline->addAttribute('title', $feed['title']); @@ -83,7 +99,34 @@ class Export $outline->addAttribute('type', 'rss'); $outline->addAttribute('version', 'RSS'); } + } - return $xml->asXML(); + /** + * Create entries for a feed list + * + * @access public + * @param SimpleXMLElement $parent Parent Element + * @param array $feeds Feed list + */ + public function createEntries(SimpleXMLElement $parent, array $feeds) + { + foreach ($feeds as $feed) { + $this->createEntry($parent, $feed); + } + } + + /** + * Create a category entry + * + * @access public + * @param SimpleXMLElement $parent Parent Element + * @param string $category Category + * @param array $feed Feed properties + */ + public function createCategory(SimpleXMLElement $parent, $category, array $feeds) + { + $outline = $parent->addChild('outline'); + $outline->addAttribute('text', $category); + $this->createEntries($outline, $feeds); } } diff --git a/vendor/PicoFeed/Favicon.php b/vendor/PicoFeed/Favicon.php new file mode 100644 index 0000000..ec87531 --- /dev/null +++ b/vendor/PicoFeed/Favicon.php @@ -0,0 +1,163 @@ +config = $config ?: new Config; + } + + /** + * Get the icon file content (available only after the download) + * + * @access public + * @return string + */ + public function getContent() + { + return $this->content; + } + + /** + * Download and check if a resource exists + * + * @access public + * @param string $url URL + * @return string Resource content + */ + public function download($url) + { + Logging::setMessage(get_called_class().' Download => '.$url); + + $client = Client::getInstance(); + $client->setConfig($this->config); + + if ($client->execute($url) && ! $client->isNotFound()) { + return $client->getContent(); + } + + return ''; + } + + /** + * Check if a remote file exists + * + * @access public + * @param string $url URL + * @return boolean + */ + public function exists($url) + { + return $this->download($url) !== ''; + } + + /** + * Get the icon link for a website + * + * @access public + * @param string $website_link URL + * @return string + */ + public function find($website_link) + { + $website = new Url($website_link); + + $icons = $this->extract($this->download($website->getBaseUrl('/'))); + $icons[] = $website->getBaseUrl('/favicon.ico'); + + foreach ($icons as $icon_link) { + + $icon_link = $this->convertLink($website, new Url($icon_link)); + $this->content = $this->download($icon_link); + + if ($this->content !== '') { + return $icon_link; + } + } + + return ''; + } + + /** + * Convert icon links to absolute url + * + * @access public + * @param \PicoFeed\Url $website Website url + * @param \PicoFeed\Url $icon Icon url + * @return string + */ + public function convertLink(Url $website, Url $icon) + { + $base_url = ''; + + if ($icon->isRelativeUrl()) { + $base_url = $website->getBaseUrl(); + } + else if ($icon->isProtocolRelative()) { + $icon->setScheme($website->getScheme()); + } + + return $icon->getAbsoluteUrl($base_url); + } + + /** + * Extract the icon links from the HTML + * + * @access public + * @param string $html HTML + * @return array + */ + public function extract($html) + { + $icons = array(); + + if (empty($html)) { + return $icons; + } + + $dom = XmlParser::getHtmlDocument($html); + + $xpath = new DOMXpath($dom); + $elements = $xpath->query("//link[contains(@rel, 'icon') and not(contains(@rel, 'apple'))]"); + + for ($i = 0; $i < $elements->length; $i++) { + $icons[] = $elements->item($i)->getAttribute('href'); + } + + return $icons; + } +} diff --git a/vendor/PicoFeed/Feed.php b/vendor/PicoFeed/Feed.php index 90ce0d8..6bd6392 100644 --- a/vendor/PicoFeed/Feed.php +++ b/vendor/PicoFeed/Feed.php @@ -35,7 +35,15 @@ class Feed public $title = ''; /** - * Item url + * Feed description + * + * @access public + * @var string + */ + public $description = ''; + + /** + * Feed url * * @access public * @var string @@ -43,7 +51,7 @@ class Feed public $url = ''; /** - * Item date + * Feed date * * @access public * @var integer @@ -51,13 +59,21 @@ class Feed public $date = 0; /** - * Item language + * Feed language * * @access public * @var string */ public $language = ''; + /** + * Feed logo URL (not the same as icon) + * + * @access public + * @var string + */ + public $logo = ''; + /** * Return feed information * @@ -68,7 +84,7 @@ class Feed { $output = ''; - foreach (array('id', 'title', 'url', 'date', 'language') as $property) { + foreach (array('id', 'title', 'url', 'date', 'language', 'description', 'logo') as $property) { $output .= 'Feed::'.$property.' = '.$this->$property.PHP_EOL; } @@ -93,6 +109,28 @@ class Feed return $this->title; } + /** + * Get description + * + * @access public + * $return string + */ + public function getDescription() + { + return $this->description; + } + + /** + * Get the logo url + * + * @access public + * $return string + */ + public function getLogo() + { + return $this->logo; + } + /** * Get url * diff --git a/vendor/PicoFeed/Filter.php b/vendor/PicoFeed/Filter.php index bbfd97a..fab3926 100644 --- a/vendor/PicoFeed/Filter.php +++ b/vendor/PicoFeed/Filter.php @@ -2,7 +2,7 @@ namespace PicoFeed; -use DOMDocument; +use PicoFeed\Filter\Html; /** * Filter class @@ -13,436 +13,18 @@ use DOMDocument; class Filter { /** - * Config object - * - * @access private - * @var \PicoFeed\Config - */ - private $config = null; - - /** - * Filtered XML data - * - * @access private - * @var string - */ - private $data = ''; - - /** - * Site URL (used to build absolute URL) - * - * @access private - * @var string - */ - private $url = ''; - - /** - * Unfiltered XML data - * - * @access private - * @var string - */ - private $input = ''; - - /** - * List of empty tags - * - * @access private - * @var array - */ - private $empty_tags = array(); - - /** - * Flag to remove the content of a tag - * - * @access private - * @var boolean - */ - private $strip_content = false; - - /** - * Flag to remember if the current payload is a source code
-     *
-     * @access private
-     * @var boolean
-     */
-    private $is_code = false;
-
-    /**
-     * Tags and attribute whitelist
-     *
-     * @access private
-     * @var array
-     */
-    private $whitelist_tags = array(
-        'audio' => array('controls', 'src'),
-        'video' => array('poster', 'controls', 'height', 'width', 'src'),
-        'source' => array('src', 'type'),
-        'dt' => array(),
-        'dd' => array(),
-        'dl' => array(),
-        'table' => array(),
-        'caption' => array(),
-        'tr' => array(),
-        'th' => array(),
-        'td' => array(),
-        'tbody' => array(),
-        'thead' => array(),
-        'h2' => array(),
-        'h3' => array(),
-        'h4' => array(),
-        'h5' => array(),
-        'h6' => array(),
-        'strong' => array(),
-        'em' => array(),
-        'code' => array(),
-        'pre' => array(),
-        'blockquote' => array(),
-        'p' => array(),
-        'ul' => array(),
-        'li' => array(),
-        'ol' => array(),
-        'br' => array(),
-        'del' => array(),
-        'a' => array('href'),
-        'img' => array('src', 'title', 'alt'),
-        'figure' => array(),
-        'figcaption' => array(),
-        'cite' => array(),
-        'time' => array('datetime'),
-        'abbr' => array('title'),
-        'iframe' => array('width', 'height', 'frameborder', 'src'),
-        'q' => array('cite')
-    );
-
-    /**
-     * Tags blacklist, strip the content of those tags
-     *
-     * @access private
-     * @var array
-     */
-    private $blacklisted_tags = array(
-        'script'
-    );
-
-    /**
-     * Scheme whitelist
-     * For a complete list go to http://en.wikipedia.org/wiki/URI_scheme
-     *
-     * @access private
-     * @var array
-     */
-    private $scheme_whitelist = array(
-        '//',
-        'data:image/png;base64,',
-        'data:image/gif;base64,',
-        'data:image/jpg;base64,',
-        'bitcoin:',
-        'callto:',
-        'ed2k://',
-        'facetime://',
-        'feed:',
-        'ftp://',
-        'geo:',
-        'git://',
-        'http://',
-        'https://',
-        'irc://',
-        'irc6://',
-        'ircs://',
-        'jabber:',
-        'magnet:',
-        'mailto:',
-        'nntp://',
-        'rtmp://',
-        'sftp://',
-        'sip:',
-        'sips:',
-        'skype:',
-        'smb://',
-        'sms:',
-        'spotify:',
-        'ssh:',
-        'steam:',
-        'svn://',
-        'tel:',
-    );
-
-    /**
-     * Attributes used for external resources
-     *
-     * @access private
-     * @var array
-     */
-    private $media_attributes = array(
-        'src',
-        'href',
-        'poster',
-    );
-
-    /**
-     * Blacklisted resources
-     *
-     * @access private
-     * @var array
-     */
-    private $media_blacklist = array(
-        'feeds.feedburner.com',
-        'share.feedsportal.com',
-        'da.feedsportal.com',
-        'rss.feedsportal.com',
-        'res.feedsportal.com',
-        'res1.feedsportal.com',
-        'res2.feedsportal.com',
-        'res3.feedsportal.com',
-        'pi.feedsportal.com',
-        'rss.nytimes.com',
-        'feeds.wordpress.com',
-        'stats.wordpress.com',
-        'rss.cnn.com',
-        'twitter.com/home?status=',
-        'twitter.com/share',
-        'twitter_icon_large.png',
-        'www.facebook.com/sharer.php',
-        'facebook_icon_large.png',
-        'plus.google.com/share',
-        'www.gstatic.com/images/icons/gplus-16.png',
-        'www.gstatic.com/images/icons/gplus-32.png',
-        'www.gstatic.com/images/icons/gplus-64.png',
-    );
-
-    /**
-     * Mandatory attributes for specified tags
-     *
-     * @access private
-     * @var array
-     */
-    private $required_attributes = array(
-        'a' => array('href'),
-        'img' => array('src'),
-        'iframe' => array('src'),
-        'audio' => array('src'),
-        'source' => array('src'),
-    );
-
-    /**
-     * Add attributes to specified tags
-     *
-     * @access private
-     * @var array
-     */
-    private $add_attributes = array(
-        'a' => 'rel="noreferrer" target="_blank"'
-    );
-
-    /**
-     * Attributes that must be integer
-     *
-     * @access private
-     * @var array
-     */
-    private $integer_attributes = array(
-        'width',
-        'height',
-        'frameborder',
-    );
-
-    /**
-     * Iframe source whitelist, everything else is ignored
-     *
-     * @access private
-     * @var array
-     */
-    private $iframe_whitelist = array(
-        '//www.youtube.com',
-        'http://www.youtube.com',
-        'https://www.youtube.com',
-        'http://player.vimeo.com',
-        'https://player.vimeo.com',
-        'http://www.dailymotion.com',
-        'https://www.dailymotion.com',
-    );
-
-    /**
-     * Initialize the filter, all inputs data must be encoded in UTF-8 before
+     * Get the Html filter instance
      *
+     * @static
      * @access public
-     * @param  string  $data      XML content
-     * @param  string  $site_url  Site URL (used to build absolute URL)
+     * @param  string  $html      HTML content
+     * @param  string  $website   Site URL (used to build absolute URL)
+     * @return PicoFeed\Filter\Html
      */
-    public function __construct($data, $site_url)
+    public static function html($html, $website)
     {
-        $this->url = $site_url;
-
-        libxml_use_internal_errors(true);
-
-        // Convert bad formatted documents to XML
-        $dom = new DOMDocument;
-        $dom->loadHTML(''.$data);
-        $this->input = $dom->saveXML($dom->getElementsByTagName('body')->item(0));
-    }
-
-    /**
-     * Run tags/attributes filtering
-     *
-     * @access public
-     * @return string
-     */
-    public function execute()
-    {
-        $parser = xml_parser_create();
-        xml_set_object($parser, $this);
-        xml_set_element_handler($parser, 'startTag', 'endTag');
-        xml_set_character_data_handler($parser, 'dataTag');
-        xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, false);
-        xml_parse($parser, $this->input, true); // We ignore parsing error (for old libxml)
-        xml_parser_free($parser);
-
-        $this->data = $this->removeEmptyTags($this->data);
-        $this->data = $this->removeMultipleTags($this->data);
-
-        return trim($this->data);
-    }
-
-    /**
-     * Parse opening tag
-     *
-     * @access public
-     * @param  resource  $parser       XML parser
-     * @param  string    $name         Tag name
-     * @param  array     $attributes   Tag attributes
-     */
-    public function startTag($parser, $name, $attributes)
-    {
-        $empty_tag = false;
-        $this->strip_content = false;
-
-        if ($this->is_code === false && $name === 'pre') $this->is_code = true;
-
-        if ($this->isPixelTracker($name, $attributes)) {
-
-            $empty_tag = true;
-        }
-        else if ($this->isAllowedTag($name)) {
-
-            $attr_data = '';
-            $used_attributes = array();
-
-            foreach ($attributes as $attribute => $value) {
-
-                if ($value != '' && $this->isAllowedAttribute($name, $attribute)) {
-
-                    if ($this->isResource($attribute)) {
-
-                        if ($name === 'iframe') {
-
-                            if ($this->isAllowedIframeResource($value)) {
-
-                                $attr_data .= ' '.$attribute.'="'.$this->escape($value).'"';
-                                $used_attributes[] = $attribute;
-                            }
-                        }
-                        else if ($this->isRelativePath($value)) {
-
-                            $attr_data .= ' '.$attribute.'="'.$this->escape($this->getAbsoluteUrl($value, $this->url)).'"';
-                            $used_attributes[] = $attribute;
-                        }
-                        else if ($this->isAllowedProtocol($value) && ! $this->isBlacklistedMedia($value)) {
-
-                            if ($attribute == 'src' &&
-                                isset($attributes['data-src']) &&
-                                $this->isAllowedProtocol($attributes['data-src']) &&
-                                ! $this->isBlacklistedMedia($attributes['data-src'])) {
-
-                                $value = $attributes['data-src'];
-                            }
-
-                            // Replace protocol-relative url // by http://
-                            if (substr($value, 0, 2) === '//') $value = 'http:'.$value;
-
-                            $attr_data .= ' '.$attribute.'="'.$this->escape($value).'"';
-                            $used_attributes[] = $attribute;
-                        }
-                    }
-                    else if ($this->validateAttributeValue($attribute, $value)) {
-
-                        $attr_data .= ' '.$attribute.'="'.$this->escape($value).'"';
-                        $used_attributes[] = $attribute;
-                    }
-                }
-            }
-
-            // Check for required attributes
-            if (isset($this->required_attributes[$name])) {
-
-                foreach ($this->required_attributes[$name] as $required_attribute) {
-
-                    if (! in_array($required_attribute, $used_attributes)) {
-
-                        $empty_tag = true;
-                        break;
-                    }
-                }
-            }
-
-            if (! $empty_tag) {
-
-                $this->data .= '<'.$name.$attr_data;
-
-                // Add custom attributes
-                if (isset($this->add_attributes[$name])) {
-
-                    $this->data .= ' '.$this->add_attributes[$name].' ';
-                }
-
-                // If img or br, we don't close it here
-                if ($name !== 'img' && $name !== 'br') $this->data .= '>';
-            }
-        }
-
-        if (in_array($name, $this->blacklisted_tags)) {
-            $this->strip_content = true;
-        }
-
-        $this->empty_tags[] = $empty_tag;
-    }
-
-    /**
-     * Parse closing tag
-     *
-     * @access public
-     * @param  resource  $parser    XML parser
-     * @param  string    $name      Tag name
-     */
-    public function endTag($parser, $name)
-    {
-        if (! array_pop($this->empty_tags) && $this->isAllowedTag($name)) {
-            $this->data .= $name !== 'img' && $name !== 'br' ? '' : '/>';
-        }
-
-        if ($this->is_code && $name === 'pre') $this->is_code = false;
-    }
-
-    /**
-     * Parse tag content
-     *
-     * @access public
-     * @param  resource  $parser    XML parser
-     * @param  string    $content   Tag content
-     */
-    public function dataTag($parser, $content)
-    {
-        $content = str_replace("\xc2\xa0", ' ', $content); // Replace   with normal space
-
-        // Issue with Cyrillic characters
-        // Replace mutliple space by a single one
-        // if (! $this->is_code) {
-        //     $content = preg_replace('!\s+!', ' ', $content);
-        // }
-
-        if (! $this->strip_content) {
-            $this->data .= $this->escape($content);
-        }
+        $filter = new Html($html, $website);
+        return $filter;
     }
 
     /**
@@ -454,222 +36,7 @@ class Filter
      */
     public static function escape($content)
     {
-        return htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false);
-    }
-
-    /**
-     * Get the absolute url for a relative link
-     *
-     * @access public
-     * @param  string  $path   Relative path
-     * @param  string  $url    Site base url
-     * @return string
-     */
-    public static function getAbsoluteUrl($path, $url)
-    {
-        $components = parse_url($url);
-
-        if (! isset($components['scheme'])) $components['scheme'] = 'http';
-
-        if (! isset($components['host'])) {
-
-            if ($url) {
-
-                $components['host'] = $url;
-                $components['path'] = '/';
-            }
-            else {
-
-                return '';
-            }
-        }
-
-        if (! strlen($path)) return $url;
-
-        if ($path{0} === '/') {
-
-            // Absolute path
-            return $components['scheme'].'://'.$components['host'].$path;
-        }
-        else {
-
-            // Relative path
-            $url_path = isset($components['path']) && ! empty($components['path']) ? $components['path'] : '/';
-            $length = strlen($url_path);
-
-            if ($length > 1 && $url_path{$length - 1} !== '/') {
-                $url_path = dirname($url_path).'/';
-            }
-
-            if (substr($path, 0, 2) === './') {
-                $path = substr($path, 2);
-            }
-
-            return $components['scheme'].'://'.$components['host'].$url_path.$path;
-        }
-    }
-
-    /**
-     * Check if an url is relative
-     *
-     * @access public
-     * @param  string  $value   Attribute value
-     * @return boolean
-     */
-    public static function isRelativePath($value)
-    {
-        if (strpos($value, 'data:') === 0) return false;
-        return strpos($value, '://') === false && strpos($value, '//') !== 0;
-    }
-
-    /**
-     * Check if a tag is on the whitelist
-     *
-     * @access public
-     * @param  string  $name   Tag name
-     * @return boolean
-     */
-    public function isAllowedTag($name)
-    {
-        return isset($this->whitelist_tags[$name]);
-    }
-
-    /**
-     * Check if an attribute is allowed for a given tag
-     *
-     * @access public
-     * @param  string  $tag        Tag name
-     * @param  array   $attribute  Attribute name
-     * @return boolean
-     */
-    public function isAllowedAttribute($tag, $attribute)
-    {
-        return in_array($attribute, $this->whitelist_tags[$tag]);
-    }
-
-    /**
-     * Check if an attribute name is an external resource
-     *
-     * @access public
-     * @param  string  $data  Attribute name
-     * @return boolean
-     */
-    public function isResource($attribute)
-    {
-        return in_array($attribute, $this->media_attributes);
-    }
-
-    /**
-     * Check if an iframe url is allowed
-     *
-     * @access public
-     * @param  string  $value  Attribute value
-     * @return boolean
-     */
-    public function isAllowedIframeResource($value)
-    {
-        foreach ($this->iframe_whitelist as $url) {
-
-            if (strpos($value, $url) === 0) {
-                return true;
-            }
-        }
-
-        return false;
-    }
-
-    /**
-     * Detect if the protocol is allowed or not
-     *
-     * @access public
-     * @param  string  $value  Attribute value
-     * @return boolean
-     */
-    public function isAllowedProtocol($value)
-    {
-        foreach ($this->scheme_whitelist as $protocol) {
-
-            if (strpos($value, $protocol) === 0) {
-                return true;
-            }
-        }
-
-        return false;
-    }
-
-    /**
-     * Detect if an url is blacklisted
-     *
-     * @access public
-     * @param  string  $resouce  Attribute value (URL)
-     * @return boolean
-     */
-    public function isBlacklistedMedia($resource)
-    {
-        foreach ($this->media_blacklist as $name) {
-
-            if (strpos($resource, $name) !== false) {
-                return true;
-            }
-        }
-
-        return false;
-    }
-
-    /**
-     * Detect if an image tag is a pixel tracker
-     *
-     * @access public
-     * @param  string  $tag         Tag name
-     * @param  array   $attributes  Tag attributes
-     * @return boolean
-     */
-    public function isPixelTracker($tag, array $attributes)
-    {
-        return $tag === 'img' &&
-                isset($attributes['height']) && isset($attributes['width']) &&
-                $attributes['height'] == 1 && $attributes['width'] == 1;
-    }
-
-    /**
-     * Check if an attribute value is integer
-     *
-     * @access public
-     * @param  string  $attribute   Attribute name
-     * @param  string  $value       Attribute value
-     * @return boolean
-     */
-    public function validateAttributeValue($attribute, $value)
-    {
-        if (in_array($attribute, $this->integer_attributes)) {
-            return ctype_digit($value);
-        }
-
-        return true;
-    }
-
-    /**
-     * Replace 

by only one - * - * @access public - * @param string $data Input data - * @return string - */ - public function removeMultipleTags($data) - { - return preg_replace("/(\s*)+/", "
", $data); - } - - /** - * Remove empty tags - * - * @access public - * @param string $data Input data - * @return string - */ - public function removeEmptyTags($data) - { - return preg_replace('/<([^<\/>]*)>([\s]*?|(?R))<\/\1>/imsU', '', $data); + return @htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false); } /** @@ -734,145 +101,41 @@ class Filter } /** - * Set whitelisted tags adn attributes for each tag + * Trim whitespace from the begining, the end and inside a string and don't break utf-8 string * + * @static * @access public - * @param array $values List of tags: ['video' => ['src', 'cover'], 'img' => ['src']] - * @return \PicoFeed\Filter + * @param string $value Raw data + * @return string Normalized data */ - public function setWhitelistedTags(array $values) + public static function stripWhiteSpace($value) { - $this->whitelist_tags = $values ?: $this->whitelist_tags; - return $this; + $value = str_replace("\r", "", $value); + $value = str_replace("\t", "", $value); + $value = str_replace("\n", "", $value); + return trim($value); } /** - * Set blacklisted tags + * Dirty quickfixes before XML parsing * + * @static * @access public - * @param array $values List of tags: ['video', 'img'] - * @return \PicoFeed\Filter + * @param string $data Raw data + * @return string Normalized data */ - public function setBlacklistedTags(array $values) + public static function normalizeData($data) { - $this->blacklisted_tags = $values ?: $this->blacklisted_tags; - return $this; - } + $invalid_chars = array( + "\x10", + "\xc3\x20", + "", + ); - /** - * Set scheme whitelist - * - * @access public - * @param array $values List of scheme: ['http://', 'ftp://'] - * @return \PicoFeed\Filter - */ - public function setSchemeWhitelist(array $values) - { - $this->scheme_whitelist = $values ?: $this->scheme_whitelist; - return $this; - } - - /** - * Set media attributes (used to load external resources) - * - * @access public - * @param array $values List of values: ['src', 'href'] - * @return \PicoFeed\Filter - */ - public function setMediaAttributes(array $values) - { - $this->media_attributes = $values ?: $this->media_attributes; - return $this; - } - - /** - * Set blacklisted external resources - * - * @access public - * @param array $values List of tags: ['http://google.com/', '...'] - * @return \PicoFeed\Filter - */ - public function setMediaBlacklist(array $values) - { - $this->media_blacklist = $values ?: $this->media_blacklist; - return $this; - } - - /** - * Set mandatory attributes for whitelisted tags - * - * @access public - * @param array $values List of tags: ['img' => 'src'] - * @return \PicoFeed\Filter - */ - public function setRequiredAttributes(array $values) - { - $this->required_attributes = $values ?: $this->required_attributes; - return $this; - } - - /** - * Set attributes to automatically to specific tags - * - * @access public - * @param array $values List of tags: ['a' => 'target="_blank"'] - * @return \PicoFeed\Filter - */ - public function setAttributeOverrides(array $values) - { - $this->add_attributes = $values ?: $this->add_attributes; - return $this; - } - - /** - * Set attributes that must be an integer - * - * @access public - * @param array $values List of tags: ['width', 'height'] - * @return \PicoFeed\Filter - */ - public function setIntegerAttributes(array $values) - { - $this->integer_attributes = $values ?: $this->integer_attributes; - return $this; - } - - /** - * Set allowed iframe resources - * - * @access public - * @param array $values List of tags: ['http://www.youtube.com'] - * @return \PicoFeed\Filter - */ - public function setIframeWhitelist(array $values) - { - $this->iframe_whitelist = $values ?: $this->iframe_whitelist; - return $this; - } - - /** - * Set config object - * - * @access public - * @param \PicoFeed\Config $config Config instance - * @return \PicoFeed\Parse - */ - public function setConfig($config) - { - $this->config = $config; - - if ($this->config !== null) { - $this->setIframeWhitelist($this->config->getFilterIframeWhitelist(array())); - $this->setIntegerAttributes($this->config->getFilterIntegerAttributes(array())); - $this->setAttributeOverrides($this->config->getFilterAttributeOverrides(array())); - $this->setRequiredAttributes($this->config->getFilterRequiredAttributes(array())); - $this->setMediaBlacklist($this->config->getFilterMediaBlacklist(array())); - $this->setMediaAttributes($this->config->getFilterMediaAttributes(array())); - $this->setSchemeWhitelist($this->config->getFilterSchemeWhitelist(array())); - $this->setBlacklistedTags($this->config->getFilterBlacklistedTags(array())); - $this->setWhitelistedTags($this->config->getFilterWhitelistedTags(array())); + foreach ($invalid_chars as $needle) { + $data = str_replace($needle, '', $data); } - return $this; + return $data; } } diff --git a/vendor/PicoFeed/Filter/Attribute.php b/vendor/PicoFeed/Filter/Attribute.php new file mode 100644 index 0000000..8fe4b71 --- /dev/null +++ b/vendor/PicoFeed/Filter/Attribute.php @@ -0,0 +1,590 @@ + array('controls', 'src'), + 'video' => array('poster', 'controls', 'height', 'width', 'src'), + 'source' => array('src', 'type'), + 'dt' => array(), + 'dd' => array(), + 'dl' => array(), + 'table' => array(), + 'caption' => array(), + 'tr' => array(), + 'th' => array(), + 'td' => array(), + 'tbody' => array(), + 'thead' => array(), + 'h2' => array(), + 'h3' => array(), + 'h4' => array(), + 'h5' => array(), + 'h6' => array(), + 'strong' => array(), + 'em' => array(), + 'code' => array(), + 'pre' => array(), + 'blockquote' => array(), + 'p' => array(), + 'ul' => array(), + 'li' => array(), + 'ol' => array(), + 'br' => array(), + 'del' => array(), + 'a' => array('href'), + 'img' => array('src', 'title', 'alt'), + 'figure' => array(), + 'figcaption' => array(), + 'cite' => array(), + 'time' => array('datetime'), + 'abbr' => array('title'), + 'iframe' => array('width', 'height', 'frameborder', 'src'), + 'q' => array('cite') + ); + + /** + * Scheme whitelist + * + * For a complete list go to http://en.wikipedia.org/wiki/URI_scheme + * + * @access private + * @var array + */ + private $scheme_whitelist = array( + 'bitcoin:', + 'callto:', + 'ed2k://', + 'facetime://', + 'feed:', + 'ftp://', + 'geo:', + 'git://', + 'http://', + 'https://', + 'irc://', + 'irc6://', + 'ircs://', + 'jabber:', + 'magnet:', + 'mailto:', + 'nntp://', + 'rtmp://', + 'sftp://', + 'sip:', + 'sips:', + 'skype:', + 'smb://', + 'sms:', + 'spotify:', + 'ssh:', + 'steam:', + 'svn://', + 'tel:', + ); + + /** + * Iframe source whitelist, everything else is ignored + * + * @access private + * @var array + */ + private $iframe_whitelist = array( + 'http://www.youtube.com', + 'https://www.youtube.com', + 'http://player.vimeo.com', + 'https://player.vimeo.com', + 'http://www.dailymotion.com', + 'https://www.dailymotion.com', + ); + + /** + * Blacklisted resources + * + * @access private + * @var array + */ + private $media_blacklist = array( + 'api.flattr.com', + 'feeds.feedburner.com', + 'share.feedsportal.com', + 'da.feedsportal.com', + 'rss.feedsportal.com', + 'res.feedsportal.com', + 'res1.feedsportal.com', + 'res2.feedsportal.com', + 'res3.feedsportal.com', + 'pi.feedsportal.com', + 'rss.nytimes.com', + 'feeds.wordpress.com', + 'stats.wordpress.com', + 'rss.cnn.com', + 'twitter.com/home?status=', + 'twitter.com/share', + 'twitter_icon_large.png', + 'www.facebook.com/sharer.php', + 'facebook_icon_large.png', + 'plus.google.com/share', + 'www.gstatic.com/images/icons/gplus-16.png', + 'www.gstatic.com/images/icons/gplus-32.png', + 'www.gstatic.com/images/icons/gplus-64.png', + ); + + /** + * Attributes used for external resources + * + * @access private + * @var array + */ + private $media_attributes = array( + 'src', + 'href', + 'poster', + ); + + /** + * Attributes that must be integer + * + * @access private + * @var array + */ + private $integer_attributes = array( + 'width', + 'height', + 'frameborder', + ); + + /** + * Mandatory attributes for specified tags + * + * @access private + * @var array + */ + private $required_attributes = array( + 'a' => array('href'), + 'img' => array('src'), + 'iframe' => array('src'), + 'audio' => array('src'), + 'source' => array('src'), + ); + + /** + * Add attributes to specified tags + * + * @access private + * @var array + */ + private $add_attributes = array( + 'a' => array('rel' => 'noreferrer', 'target' => '_blank') + ); + + /** + * List of filters to apply + * + * @access private + * @var array + */ + private $filters = array( + 'filterEmptyAttribute', + 'filterAllowedAttribute', + 'filterIntegerAttribute', + 'filterAbsoluteUrlAttribute', + 'filterIframeAttribute', + 'filterBlacklistResourceAttribute', + 'filterProtocolUrlAttribute', + ); + + /** + * Add attributes to specified tags + * + * @access private + * @var \PicoFeed\Url + */ + private $website = null; + + /** + * Constructor + * + * @access public + * @param \PicoFeed\Url $website Website url instance + */ + public function __construct(Url $website) + { + $this->website = $website; + } + + /** + * Apply filters to the attributes list + * + * @access public + * @param string $tag Tag name + * @param array $attributes Attributes dictionary + * @return array Filtered attributes + */ + public function filter($tag, array $attributes) + { + foreach ($attributes as $attribute => &$value) { + foreach ($this->filters as $filter) { + if (! $this->$filter($tag, $attribute, $value)) { + unset($attributes[$attribute]); + break; + } + } + } + + return $attributes; + } + + /** + * Return true if the value is not empty (remove empty attributes) + * + * @access public + * @param string $tag Tag name + * @param string $attribute Atttribute name + * @param string $value Atttribute value + * @return boolean + */ + public function filterEmptyAttribute($tag, $attribute, $value) + { + return $value !== ''; + } + + /** + * Return true if the value is allowed (remove not allowed attributes) + * + * @access public + * @param string $tag Tag name + * @param string $attribute Atttribute name + * @param string $value Atttribute value + * @return boolean + */ + public function filterAllowedAttribute($tag, $attribute, $value) + { + return isset($this->attribute_whitelist[$tag]) && in_array($attribute, $this->attribute_whitelist[$tag]); + } + + /** + * Return true if the value is not integer (remove attributes that should have an integer value) + * + * @access public + * @param string $tag Tag name + * @param string $attribute Atttribute name + * @param string $value Atttribute value + * @return boolean + */ + public function filterIntegerAttribute($tag, $attribute, $value) + { + if (in_array($attribute, $this->integer_attributes)) { + return ctype_digit($value); + } + + return true; + } + + /** + * Return true if the iframe source is allowed (remove not allowed iframe) + * + * @access public + * @param string $tag Tag name + * @param string $attribute Atttribute name + * @param string $value Atttribute value + * @return boolean + */ + public function filterIframeAttribute($tag, $attribute, $value) + { + if ($tag === 'iframe' && $attribute === 'src') { + + foreach ($this->iframe_whitelist as $url) { + if (strpos($value, $url) === 0) { + return true; + } + } + + return false; + } + + return true; + } + + /** + * Return true if the resource is not blacklisted (remove blacklisted resource attributes) + * + * @access public + * @param string $tag Tag name + * @param string $attribute Atttribute name + * @param string $value Atttribute value + * @return boolean + */ + public function filterBlacklistResourceAttribute($tag, $attribute, $value) + { + if ($this->isResource($attribute) && $this->isBlacklistedMedia($value)) { + return false; + } + + return true; + } + + /** + * Convert all relative links to absolute url + * + * @access public + * @param string $tag Tag name + * @param string $attribute Atttribute name + * @param string $value Atttribute value + * @return boolean + */ + public function filterAbsoluteUrlAttribute($tag, $attribute, &$value) + { + if ($this->isResource($attribute)) { + $value = Url::resolve($value, $this->website); + } + + return true; + } + + /** + * Return true if the scheme is authorized + * + * @access public + * @param string $tag Tag name + * @param string $attribute Atttribute name + * @param string $value Atttribute value + * @return boolean + */ + public function filterProtocolUrlAttribute($tag, $attribute, $value) + { + if ($this->isResource($attribute) && ! $this->isAllowedProtocol($value)) { + return false; + } + + return true; + } + + /** + * Automatically add/override some attributes for specific tags + * + * @access public + * @param string $tag Tag name + * @param array $attributes Atttributes list + * @return array + */ + public function addAttributes($tag, array $attributes) + { + if (isset($this->add_attributes[$tag])) { + $attributes += $this->add_attributes[$tag]; + } + + return $attributes; + } + + /** + * Return true if all required attributes are present + * + * @access public + * @param string $tag Tag name + * @param array $attributes Atttributes list + * @return boolean + */ + public function hasRequiredAttributes($tag, array $attributes) + { + if (isset($this->required_attributes[$tag])) { + + foreach ($this->required_attributes[$tag] as $attribute) { + if (! isset($attributes[$attribute])) { + return false; + } + } + } + + return true; + } + + /** + * Check if an attribute name is an external resource + * + * @access public + * @param string $data Attribute name + * @return boolean + */ + public function isResource($attribute) + { + return in_array($attribute, $this->media_attributes); + } + + /** + * Detect if the protocol is allowed or not + * + * @access public + * @param string $value Attribute value + * @return boolean + */ + public function isAllowedProtocol($value) + { + foreach ($this->scheme_whitelist as $protocol) { + + if (strpos($value, $protocol) === 0) { + return true; + } + } + + return false; + } + + /** + * Detect if an url is blacklisted + * + * @access public + * @param string $resouce Attribute value (URL) + * @return boolean + */ + public function isBlacklistedMedia($resource) + { + foreach ($this->media_blacklist as $name) { + + if (strpos($resource, $name) !== false) { + return true; + } + } + + return false; + } + + /** + * Convert the attribute list to html + * + * @access public + * @param array $attributes Attributes + * @return string + */ + public function toHtml(array $attributes) + { + $html = array(); + + foreach ($attributes as $attribute => $value) { + $html[] = sprintf('%s="%s"', $attribute, Filter::escape($value)); + } + + return implode(' ', $html); + } + + /** + * Set whitelisted tags adn attributes for each tag + * + * @access public + * @param array $values List of tags: ['video' => ['src', 'cover'], 'img' => ['src']] + * @return \PicoFeed\Filter + */ + public function setWhitelistedAttributes(array $values) + { + $this->attribute_whitelist = $values ?: $this->attribute_whitelist; + return $this; + } + + /** + * Set scheme whitelist + * + * @access public + * @param array $values List of scheme: ['http://', 'ftp://'] + * @return \PicoFeed\Filter + */ + public function setSchemeWhitelist(array $values) + { + $this->scheme_whitelist = $values ?: $this->scheme_whitelist; + return $this; + } + + /** + * Set media attributes (used to load external resources) + * + * @access public + * @param array $values List of values: ['src', 'href'] + * @return \PicoFeed\Filter + */ + public function setMediaAttributes(array $values) + { + $this->media_attributes = $values ?: $this->media_attributes; + return $this; + } + + /** + * Set blacklisted external resources + * + * @access public + * @param array $values List of tags: ['http://google.com/', '...'] + * @return \PicoFeed\Filter + */ + public function setMediaBlacklist(array $values) + { + $this->media_blacklist = $values ?: $this->media_blacklist; + return $this; + } + + /** + * Set mandatory attributes for whitelisted tags + * + * @access public + * @param array $values List of tags: ['img' => 'src'] + * @return \PicoFeed\Filter + */ + public function setRequiredAttributes(array $values) + { + $this->required_attributes = $values ?: $this->required_attributes; + return $this; + } + + /** + * Set attributes to automatically to specific tags + * + * @access public + * @param array $values List of tags: ['a' => 'target="_blank"'] + * @return \PicoFeed\Filter + */ + public function setAttributeOverrides(array $values) + { + $this->add_attributes = $values ?: $this->add_attributes; + return $this; + } + + /** + * Set attributes that must be an integer + * + * @access public + * @param array $values List of tags: ['width', 'height'] + * @return \PicoFeed\Filter + */ + public function setIntegerAttributes(array $values) + { + $this->integer_attributes = $values ?: $this->integer_attributes; + return $this; + } + + /** + * Set allowed iframe resources + * + * @access public + * @param array $values List of tags: ['http://www.youtube.com'] + * @return \PicoFeed\Filter + */ + public function setIframeWhitelist(array $values) + { + $this->iframe_whitelist = $values ?: $this->iframe_whitelist; + return $this; + } +} diff --git a/vendor/PicoFeed/Filter/Html.php b/vendor/PicoFeed/Filter/Html.php new file mode 100644 index 0000000..4a76ca4 --- /dev/null +++ b/vendor/PicoFeed/Filter/Html.php @@ -0,0 +1,197 @@ +input = XmlParser::HtmlToXml($html); + $this->output = ''; + $this->tag = new Tag; + $this->attribute = new Attribute(new Url($website)); + } + + /** + * Set config object + * + * @access public + * @param \PicoFeed\Config $config Config instance + * @return \PicoFeed\Html + */ + public function setConfig($config) + { + $this->config = $config; + + if ($this->config !== null) { + $this->attribute->setIframeWhitelist($this->config->getFilterIframeWhitelist(array())); + $this->attribute->setIntegerAttributes($this->config->getFilterIntegerAttributes(array())); + $this->attribute->setAttributeOverrides($this->config->getFilterAttributeOverrides(array())); + $this->attribute->setRequiredAttributes($this->config->getFilterRequiredAttributes(array())); + $this->attribute->setMediaBlacklist($this->config->getFilterMediaBlacklist(array())); + $this->attribute->setMediaAttributes($this->config->getFilterMediaAttributes(array())); + $this->attribute->setSchemeWhitelist($this->config->getFilterSchemeWhitelist(array())); + $this->attribute->setWhitelistedAttributes($this->config->getFilterWhitelistedTags(array())); + $this->tag->setWhitelistedTags(array_keys($this->config->getFilterWhitelistedTags(array()))); + } + + return $this; + } + + /** + * Run tags/attributes filtering + * + * @access public + * @return string + */ + public function execute() + { + $parser = xml_parser_create(); + + xml_set_object($parser, $this); + xml_set_element_handler($parser, 'startTag', 'endTag'); + xml_set_character_data_handler($parser, 'dataTag'); + xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, false); + xml_parse($parser, $this->input, true); + xml_parser_free($parser); + + $this->postFilter(); + + return $this->output; + } + + public function postFilter() + { + $this->output = $this->tag->removeEmptyTags($this->output); + $this->output = trim($this->output); + } + + /** + * Parse opening tag + * + * @access public + * @param resource $parser XML parser + * @param string $name Tag name + * @param array $attributes Tag attributes + */ + public function startTag($parser, $tag, array $attributes) + { + $this->empty = true; + + if ($this->tag->isAllowed($tag, $attributes)) { + + $attributes = $this->attribute->filter($tag, $attributes); + + if ($this->attribute->hasRequiredAttributes($tag, $attributes)) { + + $attributes = $this->attribute->addAttributes($tag, $attributes); + + $this->output .= $this->tag->openHtmlTag($tag, $this->attribute->toHtml($attributes)); + $this->empty = false; + } + } + + $this->empty_tags[] = $this->empty; + } + + /** + * Parse closing tag + * + * @access public + * @param resource $parser XML parser + * @param string $name Tag name + */ + public function endTag($parser, $tag) + { + if (! array_pop($this->empty_tags) && $this->tag->isAllowedTag($tag)) { + $this->output .= $this->tag->closeHtmlTag($tag); + } + } + + /** + * Parse tag content + * + * @access public + * @param resource $parser XML parser + * @param string $content Tag content + */ + public function dataTag($parser, $content) + { + // Replace   with normal space + $content = str_replace("\xc2\xa0", ' ', $content); + $this->output .= Filter::escape($content); + } +} diff --git a/vendor/PicoFeed/Filter/Tag.php b/vendor/PicoFeed/Filter/Tag.php new file mode 100644 index 0000000..83bd1b9 --- /dev/null +++ b/vendor/PicoFeed/Filter/Tag.php @@ -0,0 +1,173 @@ +isAllowedTag($tag) && ! $this->isPixelTracker($tag, $attributes); + } + + /** + * Return the HTML opening tag + * + * @access public + * @param string $tag Tag name + * @param string $attributes Attributes converted in html + * @return string + */ + public function openHtmlTag($tag, $attributes = '') + { + return '<'.$tag.(empty($attributes) ? '' : ' '.$attributes).($this->isSelfClosingTag($tag) ? '/>' : '>'); + } + + /** + * Return the HTML closing tag + * + * @access public + * @param string $tag Tag name + * @return string + */ + public function closeHtmlTag($tag) + { + return $this->isSelfClosingTag($tag) ? '' : ''; + } + + /** + * Return true is the tag is self-closing + * + * @access public + * @param string $tag Tag name + * @return boolean + */ + public function isSelfClosingTag($tag) + { + return in_array($tag, array('br', 'img')); + } + + /** + * Check if a tag is on the whitelist + * + * @access public + * @param string $tag Tag name + * @return boolean + */ + public function isAllowedTag($tag) + { + return in_array($tag, $this->tag_whitelist); + } + + /** + * Detect if an image tag is a pixel tracker + * + * @access public + * @param string $tag Tag name + * @param array $attributes Tag attributes + * @return boolean + */ + public function isPixelTracker($tag, array $attributes) + { + return $tag === 'img' && + isset($attributes['height']) && isset($attributes['width']) && + $attributes['height'] == 1 && $attributes['width'] == 1; + } + + /** + * Remove empty tags + * + * @access public + * @param string $data Input data + * @return string + */ + public function removeEmptyTags($data) + { + return preg_replace('/<([^<\/>]*)>([\s]*?|(?R))<\/\1>/imsU', '', $data); + } + + /** + * Replace

by only one + * + * @access public + * @param string $data Input data + * @return string + */ + public function removeMultipleTags($data) + { + return preg_replace("/(\s*)+/", "
", $data); + } + + /** + * Set whitelisted tags adn attributes for each tag + * + * @access public + * @param array $values List of tags: ['video' => ['src', 'cover'], 'img' => ['src']] + * @return \PicoFeed\Filter + */ + public function setWhitelistedTags(array $values) + { + $this->tag_whitelist = $values ?: $this->tag_whitelist; + return $this; + } +} diff --git a/vendor/PicoFeed/Grabber.php b/vendor/PicoFeed/Grabber.php index 840d11d..97f1e05 100644 --- a/vendor/PicoFeed/Grabber.php +++ b/vendor/PicoFeed/Grabber.php @@ -3,10 +3,6 @@ namespace PicoFeed; use DOMXPath; -use PicoFeed\Logging; -use PicoFeed\Client; -use PicoFeed\Encoding; -use PicoFeed\Filter; /** * Grabber class @@ -224,20 +220,9 @@ class Grabber public function download() { $client = Client::getInstance(); - - if ($this->config !== null) { - - $client->setTimeout($this->config->getGrabberTimeout()) - ->setUserAgent($this->config->getGrabberUserAgent()) - ->setMaxRedirections($this->config->getMaxRedirections()) - ->setMaxBodySize($this->config->getMaxBodySize()) - ->setProxyHostname($this->config->getProxyHostname()) - ->setProxyPort($this->config->getProxyPort()) - ->setProxyUsername($this->config->getProxyUsername()) - ->setProxyPassword($this->config->getProxyPassword()); - } - + $client->setConfig($this->config); $client->execute($this->url); + $this->html = $client->getContent(); $this->encoding = $client->getEncoding(); @@ -253,6 +238,11 @@ class Grabber public function getRules() { $hostname = parse_url($this->url, PHP_URL_HOST); + + if ($hostname === false) { + return false; + } + $files = array($hostname); if (substr($hostname, 0, 4) == 'www.') { diff --git a/vendor/PicoFeed/Import.php b/vendor/PicoFeed/Import.php index 7992b18..1d246c0 100644 --- a/vendor/PicoFeed/Import.php +++ b/vendor/PicoFeed/Import.php @@ -2,11 +2,8 @@ namespace PicoFeed; -require_once __DIR__.'/Logging.php'; -require_once __DIR__.'/XmlParser.php'; - -use PicoFeed\Logging; -use PicoFeed\XmlParser; +use SimpleXmlElement; +use StdClass; /** * OPML Import @@ -79,21 +76,94 @@ class Import foreach ($tree->outline as $item) { if (isset($item->outline)) { - $this->parseEntries($item); } else if ((isset($item['text']) || isset($item['title'])) && isset($item['xmlUrl'])) { - $entry = new \StdClass; - $entry->category = isset($tree['title']) ? (string) $tree['title'] : (string) $tree['text']; - $entry->title = isset($item['title']) ? (string) $item['title'] : (string) $item['text']; - $entry->feed_url = (string) $item['xmlUrl']; - $entry->site_url = isset($item['htmlUrl']) ? (string) $item['htmlUrl'] : $entry->feed_url; - $entry->type = isset($item['version']) ? (string) $item['version'] : isset($item['type']) ? (string) $item['type'] : 'rss'; - $entry->description = isset($item['description']) ? (string) $item['description'] : $entry->title; + $entry = new StdClass; + $entry->category = $this->findCategory($tree); + $entry->title = $this->findTitle($item); + $entry->feed_url = $this->findFeedUrl($item); + $entry->site_url = $this->findSiteUrl($item, $entry); + $entry->type = $this->findType($item); + $entry->description = $this->findDescription($item, $entry); $this->items[] = $entry; } } } } + + /** + * Find category + * + * @access public + * @param SimpleXmlElement $tree XML tree + * @return string + */ + public function findCategory(SimpleXmlElement $tree) + { + return isset($tree['title']) ? (string) $tree['title'] : (string) $tree['text']; + } + + /** + * Find title + * + * @access public + * @param SimpleXmlElement $item XML tree + * @return string + */ + public function findTitle(SimpleXmlElement $item) + { + return isset($item['title']) ? (string) $item['title'] : (string) $item['text']; + } + + /** + * Find feed url + * + * @access public + * @param SimpleXmlElement $item XML tree + * @return string + */ + public function findFeedUrl(SimpleXmlElement $item) + { + return (string) $item['xmlUrl']; + } + + /** + * Find site url + * + * @access public + * @param SimpleXmlElement $item XML tree + * @param StdClass $entry Feed entry + * @return string + */ + public function findSiteUrl(SimpleXmlElement $item, StdClass $entry) + { + return isset($item['htmlUrl']) ? (string) $item['htmlUrl'] : $entry->feed_url; + } + + /** + * Find type + * + * @access public + * @param SimpleXmlElement $item XML tree + * @return string + */ + public function findType(SimpleXmlElement $item) + { + return isset($item['version']) ? (string) $item['version'] : isset($item['type']) ? (string) $item['type'] : 'rss'; + } + + /** + * Find description + * + * @access public + * @param SimpleXmlElement $item XML tree + * @param StdClass $entry Feed entry + * @return string + */ + public function findDescription(SimpleXmlElement $item, StdClass $entry) + { + return isset($item['description']) ? (string) $item['description'] : $entry->title; + } } diff --git a/vendor/PicoFeed/Logging.php b/vendor/PicoFeed/Logging.php index f7d6c96..86c88c9 100644 --- a/vendor/PicoFeed/Logging.php +++ b/vendor/PicoFeed/Logging.php @@ -27,7 +27,7 @@ class Logging * * @static * @access private - * @var array + * @var string */ private static $timezone = 'UTC'; diff --git a/vendor/PicoFeed/Parser.php b/vendor/PicoFeed/Parser.php index 00977b0..d494528 100644 --- a/vendor/PicoFeed/Parser.php +++ b/vendor/PicoFeed/Parser.php @@ -4,14 +4,6 @@ namespace PicoFeed; use DateTime; use DateTimeZone; -use DOMXPath; -use SimpleXMLElement; -use PicoFeed\Config; -use PicoFeed\Encoding; -use PicoFeed\Filter; -use PicoFeed\Grabber; -use PicoFeed\Logging; -use PicoFeed\XmlParser; /** * Base parser class @@ -61,13 +53,21 @@ abstract class Parser */ protected $namespaces = array(); + /** + * Enable the content filtering + * + * @access private + * @var bool + */ + private $enable_filter = true; + /** * Enable the content grabber * * @access private * @var bool */ - public $enable_grabber = false; + private $enable_grabber = false; /** * Ignore those urls for the content scraper @@ -96,7 +96,7 @@ abstract class Parser $this->content = Encoding::convert($this->content, $xml_encoding ?: $http_encoding); // Workarounds - $this->content = $this->normalizeData($this->content); + $this->content = Filter::normalizeData($this->content); } /** @@ -122,9 +122,11 @@ abstract class Parser $feed = new Feed; $this->findFeedUrl($xml, $feed); $this->findFeedTitle($xml, $feed); + $this->findFeedDescription($xml, $feed); $this->findFeedLanguage($xml, $feed); $this->findFeedId($xml, $feed); $this->findFeedDate($xml, $feed); + $this->findFeedLogo($xml, $feed); foreach ($this->getItemsTree($xml) as $entry) { @@ -137,6 +139,10 @@ abstract class Parser $this->findItemContent($entry, $item); $this->findItemEnclosure($entry, $item, $feed); $this->findItemLanguage($entry, $item, $feed); + + $this->scrapWebsite($item); + $this->filterItemContent($feed, $item); + $feed->items[] = $item; } @@ -146,103 +152,42 @@ abstract class Parser } /** - * Filter HTML for entry content + * Fetch item content with the content grabber * * @access public - * @param string $item_content Item content - * @param string $item_url Item URL - * @return string Filtered content + * @param Item $item Item object */ - public function filterHtml($item_content, $item_url) + public function scrapWebsite(Item $item) { - $content = ''; + if ($this->enable_grabber && ! in_array($item->getUrl(), $this->grabber_ignore_urls)) { - // Setup the content scraper - if ($this->enable_grabber && ! in_array($item_url, $this->grabber_ignore_urls)) { - - $grabber = new Grabber($item_url); + $grabber = new Grabber($item->getUrl()); $grabber->setConfig($this->config); $grabber->download(); if ($grabber->parse()) { - $item_content = $grabber->getContent(); + $item->content = $grabber->getContent() ?: $item->content; } } - - // Content filtering - if ($item_content) { - - if ($this->config !== null) { - - $callback = $this->config->getContentFilteringCallback(); - - if (is_callable($callback)) { - $content = $callback($item_content, $item_url); - } - } - - if (! $content) { - $filter = new Filter($item_content, $item_url); - $filter->setConfig($this->config); - $content = $filter->execute(); - } - } - - return $content; } /** - * Dirty quickfixes before XML parsing + * Filter HTML for entry content * * @access public - * @param string $data Raw data - * @return string Normalized data + * @param Feed $feed Feed object + * @param Item $item Item object */ - public function normalizeData($data) + public function filterItemContent(Feed $feed, Item $item) { - $invalid_chars = array( - "\x10", - "\xc3\x20", - "", - ); - - foreach ($invalid_chars as $needle) { - $data = str_replace($needle, '', $data); + if ($this->isFilteringEnabled()) { + $filter = Filter::html($item->getContent(), $feed->getUrl()); + $filter->setConfig($this->config); + $item->content = $filter->execute(); + } + else { + Logging::setMessage(get_called_class().': Content filtering disabled'); } - - $data = $this->replaceEntityAttribute($data); - return $data; - } - - /** - * Replace & by & for each href attribute (Fix broken feeds) - * - * @access public - * @param string $content Raw data - * @return string Normalized data - */ - public function replaceEntityAttribute($content) - { - $content = preg_replace_callback('/href="[^"]+"/', function(array $matches) { - return htmlspecialchars($matches[0], ENT_NOQUOTES, 'UTF-8', false); - }, $content); - - return $content; - } - - /** - * Trim whitespace from the begining, the end and inside a string and don't break utf-8 string - * - * @access public - * @param string $value Raw data - * @return string Normalized data - */ - public function stripWhiteSpace($value) - { - $value = str_replace("\r", "", $value); - $value = str_replace("\t", "", $value); - $value = str_replace("\n", "", $value); - return trim($value); } /** @@ -355,25 +300,6 @@ abstract class Parser return false; } - /** - * Get xml:lang value - * - * @access public - * @param string $xml XML string - * @return string Language - */ - public function getXmlLang($xml) - { - $dom = XmlParser::getDomDocument($this->content); - - if ($dom === false) { - return ''; - } - - $xpath = new DOMXPath($dom); - return $xpath->evaluate('string(//@xml:lang[1])') ?: ''; - } - /** * Return true if the given language is "Right to Left" * @@ -446,6 +372,32 @@ abstract class Parser return $this; } + /** + * Enable the content grabber + * + * @access public + * @return \PicoFeed\Parser + */ + public function disableContentFiltering() + { + $this->enable_filter = false; + } + + /** + * Return true if the content filtering is enabled + * + * @access public + * @return boolean + */ + public function isFilteringEnabled() + { + if ($this->config === null) { + return $this->enable_filter; + } + + return $this->config->getContentFiltering($this->enable_filter); + } + /** * Enable the content grabber * @@ -468,37 +420,4 @@ abstract class Parser { $this->grabber_ignore_urls = $urls; } - - /** - * Get a value from a XML namespace - * - * @access public - * @param SimpleXMLElement $xml XML element - * @param array $namespaces XML namespaces - * @param string $property XML tag name - * @param string $attribute XML attribute name - * @return string - */ - public function getNamespaceValue(SimpleXMLElement $xml, array $namespaces, $property, $attribute = '') - { - foreach ($namespaces as $name => $url) { - $namespace = $xml->children($namespaces[$name]); - - if ($namespace->$property->count() > 0) { - - if ($attribute) { - - foreach ($namespace->$property->attributes() as $xml_attribute => $xml_value) { - if ($xml_attribute === $attribute && $xml_value) { - return (string) $xml_value; - } - } - } - - return (string) $namespace->$property; - } - } - - return ''; - } } diff --git a/vendor/PicoFeed/Parsers/Atom.php b/vendor/PicoFeed/Parsers/Atom.php index 7d228e4..8a86b81 100644 --- a/vendor/PicoFeed/Parsers/Atom.php +++ b/vendor/PicoFeed/Parsers/Atom.php @@ -6,9 +6,10 @@ use SimpleXMLElement; use PicoFeed\Parser; use PicoFeed\XmlParser; use PicoFeed\Logging; -use PicoFeed\Filter; use PicoFeed\Feed; +use PicoFeed\Filter; use PicoFeed\Item; +use PicoFeed\Url; /** * Atom parser @@ -42,6 +43,30 @@ class Atom extends Parser $feed->url = $this->getLink($xml); } + /** + * Find the feed description + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Feed $feed Feed object + */ + public function findFeedDescription(SimpleXMLElement $xml, Feed $feed) + { + $feed->description = (string) $xml->subtitle; + } + + /** + * Find the feed logo url + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Feed $feed Feed object + */ + public function findFeedLogo(SimpleXMLElement $xml, Feed $feed) + { + $feed->logo = (string) $xml->logo; + } + /** * Find the feed title * @@ -51,7 +76,7 @@ class Atom extends Parser */ public function findFeedTitle(SimpleXMLElement $xml, Feed $feed) { - $feed->title = $this->stripWhiteSpace((string) $xml->title) ?: $feed->url; + $feed->title = Filter::stripWhiteSpace((string) $xml->title) ?: $feed->url; } /** @@ -63,7 +88,7 @@ class Atom extends Parser */ public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed) { - $feed->language = $this->getXmlLang($this->content); + $feed->language = XmlParser::getXmlLang($this->content); } /** @@ -107,11 +132,11 @@ class Atom extends Parser * * @access public * @param SimpleXMLElement $entry Feed item - * @param Item $item Item object + * @param Item $item Item object */ public function findItemTitle(SimpleXMLElement $entry, Item $item) { - $item->title = $this->stripWhiteSpace((string) $entry->title); + $item->title = Filter::stripWhiteSpace((string) $entry->title); if (empty($item->title)) { $item->title = $item->url; @@ -145,7 +170,7 @@ class Atom extends Parser */ public function findItemContent(SimpleXMLElement $entry, Item $item) { - $item->content = $this->filterHtml($this->getContent($entry), $item->url); + $item->content = $this->getContent($entry); } /** @@ -202,13 +227,8 @@ class Atom extends Parser foreach ($entry->link as $link) { if ((string) $link['rel'] === 'enclosure') { - $item->enclosure_url = (string) $link['href']; + $item->enclosure_url = Url::resolve((string) $link['href'], $feed->url); $item->enclosure_type = (string) $link['type']; - - if (Filter::isRelativePath($item->enclosure_url)) { - $item->enclosure_url = Filter::getAbsoluteUrl($item->enclosure_url, $feed->url); - } - break; } } diff --git a/vendor/PicoFeed/Parsers/Rss10.php b/vendor/PicoFeed/Parsers/Rss10.php index 748597a..728c792 100644 --- a/vendor/PicoFeed/Parsers/Rss10.php +++ b/vendor/PicoFeed/Parsers/Rss10.php @@ -7,6 +7,7 @@ require_once __DIR__.'/Rss20.php'; use SimpleXMLElement; use PicoFeed\Feed; use PicoFeed\Item; +use PicoFeed\XmlParser; use PicoFeed\Parsers\Rss20; /** @@ -38,7 +39,7 @@ class Rss10 extends Rss20 */ public function findFeedDate(SimpleXMLElement $xml, Feed $feed) { - $feed->date = $this->parseDate($this->getNamespaceValue($xml->channel, $this->namespaces, 'date')); + $feed->date = $this->parseDate(XmlParser::getNamespaceValue($xml->channel, $this->namespaces, 'date')); } /** @@ -50,7 +51,7 @@ class Rss10 extends Rss20 */ public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed) { - $feed->language = $this->getNamespaceValue($xml->channel, $this->namespaces, 'language'); + $feed->language = XmlParser::getNamespaceValue($xml->channel, $this->namespaces, 'language'); } /** diff --git a/vendor/PicoFeed/Parsers/Rss20.php b/vendor/PicoFeed/Parsers/Rss20.php index bbd8b75..255c6e5 100644 --- a/vendor/PicoFeed/Parsers/Rss20.php +++ b/vendor/PicoFeed/Parsers/Rss20.php @@ -6,9 +6,10 @@ use SimpleXMLElement; use PicoFeed\Parser; use PicoFeed\XmlParser; use PicoFeed\Logging; -use PicoFeed\Filter; use PicoFeed\Feed; +use PicoFeed\Filter; use PicoFeed\Item; +use PicoFeed\Url; /** * RSS 2.0 Parser @@ -57,6 +58,32 @@ class Rss20 extends Parser } } + /** + * Find the feed description + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Feed $feed Feed object + */ + public function findFeedDescription(SimpleXMLElement $xml, Feed $feed) + { + $feed->description = (string) $xml->channel->description; + } + + /** + * Find the feed logo url + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Feed $feed Feed object + */ + public function findFeedLogo(SimpleXMLElement $xml, Feed $feed) + { + if (isset($xml->channel->image->url)) { + $feed->logo = (string) $xml->channel->image->url; + } + } + /** * Find the feed title * @@ -66,7 +93,7 @@ class Rss20 extends Parser */ public function findFeedTitle(SimpleXMLElement $xml, Feed $feed) { - $feed->title = $this->stripWhiteSpace((string) $xml->channel->title) ?: $feed->url; + $feed->title = Filter::stripWhiteSpace((string) $xml->channel->title) ?: $feed->url; } /** @@ -115,10 +142,10 @@ class Rss20 extends Parser */ public function findItemDate(SimpleXMLElement $entry, Item $item) { - $date = $this->getNamespaceValue($entry, $this->namespaces, 'date'); + $date = XmlParser::getNamespaceValue($entry, $this->namespaces, 'date'); if (empty($date)) { - $date = $this->getNamespaceValue($entry, $this->namespaces, 'updated'); + $date = XmlParser::getNamespaceValue($entry, $this->namespaces, 'updated'); } if (empty($date)) { @@ -137,7 +164,7 @@ class Rss20 extends Parser */ public function findItemTitle(SimpleXMLElement $entry, Item $item) { - $item->title = $this->stripWhiteSpace((string) $entry->title); + $item->title = Filter::stripWhiteSpace((string) $entry->title); if (empty($item->title)) { $item->title = $item->url; @@ -154,7 +181,7 @@ class Rss20 extends Parser */ public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item) { - $item->author = $this->getNamespaceValue($entry, $this->namespaces, 'creator'); + $item->author = XmlParser::getNamespaceValue($entry, $this->namespaces, 'creator'); if (empty($item->author)) { if (isset($entry->author)) { @@ -175,13 +202,13 @@ class Rss20 extends Parser */ public function findItemContent(SimpleXMLElement $entry, Item $item) { - $content = $this->getNamespaceValue($entry, $this->namespaces, 'encoded'); + $content = XmlParser::getNamespaceValue($entry, $this->namespaces, 'encoded'); if (empty($content) && $entry->description->count() > 0) { $content = (string) $entry->description; } - $item->content = $this->filterHtml($content, $item->url); + $item->content = $content; } /** @@ -194,9 +221,9 @@ class Rss20 extends Parser public function findItemUrl(SimpleXMLElement $entry, Item $item) { $links = array( - $this->getNamespaceValue($entry, $this->namespaces, 'origLink'), + XmlParser::getNamespaceValue($entry, $this->namespaces, 'origLink'), isset($entry->link) ? (string) $entry->link : '', - $this->getNamespaceValue($entry, $this->namespaces, 'link', 'href'), + XmlParser::getNamespaceValue($entry, $this->namespaces, 'link', 'href'), isset($entry->guid) ? (string) $entry->guid : '', ); @@ -247,17 +274,14 @@ class Rss20 extends Parser { if (isset($entry->enclosure)) { - $item->enclosure_url = $this->getNamespaceValue($entry->enclosure, $this->namespaces, 'origEnclosureLink'); + $item->enclosure_url = XmlParser::getNamespaceValue($entry->enclosure, $this->namespaces, 'origEnclosureLink'); if (empty($item->enclosure_url)) { $item->enclosure_url = isset($entry->enclosure['url']) ? (string) $entry->enclosure['url'] : ''; } $item->enclosure_type = isset($entry->enclosure['type']) ? (string) $entry->enclosure['type'] : ''; - - if (Filter::isRelativePath($item->enclosure_url)) { - $item->enclosure_url = Filter::getAbsoluteUrl($item->enclosure_url, $feed->url); - } + $item->enclosure_url = Url::resolve($item->enclosure_url, $feed->url); } } diff --git a/vendor/PicoFeed/PicoFeed.php b/vendor/PicoFeed/PicoFeed.php index 89be939..073c348 100644 --- a/vendor/PicoFeed/PicoFeed.php +++ b/vendor/PicoFeed/PicoFeed.php @@ -4,10 +4,14 @@ require __DIR__.'/Config.php'; require __DIR__.'/Logging.php'; +require __DIR__.'/Url.php'; require __DIR__.'/Item.php'; require __DIR__.'/Feed.php'; require __DIR__.'/Client.php'; require __DIR__.'/Filter.php'; +require __DIR__.'/Filter/Attribute.php'; +require __DIR__.'/Filter/Tag.php'; +require __DIR__.'/Filter/Html.php'; require __DIR__.'/XmlParser.php'; require __DIR__.'/Encoding.php'; require __DIR__.'/Grabber.php'; @@ -18,3 +22,4 @@ require __DIR__.'/Writer.php'; require __DIR__.'/Writers/Rss20.php'; require __DIR__.'/Writers/Atom.php'; require __DIR__.'/Parser.php'; +require __DIR__.'/Favicon.php'; diff --git a/vendor/PicoFeed/Reader.php b/vendor/PicoFeed/Reader.php index fdef231..2f5d786 100644 --- a/vendor/PicoFeed/Reader.php +++ b/vendor/PicoFeed/Reader.php @@ -9,6 +9,7 @@ use PicoFeed\Logging; use PicoFeed\Filter; use PicoFeed\Client; use PicoFeed\Parser; +use PicoFeed\Url; /** * Reader class @@ -78,14 +79,7 @@ class Reader } $client = Client::getInstance(); - $client->setTimeout($this->config->getClientTimeout()) - ->setUserAgent($this->config->getClientUserAgent()) - ->setMaxRedirections($this->config->getMaxRedirections()) - ->setMaxBodySize($this->config->getMaxBodySize()) - ->setProxyHostname($this->config->getProxyHostname()) - ->setProxyPort($this->config->getProxyPort()) - ->setProxyUsername($this->config->getProxyUsername()) - ->setProxyPassword($this->config->getProxyPassword()) + $client->setConfig($this->config) ->setLastModified($last_modified) ->setEtag($etag); @@ -249,16 +243,13 @@ class Reader if (! empty($link)) { - // Relative links - if (strpos($link, 'http') !== 0) { + $feedUrl = new Url($link); + $siteUrl = new Url($this->url); - if ($link{0} === '/') $link = substr($link, 1); - if ($this->url{strlen($this->url) - 1} !== '/') $this->url .= '/'; - - $link = $this->url.$link; - } + $link = $feedUrl->getAbsoluteUrl($feedUrl->isRelativeUrl() ? $siteUrl->getBaseUrl() : ''); Logging::setMessage(get_called_class().': Find subscription link: '.$link); + $this->download($link); return true; diff --git a/vendor/PicoFeed/Url.php b/vendor/PicoFeed/Url.php new file mode 100644 index 0000000..d16a447 --- /dev/null +++ b/vendor/PicoFeed/Url.php @@ -0,0 +1,254 @@ +url = $url; + $this->components = parse_url($url) ?: array(); + + // Issue with PHP < 5.4.7 and protocol relative url + if (version_compare(PHP_VERSION, '5.4.7', '<') && $this->isProtocolRelative()) { + $pos = strpos($this->components['path'], '/', 2); + + if ($pos === false) { + $pos = strlen($this->components['path']); + } + + $this->components['host'] = substr($this->components['path'], 2, $pos - 2); + $this->components['path'] = substr($this->components['path'], $pos); + } + } + + /** + * Shortcut method to get an absolute url from relative url + * + * @static + * @access public + * @param string $item_url Unknown url (can be relative or not) + * @param mixed $website_url Website url + * @return string + */ + public static function resolve($item_url, $website_url) + { + $link = new Url($item_url); + $website = is_string($website_url) ? new Url($website_url) : $website_url; + + if ($link->isRelativeUrl()) { + + if ($link->isRelativePath()) { + return $link->getAbsoluteUrl($website->getAbsoluteUrl()); + } + + return $link->getAbsoluteUrl($website->getBaseUrl()); + } + else if ($link->isProtocolRelative()) { + $link->setScheme($website->getScheme()); + } + + return $link->getAbsoluteUrl(); + } + + /** + * Get the base URL + * + * @access public + * @param string $suffix Add a suffix to the url + * @return string + */ + public function getBaseUrl($suffix = '') + { + return $this->hasHost() ? $this->getScheme('://').$this->getHost().$this->getPort(':').$suffix : ''; + } + + /** + * Get the absolute URL + * + * @access public + * @param string $base_url Use this url as base url + * @return string + */ + public function getAbsoluteUrl($base_url = '') + { + if ($base_url) { + $base = new Url($base_url); + $url = $base->getAbsoluteUrl().substr($this->getFullPath(), 1); + } + else { + $url = $this->hasHost() ? $this->getBaseUrl().$this->getFullPath() : ''; + } + + return $url; + } + + /** + * Return true if the url is relative + * + * @access public + * @return boolean + */ + public function isRelativeUrl() + { + return ! $this->hasScheme() && ! $this->isProtocolRelative(); + } + + /** + * Return true if the path is relative + * + * @access public + * @return boolean + */ + public function isRelativePath() + { + $path = $this->getPath(); + return empty($path) || $path{0} !== '/'; + } + + /** + * Get the path + * + * @access public + * @return string + */ + public function getPath() + { + return empty($this->components['path']) ? '' : $this->components['path']; + } + + /** + * Get the full path (path + querystring + fragment) + * + * @access public + * @return string + */ + public function getFullPath() + { + $path = $this->isRelativePath() ? '/' : ''; + $path .= $this->getPath(); + $path .= empty($this->components['query']) ? '' : '?'.$this->components['query']; + $path .= empty($this->components['fragment']) ? '' : '#'.$this->components['fragment']; + + return $path; + } + + /** + * Get the hostname + * + * @access public + * @return string + */ + public function getHost() + { + return empty($this->components['host']) ? '' : $this->components['host']; + } + + /** + * Return true if the url has a hostname + * + * @access public + * @return boolean + */ + public function hasHost() + { + return ! empty($this->components['host']); + } + + /** + * Get the scheme + * + * @access public + * @param string $suffix Suffix to add when there is a scheme + * @return string + */ + public function getScheme($suffix = '') + { + return ($this->hasScheme() ? $this->components['scheme'] : 'http').$suffix; + } + + /** + * Set the scheme + * + * @access public + * @param string $scheme Set a scheme + * @return string + */ + public function setScheme($scheme) + { + $this->components['scheme'] = $scheme; + } + + /** + * Return true if the url has a scheme + * + * @access public + * @return boolean + */ + public function hasScheme() + { + return ! empty($this->components['scheme']); + } + + /** + * Get the port + * + * @access public + * @param string $prefix Prefix to add when there is a port + * @return string + */ + public function getPort($prefix = '') + { + return $this->hasPort() ? $prefix.$this->components['port'] : ''; + } + + /** + * Return true if the url has a port + * + * @access public + * @return boolean + */ + public function hasPort() + { + return ! empty($this->components['port']); + } + + /** + * Return true if the url is protocol relative (start with //) + * + * @access public + * @return boolean + */ + public function isProtocolRelative() + { + return strpos($this->url, '//') === 0; + } +} diff --git a/vendor/PicoFeed/Writer.php b/vendor/PicoFeed/Writer.php index 9c73a92..92a1a35 100644 --- a/vendor/PicoFeed/Writer.php +++ b/vendor/PicoFeed/Writer.php @@ -7,8 +7,9 @@ use RuntimeException; /** * Base writer class * - * @author Frederic Guillot - * @package picofeed + * @author Frederic Guillot + * @package picofeed + * @property string $description Feed description */ abstract class Writer { @@ -16,7 +17,7 @@ abstract class Writer * Dom object * * @access protected - * @var DomDocument + * @var \DomDocument */ protected $dom; @@ -28,6 +29,46 @@ abstract class Writer */ public $items = array(); + /** + * Author + * + * @access public + * @var array + */ + public $author = array(); + + /** + * Feed URL + * + * @access public + * @var string + */ + public $feed_url = ''; + + /** + * Website URL + * + * @access public + * @var string + */ + public $site_url = ''; + + /** + * Feed title + * + * @access public + * @var string + */ + public $title = ''; + + /** + * Feed modification date (timestamp) + * + * @access public + * @var integer + */ + public $updated = 0; + /** * Generate the XML document * diff --git a/vendor/PicoFeed/Writers/Atom.php b/vendor/PicoFeed/Writers/Atom.php index e5be76e..ba49d3f 100644 --- a/vendor/PicoFeed/Writers/Atom.php +++ b/vendor/PicoFeed/Writers/Atom.php @@ -72,7 +72,7 @@ class Atom extends Writer $feed->appendChild($id); // - $this->addUpdated($feed, isset($this->updated) ? $this->updated : ''); + $this->addUpdated($feed, $this->updated); // $this->addLink($feed, $this->site_url); @@ -85,51 +85,8 @@ class Atom extends Writer // foreach ($this->items as $item) { - $this->checkRequiredProperties($this->required_item_properties, $item); - - $entry = $this->dom->createElement('entry'); - - // - $title = $this->dom->createElement('title'); - $title->appendChild($this->dom->createTextNode($item['title'])); - $entry->appendChild($title); - - // <id/> - $id = $this->dom->createElement('id'); - $id->appendChild($this->dom->createTextNode(isset($item['id']) ? $item['id'] : $item['url'])); - $entry->appendChild($id); - - // <updated/> - $this->addUpdated($entry, isset($item['updated']) ? $item['updated'] : ''); - - // <published/> - if (isset($item['published'])) { - $entry->appendChild($this->dom->createElement('published', date(DATE_ATOM, $item['published']))); - } - - // <link rel="alternate" type="text/html" href="http://example.org/"/> - $this->addLink($entry, $item['url']); - - // <summary/> - if (isset($item['summary'])) { - $summary = $this->dom->createElement('summary'); - $summary->appendChild($this->dom->createTextNode($item['summary'])); - $entry->appendChild($summary); - } - - // <content/> - if (isset($item['content'])) { - $content = $this->dom->createElement('content'); - $content->setAttribute('type', 'html'); - $content->appendChild($this->dom->createCDATASection($item['content'])); - $entry->appendChild($content); - } - - // <author/> - if (isset($item['author'])) $this->addAuthor($entry, $item['author']); - - $feed->appendChild($entry); + $feed->appendChild($this->createEntry($item)); } $this->dom->appendChild($feed); @@ -142,6 +99,61 @@ class Atom extends Writer } } + /** + * Create item entry + * + * @access public + * @param arrray $item Item properties + * @return DomElement + */ + public function createEntry(array $item) + { + $entry = $this->dom->createElement('entry'); + + // <title/> + $title = $this->dom->createElement('title'); + $title->appendChild($this->dom->createTextNode($item['title'])); + $entry->appendChild($title); + + // <id/> + $id = $this->dom->createElement('id'); + $id->appendChild($this->dom->createTextNode(isset($item['id']) ? $item['id'] : $item['url'])); + $entry->appendChild($id); + + // <updated/> + $this->addUpdated($entry, isset($item['updated']) ? $item['updated'] : ''); + + // <published/> + if (isset($item['published'])) { + $entry->appendChild($this->dom->createElement('published', date(DATE_ATOM, $item['published']))); + } + + // <link rel="alternate" type="text/html" href="http://example.org/"/> + $this->addLink($entry, $item['url']); + + // <summary/> + if (isset($item['summary'])) { + $summary = $this->dom->createElement('summary'); + $summary->appendChild($this->dom->createTextNode($item['summary'])); + $entry->appendChild($summary); + } + + // <content/> + if (isset($item['content'])) { + $content = $this->dom->createElement('content'); + $content->setAttribute('type', 'html'); + $content->appendChild($this->dom->createCDATASection($item['content'])); + $entry->appendChild($content); + } + + // <author/> + if (isset($item['author'])) { + $this->addAuthor($entry, $item['author']); + } + + return $entry; + } + /** * Add Link * @@ -165,9 +177,9 @@ class Atom extends Writer * * @access public * @param DomElement $xml XML node - * @param string $value Timestamp + * @param integer $value Timestamp */ - public function addUpdated(DomElement $xml, $value = '') + public function addUpdated(DomElement $xml, $value = 0) { $xml->appendChild($this->dom->createElement( 'updated', diff --git a/vendor/PicoFeed/Writers/Rss20.php b/vendor/PicoFeed/Writers/Rss20.php index 506e3c8..4974524 100644 --- a/vendor/PicoFeed/Writers/Rss20.php +++ b/vendor/PicoFeed/Writers/Rss20.php @@ -75,7 +75,7 @@ class Rss20 extends Writer $channel->appendChild($description); // <pubDate/> - $this->addPubDate($channel, isset($this->updated) ? $this->updated : ''); + $this->addPubDate($channel, $this->updated); // <atom:link/> $link = $this->dom->createElement('atom:link'); @@ -94,56 +94,8 @@ class Rss20 extends Writer // <item/> foreach ($this->items as $item) { - $this->checkRequiredProperties($this->required_item_properties, $item); - - $entry = $this->dom->createElement('item'); - - // <title/> - $title = $this->dom->createElement('title'); - $title->appendChild($this->dom->createTextNode($item['title'])); - $entry->appendChild($title); - - // <link/> - $link = $this->dom->createElement('link'); - $link->appendChild($this->dom->createTextNode($item['url'])); - $entry->appendChild($link); - - // <guid/> - if (isset($item['id'])) { - $guid = $this->dom->createElement('guid'); - $guid->setAttribute('isPermaLink', 'false'); - $guid->appendChild($this->dom->createTextNode($item['id'])); - $entry->appendChild($guid); - } - else { - $guid = $this->dom->createElement('guid'); - $guid->setAttribute('isPermaLink', 'true'); - $guid->appendChild($this->dom->createTextNode($item['url'])); - $entry->appendChild($guid); - } - - // <pubDate/> - $this->addPubDate($entry, isset($item['updated']) ? $item['updated'] : ''); - - // <description/> - if (isset($item['summary'])) { - $description = $this->dom->createElement('description'); - $description->appendChild($this->dom->createTextNode($item['summary'])); - $entry->appendChild($description); - } - - // <content/> - if (isset($item['content'])) { - $content = $this->dom->createElement('content:encoded'); - $content->appendChild($this->dom->createCDATASection($item['content'])); - $entry->appendChild($content); - } - - // <author/> - if (isset($item['author'])) $this->addAuthor($entry, 'author', $item['author']); - - $channel->appendChild($entry); + $channel->appendChild($this->createEntry($item)); } $rss->appendChild($channel); @@ -157,14 +109,74 @@ class Rss20 extends Writer } } + /** + * Create item entry + * + * @access public + * @param arrray $item Item properties + * @return DomElement + */ + public function createEntry(array $item) + { + $entry = $this->dom->createElement('item'); + + // <title/> + $title = $this->dom->createElement('title'); + $title->appendChild($this->dom->createTextNode($item['title'])); + $entry->appendChild($title); + + // <link/> + $link = $this->dom->createElement('link'); + $link->appendChild($this->dom->createTextNode($item['url'])); + $entry->appendChild($link); + + // <guid/> + if (isset($item['id'])) { + $guid = $this->dom->createElement('guid'); + $guid->setAttribute('isPermaLink', 'false'); + $guid->appendChild($this->dom->createTextNode($item['id'])); + $entry->appendChild($guid); + } + else { + $guid = $this->dom->createElement('guid'); + $guid->setAttribute('isPermaLink', 'true'); + $guid->appendChild($this->dom->createTextNode($item['url'])); + $entry->appendChild($guid); + } + + // <pubDate/> + $this->addPubDate($entry, isset($item['updated']) ? $item['updated'] : ''); + + // <description/> + if (isset($item['summary'])) { + $description = $this->dom->createElement('description'); + $description->appendChild($this->dom->createTextNode($item['summary'])); + $entry->appendChild($description); + } + + // <content/> + if (isset($item['content'])) { + $content = $this->dom->createElement('content:encoded'); + $content->appendChild($this->dom->createCDATASection($item['content'])); + $entry->appendChild($content); + } + + // <author/> + if (isset($item['author'])) { + $this->addAuthor($entry, 'author', $item['author']); + } + + return $entry; + } + /** * Add publication date * * @access public * @param DomElement $xml XML node - * @param string $value Timestamp + * @param integer $value Timestamp */ - public function addPubDate(DomElement $xml, $value = '') + public function addPubDate(DomElement $xml, $value = 0) { $xml->appendChild($this->dom->createElement( 'pubDate', diff --git a/vendor/PicoFeed/XmlParser.php b/vendor/PicoFeed/XmlParser.php index 0f0620c..fce8f7d 100644 --- a/vendor/PicoFeed/XmlParser.php +++ b/vendor/PicoFeed/XmlParser.php @@ -3,6 +3,7 @@ namespace PicoFeed; use DomDocument; +use DOMXPath; use SimpleXmlElement; /** @@ -110,6 +111,20 @@ class XmlParser return $dom; } + /** + * Convert a HTML document to XML + * + * @static + * @access public + * @param string $html HTML document + * @return string + */ + public static function HtmlToXml($html) + { + $dom = self::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html); + return $dom->saveXML($dom->getElementsByTagName('body')->item(0)); + } + /** * Get XML parser errors * @@ -160,4 +175,58 @@ class XmlParser return $encoding; } + + /** + * Get xml:lang value + * + * @static + * @access public + * @param string $xml XML string + * @return string Language + */ + public static function getXmlLang($xml) + { + $dom = self::getDomDocument($xml); + + if ($dom === false) { + return ''; + } + + $xpath = new DOMXPath($dom); + return $xpath->evaluate('string(//@xml:lang[1])') ?: ''; + } + + /** + * Get a value from a XML namespace + * + * @static + * @access public + * @param SimpleXMLElement $xml XML element + * @param array $namespaces XML namespaces + * @param string $property XML tag name + * @param string $attribute XML attribute name + * @return string + */ + public static function getNamespaceValue(SimpleXMLElement $xml, array $namespaces, $property, $attribute = '') + { + foreach ($namespaces as $name => $url) { + $namespace = $xml->children($namespaces[$name]); + + if ($namespace->$property->count() > 0) { + + if ($attribute) { + + foreach ($namespace->$property->attributes() as $xml_attribute => $xml_value) { + if ($xml_attribute === $attribute && $xml_value) { + return (string) $xml_value; + } + } + } + + return (string) $namespace->$property; + } + } + + return ''; + } } diff --git a/vendor/Readability/JSLikeHTMLElement.php b/vendor/Readability/JSLikeHTMLElement.php deleted file mode 100755 index 238ba8a..0000000 --- a/vendor/Readability/JSLikeHTMLElement.php +++ /dev/null @@ -1,109 +0,0 @@ -<?php -/** -* JavaScript-like HTML DOM Element -* -* This class extends PHP's DOMElement to allow -* users to get and set the innerHTML property of -* HTML elements in the same way it's done in -* JavaScript. -* -* Example usage: -* @code -* require_once 'JSLikeHTMLElement.php'; -* header('Content-Type: text/plain'); -* $doc = new DOMDocument(); -* $doc->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); -* $doc->loadHTML('<div><p>Para 1</p><p>Para 2</p></div>'); -* $elem = $doc->getElementsByTagName('div')->item(0); -* -* // print innerHTML -* echo $elem->innerHTML; // prints '<p>Para 1</p><p>Para 2</p>' -* echo "\n\n"; -* -* // set innerHTML -* $elem->innerHTML = '<a href="http://fivefilters.org">FiveFilters.org</a>'; -* echo $elem->innerHTML; // prints '<a href="http://fivefilters.org">FiveFilters.org</a>' -* echo "\n\n"; -* -* // print document (with our changes) -* echo $doc->saveXML(); -* @endcode -* -* @author Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net -* @see http://fivefilters.org (the project this was written for) -*/ -class JSLikeHTMLElement extends DOMElement -{ - /** - * Used for setting innerHTML like it's done in JavaScript: - * @code - * $div->innerHTML = '<h2>Chapter 2</h2><p>The story begins...</p>'; - * @endcode - */ - public function __set($name, $value) { - if ($name == 'innerHTML') { - // first, empty the element - for ($x=$this->childNodes->length-1; $x>=0; $x--) { - $this->removeChild($this->childNodes->item($x)); - } - // $value holds our new inner HTML - if ($value != '') { - $f = $this->ownerDocument->createDocumentFragment(); - // appendXML() expects well-formed markup (XHTML) - $result = @$f->appendXML($value); // @ to suppress PHP warnings - if ($result) { - if ($f->hasChildNodes()) $this->appendChild($f); - } else { - // $value is probably ill-formed - $f = new DOMDocument(); - $value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8'); - // Using <htmlfragment> will generate a warning, but so will bad HTML - // (and by this point, bad HTML is what we've got). - // We use it (and suppress the warning) because an HTML fragment will - // be wrapped around <html><body> tags which we don't really want to keep. - // Note: despite the warning, if loadHTML succeeds it will return true. - $result = @$f->loadHTML('<htmlfragment>'.$value.'</htmlfragment>'); - if ($result) { - $import = $f->getElementsByTagName('htmlfragment')->item(0); - foreach ($import->childNodes as $child) { - $importedNode = $this->ownerDocument->importNode($child, true); - $this->appendChild($importedNode); - } - } else { - // oh well, we tried, we really did. :( - // this element is now empty - } - } - } - } else { - $trace = debug_backtrace(); - trigger_error('Undefined property via __set(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE); - } - } - - /** - * Used for getting innerHTML like it's done in JavaScript: - * @code - * $string = $div->innerHTML; - * @endcode - */ - public function __get($name) - { - if ($name == 'innerHTML') { - $inner = ''; - foreach ($this->childNodes as $child) { - $inner .= $this->ownerDocument->saveXML($child); - } - return $inner; - } - - $trace = debug_backtrace(); - trigger_error('Undefined property via __get(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE); - return null; - } - - public function __toString() - { - return '['.$this->tagName.']'; - } -} \ No newline at end of file diff --git a/vendor/Readability/Readability.php b/vendor/Readability/Readability.php deleted file mode 100755 index be5892e..0000000 --- a/vendor/Readability/Readability.php +++ /dev/null @@ -1,1137 +0,0 @@ -<?php -/** -* Arc90's Readability ported to PHP for FiveFilters.org -* Based on readability.js version 1.7.1 (without multi-page support) -* Updated to allow HTML5 parsing with html5lib -* Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds -* ------------------------------------------------------ -* Original URL: http://lab.arc90.com/experiments/readability/js/readability.js -* Arc90's project URL: http://lab.arc90.com/experiments/readability/ -* JS Source: http://code.google.com/p/arc90labs-readability -* Ported by: Keyvan Minoukadeh, http://www.keyvan.net -* More information: http://fivefilters.org/content-only/ -* License: Apache License, Version 2.0 -* Requires: PHP5 -* Date: 2012-09-19 -* -* Differences between the PHP port and the original -* ------------------------------------------------------ -* Arc90's Readability is designed to run in the browser. It works on the DOM -* tree (the parsed HTML) after the page's CSS styles have been applied and -* Javascript code executed. This PHP port does not run inside a browser. -* We use PHP's ability to parse HTML to build our DOM tree, but we cannot -* rely on CSS or Javascript support. As such, the results will not always -* match Arc90's Readability. (For example, if a web page contains CSS style -* rules or Javascript code which hide certain HTML elements from display, -* Arc90's Readability will dismiss those from consideration but our PHP port, -* unable to understand CSS or Javascript, will not know any better.) -* -* Another significant difference is that the aim of Arc90's Readability is -* to re-present the main content block of a given web page so users can -* read it more easily in their browsers. Correct identification, clean up, -* and separation of the content block is only a part of this process. -* This PHP port is only concerned with this part, it does not include code -* that relates to presentation in the browser - Arc90 already do -* that extremely well, and for PDF output there's FiveFilters.org's -* PDF Newspaper: http://fivefilters.org/pdf-newspaper/. -* -* Finally, this class contains methods that might be useful for developers -* working on HTML document fragments. So without deviating too much from -* the original code (which I don't want to do because it makes debugging -* and updating more difficult), I've tried to make it a little more -* developer friendly. You should be able to use the methods here on -* existing DOMElement objects without passing an entire HTML document to -* be parsed. -*/ - -// This class allows us to do JavaScript like assignements to innerHTML -require_once(dirname(__FILE__).'/JSLikeHTMLElement.php'); - -// Alternative usage (for testing only!) -// uncomment the lines below and call Readability.php in your browser -// passing it the URL of the page you'd like content from, e.g.: -// Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php - -/* -if (!isset($_GET['url']) || $_GET['url'] == '') { - die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html'); -} -$url = $_GET['url']; -if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url; -$html = file_get_contents($url); -$r = new Readability($html, $url); -$r->init(); -echo $r->articleContent->innerHTML; -*/ - -class Readability -{ - public $version = '1.7.1-without-multi-page'; - public $convertLinksToFootnotes = false; - public $revertForcedParagraphElements = true; - public $articleTitle; - public $articleContent; - public $dom; - public $url = null; // optional - URL where HTML was retrieved - public $debug = false; - public $lightClean = true; // preserves more content (experimental) added 2012-09-19 - protected $body = null; // - protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later - protected $flags = 7; // 1 | 2 | 4; // Start with all flags set. - protected $success = false; // indicates whether we were able to extract or not - - /** - * All of the regular expressions in use within readability. - * Defined up here so we don't instantiate them repeatedly in loops. - **/ - public $regexps = array( - 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i', - 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', - 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i', - 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i', - 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i', - 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i', - 'replaceFonts' => '/<(\/?)font[^>]*>/i', - // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim() - 'normalize' => '/\s{2,}/', - 'killBreaks' => '/(<br\s*\/?>(\s| ?)*){1,}/', - 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i', - 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' - ); - - /* constants */ - const FLAG_STRIP_UNLIKELYS = 1; - const FLAG_WEIGHT_CLASSES = 2; - const FLAG_CLEAN_CONDITIONALLY = 4; - - /** - * Create instance of Readability - * @param string UTF-8 encoded string - * @param string (optional) URL associated with HTML (used for footnotes) - * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') - */ - function __construct($html, $url=null, $parser='libxml') - { - $this->url = $url; - /* Turn all double br's into p's */ - $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html); - $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); - $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); - if (trim($html) == '') $html = '<html></html>'; - if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) { - // all good - } else { - $this->dom = new DOMDocument(); - $this->dom->preserveWhiteSpace = false; - @$this->dom->loadHTML($html); - } - $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); - } - - /** - * Get article title element - * @return DOMElement - */ - public function getTitle() { - return $this->articleTitle; - } - - /** - * Get article content element - * @return DOMElement - */ - public function getContent() { - return $this->articleContent; - } - - /** - * Runs readability. - * - * Workflow: - * 1. Prep the document by removing script tags, css, etc. - * 2. Build readability's DOM tree. - * 3. Grab the article content from the current dom tree. - * 4. Replace the current DOM tree with the new one. - * 5. Read peacefully. - * - * @return boolean true if we found content, false otherwise - **/ - public function init() - { - if (!isset($this->dom->documentElement)) return false; - $this->removeScripts($this->dom); - //die($this->getInnerHTML($this->dom->documentElement)); - - // Assume successful outcome - $this->success = true; - - $bodyElems = $this->dom->getElementsByTagName('body'); - if ($bodyElems->length > 0) { - if ($this->bodyCache == null) { - $this->bodyCache = $bodyElems->item(0)->innerHTML; - } - if ($this->body == null) { - $this->body = $bodyElems->item(0); - } - } - - $this->prepDocument(); - - //die($this->dom->documentElement->parentNode->nodeType); - //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); - //die($this->getInnerHTML($this->dom->documentElement)); - - /* Build readability's DOM tree */ - $overlay = $this->dom->createElement('div'); - $innerDiv = $this->dom->createElement('div'); - $articleTitle = $this->getArticleTitle(); - $articleContent = $this->grabArticle(); - - if (!$articleContent) { - $this->success = false; - $articleContent = $this->dom->createElement('div'); - $articleContent->setAttribute('id', 'readability-content'); - $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>'; - } - - $overlay->setAttribute('id', 'readOverlay'); - $innerDiv->setAttribute('id', 'readInner'); - - /* Glue the structure of our document together. */ - $innerDiv->appendChild($articleTitle); - $innerDiv->appendChild($articleContent); - $overlay->appendChild($innerDiv); - - /* Clear the old HTML, insert the new content. */ - $this->body->innerHTML = ''; - $this->body->appendChild($overlay); - //document.body.insertBefore(overlay, document.body.firstChild); - $this->body->removeAttribute('style'); - - $this->postProcessContent($articleContent); - - // Set title and content instance variables - $this->articleTitle = $articleTitle; - $this->articleContent = $articleContent; - - return $this->success; - } - - /** - * Debug - */ - protected function dbg($msg) { - if ($this->debug) echo '* ',$msg, "\n"; - } - - /** - * Run any post-process modifications to article content as necessary. - * - * @param DOMElement - * @return void - */ - public function postProcessContent($articleContent) { - if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { - $this->addFootnotes($articleContent); - } - } - - /** - * Get the article title as an H1. - * - * @return DOMElement - */ - protected function getArticleTitle() { - $curTitle = ''; - $origTitle = ''; - - try { - $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); - } catch(Exception $e) {} - - if (preg_match('/ [\|\-] /', $curTitle)) - { - $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); - - if (count(explode(' ', $curTitle)) < 3) { - $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); - } - } - else if (strpos($curTitle, ': ') !== false) - { - $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); - - if (count(explode(' ', $curTitle)) < 3) { - $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle); - } - } - else if(strlen($curTitle) > 150 || strlen($curTitle) < 15) - { - $hOnes = $this->dom->getElementsByTagName('h1'); - if($hOnes->length == 1) - { - $curTitle = $this->getInnerText($hOnes->item(0)); - } - } - - $curTitle = trim($curTitle); - - if (count(explode(' ', $curTitle)) <= 4) { - $curTitle = $origTitle; - } - - $articleTitle = $this->dom->createElement('h1'); - $articleTitle->innerHTML = $curTitle; - - return $articleTitle; - } - - /** - * Prepare the HTML document for readability to scrape it. - * This includes things like stripping javascript, CSS, and handling terrible markup. - * - * @return void - **/ - protected function prepDocument() { - /** - * In some cases a body element can't be found (if the HTML is totally hosed for example) - * so we create a new body node and append it to the document. - */ - if ($this->body == null) - { - $this->body = $this->dom->createElement('body'); - $this->dom->documentElement->appendChild($this->body); - } - $this->body->setAttribute('id', 'readabilityBody'); - - /* Remove all style tags in head */ - $styleTags = $this->dom->getElementsByTagName('style'); - for ($i = $styleTags->length-1; $i >= 0; $i--) - { - $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); - } - - /* Turn all double br's into p's */ - /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ - //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>'); - // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree. - // Manipulating innerHTML as it's done in JS is not possible in PHP. - } - - /** - * For easier reading, convert this document to have footnotes at the bottom rather than inline links. - * @see http://www.roughtype.com/archives/2010/05/experiments_in.php - * - * @return void - **/ - public function addFootnotes($articleContent) { - $footnotesWrapper = $this->dom->createElement('div'); - $footnotesWrapper->setAttribute('id', 'readability-footnotes'); - $footnotesWrapper->innerHTML = '<h3>References</h3>'; - - $articleFootnotes = $this->dom->createElement('ol'); - $articleFootnotes->setAttribute('id', 'readability-footnotes-list'); - $footnotesWrapper->appendChild($articleFootnotes); - - $articleLinks = $articleContent->getElementsByTagName('a'); - - $linkCount = 0; - for ($i = 0; $i < $articleLinks->length; $i++) - { - $articleLink = $articleLinks->item($i); - $footnoteLink = $articleLink->cloneNode(true); - $refLink = $this->dom->createElement('a'); - $footnote = $this->dom->createElement('li'); - $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST); - if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST); - //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host, - $linkText = $this->getInnerText($articleLink); - - if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { - continue; - } - - $linkCount++; - - /** Add a superscript reference after the article link */ - $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount); - $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>'; - $refLink->setAttribute('class', 'readability-DoNotFootnote'); - $refLink->setAttribute('style', 'color: inherit;'); - - //TODO: does this work or should we use DOMNode.isSameNode()? - if ($articleLink->parentNode->lastChild == $articleLink) { - $articleLink->parentNode->appendChild($refLink); - } else { - $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling); - } - - $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); - $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); - - $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> '; - - $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); - $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); - - $footnote->appendChild($footnoteLink); - if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>'; - - $articleFootnotes->appendChild($footnote); - } - - if ($linkCount > 0) { - $articleContent->appendChild($footnotesWrapper); - } - } - - /** - * Reverts P elements with class 'readability-styled' - * to text nodes - which is what they were before. - * - * @param DOMElement - * @return void - */ - function revertReadabilityStyledElements($articleContent) { - $xpath = new DOMXPath($articleContent->ownerDocument); - $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent); - //$elems = $articleContent->getElementsByTagName('p'); - for ($i = $elems->length-1; $i >= 0; $i--) { - $e = $elems->item($i); - $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); - //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') { - // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e); - //} - } - } - - /** - * Prepare the article node for display. Clean out any inline styles, - * iframes, forms, strip extraneous <p> tags, etc. - * - * @param DOMElement - * @return void - */ - function prepArticle($articleContent) { - $this->cleanStyles($articleContent); - $this->killBreaks($articleContent); - if ($this->revertForcedParagraphElements) { - $this->revertReadabilityStyledElements($articleContent); - } - - /* Clean out junk from the article content */ - $this->cleanConditionally($articleContent, 'form'); - $this->clean($articleContent, 'object'); - $this->clean($articleContent, 'h1'); - - /** - * If there is only one h2, they are probably using it - * as a header and not a subheader, so remove it since we already have a header. - ***/ - if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) { - $this->clean($articleContent, 'h2'); - } - $this->clean($articleContent, 'iframe'); - - $this->cleanHeaders($articleContent); - - /* Do these last as the previous stuff may have removed junk that will affect these */ - $this->cleanConditionally($articleContent, 'table'); - $this->cleanConditionally($articleContent, 'ul'); - $this->cleanConditionally($articleContent, 'div'); - - /* Remove extra paragraphs */ - $articleParagraphs = $articleContent->getElementsByTagName('p'); - for ($i = $articleParagraphs->length-1; $i >= 0; $i--) - { - $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; - $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; - $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; - $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length; - - if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '') - { - $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i)); - } - } - - try { - $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML); - //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p'); - } - catch (Exception $e) { - $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e); - } - } - - /** - * Initialize a node with the readability object. Also checks the - * className/id for special names to add to its score. - * - * @param Element - * @return void - **/ - protected function initializeNode($node) { - $readability = $this->dom->createAttribute('readability'); - $readability->value = 0; // this is our contentScore - $node->setAttributeNode($readability); - - switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case - case 'DIV': - $readability->value += 5; - break; - - case 'PRE': - case 'TD': - case 'BLOCKQUOTE': - $readability->value += 3; - break; - - case 'ADDRESS': - case 'OL': - case 'UL': - case 'DL': - case 'DD': - case 'DT': - case 'LI': - case 'FORM': - $readability->value -= 3; - break; - - case 'H1': - case 'H2': - case 'H3': - case 'H4': - case 'H5': - case 'H6': - case 'TH': - $readability->value -= 5; - break; - } - $readability->value += $this->getClassWeight($node); - } - - /*** - * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is - * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. - * - * @return DOMElement - **/ - protected function grabArticle($page=null) { - $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS); - if (!$page) $page = $this->dom; - $allElements = $page->getElementsByTagName('*'); - /** - * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs - * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) - * - * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 - * TODO: Shouldn't this be a reverse traversal? - **/ - $node = null; - $nodesToScore = array(); - for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) { - //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) { - //$node = $targetList->item($nodeIndex); - $tagName = strtoupper($node->tagName); - /* Remove unlikely candidates */ - if ($stripUnlikelyCandidates) { - $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id'); - if ( - preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && - !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) && - $tagName != 'BODY' - ) - { - $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString); - //$nodesToRemove[] = $node; - $node->parentNode->removeChild($node); - $nodeIndex--; - continue; - } - } - - if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') { - $nodesToScore[] = $node; - } - - /* Turn all divs that don't have children block level elements into p's */ - if ($tagName == 'DIV') { - if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { - //$this->dbg('Altering div to p'); - $newNode = $this->dom->createElement('p'); - try { - $newNode->innerHTML = $node->innerHTML; - //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node); - $node->parentNode->replaceChild($newNode, $node); - $nodeIndex--; - $nodesToScore[] = $node; // or $newNode? - } - catch(Exception $e) { - $this->dbg('Could not alter div to p, reverting back to div.: ' . $e); - } - } - else - { - /* EXPERIMENTAL */ - // TODO: change these p elements back to text nodes after processing - for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) { - $childNode = $node->childNodes->item($i); - if ($childNode->nodeType == 3) { // XML_TEXT_NODE - //$this->dbg('replacing text node with a p tag with the same content.'); - $p = $this->dom->createElement('p'); - $p->innerHTML = $childNode->nodeValue; - $p->setAttribute('style', 'display: inline;'); - $p->setAttribute('class', 'readability-styled'); - $childNode->parentNode->replaceChild($p, $childNode); - } - } - } - } - } - - /** - * Loop through all paragraphs, and assign a score to them based on how content-y they look. - * Then add their score to their parent node. - * - * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. - **/ - $candidates = array(); - for ($pt=0; $pt < count($nodesToScore); $pt++) { - $parentNode = $nodesToScore[$pt]->parentNode; - // $grandParentNode = $parentNode ? $parentNode->parentNode : null; - $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null); - $innerText = $this->getInnerText($nodesToScore[$pt]); - - if (!$parentNode || !isset($parentNode->tagName)) { - continue; - } - - /* If this paragraph is less than 25 characters, don't even count it. */ - if(strlen($innerText) < 25) { - continue; - } - - /* Initialize readability data for the parent. */ - if (!$parentNode->hasAttribute('readability')) - { - $this->initializeNode($parentNode); - $candidates[] = $parentNode; - } - - /* Initialize readability data for the grandparent. */ - if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) - { - $this->initializeNode($grandParentNode); - $candidates[] = $grandParentNode; - } - - $contentScore = 0; - - /* Add a point for the paragraph itself as a base. */ - $contentScore++; - - /* Add points for any commas within this paragraph */ - $contentScore += count(explode(',', $innerText)); - - /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ - $contentScore += min(floor(strlen($innerText) / 100), 3); - - /* Add the score to the parent. The grandparent gets half. */ - $parentNode->getAttributeNode('readability')->value += $contentScore; - - if ($grandParentNode) { - $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; - } - } - - /** - * After we've calculated scores, loop through all of the possible candidate nodes we found - * and find the one with the highest score. - **/ - $topCandidate = null; - for ($c=0, $cl=count($candidates); $c < $cl; $c++) - { - /** - * Scale the final candidates score based on link density. Good content should have a - * relatively small link density (5% or less) and be mostly unaffected by this operation. - **/ - $readability = $candidates[$c]->getAttributeNode('readability'); - $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c])); - - $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value); - - if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) { - $topCandidate = $candidates[$c]; - } - } - - /** - * If we still have no top candidate, just use the body as a last resort. - * We also have to copy the body node so it is something we can modify. - **/ - if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY') - { - $topCandidate = $this->dom->createElement('div'); - if ($page instanceof DOMDocument) { - if (!isset($page->documentElement)) { - // we don't have a body either? what a mess! :) - } else { - $topCandidate->innerHTML = $page->documentElement->innerHTML; - $page->documentElement->innerHTML = ''; - $page->documentElement->appendChild($topCandidate); - } - } else { - $topCandidate->innerHTML = $page->innerHTML; - $page->innerHTML = ''; - $page->appendChild($topCandidate); - } - $this->initializeNode($topCandidate); - } - - /** - * Now that we have the top candidate, look through its siblings for content that might also be related. - * Things like preambles, content split by ads that we removed, etc. - **/ - $articleContent = $this->dom->createElement('div'); - $articleContent->setAttribute('id', 'readability-content'); - $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2); - $siblingNodes = $topCandidate->parentNode->childNodes; - if (!isset($siblingNodes)) { - $siblingNodes = new stdClass; - $siblingNodes->length = 0; - } - - for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++) - { - $siblingNode = $siblingNodes->item($s); - $append = false; - - $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); - - //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); - - if ($siblingNode === $topCandidate) - // or if ($siblingNode->isSameNode($topCandidate)) - { - $append = true; - } - - $contentBonus = 0; - /* Give a bonus if sibling nodes and top candidates have the example same classname */ - if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') { - $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2; - } - - if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) - { - $append = true; - } - - if (strtoupper($siblingNode->nodeName) == 'P') { - $linkDensity = $this->getLinkDensity($siblingNode); - $nodeContent = $this->getInnerText($siblingNode); - $nodeLength = strlen($nodeContent); - - if ($nodeLength > 80 && $linkDensity < 0.25) - { - $append = true; - } - else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) - { - $append = true; - } - } - - if ($append) - { - $this->dbg('Appending node: ' . $siblingNode->nodeName); - - $nodeToAppend = null; - $sibNodeName = strtoupper($siblingNode->nodeName); - if ($sibNodeName != 'DIV' && $sibNodeName != 'P') { - /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ - - $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.'); - $nodeToAppend = $this->dom->createElement('div'); - try { - $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); - $nodeToAppend->innerHTML = $siblingNode->innerHTML; - } - catch(Exception $e) - { - $this->dbg('Could not alter siblingNode to div, reverting back to original.'); - $nodeToAppend = $siblingNode; - $s--; - $sl--; - } - } else { - $nodeToAppend = $siblingNode; - $s--; - $sl--; - } - - /* To ensure a node does not interfere with readability styles, remove its classnames */ - $nodeToAppend->removeAttribute('class'); - - /* Append sibling and subtract from our list because it removes the node when you append to another node */ - $articleContent->appendChild($nodeToAppend); - } - } - - /** - * So we have all of the content that we need. Now we clean it up for presentation. - **/ - $this->prepArticle($articleContent); - - /** - * Now that we've gone through the full algorithm, check to see if we got any meaningful content. - * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher - * likelihood of finding the content, and the sieve approach gives us a higher likelihood of - * finding the -right- content. - **/ - if (strlen($this->getInnerText($articleContent, false)) < 250) - { - // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7 - // in the meantime, we check and create an empty element if it's not there. - if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body'); - $this->body->innerHTML = $this->bodyCache; - - if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { - $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); - return $this->grabArticle($this->body); - } - else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { - $this->removeFlag(self::FLAG_WEIGHT_CLASSES); - return $this->grabArticle($this->body); - } - else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { - $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); - return $this->grabArticle($this->body); - } - else { - return false; - } - } - return $articleContent; - } - - /** - * Remove script tags from document - * - * @param DOMElement - * @return void - */ - public function removeScripts($doc) { - $scripts = $doc->getElementsByTagName('script'); - for($i = $scripts->length-1; $i >= 0; $i--) - { - $scripts->item($i)->parentNode->removeChild($scripts->item($i)); - } - } - - /** - * Get the inner text of a node. - * This also strips out any excess whitespace to be found. - * - * @param DOMElement $ - * @param boolean $normalizeSpaces (default: true) - * @return string - **/ - public function getInnerText($e, $normalizeSpaces=true) { - $textContent = ''; - - if (!isset($e->textContent) || $e->textContent == '') { - return ''; - } - - $textContent = trim($e->textContent); - - if ($normalizeSpaces) { - return preg_replace($this->regexps['normalize'], ' ', $textContent); - } else { - return $textContent; - } - } - - /** - * Get the number of times a string $s appears in the node $e. - * - * @param DOMElement $e - * @param string - what to count. Default is "," - * @return number (integer) - **/ - public function getCharCount($e, $s=',') { - return substr_count($this->getInnerText($e), $s); - } - - /** - * Remove the style attribute on every $e and under. - * - * @param DOMElement $e - * @return void - */ - public function cleanStyles($e) { - if (!is_object($e)) return; - $elems = $e->getElementsByTagName('*'); - foreach ($elems as $elem) { - $elem->removeAttribute('style'); - } - } - - /** - * Get the density of links as a percentage of the content - * This is the amount of text that is inside a link divided by the total text in the node. - * - * @param DOMElement $e - * @return number (float) - */ - public function getLinkDensity($e) { - $links = $e->getElementsByTagName('a'); - $textLength = strlen($this->getInnerText($e)); - $linkLength = 0; - for ($i=0, $il=$links->length; $i < $il; $i++) - { - $linkLength += strlen($this->getInnerText($links->item($i))); - } - if ($textLength > 0) { - return $linkLength / $textLength; - } else { - return 0; - } - } - - /** - * Get an elements class/id weight. Uses regular expressions to tell if this - * element looks good or bad. - * - * @param DOMElement $e - * @return number (Integer) - */ - public function getClassWeight($e) { - if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { - return 0; - } - - $weight = 0; - - /* Look for a special classname */ - if ($e->hasAttribute('class') && $e->getAttribute('class') != '') - { - if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) { - $weight -= 25; - } - if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) { - $weight += 25; - } - } - - /* Look for a special ID */ - if ($e->hasAttribute('id') && $e->getAttribute('id') != '') - { - if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) { - $weight -= 25; - } - if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) { - $weight += 25; - } - } - return $weight; - } - - /** - * Remove extraneous break tags from a node. - * - * @param DOMElement $node - * @return void - */ - public function killBreaks($node) { - $html = $node->innerHTML; - $html = preg_replace($this->regexps['killBreaks'], '<br />', $html); - $node->innerHTML = $html; - } - - /** - * Clean a node of all elements of type "tag". - * (Unless it's a youtube/vimeo video. People love movies.) - * - * Updated 2012-09-18 to preserve youtube/vimeo iframes - * - * @param DOMElement $e - * @param string $tag - * @return void - */ - public function clean($e, $tag) { - $targetList = $e->getElementsByTagName($tag); - $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed'); - - for ($y=$targetList->length-1; $y >= 0; $y--) { - /* Allow youtube and vimeo videos through as people usually want to see those. */ - if ($isEmbed) { - $attributeValues = ''; - for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) { - $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test) - } - - /* First, check the elements attributes to see if any of them contain youtube or vimeo */ - if (preg_match($this->regexps['video'], $attributeValues)) { - continue; - } - - /* Then check the elements inside this element for the same. */ - if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) { - continue; - } - } - $targetList->item($y)->parentNode->removeChild($targetList->item($y)); - } - } - - /** - * Clean an element of all tags of type "tag" if they look fishy. - * "Fishy" is an algorithm based on content length, classnames, - * link density, number of images & embeds, etc. - * - * @param DOMElement $e - * @param string $tag - * @return void - */ - public function cleanConditionally($e, $tag) { - if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { - return; - } - - $tagsList = $e->getElementsByTagName($tag); - $curTagsLength = $tagsList->length; - - /** - * Gather counts for other typical elements embedded within. - * Traverse backwards so we can remove nodes at the same time without effecting the traversal. - * - * TODO: Consider taking into account original contentScore here. - */ - for ($i=$curTagsLength-1; $i >= 0; $i--) { - $weight = $this->getClassWeight($tagsList->item($i)); - $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0; - - $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : '')); - - if ($weight + $contentScore < 0) { - $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); - } - else if ( $this->getCharCount($tagsList->item($i), ',') < 10) { - /** - * If there are not very many commas, and the number of - * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. - **/ - $p = $tagsList->item($i)->getElementsByTagName('p')->length; - $img = $tagsList->item($i)->getElementsByTagName('img')->length; - $li = $tagsList->item($i)->getElementsByTagName('li')->length-100; - $input = $tagsList->item($i)->getElementsByTagName('input')->length; - $a = $tagsList->item($i)->getElementsByTagName('a')->length; - - $embedCount = 0; - $embeds = $tagsList->item($i)->getElementsByTagName('embed'); - for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { - if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { - $embedCount++; - } - } - $embeds = $tagsList->item($i)->getElementsByTagName('iframe'); - for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { - if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { - $embedCount++; - } - } - - $linkDensity = $this->getLinkDensity($tagsList->item($i)); - $contentLength = strlen($this->getInnerText($tagsList->item($i))); - $toRemove = false; - - if ($this->lightClean) { - $this->dbg('Light clean...'); - if ( ($img > $p) && ($img > 4) ) { - $this->dbg(' more than 4 images and more image elements than paragraph elements'); - $toRemove = true; - } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { - $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); - $toRemove = true; - } else if ( $input > floor($p/3) ) { - $this->dbg(' too many <input> elements'); - $toRemove = true; - } else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) { - $this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images'); - $toRemove = true; - } else if($weight < 25 && $linkDensity > 0.2) { - $this->dbg(' weight smaller than 25 and link density above 0.2'); - $toRemove = true; - } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) { - $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5'); - $toRemove = true; - } else if($embedCount > 3) { - $this->dbg(' more than 3 embeds'); - $toRemove = true; - } - } else { - $this->dbg('Standard clean...'); - if ( $img > $p ) { - $this->dbg(' more image elements than paragraph elements'); - $toRemove = true; - } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { - $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); - $toRemove = true; - } else if ( $input > floor($p/3) ) { - $this->dbg(' too many <input> elements'); - $toRemove = true; - } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) { - $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images'); - $toRemove = true; - } else if($weight < 25 && $linkDensity > 0.2) { - $this->dbg(' weight smaller than 25 and link density above 0.2'); - $toRemove = true; - } else if($weight >= 25 && $linkDensity > 0.5) { - $this->dbg(' weight above 25 but link density greater than 0.5'); - $toRemove = true; - } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { - $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed'); - $toRemove = true; - } - } - - if ($toRemove) { - //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML); - $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); - } - } - } - } - - /** - * Clean out spurious headers from an Element. Checks things like classnames and link density. - * - * @param DOMElement $e - * @return void - */ - public function cleanHeaders($e) { - for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { - $headers = $e->getElementsByTagName('h' . $headerIndex); - for ($i=$headers->length-1; $i >=0; $i--) { - if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { - $headers->item($i)->parentNode->removeChild($headers->item($i)); - } - } - } - } - - public function flagIsActive($flag) { - return ($this->flags & $flag) > 0; - } - - public function addFlag($flag) { - $this->flags = $this->flags | $flag; - } - - public function removeFlag($flag) { - $this->flags = $this->flags & ~$flag; - } -} \ No newline at end of file