diff --git a/common.php b/common.php index 9ee3f11..00662f9 100644 --- a/common.php +++ b/common.php @@ -3,7 +3,6 @@ require __DIR__.'/lib/Translator.php'; require __DIR__.'/vendor/PicoDb/Database.php'; require __DIR__.'/vendor/PicoFeed/PicoFeed.php'; -require __DIR__.'/vendor/Readability/Readability.php'; require __DIR__.'/vendor/SimpleValidator/Validator.php'; require __DIR__.'/vendor/SimpleValidator/Base.php'; diff --git a/docs/full-article-download.markdown b/docs/full-article-download.markdown index 9e9d29c..229cac7 100644 --- a/docs/full-article-download.markdown +++ b/docs/full-article-download.markdown @@ -8,8 +8,7 @@ How the content grabber works? 1. Try with rules first (xpath patterns) for the domain name (see `PicoFeed\Rules\`) 2. Try to find the text content by using common attributes for class and id -3. Fallback to Readability if no content is found -4. Finally, if nothing is found, the feed content is displayed +3. Finally, if nothing is found, the feed content is displayed The content downloader use a fake user agent, actually Google Chrome under Mac Os X. diff --git a/models/item.php b/models/item.php index 6cdc889..8e1b57c 100644 --- a/models/item.php +++ b/models/item.php @@ -8,7 +8,6 @@ use PicoFeed\Logging; use PicoFeed\Grabber; use PicoFeed\Client; use PicoFeed\Filter; -use Readability; // Get all items without filtering function get_everything() @@ -535,12 +534,9 @@ function download_content_url($url) if ($grabber->parse()) { $content = $grabber->getcontent(); } - else { - $content = download_content_readability($grabber->getRawContent(), $url); - } if (! empty($content)) { - $filter = new Filter($content, $url); + $filter = Filter::html($content, $url); $filter->setConfig(Config\get_reader_config()); $content = $filter->execute(); } @@ -580,18 +576,3 @@ function download_content_id($item_id) 'content' => '' ); } - -// Download content with Readability PHP port -function download_content_readability($content, $url) -{ - if (! empty($content)) { - - $readability = new Readability($content, $url); - - if ($readability->init()) { - return $readability->getContent()->innerHTML; - } - } - - return ''; -} diff --git a/vendor/PicoDb/Database.php b/vendor/PicoDb/Database.php index 5d0beb8..c09d8a9 100644 --- a/vendor/PicoDb/Database.php +++ b/vendor/PicoDb/Database.php @@ -86,6 +86,11 @@ class Database public function escapeIdentifier($value) { + // Do not escape custom query + if (strpos($value, '.') !== false || strpos($value, ' ') !== false) { + return $value; + } + return $this->pdo->escapeIdentifier($value); } diff --git a/vendor/PicoDb/Drivers/Mysql.php b/vendor/PicoDb/Drivers/Mysql.php index 22277a0..96148a1 100644 --- a/vendor/PicoDb/Drivers/Mysql.php +++ b/vendor/PicoDb/Drivers/Mysql.php @@ -70,7 +70,6 @@ class Mysql extends \PDO { public function escapeIdentifier($value) { - if (strpos($value, '.') !== false) return $value; return '`'.$value.'`'; } } \ No newline at end of file diff --git a/vendor/PicoDb/Drivers/Sqlite.php b/vendor/PicoDb/Drivers/Sqlite.php index 83b61c4..38c823a 100644 --- a/vendor/PicoDb/Drivers/Sqlite.php +++ b/vendor/PicoDb/Drivers/Sqlite.php @@ -51,7 +51,6 @@ class Sqlite extends \PDO { public function escapeIdentifier($value) { - if (strpos($value, '.') !== false) return $value; return '"'.$value.'"'; } } \ No newline at end of file diff --git a/vendor/PicoDb/Table.php b/vendor/PicoDb/Table.php index cc63743..9c6bf4f 100644 --- a/vendor/PicoDb/Table.php +++ b/vendor/PicoDb/Table.php @@ -173,6 +173,10 @@ class Table public function buildSelectQuery() { + foreach ($this->columns as $key => $value) { + $this->columns[$key] = $this->db->escapeIdentifier($value); + } + return sprintf( 'SELECT %s %s FROM %s %s %s %s %s %s %s', $this->distinct ? 'DISTINCT' : '', @@ -350,7 +354,7 @@ class Table switch (strtolower($name)) { case 'in': - if (isset($arguments[1]) && is_array($arguments[1])) { + if (isset($arguments[1]) && is_array($arguments[1]) && ! empty($arguments[1])) { $sql = sprintf( '%s IN (%s)', @@ -361,7 +365,7 @@ class Table break; case 'notin': - if (isset($arguments[1]) && is_array($arguments[1])) { + if (isset($arguments[1]) && is_array($arguments[1]) && ! empty($arguments[1])) { $sql = sprintf( '%s NOT IN (%s)', diff --git a/vendor/PicoFeed/Client.php b/vendor/PicoFeed/Client.php index a79840c..59e9aa9 100644 --- a/vendor/PicoFeed/Client.php +++ b/vendor/PicoFeed/Client.php @@ -5,7 +5,6 @@ namespace PicoFeed; use LogicException; use Clients\Curl; use Clients\Stream; -use PicoFeed\Logging; /** * Client class @@ -23,6 +22,14 @@ abstract class Client */ private $is_modified = true; + /** + * Flag that say if the resource is a 404 + * + * @access private + * @var bool + */ + private $is_not_found = false; + /** * HTTP encoding * @@ -170,38 +177,110 @@ abstract class Client $response = $this->doRequest(); if (is_array($response)) { - - if ($response['status'] == 304) { - $this->is_modified = false; - Logging::setMessage(get_called_class().' Resource not modified'); - } - else if ($response['status'] == 404) { - Logging::setMessage(get_called_class().' Resource not found'); - } - else { - $etag = isset($response['headers']['ETag']) ? $response['headers']['ETag'] : ''; - $last_modified = isset($response['headers']['Last-Modified']) ? $response['headers']['Last-Modified'] : ''; - $this->content = $response['body']; - - if (isset($response['headers']['Content-Type'])) { - $result = explode('charset=', strtolower($response['headers']['Content-Type'])); - $this->encoding = isset($result[1]) ? $result[1] : ''; - } - - if (($this->etag && $this->etag === $etag) || ($this->last_modified && $last_modified === $this->last_modified)) { - $this->is_modified = false; - } - - $this->etag = $etag; - $this->last_modified = $last_modified; - } - + $this->handleNotModifiedResponse($response); + $this->handleNotFoundResponse($response); + $this->handleNormalResponse($response); return true; } return false; } + /** + * Handle not modified response + * + * @access public + * @param array $response Client response + */ + public function handleNotModifiedResponse(array $response) + { + if ($response['status'] == 304) { + $this->is_modified = false; + } + else if ($response['status'] == 200) { + + $etag = $this->getHeader($response, 'ETag'); + $last_modified = $this->getHeader($response, 'Last-Modified'); + + if ($this->isPropertyEquals('etag', $etag) || $this->isPropertyEquals('last_modified', $last_modified)) { + $this->is_modified = false; + } + + $this->etag = $etag; + $this->last_modified = $last_modified; + } + + if ($this->is_modified === false) { + Logging::setMessage(get_called_class().' Resource not modified'); + } + } + + /** + * Handle not found response + * + * @access public + * @param array $response Client response + */ + public function handleNotFoundResponse(array $response) + { + if ($response['status'] == 404) { + $this->is_not_found = true; + Logging::setMessage(get_called_class().' Resource not found'); + } + } + + /** + * Handle normal response + * + * @access public + * @param array $response Client response + */ + public function handleNormalResponse(array $response) + { + if ($response['status'] == 200) { + $this->content = $response['body']; + $this->encoding = $this->findCharset($response); + } + } + + /** + * Check if a class property equals to a value + * + * @access public + * @param string $property Class property + * @param string $value Value + * @return boolean + */ + private function isPropertyEquals($property, $value) + { + return $this->$property && $this->$property === $value; + } + + /** + * Find charset from response headers + * + * @access public + * @param array $response Client response + */ + public function findCharset(array $response) + { + $result = explode('charset=', strtolower($this->getHeader($response, 'Content-Type'))); + return isset($result[1]) ? $result[1] : ''; + } + + /** + * Get header value from a client response + * + * @access public + * @param array $response Client response + * @param string $header Header name + * @return string + */ + public function getHeader(array $response, $header) + { + return isset($response['headers'][$header]) ? $response['headers'][$header] : ''; + } + /** * Parse HTTP headers * @@ -340,6 +419,17 @@ abstract class Client return $this->is_modified; } + /** + * Return true if the remote resource is not found + * + * @access public + * @return bool + */ + public function isNotFound() + { + return $this->is_not_found; + } + /** * Set connection timeout * @@ -453,14 +543,16 @@ abstract class Client */ public function setConfig($config) { - $this->setTimeout($config->getGrabberTimeout()); - $this->setUserAgent($config->getGrabberUserAgent()); - $this->setMaxRedirections($config->getMaxRedirections()); - $this->setMaxBodySize($config->getMaxBodySize()); - $this->setProxyHostname($config->getProxyHostname()); - $this->setProxyPort($config->getProxyPort()); - $this->setProxyUsername($config->getProxyUsername()); - $this->setProxyPassword($config->getProxyPassword()); + if ($config !== null) { + $this->setTimeout($config->getGrabberTimeout()); + $this->setUserAgent($config->getGrabberUserAgent()); + $this->setMaxRedirections($config->getMaxRedirections()); + $this->setMaxBodySize($config->getMaxBodySize()); + $this->setProxyHostname($config->getProxyHostname()); + $this->setProxyPort($config->getProxyPort()); + $this->setProxyUsername($config->getProxyUsername()); + $this->setProxyPassword($config->getProxyPassword()); + } return $this; } diff --git a/vendor/PicoFeed/Clients/Curl.php b/vendor/PicoFeed/Clients/Curl.php index 66a4773..a1ee96d 100644 --- a/vendor/PicoFeed/Clients/Curl.php +++ b/vendor/PicoFeed/Clients/Curl.php @@ -97,36 +97,37 @@ class Curl extends Client } /** - * Do the HTTP request + * Prepare HTTP headers * - * @access public - * @param bool $follow_location Flag used when there is an open_basedir restriction - * @return array HTTP response ['body' => ..., 'status' => ..., 'headers' => ...] + * @access private + * @return array */ - public function doRequest($follow_location = true) + private function prepareHeaders() { - $request_headers = array('Connection: close'); + $headers = array( + 'Connection: close', + 'User-Agent: '.$this->user_agent, + ); - if ($this->etag) $request_headers[] = 'If-None-Match: '.$this->etag; - if ($this->last_modified) $request_headers[] = 'If-Modified-Since: '.$this->last_modified; + if ($this->etag) { + $headers[] = 'If-None-Match: '.$this->etag; + } - $ch = curl_init(); + if ($this->last_modified) { + $headers[] = 'If-Modified-Since: '.$this->last_modified; + } - curl_setopt($ch, CURLOPT_URL, $this->url); - curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1); - curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $this->timeout); - curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); - curl_setopt($ch, CURLOPT_USERAGENT, $this->user_agent); - curl_setopt($ch, CURLOPT_HTTPHEADER, $request_headers); - curl_setopt($ch, CURLOPT_FOLLOWLOCATION, ini_get('open_basedir') === ''); - curl_setopt($ch, CURLOPT_MAXREDIRS, $this->max_redirects); - curl_setopt($ch, CURLOPT_ENCODING, ''); - curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // For auto-signed certificates... - curl_setopt($ch, CURLOPT_WRITEFUNCTION, array($this, 'readBody')); - curl_setopt($ch, CURLOPT_HEADERFUNCTION, array($this, 'readHeaders')); - curl_setopt($ch, CURLOPT_COOKIEJAR, 'php://memory'); - curl_setopt($ch, CURLOPT_COOKIEFILE, 'php://memory'); + return $headers; + } + /** + * Prepare curl proxy context + * + * @access private + * @return resource + */ + private function prepareProxyContext($ch) + { if ($this->proxy_hostname) { Logging::setMessage(get_called_class().' Proxy: '.$this->proxy_hostname.':'.$this->proxy_port); @@ -144,6 +145,47 @@ class Curl extends Client } } + return $ch; + } + + /** + * Prepare curl context + * + * @access private + * @return resource + */ + private function prepareContext() + { + $ch = curl_init(); + + curl_setopt($ch, CURLOPT_URL, $this->url); + curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1); + curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $this->timeout); + curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); + curl_setopt($ch, CURLOPT_HTTPHEADER, $this->prepareHeaders()); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, ini_get('open_basedir') === ''); + curl_setopt($ch, CURLOPT_MAXREDIRS, $this->max_redirects); + curl_setopt($ch, CURLOPT_ENCODING, ''); + curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // For auto-signed certificates... + curl_setopt($ch, CURLOPT_WRITEFUNCTION, array($this, 'readBody')); + curl_setopt($ch, CURLOPT_HEADERFUNCTION, array($this, 'readHeaders')); + curl_setopt($ch, CURLOPT_COOKIEJAR, 'php://memory'); + curl_setopt($ch, CURLOPT_COOKIEFILE, 'php://memory'); + + $ch = $this->prepareProxyContext($ch); + + return $ch; + } + + /** + * Execute curl context + * + * @access private + * @return resource + */ + private function executeContext() + { + $ch = $this->prepareContext(); curl_exec($ch); Logging::setMessage(get_called_class().' cURL total time: '.curl_getinfo($ch, CURLINFO_TOTAL_TIME)); @@ -153,44 +195,34 @@ class Curl extends Client Logging::setMessage(get_called_class().' cURL effective url: '.curl_getinfo($ch, CURLINFO_EFFECTIVE_URL)); if (curl_errno($ch)) { - Logging::setMessage(get_called_class().' cURL error: '.curl_error($ch)); - curl_close($ch); return false; } curl_close($ch); + return true; + } + + /** + * Do the HTTP request + * + * @access public + * @param bool $follow_location Flag used when there is an open_basedir restriction + * @return array HTTP response ['body' => ..., 'status' => ..., 'headers' => ...] + */ + public function doRequest($follow_location = true) + { + if (! $this->executeContext()) { + return false; + } + list($status, $headers) = $this->parseHeaders(explode("\r\n", $this->headers[$this->headers_counter - 1])); - if ($follow_location && ini_get('open_basedir') !== '' && ($status == 301 || $status == 302)) { - - $nb_redirects = 0; - $this->url = $headers['Location']; - $this->body = ''; - $this->body_length = 0; - $this->headers = array(); - $this->headers_counter = 0; - - while (true) { - - $nb_redirects++; - if ($nb_redirects >= $this->max_redirects) return false; - - $result = $this->doRequest(false); - - if ($result['status'] == 301 || $result['status'] == 302) { - $this->url = $result['headers']['Location']; - $this->body = ''; - $this->body_length = 0; - $this->headers = array(); - $this->headers_counter = 0; - } - else { - return $result; - } - } + // When resticted with open_basedir + if ($this->needToHandleRedirection($follow_location, $status)) { + return $this->handleRedirection($headers['Location']); } return array( @@ -199,4 +231,58 @@ class Curl extends Client 'headers' => $headers ); } + + /** + * Check if the redirection have to be handled manually + * + * @access private + * @param boolean $follow_location Flag + * @param integer $status HTTP status code + * @return boolean + */ + private function needToHandleRedirection($follow_location, $status) + { + return $follow_location && ini_get('open_basedir') !== '' && ($status == 301 || $status == 302); + } + + /** + * Handle manually redirections when there is an open base dir restriction + * + * @access private + * @param string $location Redirected URL + * @return boolean|array + */ + private function handleRedirection($location) + { + $nb_redirects = 0; + $this->url = $location; + $this->body = ''; + $this->body_length = 0; + $this->headers = array(); + $this->headers_counter = 0; + + while (true) { + + $nb_redirects++; + + if ($nb_redirects >= $this->max_redirects) { + return false; + } + + $result = $this->doRequest(false); + + if ($result['status'] == 301 || $result['status'] == 302) { + $this->url = $result['headers']['Location']; + $this->body = ''; + $this->body_length = 0; + $this->headers = array(); + $this->headers_counter = 0; + } + else { + return $result; + } + } + + return false; + } } diff --git a/vendor/PicoFeed/Clients/Stream.php b/vendor/PicoFeed/Clients/Stream.php index af5ae7e..f16952f 100644 --- a/vendor/PicoFeed/Clients/Stream.php +++ b/vendor/PicoFeed/Clients/Stream.php @@ -14,14 +14,13 @@ use \PicoFeed\Client; class Stream extends Client { /** - * Do the HTTP request + * Prepare HTTP headers * - * @access public - * @return array HTTP response ['body' => ..., 'status' => ..., 'headers' => ...] + * @access private + * @return array */ - public function doRequest() + private function prepareHeaders() { - // Prepare HTTP headers for the request $headers = array( 'Connection: close', 'User-Agent: '.$this->user_agent, @@ -39,14 +38,27 @@ class Stream extends Client $headers[] = 'If-Modified-Since: '.$this->last_modified; } - // Create context - $context_options = array( + if ($this->proxy_username) { + $headers[] = 'Proxy-Authorization: Basic '.base64_encode($this->proxy_username.':'.$this->proxy_password); + } + + return $headers; + } + + /** + * Prepare stream context + * + * @access private + * @return array + */ + private function prepareContext() + { + $context = array( 'http' => array( 'method' => 'GET', 'protocol_version' => 1.1, 'timeout' => $this->timeout, 'max_redirects' => $this->max_redirects, - 'header' => implode("\r\n", $headers) ) ); @@ -54,31 +66,46 @@ class Stream extends Client Logging::setMessage(get_called_class().' Proxy: '.$this->proxy_hostname.':'.$this->proxy_port); - $context_options['http']['proxy'] = 'tcp://'.$this->proxy_hostname.':'.$this->proxy_port; - $context_options['http']['request_fulluri'] = true; + $context['http']['proxy'] = 'tcp://'.$this->proxy_hostname.':'.$this->proxy_port; + $context['http']['request_fulluri'] = true; if ($this->proxy_username) { Logging::setMessage(get_called_class().' Proxy credentials: Yes'); - - $headers[] = 'Proxy-Authorization: Basic '.base64_encode($this->proxy_username.':'.$this->proxy_password); - $context_options['http']['header'] = implode("\r\n", $headers); } else { Logging::setMessage(get_called_class().' Proxy credentials: No'); } } - $context = stream_context_create($context_options); + $context['http']['header'] = implode("\r\n", $this->prepareHeaders()); + + return $context; + } + + /** + * Do the HTTP request + * + * @access public + * @return array HTTP response ['body' => ..., 'status' => ..., 'headers' => ...] + */ + public function doRequest() + { + // Create context + $context = stream_context_create($this->prepareContext()); // Make HTTP request $stream = @fopen($this->url, 'r', false, $context); - if (! is_resource($stream)) return false; + if (! is_resource($stream)) { + return false; + } // Get the entire body until the max size $body = stream_get_contents($stream, $this->max_body_size + 1); // If the body size is too large abort everything - if (strlen($body) > $this->max_body_size) return false; + if (strlen($body) > $this->max_body_size) { + return false; + } // Get HTTP headers response $metadata = stream_get_meta_data($stream); @@ -87,6 +114,23 @@ class Stream extends Client fclose($stream); + return array( + 'status' => $status, + 'body' => $this->decodeBody($body, $headers), + 'headers' => $headers + ); + } + + /** + * Decode body response according to the HTTP headers + * + * @access public + * @param string $body Raw body + * @param array $headers HTTP headers + * @return string + */ + public function decodeBody($body, array $headers) + { if (isset($headers['Transfer-Encoding']) && $headers['Transfer-Encoding'] === 'chunked') { $body = $this->decodeChunked($body); } @@ -95,11 +139,7 @@ class Stream extends Client $body = @gzdecode($body); } - return array( - 'status' => $status, - 'body' => $body, - 'headers' => $headers - ); + return $body; } /** diff --git a/vendor/PicoFeed/Config.php b/vendor/PicoFeed/Config.php index 935e019..283ce23 100644 --- a/vendor/PicoFeed/Config.php +++ b/vendor/PicoFeed/Config.php @@ -7,6 +7,52 @@ namespace PicoFeed; * * @author Frederic Guillot * @package picofeed + * + * @method \PicoFeed\Config setClientTimeout(integer $value) + * @method \PicoFeed\Config setClientUserAgent(string $value) + * @method \PicoFeed\Config setMaxRedirections(integer $value) + * @method \PicoFeed\Config setMaxBodySize(integer $value) + * @method \PicoFeed\Config setProxyHostname(string $value) + * @method \PicoFeed\Config setProxyPort(integer $value) + * @method \PicoFeed\Config setProxyUsername(string $value) + * @method \PicoFeed\Config setProxyPassword(string $value) + * @method \PicoFeed\Config setGrabberTimeout(integer $value) + * @method \PicoFeed\Config setGrabberUserAgent(string $value) + * @method \PicoFeed\Config setParserHashAlgo(string $value) + * @method \PicoFeed\Config setContentFiltering(boolean $value) + * @method \PicoFeed\Config setTimezone(string $value) + * @method \PicoFeed\Config setFilterIframeWhitelist(array $value) + * @method \PicoFeed\Config setFilterIntegerAttributes(array $value) + * @method \PicoFeed\Config setFilterAttributeOverrides(array $value) + * @method \PicoFeed\Config setFilterRequiredAttributes(array $value) + * @method \PicoFeed\Config setFilterMediaBlacklist(array $value) + * @method \PicoFeed\Config setFilterMediaAttributes(array $value) + * @method \PicoFeed\Config setFilterSchemeWhitelist(array $value) + * @method \PicoFeed\Config setFilterWhitelistedTags(array $value) + * @method \PicoFeed\Config setFilterBlacklistedTags(array $value) + * + * @method integer getClientTimeout() + * @method string getClientUserAgent() + * @method integer getMaxRedirections() + * @method integer getMaxBodySize() + * @method string getProxyHostname() + * @method integer getProxyPort() + * @method string getProxyUsername() + * @method string getProxyPassword() + * @method integer getGrabberTimeout() + * @method string getGrabberUserAgent() + * @method string getParserHashAlgo() + * @method boolean getContentFiltering(bool $default_value) + * @method string getTimezone() + * @method array getFilterIframeWhitelist(array $default_value) + * @method array getFilterIntegerAttributes(array $default_value) + * @method array getFilterAttributeOverrides(array $default_value) + * @method array getFilterRequiredAttributes(array $default_value) + * @method array getFilterMediaBlacklist(array $default_value) + * @method array getFilterMediaAttributes(array $default_value) + * @method array getFilterSchemeWhitelist(array $default_value) + * @method array getFilterWhitelistedTags(array $default_value) + * @method array getFilterBlacklistedTags(array $default_value) */ class Config { diff --git a/vendor/PicoFeed/Export.php b/vendor/PicoFeed/Export.php index df03f98..5fa0c4b 100644 --- a/vendor/PicoFeed/Export.php +++ b/vendor/PicoFeed/Export.php @@ -58,23 +58,39 @@ class Export $body = $xml->addChild('body'); - foreach ($this->content as $feed) { + foreach ($this->content as $category => $values) { - $valid = true; - - foreach ($this->required_fields as $field) { - - if (! isset($feed[$field])) { - $valid = false; - break; - } + if (is_string($category)) { + $this->createCategory($body, $category, $values); } - - if (! $valid) { - continue; + else { + $this->createEntry($body, $values); } + } - $outline = $body->addChild('outline'); + return $xml->asXML(); + } + + /** + * Create a feed entry + * + * @access public + * @param SimpleXMLElement $parent Parent Element + * @param array $feed Feed properties + */ + public function createEntry(SimpleXMLElement $parent, array $feed) + { + $valid = true; + + foreach ($this->required_fields as $field) { + if (! isset($feed[$field])) { + $valid = false; + break; + } + } + + if ($valid) { + $outline = $parent->addChild('outline'); $outline->addAttribute('xmlUrl', $feed['feed_url']); $outline->addAttribute('htmlUrl', $feed['site_url']); $outline->addAttribute('title', $feed['title']); @@ -83,7 +99,34 @@ class Export $outline->addAttribute('type', 'rss'); $outline->addAttribute('version', 'RSS'); } + } - return $xml->asXML(); + /** + * Create entries for a feed list + * + * @access public + * @param SimpleXMLElement $parent Parent Element + * @param array $feeds Feed list + */ + public function createEntries(SimpleXMLElement $parent, array $feeds) + { + foreach ($feeds as $feed) { + $this->createEntry($parent, $feed); + } + } + + /** + * Create a category entry + * + * @access public + * @param SimpleXMLElement $parent Parent Element + * @param string $category Category + * @param array $feed Feed properties + */ + public function createCategory(SimpleXMLElement $parent, $category, array $feeds) + { + $outline = $parent->addChild('outline'); + $outline->addAttribute('text', $category); + $this->createEntries($outline, $feeds); } } diff --git a/vendor/PicoFeed/Favicon.php b/vendor/PicoFeed/Favicon.php new file mode 100644 index 0000000..ec87531 --- /dev/null +++ b/vendor/PicoFeed/Favicon.php @@ -0,0 +1,163 @@ +config = $config ?: new Config; + } + + /** + * Get the icon file content (available only after the download) + * + * @access public + * @return string + */ + public function getContent() + { + return $this->content; + } + + /** + * Download and check if a resource exists + * + * @access public + * @param string $url URL + * @return string Resource content + */ + public function download($url) + { + Logging::setMessage(get_called_class().' Download => '.$url); + + $client = Client::getInstance(); + $client->setConfig($this->config); + + if ($client->execute($url) && ! $client->isNotFound()) { + return $client->getContent(); + } + + return ''; + } + + /** + * Check if a remote file exists + * + * @access public + * @param string $url URL + * @return boolean + */ + public function exists($url) + { + return $this->download($url) !== ''; + } + + /** + * Get the icon link for a website + * + * @access public + * @param string $website_link URL + * @return string + */ + public function find($website_link) + { + $website = new Url($website_link); + + $icons = $this->extract($this->download($website->getBaseUrl('/'))); + $icons[] = $website->getBaseUrl('/favicon.ico'); + + foreach ($icons as $icon_link) { + + $icon_link = $this->convertLink($website, new Url($icon_link)); + $this->content = $this->download($icon_link); + + if ($this->content !== '') { + return $icon_link; + } + } + + return ''; + } + + /** + * Convert icon links to absolute url + * + * @access public + * @param \PicoFeed\Url $website Website url + * @param \PicoFeed\Url $icon Icon url + * @return string + */ + public function convertLink(Url $website, Url $icon) + { + $base_url = ''; + + if ($icon->isRelativeUrl()) { + $base_url = $website->getBaseUrl(); + } + else if ($icon->isProtocolRelative()) { + $icon->setScheme($website->getScheme()); + } + + return $icon->getAbsoluteUrl($base_url); + } + + /** + * Extract the icon links from the HTML + * + * @access public + * @param string $html HTML + * @return array + */ + public function extract($html) + { + $icons = array(); + + if (empty($html)) { + return $icons; + } + + $dom = XmlParser::getHtmlDocument($html); + + $xpath = new DOMXpath($dom); + $elements = $xpath->query("//link[contains(@rel, 'icon') and not(contains(@rel, 'apple'))]"); + + for ($i = 0; $i < $elements->length; $i++) { + $icons[] = $elements->item($i)->getAttribute('href'); + } + + return $icons; + } +} diff --git a/vendor/PicoFeed/Feed.php b/vendor/PicoFeed/Feed.php index 90ce0d8..6bd6392 100644 --- a/vendor/PicoFeed/Feed.php +++ b/vendor/PicoFeed/Feed.php @@ -35,7 +35,15 @@ class Feed public $title = ''; /** - * Item url + * Feed description + * + * @access public + * @var string + */ + public $description = ''; + + /** + * Feed url * * @access public * @var string @@ -43,7 +51,7 @@ class Feed public $url = ''; /** - * Item date + * Feed date * * @access public * @var integer @@ -51,13 +59,21 @@ class Feed public $date = 0; /** - * Item language + * Feed language * * @access public * @var string */ public $language = ''; + /** + * Feed logo URL (not the same as icon) + * + * @access public + * @var string + */ + public $logo = ''; + /** * Return feed information * @@ -68,7 +84,7 @@ class Feed { $output = ''; - foreach (array('id', 'title', 'url', 'date', 'language') as $property) { + foreach (array('id', 'title', 'url', 'date', 'language', 'description', 'logo') as $property) { $output .= 'Feed::'.$property.' = '.$this->$property.PHP_EOL; } @@ -93,6 +109,28 @@ class Feed return $this->title; } + /** + * Get description + * + * @access public + * $return string + */ + public function getDescription() + { + return $this->description; + } + + /** + * Get the logo url + * + * @access public + * $return string + */ + public function getLogo() + { + return $this->logo; + } + /** * Get url * diff --git a/vendor/PicoFeed/Filter.php b/vendor/PicoFeed/Filter.php index bbfd97a..fab3926 100644 --- a/vendor/PicoFeed/Filter.php +++ b/vendor/PicoFeed/Filter.php @@ -2,7 +2,7 @@ namespace PicoFeed; -use DOMDocument; +use PicoFeed\Filter\Html; /** * Filter class @@ -13,436 +13,18 @@ use DOMDocument; class Filter { /** - * Config object - * - * @access private - * @var \PicoFeed\Config - */ - private $config = null; - - /** - * Filtered XML data - * - * @access private - * @var string - */ - private $data = ''; - - /** - * Site URL (used to build absolute URL) - * - * @access private - * @var string - */ - private $url = ''; - - /** - * Unfiltered XML data - * - * @access private - * @var string - */ - private $input = ''; - - /** - * List of empty tags - * - * @access private - * @var array - */ - private $empty_tags = array(); - - /** - * Flag to remove the content of a tag - * - * @access private - * @var boolean - */ - private $strip_content = false; - - /** - * Flag to remember if the current payload is a source code
- * - * @access private - * @var boolean - */ - private $is_code = false; - - /** - * Tags and attribute whitelist - * - * @access private - * @var array - */ - private $whitelist_tags = array( - 'audio' => array('controls', 'src'), - 'video' => array('poster', 'controls', 'height', 'width', 'src'), - 'source' => array('src', 'type'), - 'dt' => array(), - 'dd' => array(), - 'dl' => array(), - 'table' => array(), - 'caption' => array(), - 'tr' => array(), - 'th' => array(), - 'td' => array(), - 'tbody' => array(), - 'thead' => array(), - 'h2' => array(), - 'h3' => array(), - 'h4' => array(), - 'h5' => array(), - 'h6' => array(), - 'strong' => array(), - 'em' => array(), - 'code' => array(), - 'pre' => array(), - 'blockquote' => array(), - 'p' => array(), - 'ul' => array(), - 'li' => array(), - 'ol' => array(), - 'br' => array(), - 'del' => array(), - 'a' => array('href'), - 'img' => array('src', 'title', 'alt'), - 'figure' => array(), - 'figcaption' => array(), - 'cite' => array(), - 'time' => array('datetime'), - 'abbr' => array('title'), - 'iframe' => array('width', 'height', 'frameborder', 'src'), - 'q' => array('cite') - ); - - /** - * Tags blacklist, strip the content of those tags - * - * @access private - * @var array - */ - private $blacklisted_tags = array( - 'script' - ); - - /** - * Scheme whitelist - * For a complete list go to http://en.wikipedia.org/wiki/URI_scheme - * - * @access private - * @var array - */ - private $scheme_whitelist = array( - '//', - 'data:image/png;base64,', - 'data:image/gif;base64,', - 'data:image/jpg;base64,', - 'bitcoin:', - 'callto:', - 'ed2k://', - 'facetime://', - 'feed:', - 'ftp://', - 'geo:', - 'git://', - 'http://', - 'https://', - 'irc://', - 'irc6://', - 'ircs://', - 'jabber:', - 'magnet:', - 'mailto:', - 'nntp://', - 'rtmp://', - 'sftp://', - 'sip:', - 'sips:', - 'skype:', - 'smb://', - 'sms:', - 'spotify:', - 'ssh:', - 'steam:', - 'svn://', - 'tel:', - ); - - /** - * Attributes used for external resources - * - * @access private - * @var array - */ - private $media_attributes = array( - 'src', - 'href', - 'poster', - ); - - /** - * Blacklisted resources - * - * @access private - * @var array - */ - private $media_blacklist = array( - 'feeds.feedburner.com', - 'share.feedsportal.com', - 'da.feedsportal.com', - 'rss.feedsportal.com', - 'res.feedsportal.com', - 'res1.feedsportal.com', - 'res2.feedsportal.com', - 'res3.feedsportal.com', - 'pi.feedsportal.com', - 'rss.nytimes.com', - 'feeds.wordpress.com', - 'stats.wordpress.com', - 'rss.cnn.com', - 'twitter.com/home?status=', - 'twitter.com/share', - 'twitter_icon_large.png', - 'www.facebook.com/sharer.php', - 'facebook_icon_large.png', - 'plus.google.com/share', - 'www.gstatic.com/images/icons/gplus-16.png', - 'www.gstatic.com/images/icons/gplus-32.png', - 'www.gstatic.com/images/icons/gplus-64.png', - ); - - /** - * Mandatory attributes for specified tags - * - * @access private - * @var array - */ - private $required_attributes = array( - 'a' => array('href'), - 'img' => array('src'), - 'iframe' => array('src'), - 'audio' => array('src'), - 'source' => array('src'), - ); - - /** - * Add attributes to specified tags - * - * @access private - * @var array - */ - private $add_attributes = array( - 'a' => 'rel="noreferrer" target="_blank"' - ); - - /** - * Attributes that must be integer - * - * @access private - * @var array - */ - private $integer_attributes = array( - 'width', - 'height', - 'frameborder', - ); - - /** - * Iframe source whitelist, everything else is ignored - * - * @access private - * @var array - */ - private $iframe_whitelist = array( - '//www.youtube.com', - 'http://www.youtube.com', - 'https://www.youtube.com', - 'http://player.vimeo.com', - 'https://player.vimeo.com', - 'http://www.dailymotion.com', - 'https://www.dailymotion.com', - ); - - /** - * Initialize the filter, all inputs data must be encoded in UTF-8 before + * Get the Html filter instance * + * @static * @access public - * @param string $data XML content - * @param string $site_url Site URL (used to build absolute URL) + * @param string $html HTML content + * @param string $website Site URL (used to build absolute URL) + * @return PicoFeed\Filter\Html */ - public function __construct($data, $site_url) + public static function html($html, $website) { - $this->url = $site_url; - - libxml_use_internal_errors(true); - - // Convert bad formatted documents to XML - $dom = new DOMDocument; - $dom->loadHTML(''.$data); - $this->input = $dom->saveXML($dom->getElementsByTagName('body')->item(0)); - } - - /** - * Run tags/attributes filtering - * - * @access public - * @return string - */ - public function execute() - { - $parser = xml_parser_create(); - xml_set_object($parser, $this); - xml_set_element_handler($parser, 'startTag', 'endTag'); - xml_set_character_data_handler($parser, 'dataTag'); - xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, false); - xml_parse($parser, $this->input, true); // We ignore parsing error (for old libxml) - xml_parser_free($parser); - - $this->data = $this->removeEmptyTags($this->data); - $this->data = $this->removeMultipleTags($this->data); - - return trim($this->data); - } - - /** - * Parse opening tag - * - * @access public - * @param resource $parser XML parser - * @param string $name Tag name - * @param array $attributes Tag attributes - */ - public function startTag($parser, $name, $attributes) - { - $empty_tag = false; - $this->strip_content = false; - - if ($this->is_code === false && $name === 'pre') $this->is_code = true; - - if ($this->isPixelTracker($name, $attributes)) { - - $empty_tag = true; - } - else if ($this->isAllowedTag($name)) { - - $attr_data = ''; - $used_attributes = array(); - - foreach ($attributes as $attribute => $value) { - - if ($value != '' && $this->isAllowedAttribute($name, $attribute)) { - - if ($this->isResource($attribute)) { - - if ($name === 'iframe') { - - if ($this->isAllowedIframeResource($value)) { - - $attr_data .= ' '.$attribute.'="'.$this->escape($value).'"'; - $used_attributes[] = $attribute; - } - } - else if ($this->isRelativePath($value)) { - - $attr_data .= ' '.$attribute.'="'.$this->escape($this->getAbsoluteUrl($value, $this->url)).'"'; - $used_attributes[] = $attribute; - } - else if ($this->isAllowedProtocol($value) && ! $this->isBlacklistedMedia($value)) { - - if ($attribute == 'src' && - isset($attributes['data-src']) && - $this->isAllowedProtocol($attributes['data-src']) && - ! $this->isBlacklistedMedia($attributes['data-src'])) { - - $value = $attributes['data-src']; - } - - // Replace protocol-relative url // by http:// - if (substr($value, 0, 2) === '//') $value = 'http:'.$value; - - $attr_data .= ' '.$attribute.'="'.$this->escape($value).'"'; - $used_attributes[] = $attribute; - } - } - else if ($this->validateAttributeValue($attribute, $value)) { - - $attr_data .= ' '.$attribute.'="'.$this->escape($value).'"'; - $used_attributes[] = $attribute; - } - } - } - - // Check for required attributes - if (isset($this->required_attributes[$name])) { - - foreach ($this->required_attributes[$name] as $required_attribute) { - - if (! in_array($required_attribute, $used_attributes)) { - - $empty_tag = true; - break; - } - } - } - - if (! $empty_tag) { - - $this->data .= '<'.$name.$attr_data; - - // Add custom attributes - if (isset($this->add_attributes[$name])) { - - $this->data .= ' '.$this->add_attributes[$name].' '; - } - - // If img or br, we don't close it here - if ($name !== 'img' && $name !== 'br') $this->data .= '>'; - } - } - - if (in_array($name, $this->blacklisted_tags)) { - $this->strip_content = true; - } - - $this->empty_tags[] = $empty_tag; - } - - /** - * Parse closing tag - * - * @access public - * @param resource $parser XML parser - * @param string $name Tag name - */ - public function endTag($parser, $name) - { - if (! array_pop($this->empty_tags) && $this->isAllowedTag($name)) { - $this->data .= $name !== 'img' && $name !== 'br' ? ''.$name.'>' : '/>'; - } - - if ($this->is_code && $name === 'pre') $this->is_code = false; - } - - /** - * Parse tag content - * - * @access public - * @param resource $parser XML parser - * @param string $content Tag content - */ - public function dataTag($parser, $content) - { - $content = str_replace("\xc2\xa0", ' ', $content); // Replace with normal space - - // Issue with Cyrillic characters - // Replace mutliple space by a single one - // if (! $this->is_code) { - // $content = preg_replace('!\s+!', ' ', $content); - // } - - if (! $this->strip_content) { - $this->data .= $this->escape($content); - } + $filter = new Html($html, $website); + return $filter; } /** @@ -454,222 +36,7 @@ class Filter */ public static function escape($content) { - return htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false); - } - - /** - * Get the absolute url for a relative link - * - * @access public - * @param string $path Relative path - * @param string $url Site base url - * @return string - */ - public static function getAbsoluteUrl($path, $url) - { - $components = parse_url($url); - - if (! isset($components['scheme'])) $components['scheme'] = 'http'; - - if (! isset($components['host'])) { - - if ($url) { - - $components['host'] = $url; - $components['path'] = '/'; - } - else { - - return ''; - } - } - - if (! strlen($path)) return $url; - - if ($path{0} === '/') { - - // Absolute path - return $components['scheme'].'://'.$components['host'].$path; - } - else { - - // Relative path - $url_path = isset($components['path']) && ! empty($components['path']) ? $components['path'] : '/'; - $length = strlen($url_path); - - if ($length > 1 && $url_path{$length - 1} !== '/') { - $url_path = dirname($url_path).'/'; - } - - if (substr($path, 0, 2) === './') { - $path = substr($path, 2); - } - - return $components['scheme'].'://'.$components['host'].$url_path.$path; - } - } - - /** - * Check if an url is relative - * - * @access public - * @param string $value Attribute value - * @return boolean - */ - public static function isRelativePath($value) - { - if (strpos($value, 'data:') === 0) return false; - return strpos($value, '://') === false && strpos($value, '//') !== 0; - } - - /** - * Check if a tag is on the whitelist - * - * @access public - * @param string $name Tag name - * @return boolean - */ - public function isAllowedTag($name) - { - return isset($this->whitelist_tags[$name]); - } - - /** - * Check if an attribute is allowed for a given tag - * - * @access public - * @param string $tag Tag name - * @param array $attribute Attribute name - * @return boolean - */ - public function isAllowedAttribute($tag, $attribute) - { - return in_array($attribute, $this->whitelist_tags[$tag]); - } - - /** - * Check if an attribute name is an external resource - * - * @access public - * @param string $data Attribute name - * @return boolean - */ - public function isResource($attribute) - { - return in_array($attribute, $this->media_attributes); - } - - /** - * Check if an iframe url is allowed - * - * @access public - * @param string $value Attribute value - * @return boolean - */ - public function isAllowedIframeResource($value) - { - foreach ($this->iframe_whitelist as $url) { - - if (strpos($value, $url) === 0) { - return true; - } - } - - return false; - } - - /** - * Detect if the protocol is allowed or not - * - * @access public - * @param string $value Attribute value - * @return boolean - */ - public function isAllowedProtocol($value) - { - foreach ($this->scheme_whitelist as $protocol) { - - if (strpos($value, $protocol) === 0) { - return true; - } - } - - return false; - } - - /** - * Detect if an url is blacklisted - * - * @access public - * @param string $resouce Attribute value (URL) - * @return boolean - */ - public function isBlacklistedMedia($resource) - { - foreach ($this->media_blacklist as $name) { - - if (strpos($resource, $name) !== false) { - return true; - } - } - - return false; - } - - /** - * Detect if an image tag is a pixel tracker - * - * @access public - * @param string $tag Tag name - * @param array $attributes Tag attributes - * @return boolean - */ - public function isPixelTracker($tag, array $attributes) - { - return $tag === 'img' && - isset($attributes['height']) && isset($attributes['width']) && - $attributes['height'] == 1 && $attributes['width'] == 1; - } - - /** - * Check if an attribute value is integer - * - * @access public - * @param string $attribute Attribute name - * @param string $value Attribute value - * @return boolean - */ - public function validateAttributeValue($attribute, $value) - { - if (in_array($attribute, $this->integer_attributes)) { - return ctype_digit($value); - } - - return true; - } - - /** - * ReplacePara 1
Para 2
Para 1
Para 2
' -* echo "\n\n"; -* -* // set innerHTML -* $elem->innerHTML = 'FiveFilters.org'; -* echo $elem->innerHTML; // prints 'FiveFilters.org' -* echo "\n\n"; -* -* // print document (with our changes) -* echo $doc->saveXML(); -* @endcode -* -* @author Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net -* @see http://fivefilters.org (the project this was written for) -*/ -class JSLikeHTMLElement extends DOMElement -{ - /** - * Used for setting innerHTML like it's done in JavaScript: - * @code - * $div->innerHTML = 'The story begins...
'; - * @endcode - */ - public function __set($name, $value) { - if ($name == 'innerHTML') { - // first, empty the element - for ($x=$this->childNodes->length-1; $x>=0; $x--) { - $this->removeChild($this->childNodes->item($x)); - } - // $value holds our new inner HTML - if ($value != '') { - $f = $this->ownerDocument->createDocumentFragment(); - // appendXML() expects well-formed markup (XHTML) - $result = @$f->appendXML($value); // @ to suppress PHP warnings - if ($result) { - if ($f->hasChildNodes()) $this->appendChild($f); - } else { - // $value is probably ill-formed - $f = new DOMDocument(); - $value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8'); - // Using', $html); - $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); - $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); - if (trim($html) == '') $html = ''; - if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) { - // all good - } else { - $this->dom = new DOMDocument(); - $this->dom->preserveWhiteSpace = false; - @$this->dom->loadHTML($html); - } - $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); - } - - /** - * Get article title element - * @return DOMElement - */ - public function getTitle() { - return $this->articleTitle; - } - - /** - * Get article content element - * @return DOMElement - */ - public function getContent() { - return $this->articleContent; - } - - /** - * Runs readability. - * - * Workflow: - * 1. Prep the document by removing script tags, css, etc. - * 2. Build readability's DOM tree. - * 3. Grab the article content from the current dom tree. - * 4. Replace the current DOM tree with the new one. - * 5. Read peacefully. - * - * @return boolean true if we found content, false otherwise - **/ - public function init() - { - if (!isset($this->dom->documentElement)) return false; - $this->removeScripts($this->dom); - //die($this->getInnerHTML($this->dom->documentElement)); - - // Assume successful outcome - $this->success = true; - - $bodyElems = $this->dom->getElementsByTagName('body'); - if ($bodyElems->length > 0) { - if ($this->bodyCache == null) { - $this->bodyCache = $bodyElems->item(0)->innerHTML; - } - if ($this->body == null) { - $this->body = $bodyElems->item(0); - } - } - - $this->prepDocument(); - - //die($this->dom->documentElement->parentNode->nodeType); - //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); - //die($this->getInnerHTML($this->dom->documentElement)); - - /* Build readability's DOM tree */ - $overlay = $this->dom->createElement('div'); - $innerDiv = $this->dom->createElement('div'); - $articleTitle = $this->getArticleTitle(); - $articleContent = $this->grabArticle(); - - if (!$articleContent) { - $this->success = false; - $articleContent = $this->dom->createElement('div'); - $articleContent->setAttribute('id', 'readability-content'); - $articleContent->innerHTML = '
Sorry, Readability was unable to parse this page for content.
'; - } - - $overlay->setAttribute('id', 'readOverlay'); - $innerDiv->setAttribute('id', 'readInner'); - - /* Glue the structure of our document together. */ - $innerDiv->appendChild($articleTitle); - $innerDiv->appendChild($articleContent); - $overlay->appendChild($innerDiv); - - /* Clear the old HTML, insert the new content. */ - $this->body->innerHTML = ''; - $this->body->appendChild($overlay); - //document.body.insertBefore(overlay, document.body.firstChild); - $this->body->removeAttribute('style'); - - $this->postProcessContent($articleContent); - - // Set title and content instance variables - $this->articleTitle = $articleTitle; - $this->articleContent = $articleContent; - - return $this->success; - } - - /** - * Debug - */ - protected function dbg($msg) { - if ($this->debug) echo '* ',$msg, "\n"; - } - - /** - * Run any post-process modifications to article content as necessary. - * - * @param DOMElement - * @return void - */ - public function postProcessContent($articleContent) { - if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { - $this->addFootnotes($articleContent); - } - } - - /** - * Get the article title as an H1. - * - * @return DOMElement - */ - protected function getArticleTitle() { - $curTitle = ''; - $origTitle = ''; - - try { - $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); - } catch(Exception $e) {} - - if (preg_match('/ [\|\-] /', $curTitle)) - { - $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); - - if (count(explode(' ', $curTitle)) < 3) { - $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); - } - } - else if (strpos($curTitle, ': ') !== false) - { - $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); - - if (count(explode(' ', $curTitle)) < 3) { - $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle); - } - } - else if(strlen($curTitle) > 150 || strlen($curTitle) < 15) - { - $hOnes = $this->dom->getElementsByTagName('h1'); - if($hOnes->length == 1) - { - $curTitle = $this->getInnerText($hOnes->item(0)); - } - } - - $curTitle = trim($curTitle); - - if (count(explode(' ', $curTitle)) <= 4) { - $curTitle = $origTitle; - } - - $articleTitle = $this->dom->createElement('h1'); - $articleTitle->innerHTML = $curTitle; - - return $articleTitle; - } - - /** - * Prepare the HTML document for readability to scrape it. - * This includes things like stripping javascript, CSS, and handling terrible markup. - * - * @return void - **/ - protected function prepDocument() { - /** - * In some cases a body element can't be found (if the HTML is totally hosed for example) - * so we create a new body node and append it to the document. - */ - if ($this->body == null) - { - $this->body = $this->dom->createElement('body'); - $this->dom->documentElement->appendChild($this->body); - } - $this->body->setAttribute('id', 'readabilityBody'); - - /* Remove all style tags in head */ - $styleTags = $this->dom->getElementsByTagName('style'); - for ($i = $styleTags->length-1; $i >= 0; $i--) - { - $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); - } - - /* Turn all double br's into p's */ - /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ - //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '').replace(readability.regexps.replaceFonts, '<$1span>'); - // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree. - // Manipulating innerHTML as it's done in JS is not possible in PHP. - } - - /** - * For easier reading, convert this document to have footnotes at the bottom rather than inline links. - * @see http://www.roughtype.com/archives/2010/05/experiments_in.php - * - * @return void - **/ - public function addFootnotes($articleContent) { - $footnotesWrapper = $this->dom->createElement('div'); - $footnotesWrapper->setAttribute('id', 'readability-footnotes'); - $footnotesWrapper->innerHTML = '
tags, etc.
- *
- * @param DOMElement
- * @return void
- */
- function prepArticle($articleContent) {
- $this->cleanStyles($articleContent);
- $this->killBreaks($articleContent);
- if ($this->revertForcedParagraphElements) {
- $this->revertReadabilityStyledElements($articleContent);
- }
-
- /* Clean out junk from the article content */
- $this->cleanConditionally($articleContent, 'form');
- $this->clean($articleContent, 'object');
- $this->clean($articleContent, 'h1');
-
- /**
- * If there is only one h2, they are probably using it
- * as a header and not a subheader, so remove it since we already have a header.
- ***/
- if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) {
- $this->clean($articleContent, 'h2');
- }
- $this->clean($articleContent, 'iframe');
-
- $this->cleanHeaders($articleContent);
-
- /* Do these last as the previous stuff may have removed junk that will affect these */
- $this->cleanConditionally($articleContent, 'table');
- $this->cleanConditionally($articleContent, 'ul');
- $this->cleanConditionally($articleContent, 'div');
-
- /* Remove extra paragraphs */
- $articleParagraphs = $articleContent->getElementsByTagName('p');
- for ($i = $articleParagraphs->length-1; $i >= 0; $i--)
- {
- $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
- $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
- $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
- $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;
-
- if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
- {
- $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
- }
- }
-
- try {
- $articleContent->innerHTML = preg_replace('/
]*>\s*
innerHTML);
- //articleContent.innerHTML = articleContent.innerHTML.replace(/
]*>\s*
dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e);
- }
- }
-
- /**
- * Initialize a node with the readability object. Also checks the
- * className/id for special names to add to its score.
- *
- * @param Element
- * @return void
- **/
- protected function initializeNode($node) {
- $readability = $this->dom->createAttribute('readability');
- $readability->value = 0; // this is our contentScore
- $node->setAttributeNode($readability);
-
- switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case
- case 'DIV':
- $readability->value += 5;
- break;
-
- case 'PRE':
- case 'TD':
- case 'BLOCKQUOTE':
- $readability->value += 3;
- break;
-
- case 'ADDRESS':
- case 'OL':
- case 'UL':
- case 'DL':
- case 'DD':
- case 'DT':
- case 'LI':
- case 'FORM':
- $readability->value -= 3;
- break;
-
- case 'H1':
- case 'H2':
- case 'H3':
- case 'H4':
- case 'H5':
- case 'H6':
- case 'TH':
- $readability->value -= 5;
- break;
- }
- $readability->value += $this->getClassWeight($node);
- }
-
- /***
- * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
- * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
- *
- * @return DOMElement
- **/
- protected function grabArticle($page=null) {
- $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS);
- if (!$page) $page = $this->dom;
- $allElements = $page->getElementsByTagName('*');
- /**
- * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
- * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
- *
- * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
- * TODO: Shouldn't this be a reverse traversal?
- **/
- $node = null;
- $nodesToScore = array();
- for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) {
- //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) {
- //$node = $targetList->item($nodeIndex);
- $tagName = strtoupper($node->tagName);
- /* Remove unlikely candidates */
- if ($stripUnlikelyCandidates) {
- $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id');
- if (
- preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
- !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) &&
- $tagName != 'BODY'
- )
- {
- $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString);
- //$nodesToRemove[] = $node;
- $node->parentNode->removeChild($node);
- $nodeIndex--;
- continue;
- }
- }
-
- if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {
- $nodesToScore[] = $node;
- }
-
- /* Turn all divs that don't have children block level elements into p's */
- if ($tagName == 'DIV') {
- if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
- //$this->dbg('Altering div to p');
- $newNode = $this->dom->createElement('p');
- try {
- $newNode->innerHTML = $node->innerHTML;
- //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node);
- $node->parentNode->replaceChild($newNode, $node);
- $nodeIndex--;
- $nodesToScore[] = $node; // or $newNode?
- }
- catch(Exception $e) {
- $this->dbg('Could not alter div to p, reverting back to div.: ' . $e);
- }
- }
- else
- {
- /* EXPERIMENTAL */
- // TODO: change these p elements back to text nodes after processing
- for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {
- $childNode = $node->childNodes->item($i);
- if ($childNode->nodeType == 3) { // XML_TEXT_NODE
- //$this->dbg('replacing text node with a p tag with the same content.');
- $p = $this->dom->createElement('p');
- $p->innerHTML = $childNode->nodeValue;
- $p->setAttribute('style', 'display: inline;');
- $p->setAttribute('class', 'readability-styled');
- $childNode->parentNode->replaceChild($p, $childNode);
- }
- }
- }
- }
- }
-
- /**
- * Loop through all paragraphs, and assign a score to them based on how content-y they look.
- * Then add their score to their parent node.
- *
- * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
- **/
- $candidates = array();
- for ($pt=0; $pt < count($nodesToScore); $pt++) {
- $parentNode = $nodesToScore[$pt]->parentNode;
- // $grandParentNode = $parentNode ? $parentNode->parentNode : null;
- $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null);
- $innerText = $this->getInnerText($nodesToScore[$pt]);
-
- if (!$parentNode || !isset($parentNode->tagName)) {
- continue;
- }
-
- /* If this paragraph is less than 25 characters, don't even count it. */
- if(strlen($innerText) < 25) {
- continue;
- }
-
- /* Initialize readability data for the parent. */
- if (!$parentNode->hasAttribute('readability'))
- {
- $this->initializeNode($parentNode);
- $candidates[] = $parentNode;
- }
-
- /* Initialize readability data for the grandparent. */
- if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName))
- {
- $this->initializeNode($grandParentNode);
- $candidates[] = $grandParentNode;
- }
-
- $contentScore = 0;
-
- /* Add a point for the paragraph itself as a base. */
- $contentScore++;
-
- /* Add points for any commas within this paragraph */
- $contentScore += count(explode(',', $innerText));
-
- /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
- $contentScore += min(floor(strlen($innerText) / 100), 3);
-
- /* Add the score to the parent. The grandparent gets half. */
- $parentNode->getAttributeNode('readability')->value += $contentScore;
-
- if ($grandParentNode) {
- $grandParentNode->getAttributeNode('readability')->value += $contentScore/2;
- }
- }
-
- /**
- * After we've calculated scores, loop through all of the possible candidate nodes we found
- * and find the one with the highest score.
- **/
- $topCandidate = null;
- for ($c=0, $cl=count($candidates); $c < $cl; $c++)
- {
- /**
- * Scale the final candidates score based on link density. Good content should have a
- * relatively small link density (5% or less) and be mostly unaffected by this operation.
- **/
- $readability = $candidates[$c]->getAttributeNode('readability');
- $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c]));
-
- $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value);
-
- if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) {
- $topCandidate = $candidates[$c];
- }
- }
-
- /**
- * If we still have no top candidate, just use the body as a last resort.
- * We also have to copy the body node so it is something we can modify.
- **/
- if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY')
- {
- $topCandidate = $this->dom->createElement('div');
- if ($page instanceof DOMDocument) {
- if (!isset($page->documentElement)) {
- // we don't have a body either? what a mess! :)
- } else {
- $topCandidate->innerHTML = $page->documentElement->innerHTML;
- $page->documentElement->innerHTML = '';
- $page->documentElement->appendChild($topCandidate);
- }
- } else {
- $topCandidate->innerHTML = $page->innerHTML;
- $page->innerHTML = '';
- $page->appendChild($topCandidate);
- }
- $this->initializeNode($topCandidate);
- }
-
- /**
- * Now that we have the top candidate, look through its siblings for content that might also be related.
- * Things like preambles, content split by ads that we removed, etc.
- **/
- $articleContent = $this->dom->createElement('div');
- $articleContent->setAttribute('id', 'readability-content');
- $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2);
- $siblingNodes = $topCandidate->parentNode->childNodes;
- if (!isset($siblingNodes)) {
- $siblingNodes = new stdClass;
- $siblingNodes->length = 0;
- }
-
- for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++)
- {
- $siblingNode = $siblingNodes->item($s);
- $append = false;
-
- $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
-
- //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
-
- if ($siblingNode === $topCandidate)
- // or if ($siblingNode->isSameNode($topCandidate))
- {
- $append = true;
- }
-
- $contentBonus = 0;
- /* Give a bonus if sibling nodes and top candidates have the example same classname */
- if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {
- $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2;
- }
-
- if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold)
- {
- $append = true;
- }
-
- if (strtoupper($siblingNode->nodeName) == 'P') {
- $linkDensity = $this->getLinkDensity($siblingNode);
- $nodeContent = $this->getInnerText($siblingNode);
- $nodeLength = strlen($nodeContent);
-
- if ($nodeLength > 80 && $linkDensity < 0.25)
- {
- $append = true;
- }
- else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))
- {
- $append = true;
- }
- }
-
- if ($append)
- {
- $this->dbg('Appending node: ' . $siblingNode->nodeName);
-
- $nodeToAppend = null;
- $sibNodeName = strtoupper($siblingNode->nodeName);
- if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {
- /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
-
- $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');
- $nodeToAppend = $this->dom->createElement('div');
- try {
- $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));
- $nodeToAppend->innerHTML = $siblingNode->innerHTML;
- }
- catch(Exception $e)
- {
- $this->dbg('Could not alter siblingNode to div, reverting back to original.');
- $nodeToAppend = $siblingNode;
- $s--;
- $sl--;
- }
- } else {
- $nodeToAppend = $siblingNode;
- $s--;
- $sl--;
- }
-
- /* To ensure a node does not interfere with readability styles, remove its classnames */
- $nodeToAppend->removeAttribute('class');
-
- /* Append sibling and subtract from our list because it removes the node when you append to another node */
- $articleContent->appendChild($nodeToAppend);
- }
- }
-
- /**
- * So we have all of the content that we need. Now we clean it up for presentation.
- **/
- $this->prepArticle($articleContent);
-
- /**
- * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
- * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
- * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
- * finding the -right- content.
- **/
- if (strlen($this->getInnerText($articleContent, false)) < 250)
- {
- // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7
- // in the meantime, we check and create an empty element if it's not there.
- if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body');
- $this->body->innerHTML = $this->bodyCache;
-
- if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
- $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
- return $this->grabArticle($this->body);
- }
- else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
- $this->removeFlag(self::FLAG_WEIGHT_CLASSES);
- return $this->grabArticle($this->body);
- }
- else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
- $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
- return $this->grabArticle($this->body);
- }
- else {
- return false;
- }
- }
- return $articleContent;
- }
-
- /**
- * Remove script tags from document
- *
- * @param DOMElement
- * @return void
- */
- public function removeScripts($doc) {
- $scripts = $doc->getElementsByTagName('script');
- for($i = $scripts->length-1; $i >= 0; $i--)
- {
- $scripts->item($i)->parentNode->removeChild($scripts->item($i));
- }
- }
-
- /**
- * Get the inner text of a node.
- * This also strips out any excess whitespace to be found.
- *
- * @param DOMElement $
- * @param boolean $normalizeSpaces (default: true)
- * @return string
- **/
- public function getInnerText($e, $normalizeSpaces=true) {
- $textContent = '';
-
- if (!isset($e->textContent) || $e->textContent == '') {
- return '';
- }
-
- $textContent = trim($e->textContent);
-
- if ($normalizeSpaces) {
- return preg_replace($this->regexps['normalize'], ' ', $textContent);
- } else {
- return $textContent;
- }
- }
-
- /**
- * Get the number of times a string $s appears in the node $e.
- *
- * @param DOMElement $e
- * @param string - what to count. Default is ","
- * @return number (integer)
- **/
- public function getCharCount($e, $s=',') {
- return substr_count($this->getInnerText($e), $s);
- }
-
- /**
- * Remove the style attribute on every $e and under.
- *
- * @param DOMElement $e
- * @return void
- */
- public function cleanStyles($e) {
- if (!is_object($e)) return;
- $elems = $e->getElementsByTagName('*');
- foreach ($elems as $elem) {
- $elem->removeAttribute('style');
- }
- }
-
- /**
- * Get the density of links as a percentage of the content
- * This is the amount of text that is inside a link divided by the total text in the node.
- *
- * @param DOMElement $e
- * @return number (float)
- */
- public function getLinkDensity($e) {
- $links = $e->getElementsByTagName('a');
- $textLength = strlen($this->getInnerText($e));
- $linkLength = 0;
- for ($i=0, $il=$links->length; $i < $il; $i++)
- {
- $linkLength += strlen($this->getInnerText($links->item($i)));
- }
- if ($textLength > 0) {
- return $linkLength / $textLength;
- } else {
- return 0;
- }
- }
-
- /**
- * Get an elements class/id weight. Uses regular expressions to tell if this
- * element looks good or bad.
- *
- * @param DOMElement $e
- * @return number (Integer)
- */
- public function getClassWeight($e) {
- if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
- return 0;
- }
-
- $weight = 0;
-
- /* Look for a special classname */
- if ($e->hasAttribute('class') && $e->getAttribute('class') != '')
- {
- if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) {
- $weight -= 25;
- }
- if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) {
- $weight += 25;
- }
- }
-
- /* Look for a special ID */
- if ($e->hasAttribute('id') && $e->getAttribute('id') != '')
- {
- if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) {
- $weight -= 25;
- }
- if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) {
- $weight += 25;
- }
- }
- return $weight;
- }
-
- /**
- * Remove extraneous break tags from a node.
- *
- * @param DOMElement $node
- * @return void
- */
- public function killBreaks($node) {
- $html = $node->innerHTML;
- $html = preg_replace($this->regexps['killBreaks'], '
', $html);
- $node->innerHTML = $html;
- }
-
- /**
- * Clean a node of all elements of type "tag".
- * (Unless it's a youtube/vimeo video. People love movies.)
- *
- * Updated 2012-09-18 to preserve youtube/vimeo iframes
- *
- * @param DOMElement $e
- * @param string $tag
- * @return void
- */
- public function clean($e, $tag) {
- $targetList = $e->getElementsByTagName($tag);
- $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed');
-
- for ($y=$targetList->length-1; $y >= 0; $y--) {
- /* Allow youtube and vimeo videos through as people usually want to see those. */
- if ($isEmbed) {
- $attributeValues = '';
- for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) {
- $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test)
- }
-
- /* First, check the elements attributes to see if any of them contain youtube or vimeo */
- if (preg_match($this->regexps['video'], $attributeValues)) {
- continue;
- }
-
- /* Then check the elements inside this element for the same. */
- if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) {
- continue;
- }
- }
- $targetList->item($y)->parentNode->removeChild($targetList->item($y));
- }
- }
-
- /**
- * Clean an element of all tags of type "tag" if they look fishy.
- * "Fishy" is an algorithm based on content length, classnames,
- * link density, number of images & embeds, etc.
- *
- * @param DOMElement $e
- * @param string $tag
- * @return void
- */
- public function cleanConditionally($e, $tag) {
- if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
- return;
- }
-
- $tagsList = $e->getElementsByTagName($tag);
- $curTagsLength = $tagsList->length;
-
- /**
- * Gather counts for other typical elements embedded within.
- * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
- *
- * TODO: Consider taking into account original contentScore here.
- */
- for ($i=$curTagsLength-1; $i >= 0; $i--) {
- $weight = $this->getClassWeight($tagsList->item($i));
- $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0;
-
- $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : ''));
-
- if ($weight + $contentScore < 0) {
- $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
- }
- else if ( $this->getCharCount($tagsList->item($i), ',') < 10) {
- /**
- * If there are not very many commas, and the number of
- * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
- **/
- $p = $tagsList->item($i)->getElementsByTagName('p')->length;
- $img = $tagsList->item($i)->getElementsByTagName('img')->length;
- $li = $tagsList->item($i)->getElementsByTagName('li')->length-100;
- $input = $tagsList->item($i)->getElementsByTagName('input')->length;
- $a = $tagsList->item($i)->getElementsByTagName('a')->length;
-
- $embedCount = 0;
- $embeds = $tagsList->item($i)->getElementsByTagName('embed');
- for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
- if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
- $embedCount++;
- }
- }
- $embeds = $tagsList->item($i)->getElementsByTagName('iframe');
- for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
- if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
- $embedCount++;
- }
- }
-
- $linkDensity = $this->getLinkDensity($tagsList->item($i));
- $contentLength = strlen($this->getInnerText($tagsList->item($i)));
- $toRemove = false;
-
- if ($this->lightClean) {
- $this->dbg('Light clean...');
- if ( ($img > $p) && ($img > 4) ) {
- $this->dbg(' more than 4 images and more image elements than paragraph elements');
- $toRemove = true;
- } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
- $this->dbg(' too many