From de8133b7eeddac127e6dcc57107a7ad033b60aab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Thu, 3 Oct 2013 23:14:39 -0400 Subject: [PATCH] Add support for CP1251 encoding --- model.php | 3 +-- vendor/PicoFeed/Client.php | 12 ++++++++++++ vendor/PicoFeed/Encoding.php | 5 +++++ vendor/PicoFeed/Filter.php | 28 +++++++++++++++++++++++++--- vendor/PicoFeed/Grabber.php | 15 +++++++++++++-- vendor/PicoFeed/Parser.php | 14 +++++++++++--- vendor/PicoFeed/Parsers/Rss20.php | 1 - vendor/PicoFeed/Reader.php | 15 +++++++++------ 8 files changed, 76 insertions(+), 17 deletions(-) diff --git a/model.php b/model.php index a20f65e..19a9dd8 100644 --- a/model.php +++ b/model.php @@ -432,8 +432,7 @@ function download_content($url) if (! empty($html)) { // Try first with PicoFeed grabber and with Readability after - $grabber = new \PicoFeed\Grabber($url); - $grabber->html = $html; + $grabber = new \PicoFeed\Grabber($url, $html, $client->getEncoding()); $content = ''; if ($grabber->parse()) { diff --git a/vendor/PicoFeed/Client.php b/vendor/PicoFeed/Client.php index 816d22a..a0912de 100644 --- a/vendor/PicoFeed/Client.php +++ b/vendor/PicoFeed/Client.php @@ -11,6 +11,7 @@ abstract class Client protected static $proxy_username = null; protected static $proxy_password = null; + public $encoding = ''; public $etag = ''; public $last_modified = ''; public $is_modified = true; @@ -70,6 +71,11 @@ abstract class Client $this->etag = isset($response['headers']['ETag']) ? $response['headers']['ETag'] : ''; $this->last_modified = isset($response['headers']['Last-Modified']) ? $response['headers']['Last-Modified'] : ''; $this->content = $response['body']; + + if (isset($response['headers']['Content-Type'])) { + $result = explode('charset=', strtolower($response['headers']['Content-Type'])); + $this->encoding = isset($result[1]) ? $result[1] : ''; + } } } } @@ -149,6 +155,12 @@ abstract class Client } + public function getEncoding() + { + return $this->encoding; + } + + public function isModified() { return $this->is_modified; diff --git a/vendor/PicoFeed/Encoding.php b/vendor/PicoFeed/Encoding.php index b757643..ebfa9a3 100644 --- a/vendor/PicoFeed/Encoding.php +++ b/vendor/PicoFeed/Encoding.php @@ -322,4 +322,9 @@ class Encoding { if($encodingLabel == 'ISO-8859-1') return Encoding::toLatin1($text); } + + public static function cp1251ToUtf8($input) + { + return iconv('CP1251', 'UTF-8//TRANSLIT', $input); + } } diff --git a/vendor/PicoFeed/Filter.php b/vendor/PicoFeed/Filter.php index 7bd7025..24279c0 100644 --- a/vendor/PicoFeed/Filter.php +++ b/vendor/PicoFeed/Filter.php @@ -164,6 +164,7 @@ class Filter ); + // All inputs data must be encoded in UTF-8 before public function __construct($data, $site_url) { $this->url = $site_url; @@ -301,10 +302,11 @@ class Filter { $content = str_replace("\xc2\xa0", ' ', $content); // Replace   with normal space + // Issue with Cyrillic characters // Replace mutliple space by a single one - if (! $this->is_code) { - $content = preg_replace('!\s+!', ' ', $content); - } + // if (! $this->is_code) { + // $content = preg_replace('!\s+!', ' ', $content); + // } if (! $this->strip_content) { $this->data .= htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false); @@ -470,4 +472,24 @@ class Filter { return preg_replace('//is', '', $data); } + + + public static function getEncodingFromXmlTag($data) + { + $encoding = ''; + + if (strpos($data, '')); + $data = str_replace("'", '"', $data); + + $p1 = strpos($data, 'encoding='); + $p2 = strpos($data, '"', $p1 + 10); + + $encoding = substr($data, $p1 + 10, $p2 - $p1 - 10); + $encoding = strtolower($encoding); + } + + return $encoding; + } } diff --git a/vendor/PicoFeed/Grabber.php b/vendor/PicoFeed/Grabber.php index 32a6a9a..cfb6c98 100644 --- a/vendor/PicoFeed/Grabber.php +++ b/vendor/PicoFeed/Grabber.php @@ -11,6 +11,7 @@ class Grabber { public $content = ''; public $html = ''; + public $encoding = ''; // Order is important, generic terms at the end public $candidatesAttributes = array( @@ -67,9 +68,11 @@ class Grabber ); - public function __construct($url) + public function __construct($url, $html = '', $encoding = 'utf-8') { $this->url = $url; + $this->html = $html; + $this->encoding = $encoding; } @@ -78,8 +81,16 @@ class Grabber if ($this->html) { Logging::log(\get_called_class().' Fix encoding'); + Logging::log(\get_called_class().': HTTP Encoding "'.$this->encoding.'"'); + $this->html = Filter::stripMetaTags($this->html); - $this->html = Encoding::toUtf8($this->html); + + if ($this->encoding == 'windows-1251') { + $this->html = Encoding::cp1251ToUtf8($this->html); + } + else { + $this->html = Encoding::toUTF8($this->html); + } Logging::log(\get_called_class().' Try to find rules'); $rules = $this->getRules(); diff --git a/vendor/PicoFeed/Parser.php b/vendor/PicoFeed/Parser.php index 84d1b4d..667c20d 100644 --- a/vendor/PicoFeed/Parser.php +++ b/vendor/PicoFeed/Parser.php @@ -25,13 +25,21 @@ abstract class Parser abstract public function execute(); - public function __construct($content) + public function __construct($content, $http_encoding = '') { - // Strip XML tag to avoid multiple encoding/decoding in next XML processing + $xml_encoding = Filter::getEncodingFromXmlTag($content); + Logging::log(\get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"'); + + // Strip XML tag to avoid multiple encoding/decoding in the next XML processing $this->content = Filter::stripXmlTag($content); // Encode everything in UTF-8 - $this->content = Encoding::toUTF8($this->content); + if ($xml_encoding == 'windows-1251' || $http_encoding == 'windows-1251') { + $this->content = Encoding::cp1251ToUtf8($this->content); + } + else { + $this->content = Encoding::toUTF8($this->content); + } // Workarounds $this->content = $this->normalizeData($this->content); diff --git a/vendor/PicoFeed/Parsers/Rss20.php b/vendor/PicoFeed/Parsers/Rss20.php index 1c64516..bb7e82b 100644 --- a/vendor/PicoFeed/Parsers/Rss20.php +++ b/vendor/PicoFeed/Parsers/Rss20.php @@ -81,7 +81,6 @@ class Rss20 extends \PicoFeed\Parser if (empty($item->updated)) $item->updated = strtotime((string) $entry->pubDate) ?: $this->updated; if (empty($item->content)) { - $item->content = isset($entry->description) ? (string) $entry->description : ''; } diff --git a/vendor/PicoFeed/Reader.php b/vendor/PicoFeed/Reader.php index c76baee..4191c0a 100644 --- a/vendor/PicoFeed/Reader.php +++ b/vendor/PicoFeed/Reader.php @@ -11,11 +11,13 @@ class Reader { private $url = ''; private $content = ''; + private $encoding = ''; - public function __construct($content = '') + public function __construct($content = '', $encoding = '') { $this->content = $content; + $this->encoding = ''; return $this; } @@ -37,6 +39,7 @@ class Reader $this->content = $client->getContent(); $this->url = $client->getUrl(); + $this->encoding = $client->getEncoding(); return $client; } @@ -86,7 +89,7 @@ class Reader Logging::log(\get_called_class().': discover Atom feed'); require_once __DIR__.'/Parsers/Atom.php'; - return new Parsers\Atom($this->content); + return new Parsers\Atom($this->content, $this->encoding); } else if (strpos($first_tag, 'content); + return new Parsers\Rss20($this->content, $this->encoding); } else if (strpos($first_tag, 'content); + return new Parsers\Rss92($this->content, $this->encoding); } else if (strpos($first_tag, 'content); + return new Parsers\Rss91($this->content, $this->encoding); } else if (strpos($first_tag, 'content); + return new Parsers\Rss10($this->content, $this->encoding); } else if ($discover === true) {