Add support for CP1251 encoding

This commit is contained in:
Frédéric Guillot 2013-10-03 23:14:39 -04:00
parent 4d78a22684
commit de8133b7ee
8 changed files with 76 additions and 17 deletions

View File

@ -432,8 +432,7 @@ function download_content($url)
if (! empty($html)) {
// Try first with PicoFeed grabber and with Readability after
$grabber = new \PicoFeed\Grabber($url);
$grabber->html = $html;
$grabber = new \PicoFeed\Grabber($url, $html, $client->getEncoding());
$content = '';
if ($grabber->parse()) {

View File

@ -11,6 +11,7 @@ abstract class Client
protected static $proxy_username = null;
protected static $proxy_password = null;
public $encoding = '';
public $etag = '';
public $last_modified = '';
public $is_modified = true;
@ -70,6 +71,11 @@ abstract class Client
$this->etag = isset($response['headers']['ETag']) ? $response['headers']['ETag'] : '';
$this->last_modified = isset($response['headers']['Last-Modified']) ? $response['headers']['Last-Modified'] : '';
$this->content = $response['body'];
if (isset($response['headers']['Content-Type'])) {
$result = explode('charset=', strtolower($response['headers']['Content-Type']));
$this->encoding = isset($result[1]) ? $result[1] : '';
}
}
}
}
@ -149,6 +155,12 @@ abstract class Client
}
public function getEncoding()
{
return $this->encoding;
}
public function isModified()
{
return $this->is_modified;

View File

@ -322,4 +322,9 @@ class Encoding {
if($encodingLabel == 'ISO-8859-1') return Encoding::toLatin1($text);
}
public static function cp1251ToUtf8($input)
{
return iconv('CP1251', 'UTF-8//TRANSLIT', $input);
}
}

View File

@ -164,6 +164,7 @@ class Filter
);
// All inputs data must be encoded in UTF-8 before
public function __construct($data, $site_url)
{
$this->url = $site_url;
@ -301,10 +302,11 @@ class Filter
{
$content = str_replace("\xc2\xa0", ' ', $content); // Replace   with normal space
// Issue with Cyrillic characters
// Replace mutliple space by a single one
if (! $this->is_code) {
$content = preg_replace('!\s+!', ' ', $content);
}
// if (! $this->is_code) {
// $content = preg_replace('!\s+!', ' ', $content);
// }
if (! $this->strip_content) {
$this->data .= htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false);
@ -470,4 +472,24 @@ class Filter
{
return preg_replace('/<meta\s.*?\/>/is', '', $data);
}
public static function getEncodingFromXmlTag($data)
{
$encoding = '';
if (strpos($data, '<?xml') !== false) {
$data = substr($data, 0, strrpos($data, '?>'));
$data = str_replace("'", '"', $data);
$p1 = strpos($data, 'encoding=');
$p2 = strpos($data, '"', $p1 + 10);
$encoding = substr($data, $p1 + 10, $p2 - $p1 - 10);
$encoding = strtolower($encoding);
}
return $encoding;
}
}

View File

@ -11,6 +11,7 @@ class Grabber
{
public $content = '';
public $html = '';
public $encoding = '';
// Order is important, generic terms at the end
public $candidatesAttributes = array(
@ -67,9 +68,11 @@ class Grabber
);
public function __construct($url)
public function __construct($url, $html = '', $encoding = 'utf-8')
{
$this->url = $url;
$this->html = $html;
$this->encoding = $encoding;
}
@ -78,8 +81,16 @@ class Grabber
if ($this->html) {
Logging::log(\get_called_class().' Fix encoding');
Logging::log(\get_called_class().': HTTP Encoding "'.$this->encoding.'"');
$this->html = Filter::stripMetaTags($this->html);
$this->html = Encoding::toUtf8($this->html);
if ($this->encoding == 'windows-1251') {
$this->html = Encoding::cp1251ToUtf8($this->html);
}
else {
$this->html = Encoding::toUTF8($this->html);
}
Logging::log(\get_called_class().' Try to find rules');
$rules = $this->getRules();

View File

@ -25,13 +25,21 @@ abstract class Parser
abstract public function execute();
public function __construct($content)
public function __construct($content, $http_encoding = '')
{
// Strip XML tag to avoid multiple encoding/decoding in next XML processing
$xml_encoding = Filter::getEncodingFromXmlTag($content);
Logging::log(\get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"');
// Strip XML tag to avoid multiple encoding/decoding in the next XML processing
$this->content = Filter::stripXmlTag($content);
// Encode everything in UTF-8
$this->content = Encoding::toUTF8($this->content);
if ($xml_encoding == 'windows-1251' || $http_encoding == 'windows-1251') {
$this->content = Encoding::cp1251ToUtf8($this->content);
}
else {
$this->content = Encoding::toUTF8($this->content);
}
// Workarounds
$this->content = $this->normalizeData($this->content);

View File

@ -81,7 +81,6 @@ class Rss20 extends \PicoFeed\Parser
if (empty($item->updated)) $item->updated = strtotime((string) $entry->pubDate) ?: $this->updated;
if (empty($item->content)) {
$item->content = isset($entry->description) ? (string) $entry->description : '';
}

View File

@ -11,11 +11,13 @@ class Reader
{
private $url = '';
private $content = '';
private $encoding = '';
public function __construct($content = '')
public function __construct($content = '', $encoding = '')
{
$this->content = $content;
$this->encoding = '';
return $this;
}
@ -37,6 +39,7 @@ class Reader
$this->content = $client->getContent();
$this->url = $client->getUrl();
$this->encoding = $client->getEncoding();
return $client;
}
@ -86,7 +89,7 @@ class Reader
Logging::log(\get_called_class().': discover Atom feed');
require_once __DIR__.'/Parsers/Atom.php';
return new Parsers\Atom($this->content);
return new Parsers\Atom($this->content, $this->encoding);
}
else if (strpos($first_tag, '<rss') !== false &&
(strpos($first_tag, 'version="2.0"') !== false || strpos($first_tag, 'version=\'2.0\'') !== false)) {
@ -94,7 +97,7 @@ class Reader
Logging::log(\get_called_class().': discover RSS 2.0 feed');
require_once __DIR__.'/Parsers/Rss20.php';
return new Parsers\Rss20($this->content);
return new Parsers\Rss20($this->content, $this->encoding);
}
else if (strpos($first_tag, '<rss') !== false &&
(strpos($first_tag, 'version="0.92"') !== false || strpos($first_tag, 'version=\'0.92\'') !== false)) {
@ -102,7 +105,7 @@ class Reader
Logging::log(\get_called_class().': discover RSS 0.92 feed');
require_once __DIR__.'/Parsers/Rss92.php';
return new Parsers\Rss92($this->content);
return new Parsers\Rss92($this->content, $this->encoding);
}
else if (strpos($first_tag, '<rss') !== false &&
(strpos($first_tag, 'version="0.91"') !== false || strpos($first_tag, 'version=\'0.91\'') !== false)) {
@ -110,14 +113,14 @@ class Reader
Logging::log(\get_called_class().': discover RSS 0.91 feed');
require_once __DIR__.'/Parsers/Rss91.php';
return new Parsers\Rss91($this->content);
return new Parsers\Rss91($this->content, $this->encoding);
}
else if (strpos($first_tag, '<rdf:') !== false && strpos($first_tag, 'xmlns="http://purl.org/rss/1.0/"') !== false) {
Logging::log(\get_called_class().': discover RSS 1.0 feed');
require_once __DIR__.'/Parsers/Rss10.php';
return new Parsers\Rss10($this->content);
return new Parsers\Rss10($this->content, $this->encoding);
}
else if ($discover === true) {