Bug fixes: charset encoding/decoding

This commit is contained in:
Frédéric Guillot 2014-05-25 08:47:03 -04:00
parent 35e07a8903
commit c6f5606070
8 changed files with 71 additions and 90 deletions

View File

@ -609,6 +609,12 @@ a.bookmark-icon {
text-decoration: none; text-decoration: none;
} }
#item-content-enclosure {
border-bottom: 1px dashed #ccc;
margin-bottom: 20px;
padding-bottom: 10px;
}
/* other pages */ /* other pages */
section li { section li {
margin-left: 15px; margin-left: 15px;

View File

@ -102,6 +102,8 @@ function relative_time($timestamp, $fallback_date_format = '%e %B %Y %k:%M')
{ {
$diff = time() - $timestamp; $diff = time() - $timestamp;
if ($diff < 0) return \dt($fallback_date_format, $timestamp);
if ($diff < 60) return \t('%d second'.($diff > 1 ? 's' : '').' ago', $diff); if ($diff < 60) return \t('%d second'.($diff > 1 ? 's' : '').' ago', $diff);
$diff = floor($diff / 60); $diff = floor($diff / 60);

View File

@ -90,9 +90,9 @@
</ul> </ul>
<div id="item-content" <?= Helper\isRTL($item['language']) ? 'dir="rtl"' : '' ?>> <div id="item-content" <?= Helper\isRTL($item['language']) ? 'dir="rtl"' : '' ?>>
<?= $item['content'] ?>
<?php if ($item['enclosure']): ?> <?php if ($item['enclosure']): ?>
<div id="item-content-enclosure">
<?php if (strpos($item['enclosure_type'], 'audio') !== false): ?> <?php if (strpos($item['enclosure_type'], 'audio') !== false): ?>
<audio controls> <audio controls>
<source src="<?= $item['enclosure'] ?>" type="<?= $item['enclosure_type'] ?>"> <source src="<?= $item['enclosure'] ?>" type="<?= $item['enclosure_type'] ?>">
@ -101,8 +101,13 @@
<video controls> <video controls>
<source src="<?= $item['enclosure'] ?>" type="<?= $item['enclosure_type'] ?>"> <source src="<?= $item['enclosure'] ?>" type="<?= $item['enclosure_type'] ?>">
</video> </video>
<?php elseif (strpos($item['enclosure_type'], 'image') !== false): ?>
<img src="<?= $item['enclosure'] ?>" alt="enclosure"/>
<?php endif ?> <?php endif ?>
</div>
<?php endif ?> <?php endif ?>
<?= $item['content'] ?>
</div> </div>
<?php if (isset($item_nav)): ?> <?php if (isset($item_nav)): ?>

View File

@ -42,36 +42,6 @@ class Encoding
159 => "\xc5\xb8" 159 => "\xc5\xb8"
); );
protected static $utf8ToWin1252 = array(
"\xe2\x82\xac" => "\x80",
"\xe2\x80\x9a" => "\x82",
"\xc6\x92" => "\x83",
"\xe2\x80\x9e" => "\x84",
"\xe2\x80\xa6" => "\x85",
"\xe2\x80\xa0" => "\x86",
"\xe2\x80\xa1" => "\x87",
"\xcb\x86" => "\x88",
"\xe2\x80\xb0" => "\x89",
"\xc5\xa0" => "\x8a",
"\xe2\x80\xb9" => "\x8b",
"\xc5\x92" => "\x8c",
"\xc5\xbd" => "\x8e",
"\xe2\x80\x98" => "\x91",
"\xe2\x80\x99" => "\x92",
"\xe2\x80\x9c" => "\x93",
"\xe2\x80\x9d" => "\x94",
"\xe2\x80\xa2" => "\x95",
"\xe2\x80\x93" => "\x96",
"\xe2\x80\x94" => "\x97",
"\xcb\x9c" => "\x98",
"\xe2\x84\xa2" => "\x99",
"\xc5\xa1" => "\x9a",
"\xe2\x80\xba" => "\x9b",
"\xc5\x93" => "\x9c",
"\xc5\xbe" => "\x9e",
"\xc5\xb8" => "\x9f"
);
/** /**
* Function Encoding::toUTF8 * Function Encoding::toUTF8
* *
@ -127,9 +97,7 @@ class Encoding
$i++; $i++;
} }
else { //not valid UTF8. Convert it. else { //not valid UTF8. Convert it.
$cc1 = (chr(ord($c1) / 64) | "\xc0"); $buf .= self::convertInvalidCharacter($c1);
$cc2 = ($c1 & "\x3f") | "\x80";
$buf .= $cc1 . $cc2;
} }
} }
else if ($c1 >= "\xe0" & $c1 <= "\xef") { //looks like 3 bytes UTF8 else if ($c1 >= "\xe0" & $c1 <= "\xef") { //looks like 3 bytes UTF8
@ -139,9 +107,7 @@ class Encoding
$i = $i + 2; $i = $i + 2;
} }
else { //not valid UTF8. Convert it. else { //not valid UTF8. Convert it.
$cc1 = (chr(ord($c1) / 64) | "\xc0"); $buf .= self::convertInvalidCharacter($c1);
$cc2 = ($c1 & "\x3f") | "\x80";
$buf .= $cc1 . $cc2;
} }
} }
else if ($c1 >= "\xf0" & $c1 <= "\xf7") { //looks like 4 bytes UTF8 else if ($c1 >= "\xf0" & $c1 <= "\xf7") { //looks like 4 bytes UTF8
@ -151,15 +117,11 @@ class Encoding
$i = $i + 2; $i = $i + 2;
} }
else { //not valid UTF8. Convert it. else { //not valid UTF8. Convert it.
$cc1 = (chr(ord($c1) / 64) | "\xc0"); $buf .= self::convertInvalidCharacter($c1);
$cc2 = ($c1 & "\x3f") | "\x80";
$buf .= $cc1 . $cc2;
} }
} }
else { //doesn't look like UTF8, but should be converted else { //doesn't look like UTF8, but should be converted
$cc1 = (chr(ord($c1) / 64) | "\xc0"); $buf .= self::convertInvalidCharacter($c1);
$cc2 = (($c1 & "\x3f") | "\x80");
$buf .= $cc1 . $cc2;
} }
} }
elseif (($c1 & "\xc0") == "\x80") { // needs conversion elseif (($c1 & "\xc0") == "\x80") { // needs conversion
@ -168,12 +130,10 @@ class Encoding
$buf .= self::$win1252ToUtf8[ord($c1)]; $buf .= self::$win1252ToUtf8[ord($c1)];
} }
else { else {
$cc1 = (chr(ord($c1) / 64) | "\xc0"); $buf .= self::convertInvalidCharacter($c1);
$cc2 = (($c1 & "\x3f") | "\x80");
$buf .= $cc1 . $cc2;
} }
} }
else { // it doesn't need convesion else { // it doesn't need conversion
$buf .= $c1; $buf .= $c1;
} }
} }
@ -185,8 +145,27 @@ class Encoding
} }
} }
public static function cp1251ToUtf8($input) public static function convertInvalidCharacter($c1)
{
$cc1 = chr(ord($c1) / 64) | "\xc0";
$cc2 = ($c1 & "\x3f") | "\x80";
return $cc1.$cc2;
}
public static function convert_CP_1251($input)
{ {
return iconv('CP1251', 'UTF-8//TRANSLIT', $input); return iconv('CP1251', 'UTF-8//TRANSLIT', $input);
} }
public static function convert($input, $encoding)
{
if ($encoding === 'windows-1251') {
return self::convert_CP_1251($input);
}
else if ($encoding === '' || $encoding !== 'utf-8') {
return self::toUTF8($input);
}
return $input;
}
} }

View File

@ -733,33 +733,6 @@ class Filter
return $data; return $data;
} }
/**
* Get the encoding from a xml tag
*
* @static
* @access public
* @param string $data Input data
* @return string
*/
public static function getEncodingFromXmlTag($data)
{
$encoding = '';
if (strpos($data, '<?xml') !== false) {
$data = substr($data, 0, strrpos($data, '?>'));
$data = str_replace("'", '"', $data);
$p1 = strpos($data, 'encoding=');
$p2 = strpos($data, '"', $p1 + 10);
$encoding = substr($data, $p1 + 10, $p2 - $p1 - 10);
$encoding = strtolower($encoding);
}
return $encoding;
}
/** /**
* Set whitelisted tags adn attributes for each tag * Set whitelisted tags adn attributes for each tag
* *

View File

@ -191,13 +191,7 @@ class Grabber
Logging::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'"'); Logging::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'"');
$this->html = Filter::stripHeadTags($this->html); $this->html = Filter::stripHeadTags($this->html);
$this->html = Encoding::convert($this->html, $this->encoding);
if ($this->encoding == 'windows-1251') {
$this->html = Encoding::cp1251ToUtf8($this->html);
}
else {
$this->html = Encoding::toUTF8($this->html);
}
Logging::setMessage(get_called_class().' Content length: '.strlen($this->html).' bytes'); Logging::setMessage(get_called_class().' Content length: '.strlen($this->html).' bytes');
$rules = $this->getRules(); $rules = $this->getRules();

View File

@ -86,19 +86,14 @@ abstract class Parser
*/ */
public function __construct($content, $http_encoding = '') public function __construct($content, $http_encoding = '')
{ {
$xml_encoding = Filter::getEncodingFromXmlTag($content); $xml_encoding = XmlParser::getEncodingFromXmlTag($content);
Logging::setMessage(get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"');
// Strip XML tag to avoid multiple encoding/decoding in the next XML processing // Strip XML tag to avoid multiple encoding/decoding in the next XML processing
$this->content = Filter::stripXmlTag($content); $this->content = Filter::stripXmlTag($content);
// Encode everything in UTF-8 // Encode everything in UTF-8
if ($xml_encoding == 'windows-1251' || $http_encoding == 'windows-1251') { Logging::setMessage(get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"');
$this->content = Encoding::cp1251ToUtf8($this->content); $this->content = Encoding::convert($this->content, $xml_encoding ?: $http_encoding);
}
else {
$this->content = Encoding::toUTF8($this->content);
}
// Workarounds // Workarounds
$this->content = $this->normalizeData($this->content); $this->content = $this->normalizeData($this->content);

View File

@ -133,4 +133,31 @@ class XmlParser
return implode(', ', $errors); return implode(', ', $errors);
} }
/**
* Get the encoding from a xml tag
*
* @static
* @access public
* @param string $data Input data
* @return string
*/
public static function getEncodingFromXmlTag($data)
{
$encoding = '';
if (strpos($data, '<?xml') !== false) {
$data = substr($data, 0, strrpos($data, '?>'));
$data = str_replace("'", '"', $data);
$p1 = strpos($data, 'encoding=');
$p2 = strpos($data, '"', $p1 + 10);
$encoding = substr($data, $p1 + 10, $p2 - $p1 - 10);
$encoding = strtolower($encoding);
}
return $encoding;
}
} }