2013-06-01 14:57:17 -04:00
|
|
|
<?php
|
|
|
|
|
2014-12-23 21:28:26 -05:00
|
|
|
namespace PicoFeed\Encoding;
|
2013-06-01 14:57:17 -04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @author "Sebastián Grignoli" <grignoli@framework2.com.ar>
|
|
|
|
* @package Encoding
|
|
|
|
* @version 1.2
|
|
|
|
* @link https://github.com/neitanod/forceutf8
|
|
|
|
* @example https://github.com/neitanod/forceutf8
|
|
|
|
* @license Revised BSD
|
2014-05-20 14:20:27 -04:00
|
|
|
*/
|
|
|
|
class Encoding
|
|
|
|
{
|
|
|
|
protected static $win1252ToUtf8 = array(
|
2013-06-01 14:57:17 -04:00
|
|
|
128 => "\xe2\x82\xac",
|
|
|
|
130 => "\xe2\x80\x9a",
|
|
|
|
131 => "\xc6\x92",
|
|
|
|
132 => "\xe2\x80\x9e",
|
|
|
|
133 => "\xe2\x80\xa6",
|
|
|
|
134 => "\xe2\x80\xa0",
|
|
|
|
135 => "\xe2\x80\xa1",
|
|
|
|
136 => "\xcb\x86",
|
|
|
|
137 => "\xe2\x80\xb0",
|
|
|
|
138 => "\xc5\xa0",
|
|
|
|
139 => "\xe2\x80\xb9",
|
|
|
|
140 => "\xc5\x92",
|
|
|
|
142 => "\xc5\xbd",
|
|
|
|
145 => "\xe2\x80\x98",
|
|
|
|
146 => "\xe2\x80\x99",
|
|
|
|
147 => "\xe2\x80\x9c",
|
|
|
|
148 => "\xe2\x80\x9d",
|
|
|
|
149 => "\xe2\x80\xa2",
|
|
|
|
150 => "\xe2\x80\x93",
|
|
|
|
151 => "\xe2\x80\x94",
|
|
|
|
152 => "\xcb\x9c",
|
|
|
|
153 => "\xe2\x84\xa2",
|
|
|
|
154 => "\xc5\xa1",
|
|
|
|
155 => "\xe2\x80\xba",
|
|
|
|
156 => "\xc5\x93",
|
|
|
|
158 => "\xc5\xbe",
|
|
|
|
159 => "\xc5\xb8"
|
|
|
|
);
|
|
|
|
|
2014-05-20 14:20:27 -04:00
|
|
|
/**
|
|
|
|
* Function Encoding::toUTF8
|
|
|
|
*
|
|
|
|
* This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
|
|
|
|
*
|
|
|
|
* It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1.
|
|
|
|
*
|
|
|
|
* It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
|
|
|
|
*
|
|
|
|
* 1) when any of these characters: ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß
|
|
|
|
* are followed by any of these: ("group B")
|
|
|
|
* ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶•¸¹º»¼½¾¿
|
|
|
|
* For example: %ABREPRESENT%C9%BB. «REPRESENTÉ»
|
|
|
|
* The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
|
|
|
|
* is also a valid unicode character, and will be left unchanged.
|
|
|
|
*
|
|
|
|
* 2) when any of these: àáâãäåæçèéêëìíîï are followed by TWO chars from group B,
|
|
|
|
* 3) when any of these: ðñòó are followed by THREE chars from group B.
|
|
|
|
*
|
|
|
|
* @name toUTF8
|
|
|
|
* @param string $text Any string.
|
|
|
|
* @return string The same string, UTF8 encoded
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
public static function toUTF8($text)
|
2013-06-01 14:57:17 -04:00
|
|
|
{
|
2014-05-20 14:20:27 -04:00
|
|
|
if (is_array($text)) {
|
|
|
|
foreach ($text as $k => $v) {
|
|
|
|
$text[$k] = self::toUTF8($v);
|
|
|
|
}
|
|
|
|
|
|
|
|
return $text;
|
|
|
|
}
|
|
|
|
elseif (is_string($text)) {
|
|
|
|
|
|
|
|
$max = strlen($text);
|
|
|
|
$buf = "";
|
|
|
|
|
|
|
|
for ($i = 0; $i < $max; $i++) {
|
|
|
|
|
|
|
|
$c1 = $text{$i};
|
|
|
|
|
|
|
|
if ($c1>="\xc0") { //Should be converted to UTF8, if it's not UTF8 already
|
|
|
|
|
|
|
|
$c2 = $i+1 >= $max? "\x00" : $text{$i+1};
|
|
|
|
$c3 = $i+2 >= $max? "\x00" : $text{$i+2};
|
|
|
|
$c4 = $i+3 >= $max? "\x00" : $text{$i+3};
|
|
|
|
|
|
|
|
if ($c1 >= "\xc0" & $c1 <= "\xdf") { //looks like 2 bytes UTF8
|
|
|
|
|
|
|
|
if ($c2 >= "\x80" && $c2 <= "\xbf") { //yeah, almost sure it's UTF8 already
|
|
|
|
$buf .= $c1 . $c2;
|
|
|
|
$i++;
|
|
|
|
}
|
|
|
|
else { //not valid UTF8. Convert it.
|
2014-05-25 08:47:03 -04:00
|
|
|
$buf .= self::convertInvalidCharacter($c1);
|
2014-05-20 14:20:27 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else if ($c1 >= "\xe0" & $c1 <= "\xef") { //looks like 3 bytes UTF8
|
|
|
|
|
|
|
|
if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { //yeah, almost sure it's UTF8 already
|
|
|
|
$buf .= $c1 . $c2 . $c3;
|
|
|
|
$i = $i + 2;
|
|
|
|
}
|
|
|
|
else { //not valid UTF8. Convert it.
|
2014-05-25 08:47:03 -04:00
|
|
|
$buf .= self::convertInvalidCharacter($c1);
|
2014-05-20 14:20:27 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else if ($c1 >= "\xf0" & $c1 <= "\xf7") { //looks like 4 bytes UTF8
|
|
|
|
|
|
|
|
if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") { //yeah, almost sure it's UTF8 already
|
|
|
|
$buf .= $c1 . $c2 . $c3;
|
|
|
|
$i = $i + 2;
|
|
|
|
}
|
|
|
|
else { //not valid UTF8. Convert it.
|
2014-05-25 08:47:03 -04:00
|
|
|
$buf .= self::convertInvalidCharacter($c1);
|
2014-05-20 14:20:27 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else { //doesn't look like UTF8, but should be converted
|
2014-05-25 08:47:03 -04:00
|
|
|
$buf .= self::convertInvalidCharacter($c1);
|
2014-05-20 14:20:27 -04:00
|
|
|
}
|
2013-06-01 14:57:17 -04:00
|
|
|
}
|
2014-05-20 14:20:27 -04:00
|
|
|
elseif (($c1 & "\xc0") == "\x80") { // needs conversion
|
|
|
|
|
|
|
|
if (isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases
|
|
|
|
$buf .= self::$win1252ToUtf8[ord($c1)];
|
|
|
|
}
|
|
|
|
else {
|
2014-05-25 08:47:03 -04:00
|
|
|
$buf .= self::convertInvalidCharacter($c1);
|
2014-05-20 14:20:27 -04:00
|
|
|
}
|
|
|
|
}
|
2014-05-25 08:47:03 -04:00
|
|
|
else { // it doesn't need conversion
|
2014-05-20 14:20:27 -04:00
|
|
|
$buf .= $c1;
|
|
|
|
}
|
|
|
|
}
|
2013-06-01 14:57:17 -04:00
|
|
|
|
2014-05-20 14:20:27 -04:00
|
|
|
return $buf;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
return $text;
|
|
|
|
}
|
2013-06-01 14:57:17 -04:00
|
|
|
}
|
|
|
|
|
2014-05-25 08:47:03 -04:00
|
|
|
public static function convertInvalidCharacter($c1)
|
|
|
|
{
|
|
|
|
$cc1 = chr(ord($c1) / 64) | "\xc0";
|
|
|
|
$cc2 = ($c1 & "\x3f") | "\x80";
|
|
|
|
return $cc1.$cc2;
|
|
|
|
}
|
|
|
|
|
|
|
|
public static function convert($input, $encoding)
|
|
|
|
{
|
2014-12-23 21:28:26 -05:00
|
|
|
switch ($encoding) {
|
|
|
|
case 'utf-8':
|
|
|
|
return $input;
|
|
|
|
case 'windows-1251':
|
|
|
|
case 'windows-1255':
|
2015-01-27 20:13:16 -05:00
|
|
|
case 'windows-1256':
|
2014-12-23 21:28:26 -05:00
|
|
|
return iconv($encoding, 'UTF-8//TRANSLIT', $input);
|
|
|
|
default:
|
|
|
|
return self::toUTF8($input);
|
2014-05-25 08:47:03 -04:00
|
|
|
}
|
|
|
|
}
|
2013-06-01 14:57:17 -04:00
|
|
|
}
|