Add support for CP1251 encoding
This commit is contained in:
parent
4d78a22684
commit
de8133b7ee
@ -432,8 +432,7 @@ function download_content($url)
|
|||||||
if (! empty($html)) {
|
if (! empty($html)) {
|
||||||
|
|
||||||
// Try first with PicoFeed grabber and with Readability after
|
// Try first with PicoFeed grabber and with Readability after
|
||||||
$grabber = new \PicoFeed\Grabber($url);
|
$grabber = new \PicoFeed\Grabber($url, $html, $client->getEncoding());
|
||||||
$grabber->html = $html;
|
|
||||||
$content = '';
|
$content = '';
|
||||||
|
|
||||||
if ($grabber->parse()) {
|
if ($grabber->parse()) {
|
||||||
|
12
vendor/PicoFeed/Client.php
vendored
12
vendor/PicoFeed/Client.php
vendored
@ -11,6 +11,7 @@ abstract class Client
|
|||||||
protected static $proxy_username = null;
|
protected static $proxy_username = null;
|
||||||
protected static $proxy_password = null;
|
protected static $proxy_password = null;
|
||||||
|
|
||||||
|
public $encoding = '';
|
||||||
public $etag = '';
|
public $etag = '';
|
||||||
public $last_modified = '';
|
public $last_modified = '';
|
||||||
public $is_modified = true;
|
public $is_modified = true;
|
||||||
@ -70,6 +71,11 @@ abstract class Client
|
|||||||
$this->etag = isset($response['headers']['ETag']) ? $response['headers']['ETag'] : '';
|
$this->etag = isset($response['headers']['ETag']) ? $response['headers']['ETag'] : '';
|
||||||
$this->last_modified = isset($response['headers']['Last-Modified']) ? $response['headers']['Last-Modified'] : '';
|
$this->last_modified = isset($response['headers']['Last-Modified']) ? $response['headers']['Last-Modified'] : '';
|
||||||
$this->content = $response['body'];
|
$this->content = $response['body'];
|
||||||
|
|
||||||
|
if (isset($response['headers']['Content-Type'])) {
|
||||||
|
$result = explode('charset=', strtolower($response['headers']['Content-Type']));
|
||||||
|
$this->encoding = isset($result[1]) ? $result[1] : '';
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -149,6 +155,12 @@ abstract class Client
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public function getEncoding()
|
||||||
|
{
|
||||||
|
return $this->encoding;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public function isModified()
|
public function isModified()
|
||||||
{
|
{
|
||||||
return $this->is_modified;
|
return $this->is_modified;
|
||||||
|
5
vendor/PicoFeed/Encoding.php
vendored
5
vendor/PicoFeed/Encoding.php
vendored
@ -322,4 +322,9 @@ class Encoding {
|
|||||||
if($encodingLabel == 'ISO-8859-1') return Encoding::toLatin1($text);
|
if($encodingLabel == 'ISO-8859-1') return Encoding::toLatin1($text);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static function cp1251ToUtf8($input)
|
||||||
|
{
|
||||||
|
return iconv('CP1251', 'UTF-8//TRANSLIT', $input);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
28
vendor/PicoFeed/Filter.php
vendored
28
vendor/PicoFeed/Filter.php
vendored
@ -164,6 +164,7 @@ class Filter
|
|||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
|
// All inputs data must be encoded in UTF-8 before
|
||||||
public function __construct($data, $site_url)
|
public function __construct($data, $site_url)
|
||||||
{
|
{
|
||||||
$this->url = $site_url;
|
$this->url = $site_url;
|
||||||
@ -301,10 +302,11 @@ class Filter
|
|||||||
{
|
{
|
||||||
$content = str_replace("\xc2\xa0", ' ', $content); // Replace with normal space
|
$content = str_replace("\xc2\xa0", ' ', $content); // Replace with normal space
|
||||||
|
|
||||||
|
// Issue with Cyrillic characters
|
||||||
// Replace mutliple space by a single one
|
// Replace mutliple space by a single one
|
||||||
if (! $this->is_code) {
|
// if (! $this->is_code) {
|
||||||
$content = preg_replace('!\s+!', ' ', $content);
|
// $content = preg_replace('!\s+!', ' ', $content);
|
||||||
}
|
// }
|
||||||
|
|
||||||
if (! $this->strip_content) {
|
if (! $this->strip_content) {
|
||||||
$this->data .= htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false);
|
$this->data .= htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false);
|
||||||
@ -470,4 +472,24 @@ class Filter
|
|||||||
{
|
{
|
||||||
return preg_replace('/<meta\s.*?\/>/is', '', $data);
|
return preg_replace('/<meta\s.*?\/>/is', '', $data);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static function getEncodingFromXmlTag($data)
|
||||||
|
{
|
||||||
|
$encoding = '';
|
||||||
|
|
||||||
|
if (strpos($data, '<?xml') !== false) {
|
||||||
|
|
||||||
|
$data = substr($data, 0, strrpos($data, '?>'));
|
||||||
|
$data = str_replace("'", '"', $data);
|
||||||
|
|
||||||
|
$p1 = strpos($data, 'encoding=');
|
||||||
|
$p2 = strpos($data, '"', $p1 + 10);
|
||||||
|
|
||||||
|
$encoding = substr($data, $p1 + 10, $p2 - $p1 - 10);
|
||||||
|
$encoding = strtolower($encoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $encoding;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
15
vendor/PicoFeed/Grabber.php
vendored
15
vendor/PicoFeed/Grabber.php
vendored
@ -11,6 +11,7 @@ class Grabber
|
|||||||
{
|
{
|
||||||
public $content = '';
|
public $content = '';
|
||||||
public $html = '';
|
public $html = '';
|
||||||
|
public $encoding = '';
|
||||||
|
|
||||||
// Order is important, generic terms at the end
|
// Order is important, generic terms at the end
|
||||||
public $candidatesAttributes = array(
|
public $candidatesAttributes = array(
|
||||||
@ -67,9 +68,11 @@ class Grabber
|
|||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
public function __construct($url)
|
public function __construct($url, $html = '', $encoding = 'utf-8')
|
||||||
{
|
{
|
||||||
$this->url = $url;
|
$this->url = $url;
|
||||||
|
$this->html = $html;
|
||||||
|
$this->encoding = $encoding;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -78,8 +81,16 @@ class Grabber
|
|||||||
if ($this->html) {
|
if ($this->html) {
|
||||||
|
|
||||||
Logging::log(\get_called_class().' Fix encoding');
|
Logging::log(\get_called_class().' Fix encoding');
|
||||||
|
Logging::log(\get_called_class().': HTTP Encoding "'.$this->encoding.'"');
|
||||||
|
|
||||||
$this->html = Filter::stripMetaTags($this->html);
|
$this->html = Filter::stripMetaTags($this->html);
|
||||||
$this->html = Encoding::toUtf8($this->html);
|
|
||||||
|
if ($this->encoding == 'windows-1251') {
|
||||||
|
$this->html = Encoding::cp1251ToUtf8($this->html);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
$this->html = Encoding::toUTF8($this->html);
|
||||||
|
}
|
||||||
|
|
||||||
Logging::log(\get_called_class().' Try to find rules');
|
Logging::log(\get_called_class().' Try to find rules');
|
||||||
$rules = $this->getRules();
|
$rules = $this->getRules();
|
||||||
|
12
vendor/PicoFeed/Parser.php
vendored
12
vendor/PicoFeed/Parser.php
vendored
@ -25,13 +25,21 @@ abstract class Parser
|
|||||||
abstract public function execute();
|
abstract public function execute();
|
||||||
|
|
||||||
|
|
||||||
public function __construct($content)
|
public function __construct($content, $http_encoding = '')
|
||||||
{
|
{
|
||||||
// Strip XML tag to avoid multiple encoding/decoding in next XML processing
|
$xml_encoding = Filter::getEncodingFromXmlTag($content);
|
||||||
|
Logging::log(\get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"');
|
||||||
|
|
||||||
|
// Strip XML tag to avoid multiple encoding/decoding in the next XML processing
|
||||||
$this->content = Filter::stripXmlTag($content);
|
$this->content = Filter::stripXmlTag($content);
|
||||||
|
|
||||||
// Encode everything in UTF-8
|
// Encode everything in UTF-8
|
||||||
|
if ($xml_encoding == 'windows-1251' || $http_encoding == 'windows-1251') {
|
||||||
|
$this->content = Encoding::cp1251ToUtf8($this->content);
|
||||||
|
}
|
||||||
|
else {
|
||||||
$this->content = Encoding::toUTF8($this->content);
|
$this->content = Encoding::toUTF8($this->content);
|
||||||
|
}
|
||||||
|
|
||||||
// Workarounds
|
// Workarounds
|
||||||
$this->content = $this->normalizeData($this->content);
|
$this->content = $this->normalizeData($this->content);
|
||||||
|
1
vendor/PicoFeed/Parsers/Rss20.php
vendored
1
vendor/PicoFeed/Parsers/Rss20.php
vendored
@ -81,7 +81,6 @@ class Rss20 extends \PicoFeed\Parser
|
|||||||
if (empty($item->updated)) $item->updated = strtotime((string) $entry->pubDate) ?: $this->updated;
|
if (empty($item->updated)) $item->updated = strtotime((string) $entry->pubDate) ?: $this->updated;
|
||||||
|
|
||||||
if (empty($item->content)) {
|
if (empty($item->content)) {
|
||||||
|
|
||||||
$item->content = isset($entry->description) ? (string) $entry->description : '';
|
$item->content = isset($entry->description) ? (string) $entry->description : '';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
15
vendor/PicoFeed/Reader.php
vendored
15
vendor/PicoFeed/Reader.php
vendored
@ -11,11 +11,13 @@ class Reader
|
|||||||
{
|
{
|
||||||
private $url = '';
|
private $url = '';
|
||||||
private $content = '';
|
private $content = '';
|
||||||
|
private $encoding = '';
|
||||||
|
|
||||||
|
|
||||||
public function __construct($content = '')
|
public function __construct($content = '', $encoding = '')
|
||||||
{
|
{
|
||||||
$this->content = $content;
|
$this->content = $content;
|
||||||
|
$this->encoding = '';
|
||||||
return $this;
|
return $this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -37,6 +39,7 @@ class Reader
|
|||||||
|
|
||||||
$this->content = $client->getContent();
|
$this->content = $client->getContent();
|
||||||
$this->url = $client->getUrl();
|
$this->url = $client->getUrl();
|
||||||
|
$this->encoding = $client->getEncoding();
|
||||||
|
|
||||||
return $client;
|
return $client;
|
||||||
}
|
}
|
||||||
@ -86,7 +89,7 @@ class Reader
|
|||||||
Logging::log(\get_called_class().': discover Atom feed');
|
Logging::log(\get_called_class().': discover Atom feed');
|
||||||
|
|
||||||
require_once __DIR__.'/Parsers/Atom.php';
|
require_once __DIR__.'/Parsers/Atom.php';
|
||||||
return new Parsers\Atom($this->content);
|
return new Parsers\Atom($this->content, $this->encoding);
|
||||||
}
|
}
|
||||||
else if (strpos($first_tag, '<rss') !== false &&
|
else if (strpos($first_tag, '<rss') !== false &&
|
||||||
(strpos($first_tag, 'version="2.0"') !== false || strpos($first_tag, 'version=\'2.0\'') !== false)) {
|
(strpos($first_tag, 'version="2.0"') !== false || strpos($first_tag, 'version=\'2.0\'') !== false)) {
|
||||||
@ -94,7 +97,7 @@ class Reader
|
|||||||
Logging::log(\get_called_class().': discover RSS 2.0 feed');
|
Logging::log(\get_called_class().': discover RSS 2.0 feed');
|
||||||
|
|
||||||
require_once __DIR__.'/Parsers/Rss20.php';
|
require_once __DIR__.'/Parsers/Rss20.php';
|
||||||
return new Parsers\Rss20($this->content);
|
return new Parsers\Rss20($this->content, $this->encoding);
|
||||||
}
|
}
|
||||||
else if (strpos($first_tag, '<rss') !== false &&
|
else if (strpos($first_tag, '<rss') !== false &&
|
||||||
(strpos($first_tag, 'version="0.92"') !== false || strpos($first_tag, 'version=\'0.92\'') !== false)) {
|
(strpos($first_tag, 'version="0.92"') !== false || strpos($first_tag, 'version=\'0.92\'') !== false)) {
|
||||||
@ -102,7 +105,7 @@ class Reader
|
|||||||
Logging::log(\get_called_class().': discover RSS 0.92 feed');
|
Logging::log(\get_called_class().': discover RSS 0.92 feed');
|
||||||
|
|
||||||
require_once __DIR__.'/Parsers/Rss92.php';
|
require_once __DIR__.'/Parsers/Rss92.php';
|
||||||
return new Parsers\Rss92($this->content);
|
return new Parsers\Rss92($this->content, $this->encoding);
|
||||||
}
|
}
|
||||||
else if (strpos($first_tag, '<rss') !== false &&
|
else if (strpos($first_tag, '<rss') !== false &&
|
||||||
(strpos($first_tag, 'version="0.91"') !== false || strpos($first_tag, 'version=\'0.91\'') !== false)) {
|
(strpos($first_tag, 'version="0.91"') !== false || strpos($first_tag, 'version=\'0.91\'') !== false)) {
|
||||||
@ -110,14 +113,14 @@ class Reader
|
|||||||
Logging::log(\get_called_class().': discover RSS 0.91 feed');
|
Logging::log(\get_called_class().': discover RSS 0.91 feed');
|
||||||
|
|
||||||
require_once __DIR__.'/Parsers/Rss91.php';
|
require_once __DIR__.'/Parsers/Rss91.php';
|
||||||
return new Parsers\Rss91($this->content);
|
return new Parsers\Rss91($this->content, $this->encoding);
|
||||||
}
|
}
|
||||||
else if (strpos($first_tag, '<rdf:') !== false && strpos($first_tag, 'xmlns="http://purl.org/rss/1.0/"') !== false) {
|
else if (strpos($first_tag, '<rdf:') !== false && strpos($first_tag, 'xmlns="http://purl.org/rss/1.0/"') !== false) {
|
||||||
|
|
||||||
Logging::log(\get_called_class().': discover RSS 1.0 feed');
|
Logging::log(\get_called_class().': discover RSS 1.0 feed');
|
||||||
|
|
||||||
require_once __DIR__.'/Parsers/Rss10.php';
|
require_once __DIR__.'/Parsers/Rss10.php';
|
||||||
return new Parsers\Rss10($this->content);
|
return new Parsers\Rss10($this->content, $this->encoding);
|
||||||
}
|
}
|
||||||
else if ($discover === true) {
|
else if ($discover === true) {
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user