Add some workarounds for broken feeds

This commit is contained in:
Frédéric Guillot 2014-02-01 14:52:33 -05:00
parent 541e9e5afb
commit a843312fff

View File

@ -86,7 +86,20 @@ abstract class Parser
// Dirty quickfix before XML parsing // Dirty quickfix before XML parsing
public function normalizeData($data) public function normalizeData($data)
{ {
return str_replace("\xc3\x20", '', $data); $data = str_replace("\xc3\x20", '', $data);
$data = $this->replaceEntityAttribute($data);
return $data;
}
// For each href attribute, replace & by &
// Useful for broken XML feeds
public function replaceEntityAttribute($content)
{
$content = preg_replace_callback('/href="[^"]+"/', function(array $matches) {
return htmlspecialchars($matches[0], ENT_NOQUOTES, 'UTF-8', false);
}, $content);
return $content;
} }
@ -172,7 +185,7 @@ abstract class Parser
// Hardcoded list of hostname/token to exclude from id generation // Hardcoded list of hostname/token to exclude from id generation
public function isExcludedFromId($url) public function isExcludedFromId($url)
{ {
$exclude_list = array('ap.org'); $exclude_list = array('ap.org', 'jacksonville.com');
foreach ($exclude_list as $token) { foreach ($exclude_list as $token) {
if (strpos($url, $token) !== false) return true; if (strpos($url, $token) !== false) return true;