Improve date parsing for invalid date

This commit is contained in:
Frédéric Guillot 2013-10-23 18:39:21 -04:00
parent ebc835cf78
commit 431464036a
4 changed files with 63 additions and 13 deletions

View File

@ -105,4 +105,58 @@ abstract class Parser
// crc32b seems to be faster and shorter than other hash algorithms
return hash('crc32b', implode(func_get_args()));
}
public function parseDate($value)
{
// Format => truncate to this length if not null
$formats = array(
DATE_ATOM => null,
DATE_RSS => null,
DATE_COOKIE => null,
DATE_ISO8601 => null,
DATE_RFC822 => null,
DATE_RFC850 => null,
DATE_RFC1036 => null,
DATE_RFC1123 => null,
DATE_RFC2822 => null,
DATE_RFC3339 => null,
'D, d M Y H:i:s' => 25,
'D, d M Y h:i:s' => 25,
'D M d Y H:i:s' => 24,
'Y-m-d H:i:s' => 19,
'Y-m-d\TH:i:s' => 19,
'd/m/Y H:i:s' => 19,
'D, d M Y' => 16,
'Y-m-d' => 10,
'd-m-Y' => 10,
'm-d-Y' => 10,
'd.m.Y' => 10,
'm.d.Y' => 10,
'd/m/Y' => 10,
'm/d/Y' => 10,
);
$value = trim($value);
foreach ($formats as $format => $length) {
$timestamp = $this->getValidDate($format, substr($value, 0, $length));
if ($timestamp > 0) return $timestamp;
}
return time();
}
public function getValidDate($format, $value)
{
$date = \DateTime::createFromFormat($format, $value);
if ($date !== false) {
$errors = \DateTime::getLastErrors();
if ($errors['error_count'] === 0 && $errors['warning_count'] === 0) return $date->getTimestamp();
}
return 0;
}
}

View File

@ -20,7 +20,7 @@ class Atom extends \PicoFeed\Parser
$this->url = $this->getUrl($xml);
$this->title = $this->stripWhiteSpace((string) $xml->title);
$this->id = (string) $xml->id;
$this->updated = strtotime((string) $xml->updated);
$this->updated = $this->parseDate((string) $xml->updated);
$author = (string) $xml->author->name;
foreach ($xml->entry as $entry) {
@ -36,7 +36,7 @@ class Atom extends \PicoFeed\Parser
$item->url = $this->getUrl($entry);
$item->id = $this->generateId($id !== $item->url ? $id : $item->url, $this->url);
$item->title = $this->stripWhiteSpace((string) $entry->title);
$item->updated = strtotime((string) $entry->updated);
$item->updated = $this->parseDate((string) $entry->updated);
$item->author = $author;
$item->content = $this->filterHtml($this->getContent($entry), $item->url);

View File

@ -26,7 +26,7 @@ class Rss10 extends \PicoFeed\Parser
if (isset($namespaces['dc'])) {
$ns_dc = $xml->channel->children($namespaces['dc']);
$this->updated = isset($ns_dc->date) ? strtotime($ns_dc->date) : time();
$this->updated = isset($ns_dc->date) ? $this->parseDate($ns_dc->date) : time();
}
else {
@ -48,8 +48,8 @@ class Rss10 extends \PicoFeed\Parser
if (! $item->url && ! empty($namespace->origLink)) $item->url = (string) $namespace->origLink;
if (! $item->author && ! empty($namespace->creator)) $item->author = (string) $namespace->creator;
if (! $item->updated && ! empty($namespace->date)) $item->updated = strtotime((string) $namespace->date);
if (! $item->updated && ! empty($namespace->updated)) $item->updated = strtotime((string) $namespace->updated);
if (! $item->updated && ! empty($namespace->date)) $item->updated = $this->parseDate((string) $namespace->date);
if (! $item->updated && ! empty($namespace->updated)) $item->updated = $this->parseDate((string) $namespace->updated);
if (! $item->content && ! empty($namespace->encoded)) $item->content = (string) $namespace->encoded;
}
@ -57,18 +57,15 @@ class Rss10 extends \PicoFeed\Parser
if (empty($item->updated)) $item->updated = $this->updated;
if (empty($item->content)) {
$item->content = isset($entry->description) ? (string) $entry->description : '';
}
if (empty($item->author)) {
if (isset($entry->author)) {
$item->author = (string) $entry->author;
}
else if (isset($xml->channel->webMaster)) {
$item->author = (string) $xml->channel->webMaster;
}
}

View File

@ -39,8 +39,7 @@ class Rss20 extends \PicoFeed\Parser
$this->title = $this->stripWhiteSpace((string) $xml->channel->title);
$this->id = $this->url;
$this->updated = isset($xml->channel->pubDate) ? (string) $xml->channel->pubDate : (string) $xml->channel->lastBuildDate;
$this->updated = $this->updated ? strtotime($this->updated) : time();
$this->updated = $this->parseDate(isset($xml->channel->pubDate) ? (string) $xml->channel->pubDate : (string) $xml->channel->lastBuildDate);
// RSS feed might be empty
if (! $xml->channel->item) {
@ -63,8 +62,8 @@ class Rss20 extends \PicoFeed\Parser
if (! $item->url && ! empty($namespace->origLink)) $item->url = (string) $namespace->origLink;
if (! $item->author && ! empty($namespace->creator)) $item->author = (string) $namespace->creator;
if (! $item->updated && ! empty($namespace->date)) $item->updated = strtotime((string) $namespace->date);
if (! $item->updated && ! empty($namespace->updated)) $item->updated = strtotime((string) $namespace->updated);
if (! $item->updated && ! empty($namespace->date)) $item->updated = $this->parseDate((string) $namespace->date);
if (! $item->updated && ! empty($namespace->updated)) $item->updated = $this->parseDate((string) $namespace->updated);
if (! $item->content && ! empty($namespace->encoded)) $item->content = (string) $namespace->encoded;
}
@ -78,7 +77,7 @@ class Rss20 extends \PicoFeed\Parser
}
}
if (empty($item->updated)) $item->updated = strtotime((string) $entry->pubDate) ?: $this->updated;
if (empty($item->updated)) $item->updated = $this->parseDate((string) $entry->pubDate) ?: $this->updated;
if (empty($item->content)) {
$item->content = isset($entry->description) ? (string) $entry->description : '';