Update of PicoFeed

This commit is contained in:
Frederic Guillot 2013-04-04 23:34:07 -04:00
parent d4c796f79c
commit fd2e034c01
8 changed files with 317 additions and 244 deletions

View File

@ -45,7 +45,8 @@ class Filter
'figcaption' => array(), 'figcaption' => array(),
'cite' => array(), 'cite' => array(),
'time' => array('datetime'), 'time' => array('datetime'),
'abbr' => array('title') 'abbr' => array('title'),
'iframe' => array('width', 'height', 'frameborder', 'src')
); );
public $strip_tags_content = array( public $strip_tags_content = array(
@ -82,6 +83,11 @@ class Filter
'a' => 'rel="noreferrer" target="_blank"' 'a' => 'rel="noreferrer" target="_blank"'
); );
public $iframe_allowed_resources = array(
'http://www.youtube.com/',
'http://player.vimeo.com/'
);
public function __construct($data, $url) public function __construct($data, $url)
{ {
@ -104,7 +110,7 @@ class Filter
if (! xml_parse($parser, $this->input, true)) { if (! xml_parse($parser, $this->input, true)) {
var_dump($this->input); //var_dump($this->input);
die(xml_get_current_line_number($parser).'|'.xml_error_string(xml_get_error_code($parser))); die(xml_get_current_line_number($parser).'|'.xml_error_string(xml_get_error_code($parser)));
} }
@ -130,11 +136,16 @@ class Filter
foreach ($attributes as $attribute => $value) { foreach ($attributes as $attribute => $value) {
if ($this->isAllowedAttribute($name, $attribute)) { if ($value != '' && $this->isAllowedAttribute($name, $attribute)) {
if ($this->isResource($attribute)) { if ($this->isResource($attribute)) {
if ($this->isRelativePath($value)) { if ($name === 'iframe' && $this->isAllowedIframeResource($value)) {
$attr_data .= ' '.$attribute.'="'.$value.'"';
$used_attributes[] = $attribute;
}
else if ($this->isRelativePath($value)) {
$attr_data .= ' '.$attribute.'="'.$this->getAbsoluteUrl($value, $this->url).'"'; $attr_data .= ' '.$attribute.'="'.$this->getAbsoluteUrl($value, $this->url).'"';
$used_attributes[] = $attribute; $used_attributes[] = $attribute;
@ -216,7 +227,6 @@ class Filter
else { else {
// Relative path // Relative path
$url_path = $components['path']; $url_path = $components['path'];
if ($url_path{strlen($url_path) - 1} !== '/') { if ($url_path{strlen($url_path) - 1} !== '/') {
@ -258,6 +268,20 @@ class Filter
} }
public function isAllowedIframeResource($value)
{
foreach ($this->iframe_allowed_resources as $url) {
if (strpos($value, $url) === 0) {
return true;
}
}
return false;
}
public function isAllowedProtocol($value) public function isAllowedProtocol($value)
{ {
foreach ($this->allowed_protocols as $protocol) { foreach ($this->allowed_protocols as $protocol) {

View File

@ -14,6 +14,7 @@ abstract class Parser
public $title = ''; public $title = '';
public $updated = ''; public $updated = '';
public $items = array(); public $items = array();
public $debug = false;
abstract public function execute(); abstract public function execute();
@ -37,253 +38,25 @@ abstract class Parser
return $content; return $content;
} }
}
class Atom extends Parser public function displayXmlErrors()
{
public function execute()
{ {
try { foreach(\libxml_get_errors() as $error) {
\libxml_use_internal_errors(true); printf("Message: %s\nLine: %d\nColumn: %d\nCode: %d\n",
$error->message,
$xml = new \SimpleXMLElement($this->content); $error->line,
$error->column,
$this->url = $this->getUrl($xml); $error->code
$this->title = (string) $xml->title; );
$this->id = (string) $xml->id;
$this->updated = strtotime((string) $xml->updated);
$author = (string) $xml->author->name;
foreach ($xml->entry as $entry) {
if (isset($entry->author->name)) {
$author = $entry->author->name;
}
$item = new \StdClass;
$item->id = (string) $entry->id;
$item->title = (string) $entry->title;
$item->url = $this->getUrl($entry);
$item->updated = strtotime((string) $entry->updated);
$item->author = $author;
$item->content = $this->filterHtml($this->getContent($entry), $item->url);
$this->items[] = $item;
}
} }
catch (\Exception $e) {
}
return $this;
} }
public function getContent($entry) // Dirty quickfix before XML parsing
public function normalizeData($data)
{ {
if (isset($entry->content) && ! empty($entry->content)) { return str_replace("\xc3\x20", '', $data);
if (count($entry->content->children())) {
return (string) $entry->content->asXML();
}
else {
return (string) $entry->content;
}
}
else if (isset($entry->summary) && ! empty($entry->summary)) {
return (string) $entry->summary;
}
return '';
}
public function getUrl($xml)
{
foreach ($xml->link as $link) {
if ((string) $link['type'] === 'text/html') {
return (string) $link['href'];
}
}
return (string) $xml->link['href'];
} }
} }
class Rss20 extends Parser
{
public function execute()
{
try {
\libxml_use_internal_errors(true);
$xml = new \SimpleXMLElement($this->content);
$ns = $xml->getNamespaces(true);
$this->title = (string) $xml->channel->title;
$this->url = (string) $xml->channel->link;
$this->id = $this->url;
$this->updated = isset($xml->channel->pubDate) ? (string) $xml->channel->pubDate : (string) $xml->channel->lastBuildDate;
if ($this->updated) {
$this->updated = strtotime($this->updated);
}
else {
$this->updated = time();
}
foreach ($xml->channel->item as $entry) {
$author = '';
$content = '';
$pubdate = '';
$link = '';
if (isset($ns['feedburner'])) {
$ns_fb = $entry->children($ns['feedburner']);
$link = $ns_fb->origLink;
}
if (isset($ns['dc'])) {
$ns_dc = $entry->children($ns['dc']);
$author = (string) $ns_dc->creator;
$pubdate = (string) $ns_dc->date;
}
if (isset($ns['content'])) {
$ns_content = $entry->children($ns['content']);
$content = (string) $ns_content->encoded;
}
if ($content === '' && isset($entry->description)) {
$content = (string) $entry->description;
}
if ($author === '') {
if (isset($entry->author)) {
$author = (string) $entry->author;
}
else if (isset($xml->channel->webMaster)) {
$author = (string) $xml->channel->webMaster;
}
}
$item = new \StdClass;
$item->title = (string) $entry->title;
$item->url = $link ?: (string) $entry->link;
$item->id = isset($entry->guid) ? (string) $entry->guid : $item->url;
$item->updated = strtotime($pubdate ?: (string) $entry->pubDate) ?: $this->updated;
$item->content = $this->filterHtml($content, $item->url);
$item->author = $author;
$this->items[] = $item;
}
}
catch (\Exception $e) {
}
return $this;
}
}
class Rss10 extends Parser
{
public function execute()
{
try {
\libxml_use_internal_errors(true);
$xml = new \SimpleXMLElement($this->content);
$ns = $xml->getNamespaces(true);
$this->title = (string) $xml->channel->title;
$this->url = (string) $xml->channel->link;
$this->id = $this->url;
if (isset($ns['dc'])) {
$ns_dc = $xml->channel->children($ns['dc']);
$this->updated = isset($ns_dc->date) ? strtotime($ns_dc->date) : time();
}
else {
$this->updated = time();
}
foreach ($xml->item as $entry) {
$author = '';
$content = '';
$pubdate = '';
$link = '';
if (isset($ns['feedburner'])) {
$ns_fb = $entry->children($ns['feedburner']);
$link = $ns_fb->origLink;
}
if (isset($ns['dc'])) {
$ns_dc = $entry->children($ns['dc']);
$author = (string) $ns_dc->creator;
$pubdate = (string) $ns_dc->date;
}
if (isset($ns['content'])) {
$ns_content = $entry->children($ns['content']);
$content = (string) $ns_content->encoded;
}
if ($content === '' && isset($entry->description)) {
$content = (string) $entry->description;
}
$item = new \StdClass;
$item->title = (string) $entry->title;
$item->url = $link ?: (string) $entry->link;
$item->id = $item->url;
$item->updated = $pubdate ? strtotime($pubdate) : time();
$item->content = $this->filterHtml($content, $item->url);
$item->author = $author ?: (string) $xml->channel->webMaster;
$this->items[] = $item;
}
}
catch (\Exception $e) {
}
return $this;
}
}
class Rss92 extends Rss20 {}
class Rss91 extends Rss20 {}

View File

@ -0,0 +1,83 @@
<?php
namespace PicoFeed;
class Atom extends Parser
{
public function execute()
{
$this->content = $this->normalizeData($this->content);
\libxml_use_internal_errors(true);
$xml = \simplexml_load_string($this->content);
if ($xml === false) {
if ($this->debug) $this->displayXmlErrors();
return false;
}
$this->url = $this->getUrl($xml);
$this->title = (string) $xml->title;
$this->id = (string) $xml->id;
$this->updated = strtotime((string) $xml->updated);
$author = (string) $xml->author->name;
foreach ($xml->entry as $entry) {
if (isset($entry->author->name)) {
$author = $entry->author->name;
}
$item = new \StdClass;
$item->id = (string) $entry->id;
$item->title = (string) $entry->title;
$item->url = $this->getUrl($entry);
$item->updated = strtotime((string) $entry->updated);
$item->author = $author;
$item->content = $this->filterHtml($this->getContent($entry), $item->url);
$this->items[] = $item;
}
return $this;
}
public function getContent($entry)
{
if (isset($entry->content) && ! empty($entry->content)) {
if (count($entry->content->children())) {
return (string) $entry->content->asXML();
}
else {
return (string) $entry->content;
}
}
else if (isset($entry->summary) && ! empty($entry->summary)) {
return (string) $entry->summary;
}
return '';
}
public function getUrl($xml)
{
foreach ($xml->link as $link) {
if ((string) $link['type'] === 'text/html') {
return (string) $link['href'];
}
}
return (string) $xml->link['href'];
}
}

View File

@ -0,0 +1,81 @@
<?php
namespace PicoFeed;
class Rss10 extends Parser
{
public function execute()
{
$this->content = $this->normalizeData($this->content);
\libxml_use_internal_errors(true);
$xml = \simplexml_load_string($this->content);
if ($xml === false) {
if ($this->debug) $this->displayXmlErrors();
return false;
}
$ns = $xml->getNamespaces(true);
$this->title = (string) $xml->channel->title;
$this->url = (string) $xml->channel->link;
$this->id = $this->url;
if (isset($ns['dc'])) {
$ns_dc = $xml->channel->children($ns['dc']);
$this->updated = isset($ns_dc->date) ? strtotime($ns_dc->date) : time();
}
else {
$this->updated = time();
}
foreach ($xml->item as $entry) {
$author = '';
$content = '';
$pubdate = '';
$link = '';
if (isset($ns['feedburner'])) {
$ns_fb = $entry->children($ns['feedburner']);
$link = $ns_fb->origLink;
}
if (isset($ns['dc'])) {
$ns_dc = $entry->children($ns['dc']);
$author = (string) $ns_dc->creator;
$pubdate = (string) $ns_dc->date;
}
if (isset($ns['content'])) {
$ns_content = $entry->children($ns['content']);
$content = (string) $ns_content->encoded;
}
if ($content === '' && isset($entry->description)) {
$content = (string) $entry->description;
}
$item = new \StdClass;
$item->title = (string) $entry->title;
$item->url = $link ?: (string) $entry->link;
$item->id = $item->url;
$item->updated = $pubdate ? strtotime($pubdate) : time();
$item->content = $this->filterHtml($content, $item->url);
$item->author = $author ?: (string) $xml->channel->webMaster;
$this->items[] = $item;
}
return $this;
}
}

View File

@ -0,0 +1,93 @@
<?php
namespace PicoFeed;
class Rss20 extends Parser
{
public function execute()
{
$this->content = $this->normalizeData($this->content);
\libxml_use_internal_errors(true);
$xml = \simplexml_load_string($this->content);
if ($xml === false) {
if ($this->debug) $this->displayXmlErrors();
return false;
}
$ns = $xml->getNamespaces(true);
$this->title = (string) $xml->channel->title;
$this->url = (string) $xml->channel->link;
$this->id = $this->url;
$this->updated = isset($xml->channel->pubDate) ? (string) $xml->channel->pubDate : (string) $xml->channel->lastBuildDate;
if ($this->updated) {
$this->updated = strtotime($this->updated);
}
else {
$this->updated = time();
}
foreach ($xml->channel->item as $entry) {
$author = '';
$content = '';
$pubdate = '';
$link = '';
if (isset($ns['feedburner'])) {
$ns_fb = $entry->children($ns['feedburner']);
$link = $ns_fb->origLink;
}
if (isset($ns['dc'])) {
$ns_dc = $entry->children($ns['dc']);
$author = (string) $ns_dc->creator;
$pubdate = (string) $ns_dc->date;
}
if (isset($ns['content'])) {
$ns_content = $entry->children($ns['content']);
$content = (string) $ns_content->encoded;
}
if ($content === '' && isset($entry->description)) {
$content = (string) $entry->description;
}
if ($author === '') {
if (isset($entry->author)) {
$author = (string) $entry->author;
}
else if (isset($xml->channel->webMaster)) {
$author = (string) $xml->channel->webMaster;
}
}
$item = new \StdClass;
$item->title = (string) $entry->title;
$item->url = $link ?: (string) $entry->link;
$item->id = isset($entry->guid) ? (string) $entry->guid : $item->url;
$item->updated = strtotime($pubdate ?: (string) $entry->pubDate) ?: $this->updated;
$item->content = $this->filterHtml($content, $item->url);
$item->author = $author;
$this->items[] = $item;
}
return $this;
}
}

View File

@ -0,0 +1,7 @@
<?php
namespace PicoFeed;
require_once __DIR__.'/Rss20.php';
class Rss91 extends Rss20 {}

View File

@ -0,0 +1,7 @@
<?php
namespace PicoFeed;
require_once __DIR__.'/Rss20.php';
class Rss92 extends Rss20 {}

View File

@ -69,25 +69,30 @@ class Reader
if (strpos($first_tag, '<feed ') !== false) { if (strpos($first_tag, '<feed ') !== false) {
require_once __DIR__.'/Parsers/Atom.php';
return new Atom($this->content); return new Atom($this->content);
} }
else if (strpos($first_tag, '<rss ') !== false && else if (strpos($first_tag, '<rss ') !== false &&
(strpos($first_tag, 'version="2.0"') !== false || strpos($first_tag, 'version=\'2.0\'') !== false)) { (strpos($first_tag, 'version="2.0"') !== false || strpos($first_tag, 'version=\'2.0\'') !== false)) {
require_once __DIR__.'/Parsers/Rss20.php';
return new Rss20($this->content); return new Rss20($this->content);
} }
else if (strpos($first_tag, '<rss ') !== false && else if (strpos($first_tag, '<rss ') !== false &&
(strpos($first_tag, 'version="0.92"') !== false || strpos($first_tag, 'version=\'0.92\'') !== false)) { (strpos($first_tag, 'version="0.92"') !== false || strpos($first_tag, 'version=\'0.92\'') !== false)) {
require_once __DIR__.'/Parsers/Rss92.php';
return new Rss92($this->content); return new Rss92($this->content);
} }
else if (strpos($first_tag, '<rss ') !== false && else if (strpos($first_tag, '<rss ') !== false &&
(strpos($first_tag, 'version="0.91"') !== false || strpos($first_tag, 'version=\'0.91\'') !== false)) { (strpos($first_tag, 'version="0.91"') !== false || strpos($first_tag, 'version=\'0.91\'') !== false)) {
require_once __DIR__.'/Parsers/Rss91.php';
return new Rss91($this->content); return new Rss91($this->content);
} }
else if (strpos($first_tag, '<rdf:') !== false && strpos($first_tag, 'xmlns="http://purl.org/rss/1.0/"') !== false) { else if (strpos($first_tag, '<rdf:') !== false && strpos($first_tag, 'xmlns="http://purl.org/rss/1.0/"') !== false) {
require_once __DIR__.'/Parsers/Rss10.php';
return new Rss10($this->content); return new Rss10($this->content);
} }
else if ($discover === true) { else if ($discover === true) {