Update of picoFeed

This commit is contained in:
Frederic Guillot 2013-04-06 21:14:52 -04:00
parent 402b2fe7a9
commit 3a1050b749
3 changed files with 211 additions and 51 deletions

View File

@ -7,11 +7,9 @@ class Filter
private $data = ''; private $data = '';
private $url = ''; private $url = '';
private $input = ''; private $input = '';
private $empty_tag = false; private $empty_tags = array();
private $strip_content = false; private $strip_content = false;
public $ignored_tags = array();
public $allowed_tags = array( public $allowed_tags = array(
'dt' => array(), 'dt' => array(),
'dd' => array(), 'dd' => array(),
@ -68,15 +66,20 @@ class Filter
public $blacklist_media = array( public $blacklist_media = array(
'feeds.feedburner.com', 'feeds.feedburner.com',
'feedsportal.com', 'da.feedsportal.com',
'rss.feedsportal.com',
'res.feedsportal.com',
'pi.feedsportal.com',
'rss.nytimes.com', 'rss.nytimes.com',
'feeds.wordpress.com', 'feeds.wordpress.com',
'stats.wordpress.com' 'stats.wordpress.com',
'rss.cnn.com'
); );
public $required_attributes = array( public $required_attributes = array(
'a' => array('href'), 'a' => array('href'),
'img' => array('src') 'img' => array('src'),
'iframe' => array('src')
); );
public $add_attributes = array( public $add_attributes = array(
@ -85,13 +88,15 @@ class Filter
public $iframe_allowed_resources = array( public $iframe_allowed_resources = array(
'http://www.youtube.com/', 'http://www.youtube.com/',
'http://player.vimeo.com/' 'https://www.youtube.com/',
'http://player.vimeo.com/',
'https://player.vimeo.com/'
); );
public function __construct($data, $url) public function __construct($data, $site_url)
{ {
$this->url = $url; $this->url = $site_url;
// Convert bad formatted documents to XML // Convert bad formatted documents to XML
$dom = new \DOMDocument; $dom = new \DOMDocument;
@ -122,12 +127,12 @@ class Filter
public function startTag($parser, $name, $attributes) public function startTag($parser, $name, $attributes)
{ {
$this->empty_tag = false; $empty_tag = false;
$this->strip_content = false; $this->strip_content = false;
if ($this->isPixelTracker($name, $attributes)) { if ($this->isPixelTracker($name, $attributes)) {
$this->empty_tag = true; $empty_tag = true;
} }
else if ($this->isAllowedTag($name)) { else if ($this->isAllowedTag($name)) {
@ -140,10 +145,13 @@ class Filter
if ($this->isResource($attribute)) { if ($this->isResource($attribute)) {
if ($name === 'iframe' && $this->isAllowedIframeResource($value)) { if ($name === 'iframe') {
$attr_data .= ' '.$attribute.'="'.$value.'"'; if ($this->isAllowedIframeResource($value)) {
$used_attributes[] = $attribute;
$attr_data .= ' '.$attribute.'="'.$value.'"';
$used_attributes[] = $attribute;
}
} }
else if ($this->isRelativePath($value)) { else if ($this->isRelativePath($value)) {
@ -164,45 +172,46 @@ class Filter
} }
} }
// Check for required attributes
if (isset($this->required_attributes[$name])) { if (isset($this->required_attributes[$name])) {
foreach ($this->required_attributes[$name] as $required_attribute) { foreach ($this->required_attributes[$name] as $required_attribute) {
if (! in_array($required_attribute, $used_attributes)) { if (! in_array($required_attribute, $used_attributes)) {
$this->empty_tag = true; $empty_tag = true;
break; break;
} }
} }
} }
if (! $this->empty_tag) { if (! $empty_tag) {
$this->data .= '<'.$name.$attr_data; $this->data .= '<'.$name.$attr_data;
// Add custom attributes
if (isset($this->add_attributes[$name])) { if (isset($this->add_attributes[$name])) {
$this->data .= ' '.$this->add_attributes[$name].' '; $this->data .= ' '.$this->add_attributes[$name].' ';
} }
// If img or br, we don't close it here
if ($name !== 'img' && $name !== 'br') $this->data .= '>'; if ($name !== 'img' && $name !== 'br') $this->data .= '>';
} }
} }
else {
$this->ignored_tags[] = $name;
}
if (in_array($name, $this->strip_tags_content)) { if (in_array($name, $this->strip_tags_content)) {
$this->strip_content = true; $this->strip_content = true;
} }
$this->empty_tags[] = $empty_tag;
} }
public function endTag($parser, $name) public function endTag($parser, $name)
{ {
if (! $this->empty_tag && $this->isAllowedTag($name)) { if (! array_pop($this->empty_tags) && $this->isAllowedTag($name)) {
$this->data .= $name !== 'img' && $name !== 'br' ? '</'.$name.'>' : '/>'; $this->data .= $name !== 'img' && $name !== 'br' ? '</'.$name.'>' : '/>';
} }

View File

@ -2,6 +2,9 @@
namespace PicoFeed; namespace PicoFeed;
require_once __DIR__.'/Parser.php';
require_once __DIR__.'/RemoteResource.php';
class Reader class Reader
{ {
private $url = ''; private $url = '';
@ -16,40 +19,22 @@ class Reader
} }
public function download($url, $timeout = 5, $user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)') public function download($url, $last_modified = '', $etag = '', $timeout = 5, $user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)')
{ {
if (strpos($url, 'http') !== 0) { if (strpos($url, 'http') !== 0) {
$url = 'http://'.$url; $url = 'http://'.$url;
} }
$this->url = $url; $resource = new RemoteResource($url, $timeout, $user_agent);
$this->content = $this->fetchRemoteFile($url, $timeout, $user_agent); $resource->setLastModified($last_modified);
$resource->setEtag($etag);
$resource->execute();
return $this; $this->content = $resource->getContent();
} $this->url = $resource->getUrl();
return $resource;
public function fetchRemoteFile($url, $timeout, $user_agent)
{
if (! \function_exists('curl_init')) {
return @file_get_contents($this->url);
}
$ch = \curl_init();
\curl_setopt($ch, CURLOPT_URL, $url);
\curl_setopt($ch, CURLOPT_HEADER, false);
\curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
\curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
\curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);
$content = \curl_exec($ch);
\curl_close($ch);
return $content;
} }
@ -90,24 +75,24 @@ class Reader
{ {
$first_tag = $this->getFirstTag($this->content); $first_tag = $this->getFirstTag($this->content);
if (strpos($first_tag, '<feed ') !== false) { if (strpos($first_tag, '<feed') !== false) {
require_once __DIR__.'/Parsers/Atom.php'; require_once __DIR__.'/Parsers/Atom.php';
return new Atom($this->content); return new Atom($this->content);
} }
else if (strpos($first_tag, '<rss ') !== false && else if (strpos($first_tag, '<rss') !== false &&
(strpos($first_tag, 'version="2.0"') !== false || strpos($first_tag, 'version=\'2.0\'') !== false)) { (strpos($first_tag, 'version="2.0"') !== false || strpos($first_tag, 'version=\'2.0\'') !== false)) {
require_once __DIR__.'/Parsers/Rss20.php'; require_once __DIR__.'/Parsers/Rss20.php';
return new Rss20($this->content); return new Rss20($this->content);
} }
else if (strpos($first_tag, '<rss ') !== false && else if (strpos($first_tag, '<rss') !== false &&
(strpos($first_tag, 'version="0.92"') !== false || strpos($first_tag, 'version=\'0.92\'') !== false)) { (strpos($first_tag, 'version="0.92"') !== false || strpos($first_tag, 'version=\'0.92\'') !== false)) {
require_once __DIR__.'/Parsers/Rss92.php'; require_once __DIR__.'/Parsers/Rss92.php';
return new Rss92($this->content); return new Rss92($this->content);
} }
else if (strpos($first_tag, '<rss ') !== false && else if (strpos($first_tag, '<rss') !== false &&
(strpos($first_tag, 'version="0.91"') !== false || strpos($first_tag, 'version=\'0.91\'') !== false)) { (strpos($first_tag, 'version="0.91"') !== false || strpos($first_tag, 'version=\'0.91\'') !== false)) {
require_once __DIR__.'/Parsers/Rss91.php'; require_once __DIR__.'/Parsers/Rss91.php';

View File

@ -0,0 +1,166 @@
<?php
namespace PicoFeed;
class RemoteResource
{
public $user_agent;
public $timeout;
public $url;
public $etag;
public $last_modified;
public $is_modified = true;
public $content = '';
public function __construct($url, $timeout = 5, $user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)')
{
$this->url = $url;
$this->timeout = $timeout;
$this->user_agent = $user_agent;
return $this;
}
public function setLastModified($last_modified)
{
$this->last_modified = $last_modified;
return $this;
}
public function getLastModified()
{
return $this->last_modified;
}
public function setEtag($etag)
{
$this->etag = $etag;
return $this;
}
public function getEtag()
{
return $this->etag;
}
public function getUrl()
{
return $this->url;
}
public function getContent()
{
return $this->content;
}
public function isModified()
{
return $this->is_modified;
}
public function execute()
{
$response = $this->makeRequest();
$this->etag = isset($response['headers']['ETag']) ? $response['headers']['ETag'] : '';
$this->last_modified = isset($response['headers']['Last-Modified']) ? $response['headers']['Last-Modified'] : '';
if ($response['status'] == 304) {
$this->is_modified = false;
}
else if ($response['status'] == 301 || $response['status'] == 302) {
if (isset($response['headers']['Location'])) {
$this->url = $response['headers']['Location'];
}
else if (isset($response['headers']['location'])) {
$this->url = $response['headers']['location'];
}
$this->execute();
}
else {
$this->content = $response['body'];
}
}
public function makeRequest()
{
$http_code = 200;
$http_body = '';
$http_headers = array();
if (! function_exists('curl_init')) {
$http_body = @file_get_contents($this->url);
}
else {
$headers = array('Connection: close');
if ($this->etag) $headers[] = 'If-None-Match: '.$this->etag;
if ($this->last_modified) $headers[] = 'If-Modified-Since: '.$this->last_modified;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $this->url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_USERAGENT, $this->user_agent);
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
$http_response = curl_exec($ch);
$http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$http_body = '';
$http_headers = array();
curl_close($ch);
$lines = explode("\r\n", $http_response);
$body_start = 0;
$i = 0;
foreach ($lines as $line) {
if ($line === '') {
$body_start = $i;
break;
}
else if (($p = strpos($line, ':')) !== false) {
$key = substr($line, 0, $p);
$value = substr($line, $p + 1);
$http_headers[trim($key)] = trim($value);
}
$i++;
}
$http_body = implode("\r\n", array_splice($lines, $i + 1));
}
return array(
'status' => $http_code,
'body' => $http_body,
'headers' => $http_headers
);
}
}