2013-02-18 03:48:21 +01:00
|
|
|
<?php
|
|
|
|
|
|
|
|
namespace PicoFeed;
|
|
|
|
|
2013-07-06 20:29:45 +02:00
|
|
|
require_once __DIR__.'/Logging.php';
|
2013-04-07 03:14:52 +02:00
|
|
|
require_once __DIR__.'/Parser.php';
|
2013-07-17 01:54:44 +02:00
|
|
|
require_once __DIR__.'/Client.php';
|
2013-08-31 17:05:45 +02:00
|
|
|
require_once __DIR__.'/Filter.php';
|
2013-04-07 03:14:52 +02:00
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Reader class
|
|
|
|
*
|
|
|
|
* @author Frederic Guillot
|
|
|
|
* @package parser
|
|
|
|
*/
|
2013-02-18 03:48:21 +01:00
|
|
|
class Reader
|
|
|
|
{
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Feed or site URL
|
|
|
|
*
|
|
|
|
* @access private
|
|
|
|
* @var string
|
|
|
|
*/
|
2013-02-18 03:48:21 +01:00
|
|
|
private $url = '';
|
2014-03-30 00:48:29 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Feed content
|
|
|
|
*
|
|
|
|
* @access private
|
|
|
|
* @var string
|
|
|
|
*/
|
2013-02-18 03:48:21 +01:00
|
|
|
private $content = '';
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* HTTP encoding
|
|
|
|
*
|
|
|
|
* @access private
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
private $encoding = '';
|
2013-02-18 03:48:21 +01:00
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Constructor
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param string $content Feed content
|
|
|
|
* @param string $encoding Feed encoding
|
|
|
|
* @return Reader
|
|
|
|
*/
|
2013-10-04 05:14:39 +02:00
|
|
|
public function __construct($content = '', $encoding = '')
|
2013-02-18 03:48:21 +01:00
|
|
|
{
|
|
|
|
$this->content = $content;
|
2013-10-04 05:14:39 +02:00
|
|
|
$this->encoding = '';
|
2013-02-18 03:48:21 +01:00
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Download a feed
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param string $url Feed content
|
|
|
|
* @param string $last_modified Last modified HTTP header
|
|
|
|
* @param string $etag Etag HTTP header
|
|
|
|
* @param string $timeout Client connection timeout
|
|
|
|
* @param string $user_agent HTTP user-agent
|
|
|
|
* @return Client
|
|
|
|
*/
|
2013-04-07 03:14:52 +02:00
|
|
|
public function download($url, $last_modified = '', $etag = '', $timeout = 5, $user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)')
|
2013-02-18 03:48:21 +01:00
|
|
|
{
|
|
|
|
if (strpos($url, 'http') !== 0) {
|
|
|
|
|
|
|
|
$url = 'http://'.$url;
|
|
|
|
}
|
|
|
|
|
2013-07-17 01:54:44 +02:00
|
|
|
$client = Client::create();
|
|
|
|
$client->url = $url;
|
|
|
|
$client->timeout = $timeout;
|
|
|
|
$client->user_agent = $user_agent;
|
|
|
|
$client->last_modified = $last_modified;
|
|
|
|
$client->etag = $etag;
|
|
|
|
$client->execute();
|
2013-04-06 15:19:29 +02:00
|
|
|
|
2013-07-17 01:54:44 +02:00
|
|
|
$this->content = $client->getContent();
|
|
|
|
$this->url = $client->getUrl();
|
2013-10-04 05:14:39 +02:00
|
|
|
$this->encoding = $client->getEncoding();
|
2013-04-06 15:19:29 +02:00
|
|
|
|
2013-07-17 01:54:44 +02:00
|
|
|
return $client;
|
2013-04-06 15:19:29 +02:00
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Get the download content
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @return string
|
|
|
|
*/
|
2013-02-18 03:48:21 +01:00
|
|
|
public function getContent()
|
|
|
|
{
|
|
|
|
return $this->content;
|
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Get finale URL
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @return string
|
|
|
|
*/
|
2013-02-18 03:48:21 +01:00
|
|
|
public function getUrl()
|
|
|
|
{
|
|
|
|
return $this->url;
|
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Get the first XML tag
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param string $data Feed content
|
|
|
|
* @return string
|
|
|
|
*/
|
2013-02-24 20:03:14 +01:00
|
|
|
public function getFirstTag($data)
|
2013-02-18 03:48:21 +01:00
|
|
|
{
|
2013-06-27 01:30:46 +02:00
|
|
|
// Strip HTML comments (max of 5,000 characters long to prevent crashing)
|
|
|
|
$data = preg_replace('/<!--(.{0,5000}?)-->/Uis', '', $data);
|
|
|
|
|
|
|
|
/* Strip Doctype:
|
2013-08-31 17:05:45 +02:00
|
|
|
* Doctype needs to be within the first 100 characters. (Ideally the first!)
|
2013-06-27 01:30:46 +02:00
|
|
|
* If it's not found by then, we need to stop looking to prevent PREG
|
|
|
|
* from reaching max backtrack depth and crashing.
|
|
|
|
*/
|
2013-08-31 17:05:45 +02:00
|
|
|
$data = preg_replace('/^.{0,100}<!DOCTYPE([^>]*)>/Uis', '', $data);
|
2013-03-22 00:58:52 +01:00
|
|
|
|
2013-08-31 17:05:45 +02:00
|
|
|
// Strip <?xml version....
|
|
|
|
$data = Filter::stripXmlTag($data);
|
2013-02-18 03:48:21 +01:00
|
|
|
|
2013-08-31 17:05:45 +02:00
|
|
|
// Find the first tag
|
|
|
|
$open_tag = strpos($data, '<');
|
|
|
|
$close_tag = strpos($data, '>');
|
2013-02-18 03:48:21 +01:00
|
|
|
|
2013-08-31 17:05:45 +02:00
|
|
|
return substr($data, $open_tag, $close_tag);
|
2013-02-24 20:03:14 +01:00
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Discover feed format and return a parser instance
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param boolean $discover Enable feed autodiscovery in HTML document
|
|
|
|
* @return mixed False on failure or Parser instance
|
|
|
|
*/
|
2013-02-24 20:03:14 +01:00
|
|
|
public function getParser($discover = false)
|
|
|
|
{
|
|
|
|
$first_tag = $this->getFirstTag($this->content);
|
|
|
|
|
2013-04-07 03:14:52 +02:00
|
|
|
if (strpos($first_tag, '<feed') !== false) {
|
2013-02-18 03:48:21 +01:00
|
|
|
|
2013-08-30 01:34:11 +02:00
|
|
|
Logging::log(\get_called_class().': discover Atom feed');
|
2013-07-06 20:29:45 +02:00
|
|
|
|
2013-04-05 05:34:07 +02:00
|
|
|
require_once __DIR__.'/Parsers/Atom.php';
|
2013-10-04 05:14:39 +02:00
|
|
|
return new Parsers\Atom($this->content, $this->encoding);
|
2013-02-18 03:48:21 +01:00
|
|
|
}
|
2013-04-07 03:14:52 +02:00
|
|
|
else if (strpos($first_tag, '<rss') !== false &&
|
2013-04-02 03:31:54 +02:00
|
|
|
(strpos($first_tag, 'version="2.0"') !== false || strpos($first_tag, 'version=\'2.0\'') !== false)) {
|
2013-02-18 03:48:21 +01:00
|
|
|
|
2013-08-30 01:34:11 +02:00
|
|
|
Logging::log(\get_called_class().': discover RSS 2.0 feed');
|
2013-07-06 20:29:45 +02:00
|
|
|
|
2013-04-05 05:34:07 +02:00
|
|
|
require_once __DIR__.'/Parsers/Rss20.php';
|
2013-10-04 05:14:39 +02:00
|
|
|
return new Parsers\Rss20($this->content, $this->encoding);
|
2013-02-24 20:03:14 +01:00
|
|
|
}
|
2013-04-07 03:14:52 +02:00
|
|
|
else if (strpos($first_tag, '<rss') !== false &&
|
2013-04-02 03:31:54 +02:00
|
|
|
(strpos($first_tag, 'version="0.92"') !== false || strpos($first_tag, 'version=\'0.92\'') !== false)) {
|
2013-03-26 02:29:55 +01:00
|
|
|
|
2013-08-30 01:34:11 +02:00
|
|
|
Logging::log(\get_called_class().': discover RSS 0.92 feed');
|
2013-07-06 20:29:45 +02:00
|
|
|
|
2013-04-05 05:34:07 +02:00
|
|
|
require_once __DIR__.'/Parsers/Rss92.php';
|
2013-10-04 05:14:39 +02:00
|
|
|
return new Parsers\Rss92($this->content, $this->encoding);
|
2013-03-26 02:29:55 +01:00
|
|
|
}
|
2013-04-07 03:14:52 +02:00
|
|
|
else if (strpos($first_tag, '<rss') !== false &&
|
2013-04-02 03:31:54 +02:00
|
|
|
(strpos($first_tag, 'version="0.91"') !== false || strpos($first_tag, 'version=\'0.91\'') !== false)) {
|
2013-03-26 02:29:55 +01:00
|
|
|
|
2013-08-30 01:34:11 +02:00
|
|
|
Logging::log(\get_called_class().': discover RSS 0.91 feed');
|
2013-07-06 20:29:45 +02:00
|
|
|
|
2013-04-05 05:34:07 +02:00
|
|
|
require_once __DIR__.'/Parsers/Rss91.php';
|
2013-10-04 05:14:39 +02:00
|
|
|
return new Parsers\Rss91($this->content, $this->encoding);
|
2013-03-26 02:29:55 +01:00
|
|
|
}
|
2013-03-20 05:19:12 +01:00
|
|
|
else if (strpos($first_tag, '<rdf:') !== false && strpos($first_tag, 'xmlns="http://purl.org/rss/1.0/"') !== false) {
|
|
|
|
|
2013-08-30 01:34:11 +02:00
|
|
|
Logging::log(\get_called_class().': discover RSS 1.0 feed');
|
2013-07-06 20:29:45 +02:00
|
|
|
|
2013-04-05 05:34:07 +02:00
|
|
|
require_once __DIR__.'/Parsers/Rss10.php';
|
2013-10-04 05:14:39 +02:00
|
|
|
return new Parsers\Rss10($this->content, $this->encoding);
|
2013-03-20 05:19:12 +01:00
|
|
|
}
|
2013-02-24 20:03:14 +01:00
|
|
|
else if ($discover === true) {
|
|
|
|
|
2013-08-30 01:34:11 +02:00
|
|
|
Logging::log(\get_called_class().': Format not supported or malformed');
|
|
|
|
Logging::log(\get_called_class().':'.PHP_EOL.$this->content);
|
|
|
|
|
2013-02-24 20:03:14 +01:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
else if ($this->discover()) {
|
2013-02-18 03:48:21 +01:00
|
|
|
|
2013-02-24 20:03:14 +01:00
|
|
|
return $this->getParser(true);
|
|
|
|
}
|
2013-02-18 03:48:21 +01:00
|
|
|
|
2013-08-30 01:34:11 +02:00
|
|
|
Logging::log(\get_called_class().': Subscription not found');
|
|
|
|
Logging::log(\get_called_class().': Content => '.PHP_EOL.$this->content);
|
|
|
|
|
2013-02-18 03:48:21 +01:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-03-30 00:48:29 +01:00
|
|
|
/**
|
|
|
|
* Discover feed url inside a HTML document and download the feed
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @return boolean
|
|
|
|
*/
|
2013-02-18 03:48:21 +01:00
|
|
|
public function discover()
|
|
|
|
{
|
2013-02-24 20:03:14 +01:00
|
|
|
if (! $this->content) {
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2013-08-30 01:34:11 +02:00
|
|
|
Logging::log(\get_called_class().': Try to discover a subscription');
|
2013-07-06 20:29:45 +02:00
|
|
|
|
2013-02-18 03:48:21 +01:00
|
|
|
\libxml_use_internal_errors(true);
|
|
|
|
|
|
|
|
$dom = new \DOMDocument;
|
|
|
|
$dom->loadHTML($this->content);
|
|
|
|
|
|
|
|
$xpath = new \DOMXPath($dom);
|
|
|
|
|
|
|
|
$queries = array(
|
|
|
|
"//link[@type='application/atom+xml']",
|
|
|
|
"//link[@type='application/rss+xml']"
|
|
|
|
);
|
|
|
|
|
|
|
|
foreach ($queries as $query) {
|
|
|
|
|
|
|
|
$nodes = $xpath->query($query);
|
|
|
|
|
|
|
|
if ($nodes->length !== 0) {
|
|
|
|
|
|
|
|
$link = $nodes->item(0)->getAttribute('href');
|
|
|
|
|
2013-06-29 03:50:15 +02:00
|
|
|
if (! empty($link)) {
|
2013-02-18 03:48:21 +01:00
|
|
|
|
2013-06-29 03:50:15 +02:00
|
|
|
// Relative links
|
|
|
|
if (strpos($link, 'http') !== 0) {
|
2013-02-18 03:48:21 +01:00
|
|
|
|
2013-06-29 03:50:15 +02:00
|
|
|
if ($link{0} === '/') $link = substr($link, 1);
|
|
|
|
if ($this->url{strlen($this->url) - 1} !== '/') $this->url .= '/';
|
|
|
|
|
|
|
|
$link = $this->url.$link;
|
|
|
|
}
|
2013-02-18 03:48:21 +01:00
|
|
|
|
2013-08-30 01:34:11 +02:00
|
|
|
Logging::log(\get_called_class().': Find subscription link: '.$link);
|
2013-06-29 03:50:15 +02:00
|
|
|
$this->download($link);
|
2013-02-18 03:48:21 +01:00
|
|
|
|
2013-06-29 03:50:15 +02:00
|
|
|
return true;
|
|
|
|
}
|
2013-02-18 03:48:21 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|