2013-02-17 21:48:21 -05:00
|
|
|
<?php
|
|
|
|
|
|
|
|
namespace PicoFeed;
|
|
|
|
|
2014-05-20 14:20:27 -04:00
|
|
|
use DOMXPath;
|
|
|
|
use PicoFeed\Config;
|
|
|
|
use PicoFeed\XmlParser;
|
|
|
|
use PicoFeed\Logging;
|
|
|
|
use PicoFeed\Filter;
|
|
|
|
use PicoFeed\Client;
|
|
|
|
use PicoFeed\Parser;
|
2013-04-06 21:14:52 -04:00
|
|
|
|
2014-03-29 19:48:29 -04:00
|
|
|
/**
|
|
|
|
* Reader class
|
|
|
|
*
|
|
|
|
* @author Frederic Guillot
|
2014-05-20 14:20:27 -04:00
|
|
|
* @package picofeed
|
2014-03-29 19:48:29 -04:00
|
|
|
*/
|
2013-02-17 21:48:21 -05:00
|
|
|
class Reader
|
|
|
|
{
|
2014-03-29 19:48:29 -04:00
|
|
|
/**
|
|
|
|
* Feed or site URL
|
|
|
|
*
|
|
|
|
* @access private
|
|
|
|
* @var string
|
|
|
|
*/
|
2013-02-17 21:48:21 -05:00
|
|
|
private $url = '';
|
2014-03-29 19:48:29 -04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Feed content
|
|
|
|
*
|
|
|
|
* @access private
|
|
|
|
* @var string
|
|
|
|
*/
|
2013-02-17 21:48:21 -05:00
|
|
|
private $content = '';
|
|
|
|
|
2014-03-29 19:48:29 -04:00
|
|
|
/**
|
|
|
|
* HTTP encoding
|
|
|
|
*
|
|
|
|
* @access private
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
private $encoding = '';
|
2013-02-17 21:48:21 -05:00
|
|
|
|
2014-05-20 14:20:27 -04:00
|
|
|
/**
|
|
|
|
* Config class instance
|
|
|
|
*
|
|
|
|
* @access private
|
|
|
|
* @var \PicoFeed\Config
|
|
|
|
*/
|
|
|
|
private $config = null;
|
|
|
|
|
2014-03-29 19:48:29 -04:00
|
|
|
/**
|
|
|
|
* Constructor
|
|
|
|
*
|
|
|
|
* @access public
|
2014-05-20 14:20:27 -04:00
|
|
|
* @param \PicoFeed\Config $config Config class instance
|
2014-03-29 19:48:29 -04:00
|
|
|
*/
|
2014-05-20 14:20:27 -04:00
|
|
|
public function __construct(Config $config = null)
|
2013-02-17 21:48:21 -05:00
|
|
|
{
|
2014-05-20 14:20:27 -04:00
|
|
|
$this->config = $config ?: new Config;
|
|
|
|
Logging::setTimezone($this->config->getTimezone());
|
2013-02-17 21:48:21 -05:00
|
|
|
}
|
|
|
|
|
2014-03-29 19:48:29 -04:00
|
|
|
/**
|
|
|
|
* Download a feed
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param string $url Feed content
|
|
|
|
* @param string $last_modified Last modified HTTP header
|
|
|
|
* @param string $etag Etag HTTP header
|
2014-05-20 14:20:27 -04:00
|
|
|
* @return \PicoFeed\Client
|
2014-03-29 19:48:29 -04:00
|
|
|
*/
|
2014-05-20 14:20:27 -04:00
|
|
|
public function download($url, $last_modified = '', $etag = '')
|
2013-02-17 21:48:21 -05:00
|
|
|
{
|
|
|
|
if (strpos($url, 'http') !== 0) {
|
|
|
|
$url = 'http://'.$url;
|
|
|
|
}
|
|
|
|
|
2014-05-20 14:20:27 -04:00
|
|
|
$client = Client::getInstance();
|
|
|
|
$client->setTimeout($this->config->getClientTimeout())
|
|
|
|
->setUserAgent($this->config->getClientUserAgent())
|
|
|
|
->setMaxRedirections($this->config->getMaxRedirections())
|
|
|
|
->setMaxBodySize($this->config->getMaxBodySize())
|
|
|
|
->setProxyHostname($this->config->getProxyHostname())
|
|
|
|
->setProxyPort($this->config->getProxyPort())
|
|
|
|
->setProxyUsername($this->config->getProxyUsername())
|
|
|
|
->setProxyPassword($this->config->getProxyPassword())
|
|
|
|
->setLastModified($last_modified)
|
|
|
|
->setEtag($etag);
|
|
|
|
|
|
|
|
if ($client->execute($url)) {
|
|
|
|
$this->content = $client->getContent();
|
|
|
|
$this->url = $client->getUrl();
|
|
|
|
$this->encoding = $client->getEncoding();
|
|
|
|
}
|
2013-04-06 09:19:29 -04:00
|
|
|
|
2013-07-16 19:54:44 -04:00
|
|
|
return $client;
|
2013-04-06 09:19:29 -04:00
|
|
|
}
|
|
|
|
|
2014-03-29 19:48:29 -04:00
|
|
|
/**
|
2014-05-20 14:20:27 -04:00
|
|
|
* Get a parser instance with a custom config
|
2014-03-29 19:48:29 -04:00
|
|
|
*
|
|
|
|
* @access public
|
2014-05-20 14:20:27 -04:00
|
|
|
* @param string $name Parser name
|
|
|
|
* @return \PicoFeed\Parser
|
2014-03-29 19:48:29 -04:00
|
|
|
*/
|
2014-05-20 14:20:27 -04:00
|
|
|
public function getParserInstance($name)
|
2013-02-17 21:48:21 -05:00
|
|
|
{
|
2014-05-20 14:20:27 -04:00
|
|
|
require_once __DIR__.'/Parsers/'.ucfirst($name).'.php';
|
|
|
|
$name = '\PicoFeed\Parsers\\'.$name;
|
2013-02-17 21:48:21 -05:00
|
|
|
|
2014-05-20 14:20:27 -04:00
|
|
|
$parser = new $name($this->content, $this->encoding);
|
|
|
|
$parser->setHashAlgo($this->config->getParserHashAlgo());
|
|
|
|
$parser->setTimezone($this->config->getTimezone());
|
|
|
|
$parser->setConfig($this->config);
|
|
|
|
|
|
|
|
return $parser;
|
2013-02-17 21:48:21 -05:00
|
|
|
}
|
|
|
|
|
2014-03-29 19:48:29 -04:00
|
|
|
/**
|
|
|
|
* Get the first XML tag
|
|
|
|
*
|
|
|
|
* @access public
|
2014-05-20 14:20:27 -04:00
|
|
|
* @param string $data Feed content
|
2014-03-29 19:48:29 -04:00
|
|
|
* @return string
|
|
|
|
*/
|
2013-02-24 14:03:14 -05:00
|
|
|
public function getFirstTag($data)
|
2013-02-17 21:48:21 -05:00
|
|
|
{
|
2013-06-26 19:30:46 -04:00
|
|
|
// Strip HTML comments (max of 5,000 characters long to prevent crashing)
|
|
|
|
$data = preg_replace('/<!--(.{0,5000}?)-->/Uis', '', $data);
|
|
|
|
|
|
|
|
/* Strip Doctype:
|
2013-08-31 11:05:45 -04:00
|
|
|
* Doctype needs to be within the first 100 characters. (Ideally the first!)
|
2013-06-26 19:30:46 -04:00
|
|
|
* If it's not found by then, we need to stop looking to prevent PREG
|
|
|
|
* from reaching max backtrack depth and crashing.
|
|
|
|
*/
|
2013-08-31 11:05:45 -04:00
|
|
|
$data = preg_replace('/^.{0,100}<!DOCTYPE([^>]*)>/Uis', '', $data);
|
2013-03-21 19:58:52 -04:00
|
|
|
|
2013-08-31 11:05:45 -04:00
|
|
|
// Strip <?xml version....
|
|
|
|
$data = Filter::stripXmlTag($data);
|
2013-02-17 21:48:21 -05:00
|
|
|
|
2013-08-31 11:05:45 -04:00
|
|
|
// Find the first tag
|
|
|
|
$open_tag = strpos($data, '<');
|
|
|
|
$close_tag = strpos($data, '>');
|
2013-02-17 21:48:21 -05:00
|
|
|
|
2013-08-31 11:05:45 -04:00
|
|
|
return substr($data, $open_tag, $close_tag);
|
2013-02-24 14:03:14 -05:00
|
|
|
}
|
|
|
|
|
2014-03-29 19:48:29 -04:00
|
|
|
/**
|
2014-05-20 14:20:27 -04:00
|
|
|
* Detect the feed format
|
2014-03-29 19:48:29 -04:00
|
|
|
*
|
|
|
|
* @access public
|
2014-05-20 14:20:27 -04:00
|
|
|
* @param string $parser_name Parser name
|
|
|
|
* @param string $haystack First XML tag
|
|
|
|
* @param array $needles List of strings that need to be there
|
|
|
|
* @return mixed False on failure or Parser instance
|
2014-03-29 19:48:29 -04:00
|
|
|
*/
|
2014-05-20 14:20:27 -04:00
|
|
|
public function detectFormat($parser_name, $haystack, array $needles)
|
2013-02-24 14:03:14 -05:00
|
|
|
{
|
2014-05-20 14:20:27 -04:00
|
|
|
$results = array();
|
2013-02-17 21:48:21 -05:00
|
|
|
|
2014-05-20 14:20:27 -04:00
|
|
|
foreach ($needles as $needle) {
|
|
|
|
$results[] = strpos($haystack, $needle) !== false;
|
2013-02-17 21:48:21 -05:00
|
|
|
}
|
2013-07-06 14:29:45 -04:00
|
|
|
|
2014-05-20 14:20:27 -04:00
|
|
|
if (! in_array(false, $results, true)) {
|
|
|
|
Logging::setMessage(get_called_class().': Format detected => '.$parser_name);
|
|
|
|
return $this->getParserInstance($parser_name);
|
2013-02-24 14:03:14 -05:00
|
|
|
}
|
2013-03-25 21:29:55 -04:00
|
|
|
|
2014-05-20 14:20:27 -04:00
|
|
|
return false;
|
|
|
|
}
|
2013-07-06 14:29:45 -04:00
|
|
|
|
2014-05-20 14:20:27 -04:00
|
|
|
/**
|
|
|
|
* Discover feed format and return a parser instance
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param boolean $discover Enable feed autodiscovery in HTML document
|
|
|
|
* @return mixed False on failure or Parser instance
|
|
|
|
*/
|
|
|
|
public function getParser($discover = false)
|
|
|
|
{
|
|
|
|
$formats = array(
|
|
|
|
array('parser' => 'Atom', 'needles' => array('<feed')),
|
|
|
|
array('parser' => 'Rss20', 'needles' => array('<rss', '2.0')),
|
|
|
|
array('parser' => 'Rss92', 'needles' => array('<rss', '0.92')),
|
|
|
|
array('parser' => 'Rss91', 'needles' => array('<rss', '0.91')),
|
2014-07-04 12:05:01 -03:00
|
|
|
array('parser' => 'Rss10', 'needles' => array('<rdf:'/*, 'xmlns="http://purl.org/rss/1.0/"'*/)),
|
2014-05-20 14:20:27 -04:00
|
|
|
);
|
2013-03-25 21:29:55 -04:00
|
|
|
|
2014-05-20 14:20:27 -04:00
|
|
|
$first_tag = $this->getFirstTag($this->content);
|
2013-07-06 14:29:45 -04:00
|
|
|
|
2014-05-20 14:20:27 -04:00
|
|
|
foreach ($formats as $format) {
|
2013-03-20 00:19:12 -04:00
|
|
|
|
2014-05-20 14:20:27 -04:00
|
|
|
$parser = $this->detectFormat($format['parser'], $first_tag, $format['needles']);
|
2013-07-06 14:29:45 -04:00
|
|
|
|
2014-05-20 14:20:27 -04:00
|
|
|
if ($parser !== false) {
|
|
|
|
return $parser;
|
|
|
|
}
|
2013-03-20 00:19:12 -04:00
|
|
|
}
|
2013-02-24 14:03:14 -05:00
|
|
|
|
2014-05-20 14:20:27 -04:00
|
|
|
if ($discover === true) {
|
|
|
|
|
|
|
|
Logging::setMessage(get_called_class().': Format not supported or feed malformed');
|
|
|
|
Logging::setMessage(get_called_class().': Content => '.PHP_EOL.$this->content);
|
2013-08-29 19:34:11 -04:00
|
|
|
|
2013-02-24 14:03:14 -05:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
else if ($this->discover()) {
|
|
|
|
return $this->getParser(true);
|
|
|
|
}
|
2013-02-17 21:48:21 -05:00
|
|
|
|
2014-05-20 14:20:27 -04:00
|
|
|
Logging::setMessage(get_called_class().': Subscription not found');
|
|
|
|
Logging::setMessage(get_called_class().': Content => '.PHP_EOL.$this->content);
|
2013-08-29 19:34:11 -04:00
|
|
|
|
2013-02-17 21:48:21 -05:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-03-29 19:48:29 -04:00
|
|
|
/**
|
2014-05-20 14:20:27 -04:00
|
|
|
* Discover the feed url inside a HTML document and download the feed
|
2014-03-29 19:48:29 -04:00
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @return boolean
|
|
|
|
*/
|
2013-02-17 21:48:21 -05:00
|
|
|
public function discover()
|
|
|
|
{
|
2013-02-24 14:03:14 -05:00
|
|
|
if (! $this->content) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-05-20 14:20:27 -04:00
|
|
|
Logging::setMessage(get_called_class().': Try to discover a subscription');
|
2013-02-17 21:48:21 -05:00
|
|
|
|
2014-05-20 14:20:27 -04:00
|
|
|
$dom = XmlParser::getHtmlDocument($this->content);
|
|
|
|
$xpath = new DOMXPath($dom);
|
2013-02-17 21:48:21 -05:00
|
|
|
|
|
|
|
$queries = array(
|
2014-05-26 18:49:52 -04:00
|
|
|
'//link[@type="application/rss+xml"]',
|
|
|
|
'//link[@type="application/atom+xml"]',
|
2013-02-17 21:48:21 -05:00
|
|
|
);
|
|
|
|
|
|
|
|
foreach ($queries as $query) {
|
|
|
|
|
|
|
|
$nodes = $xpath->query($query);
|
|
|
|
|
|
|
|
if ($nodes->length !== 0) {
|
|
|
|
|
|
|
|
$link = $nodes->item(0)->getAttribute('href');
|
|
|
|
|
2013-06-28 21:50:15 -04:00
|
|
|
if (! empty($link)) {
|
2013-02-17 21:48:21 -05:00
|
|
|
|
2013-06-28 21:50:15 -04:00
|
|
|
// Relative links
|
|
|
|
if (strpos($link, 'http') !== 0) {
|
2013-02-17 21:48:21 -05:00
|
|
|
|
2013-06-28 21:50:15 -04:00
|
|
|
if ($link{0} === '/') $link = substr($link, 1);
|
|
|
|
if ($this->url{strlen($this->url) - 1} !== '/') $this->url .= '/';
|
|
|
|
|
|
|
|
$link = $this->url.$link;
|
|
|
|
}
|
2013-02-17 21:48:21 -05:00
|
|
|
|
2014-05-20 14:20:27 -04:00
|
|
|
Logging::setMessage(get_called_class().': Find subscription link: '.$link);
|
2013-06-28 21:50:15 -04:00
|
|
|
$this->download($link);
|
2013-02-17 21:48:21 -05:00
|
|
|
|
2013-06-28 21:50:15 -04:00
|
|
|
return true;
|
|
|
|
}
|
2013-02-17 21:48:21 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
2014-05-20 14:20:27 -04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Get the downloaded content
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
public function getContent()
|
|
|
|
{
|
|
|
|
return $this->content;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Set the page content
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param string $content Page content
|
|
|
|
* @return \PicoFeed\Reader
|
|
|
|
*/
|
|
|
|
public function setContent($content)
|
|
|
|
{
|
|
|
|
$this->content = $content;
|
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get final URL
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
public function getUrl()
|
|
|
|
{
|
|
|
|
return $this->url;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Set the URL
|
|
|
|
*
|
|
|
|
* @access public
|
|
|
|
* @param string $url URL
|
|
|
|
* @return \PicoFeed\Reader
|
|
|
|
*/
|
|
|
|
public function setUrl($url)
|
|
|
|
{
|
|
|
|
$this->url = $url;
|
|
|
|
return $this;
|
|
|
|
}
|
2013-02-17 21:48:21 -05:00
|
|
|
}
|