2014-12-23 21:28:26 -05:00
|
|
|
<?php
|
|
|
|
|
|
|
|
namespace PicoFeed\Reader;
|
|
|
|
|
|
|
|
use DOMXPath;
|
2016-03-24 17:49:50 -04:00
|
|
|
use PicoFeed\Base;
|
2014-12-23 21:28:26 -05:00
|
|
|
use PicoFeed\Client\Client;
|
|
|
|
use PicoFeed\Client\Url;
|
|
|
|
use PicoFeed\Logging\Logger;
|
|
|
|
use PicoFeed\Parser\XmlParser;
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Reader class.
|
2014-12-23 21:28:26 -05:00
|
|
|
*
|
|
|
|
* @author Frederic Guillot
|
|
|
|
*/
|
2016-03-24 17:49:50 -04:00
|
|
|
class Reader extends Base
|
2014-12-23 21:28:26 -05:00
|
|
|
{
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Feed formats for detection.
|
2014-12-23 21:28:26 -05:00
|
|
|
*
|
|
|
|
* @var array
|
|
|
|
*/
|
|
|
|
private $formats = array(
|
|
|
|
'Atom' => '//feed',
|
|
|
|
'Rss20' => '//rss[@version="2.0"]',
|
|
|
|
'Rss92' => '//rss[@version="0.92"]',
|
|
|
|
'Rss91' => '//rss[@version="0.91"]',
|
|
|
|
'Rss10' => '//rdf',
|
|
|
|
);
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Download a feed (no discovery).
|
|
|
|
*
|
|
|
|
* @param string $url Feed url
|
|
|
|
* @param string $last_modified Last modified HTTP header
|
|
|
|
* @param string $etag Etag HTTP header
|
|
|
|
* @param string $username HTTP basic auth username
|
|
|
|
* @param string $password HTTP basic auth password
|
2014-12-23 21:28:26 -05:00
|
|
|
*
|
|
|
|
* @return \PicoFeed\Client\Client
|
|
|
|
*/
|
2015-03-30 20:13:07 -04:00
|
|
|
public function download($url, $last_modified = '', $etag = '', $username = '', $password = '')
|
2014-12-23 21:28:26 -05:00
|
|
|
{
|
|
|
|
$url = $this->prependScheme($url);
|
|
|
|
|
|
|
|
return Client::getInstance()
|
|
|
|
->setConfig($this->config)
|
|
|
|
->setLastModified($last_modified)
|
|
|
|
->setEtag($etag)
|
2015-03-30 20:13:07 -04:00
|
|
|
->setUsername($username)
|
|
|
|
->setPassword($password)
|
2014-12-23 21:28:26 -05:00
|
|
|
->execute($url);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Discover and download a feed.
|
|
|
|
*
|
|
|
|
* @param string $url Feed or website url
|
|
|
|
* @param string $last_modified Last modified HTTP header
|
|
|
|
* @param string $etag Etag HTTP header
|
|
|
|
* @param string $username HTTP basic auth username
|
|
|
|
* @param string $password HTTP basic auth password
|
2014-12-23 21:28:26 -05:00
|
|
|
*
|
|
|
|
* @return \PicoFeed\Client\Client
|
|
|
|
*/
|
2015-03-30 20:13:07 -04:00
|
|
|
public function discover($url, $last_modified = '', $etag = '', $username = '', $password = '')
|
2014-12-23 21:28:26 -05:00
|
|
|
{
|
2015-03-30 20:13:07 -04:00
|
|
|
$client = $this->download($url, $last_modified, $etag, $username, $password);
|
2014-12-23 21:28:26 -05:00
|
|
|
|
|
|
|
// It's already a feed or the feed was not modified
|
2015-10-19 22:49:30 -04:00
|
|
|
if (!$client->isModified() || $this->detectFormat($client->getContent())) {
|
2014-12-23 21:28:26 -05:00
|
|
|
return $client;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Try to find a subscription
|
|
|
|
$links = $this->find($client->getUrl(), $client->getContent());
|
|
|
|
|
|
|
|
if (empty($links)) {
|
|
|
|
throw new SubscriptionNotFoundException('Unable to find a subscription');
|
|
|
|
}
|
|
|
|
|
2015-03-30 20:13:07 -04:00
|
|
|
return $this->download($links[0], $last_modified, $etag, $username, $password);
|
2014-12-23 21:28:26 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Find feed urls inside a HTML document.
|
2014-12-23 21:28:26 -05:00
|
|
|
*
|
2015-10-19 22:49:30 -04:00
|
|
|
* @param string $url Website url
|
|
|
|
* @param string $html HTML content
|
|
|
|
*
|
|
|
|
* @return array List of feed links
|
2014-12-23 21:28:26 -05:00
|
|
|
*/
|
|
|
|
public function find($url, $html)
|
|
|
|
{
|
|
|
|
Logger::setMessage(get_called_class().': Try to discover subscriptions');
|
|
|
|
|
|
|
|
$dom = XmlParser::getHtmlDocument($html);
|
|
|
|
$xpath = new DOMXPath($dom);
|
|
|
|
$links = array();
|
|
|
|
|
|
|
|
$queries = array(
|
|
|
|
'//link[@type="application/rss+xml"]',
|
|
|
|
'//link[@type="application/atom+xml"]',
|
|
|
|
);
|
|
|
|
|
|
|
|
foreach ($queries as $query) {
|
|
|
|
$nodes = $xpath->query($query);
|
|
|
|
|
|
|
|
foreach ($nodes as $node) {
|
|
|
|
$link = $node->getAttribute('href');
|
|
|
|
|
2015-10-19 22:49:30 -04:00
|
|
|
if (!empty($link)) {
|
2014-12-23 21:28:26 -05:00
|
|
|
$feedUrl = new Url($link);
|
|
|
|
$siteUrl = new Url($url);
|
|
|
|
|
|
|
|
$links[] = $feedUrl->getAbsoluteUrl($feedUrl->isRelativeUrl() ? $siteUrl->getBaseUrl() : '');
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Logger::setMessage(get_called_class().': '.implode(', ', $links));
|
|
|
|
|
|
|
|
return $links;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Get a parser instance.
|
|
|
|
*
|
|
|
|
* @param string $url Site url
|
|
|
|
* @param string $content Feed content
|
|
|
|
* @param string $encoding HTTP encoding
|
2014-12-23 21:28:26 -05:00
|
|
|
*
|
|
|
|
* @return \PicoFeed\Parser\Parser
|
|
|
|
*/
|
|
|
|
public function getParser($url, $content, $encoding)
|
|
|
|
{
|
|
|
|
$format = $this->detectFormat($content);
|
|
|
|
|
|
|
|
if (empty($format)) {
|
|
|
|
throw new UnsupportedFeedFormatException('Unable to detect feed format');
|
|
|
|
}
|
|
|
|
|
|
|
|
$className = '\PicoFeed\Parser\\'.$format;
|
|
|
|
|
|
|
|
$parser = new $className($content, $encoding, $url);
|
|
|
|
$parser->setHashAlgo($this->config->getParserHashAlgo());
|
|
|
|
$parser->setConfig($this->config);
|
|
|
|
|
|
|
|
return $parser;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Detect the feed format.
|
|
|
|
*
|
|
|
|
* @param string $content Feed content
|
2014-12-23 21:28:26 -05:00
|
|
|
*
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
public function detectFormat($content)
|
|
|
|
{
|
|
|
|
$dom = XmlParser::getHtmlDocument($content);
|
|
|
|
$xpath = new DOMXPath($dom);
|
|
|
|
|
|
|
|
foreach ($this->formats as $parser_name => $query) {
|
|
|
|
$nodes = $xpath->query($query);
|
|
|
|
|
|
|
|
if ($nodes->length === 1) {
|
|
|
|
return $parser_name;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return '';
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Add the prefix "http://" if the end-user just enter a domain name.
|
2014-12-23 21:28:26 -05:00
|
|
|
*
|
2015-10-19 22:49:30 -04:00
|
|
|
* @param string $url Url
|
2014-12-23 21:28:26 -05:00
|
|
|
* @retunr string
|
|
|
|
*/
|
|
|
|
public function prependScheme($url)
|
|
|
|
{
|
2015-10-19 22:49:30 -04:00
|
|
|
if (!preg_match('%^https?://%', $url)) {
|
|
|
|
$url = 'http://'.$url;
|
2014-12-23 21:28:26 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
return $url;
|
|
|
|
}
|
|
|
|
}
|