miniflux-legacy/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/Scraper.php

342 lines
7.0 KiB
PHP
Raw Normal View History

2015-04-28 18:08:42 +02:00
<?php
namespace PicoFeed\Scraper;
use PicoFeed\Client\Client;
use PicoFeed\Client\ClientException;
use PicoFeed\Client\Url;
use PicoFeed\Config\Config;
use PicoFeed\Encoding\Encoding;
use PicoFeed\Filter\Filter;
use PicoFeed\Logging\Logger;
use PicoFeed\Parser\XmlParser;
/**
* Scraper class.
2015-04-28 18:08:42 +02:00
*
* @author Frederic Guillot
*/
class Scraper
{
/**
* URL.
2015-04-28 18:08:42 +02:00
*
* @var string
*/
private $url = '';
/**
* Relevant content.
2015-04-28 18:08:42 +02:00
*
* @var string
*/
private $content = '';
/**
* HTML content.
2015-04-28 18:08:42 +02:00
*
* @var string
*/
private $html = '';
/**
* HTML content encoding.
2015-04-28 18:08:42 +02:00
*
* @var string
*/
private $encoding = '';
/**
* Flag to enable candidates parsing.
2015-04-28 18:08:42 +02:00
*
* @var bool
2015-04-28 18:08:42 +02:00
*/
private $enableCandidateParser = true;
/**
* Config object.
2015-04-28 18:08:42 +02:00
*
* @var \PicoFeed\Config\Config
*/
private $config;
/**
* Constructor.
2015-04-28 18:08:42 +02:00
*
* @param \PicoFeed\Config\Config $config Config class instance
2015-04-28 18:08:42 +02:00
*/
public function __construct(Config $config)
{
$this->config = $config;
Logger::setTimezone($this->config->getTimezone());
}
/**
* Disable candidates parsing.
2015-04-28 18:08:42 +02:00
*
* @return Scraper
2015-04-28 18:08:42 +02:00
*/
public function disableCandidateParser()
{
$this->enableCandidateParser = false;
2015-04-28 18:08:42 +02:00
return $this;
}
/**
* Get encoding.
2015-04-28 18:08:42 +02:00
*
* @return string
2015-04-28 18:08:42 +02:00
*/
public function getEncoding()
{
return $this->encoding;
}
/**
* Set encoding.
2015-04-28 18:08:42 +02:00
*
* @param string $encoding
*
* @return Scraper
2015-04-28 18:08:42 +02:00
*/
public function setEncoding($encoding)
{
$this->encoding = $encoding;
2015-04-28 18:08:42 +02:00
return $this;
}
/**
* Get URL to download.
2015-04-28 18:08:42 +02:00
*
* @return string
2015-04-28 18:08:42 +02:00
*/
public function getUrl()
{
return $this->url;
}
/**
* Set URL to download.
*
* @param string $url URL
2015-04-28 18:08:42 +02:00
*
* @return Scraper
2015-04-28 18:08:42 +02:00
*/
public function setUrl($url)
{
$this->url = $url;
2015-04-28 18:08:42 +02:00
return $this;
}
/**
* Return true if the scraper found relevant content.
2015-04-28 18:08:42 +02:00
*
* @return bool
2015-04-28 18:08:42 +02:00
*/
public function hasRelevantContent()
{
return !empty($this->content);
2015-04-28 18:08:42 +02:00
}
/**
* Get relevant content.
2015-04-28 18:08:42 +02:00
*
* @return string
*/
public function getRelevantContent()
{
return $this->content;
}
/**
* Get raw content (unfiltered).
2015-04-28 18:08:42 +02:00
*
* @return string
*/
public function getRawContent()
{
return $this->html;
}
/**
* Set raw content (unfiltered).
*
* @param string $html
2015-04-28 18:08:42 +02:00
*
* @return Scraper
*/
public function setRawContent($html)
{
$this->html = $html;
2015-04-28 18:08:42 +02:00
return $this;
}
/**
* Get filtered relevant content.
2015-04-28 18:08:42 +02:00
*
* @return string
*/
public function getFilteredContent()
{
$filter = Filter::html($this->content, $this->url);
$filter->setConfig($this->config);
2015-04-28 18:08:42 +02:00
return $filter->execute();
}
/**
* Download the HTML content.
2015-04-28 18:08:42 +02:00
*
* @return bool
2015-04-28 18:08:42 +02:00
*/
public function download()
{
if (!empty($this->url)) {
2015-04-28 18:08:42 +02:00
// Clear everything
$this->html = '';
$this->content = '';
$this->encoding = '';
try {
$client = Client::getInstance();
$client->setConfig($this->config);
$client->setTimeout($this->config->getGrabberTimeout());
$client->setUserAgent($this->config->getGrabberUserAgent());
$client->execute($this->url);
$this->url = $client->getUrl();
$this->html = $client->getContent();
$this->encoding = $client->getEncoding();
return true;
} catch (ClientException $e) {
2015-04-28 18:08:42 +02:00
Logger::setMessage(get_called_class().': '.$e->getMessage());
}
}
return false;
}
/**
* Execute the scraper.
2015-04-28 18:08:42 +02:00
*/
public function execute()
{
$this->download();
if (!$this->skipProcessing()) {
2015-04-28 18:08:42 +02:00
$this->prepareHtml();
$parser = $this->getParser();
if ($parser !== null) {
$this->content = $parser->execute();
Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
}
}
}
/**
* Returns true if the parsing must be skipped.
2015-04-28 18:08:42 +02:00
*
* @return bool
2015-04-28 18:08:42 +02:00
*/
public function skipProcessing()
{
$handlers = array(
'detectStreamingVideos',
'detectPdfFiles',
);
foreach ($handlers as $handler) {
if ($this->$handler()) {
return true;
}
}
if (empty($this->html)) {
Logger::setMessage(get_called_class().': Raw HTML is empty');
2015-04-28 18:08:42 +02:00
return true;
}
return false;
}
/**
* Get the parser.
2015-04-28 18:08:42 +02:00
*
* @return ParserInterface
*/
public function getParser()
{
$ruleLoader = new RuleLoader($this->config);
$rules = $ruleLoader->getRules($this->url);
if (!empty($rules['grabber'])) {
2015-04-28 18:08:42 +02:00
Logger::setMessage(get_called_class().': Parse content with rules');
foreach ($rules['grabber'] as $pattern => $rule) {
$url = new Url($this->url);
$sub_url = $url->getFullPath();
if (preg_match($pattern, $sub_url)) {
Logger::setMessage(get_called_class().': Matched url '.$sub_url);
2015-04-28 18:08:42 +02:00
return new RuleParser($this->html, $rule);
}
}
} elseif ($this->enableCandidateParser) {
2015-04-28 18:08:42 +02:00
Logger::setMessage(get_called_class().': Parse content with candidates');
2015-04-28 18:08:42 +02:00
return new CandidateParser($this->html);
}
return;
2015-04-28 18:08:42 +02:00
}
/**
* Normalize encoding and strip head tag.
2015-04-28 18:08:42 +02:00
*/
public function prepareHtml()
{
$html_encoding = XmlParser::getEncodingFromMetaTag($this->html);
$this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding);
$this->html = Filter::stripHeadTags($this->html);
Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'" ; HTML Encoding "'.$html_encoding.'"');
}
/**
* Return the Youtube embed player and skip processing.
2015-04-28 18:08:42 +02:00
*
* @return bool
2015-04-28 18:08:42 +02:00
*/
public function detectStreamingVideos()
{
if (preg_match("#(?<=v=|v\/|vi=|vi\/|youtu.be\/)[a-zA-Z0-9_-]{11}#", $this->url, $matches)) {
$this->content = '<iframe width="560" height="315" src="//www.youtube.com/embed/'.$matches[0].'" frameborder="0"></iframe>';
2015-04-28 18:08:42 +02:00
return true;
}
return false;
}
/**
* Skip processing for PDF documents.
2015-04-28 18:08:42 +02:00
*
* @return bool
2015-04-28 18:08:42 +02:00
*/
public function detectPdfFiles()
{
return substr($this->url, -3) === 'pdf';
}
}