2015-04-28 18:08:42 +02:00
|
|
|
<?php
|
|
|
|
|
|
|
|
namespace PicoFeed\Scraper;
|
|
|
|
|
2016-03-24 17:49:50 -04:00
|
|
|
use PicoFeed\Base;
|
2015-04-28 18:08:42 +02:00
|
|
|
use PicoFeed\Client\Client;
|
|
|
|
use PicoFeed\Client\ClientException;
|
|
|
|
use PicoFeed\Client\Url;
|
|
|
|
use PicoFeed\Encoding\Encoding;
|
|
|
|
use PicoFeed\Filter\Filter;
|
|
|
|
use PicoFeed\Logging\Logger;
|
|
|
|
use PicoFeed\Parser\XmlParser;
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Scraper class.
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
|
|
|
* @author Frederic Guillot
|
|
|
|
*/
|
2016-03-24 17:49:50 -04:00
|
|
|
class Scraper extends Base
|
2015-04-28 18:08:42 +02:00
|
|
|
{
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* URL.
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
private $url = '';
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Relevant content.
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
private $content = '';
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* HTML content.
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
private $html = '';
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* HTML content encoding.
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
private $encoding = '';
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Flag to enable candidates parsing.
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
2015-10-19 22:49:30 -04:00
|
|
|
* @var bool
|
2015-04-28 18:08:42 +02:00
|
|
|
*/
|
|
|
|
private $enableCandidateParser = true;
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Disable candidates parsing.
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
2015-10-19 22:49:30 -04:00
|
|
|
* @return Scraper
|
2015-04-28 18:08:42 +02:00
|
|
|
*/
|
|
|
|
public function disableCandidateParser()
|
|
|
|
{
|
|
|
|
$this->enableCandidateParser = false;
|
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Get encoding.
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
2015-10-19 22:49:30 -04:00
|
|
|
* @return string
|
2015-04-28 18:08:42 +02:00
|
|
|
*/
|
|
|
|
public function getEncoding()
|
|
|
|
{
|
|
|
|
return $this->encoding;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Set encoding.
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
2015-10-19 22:49:30 -04:00
|
|
|
* @param string $encoding
|
|
|
|
*
|
|
|
|
* @return Scraper
|
2015-04-28 18:08:42 +02:00
|
|
|
*/
|
|
|
|
public function setEncoding($encoding)
|
|
|
|
{
|
|
|
|
$this->encoding = $encoding;
|
2015-10-19 22:49:30 -04:00
|
|
|
|
2015-04-28 18:08:42 +02:00
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Get URL to download.
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
2015-10-19 22:49:30 -04:00
|
|
|
* @return string
|
2015-04-28 18:08:42 +02:00
|
|
|
*/
|
|
|
|
public function getUrl()
|
|
|
|
{
|
|
|
|
return $this->url;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Set URL to download.
|
|
|
|
*
|
|
|
|
* @param string $url URL
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
2015-10-19 22:49:30 -04:00
|
|
|
* @return Scraper
|
2015-04-28 18:08:42 +02:00
|
|
|
*/
|
|
|
|
public function setUrl($url)
|
|
|
|
{
|
|
|
|
$this->url = $url;
|
2015-10-19 22:49:30 -04:00
|
|
|
|
2015-04-28 18:08:42 +02:00
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Return true if the scraper found relevant content.
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
2015-10-19 22:49:30 -04:00
|
|
|
* @return bool
|
2015-04-28 18:08:42 +02:00
|
|
|
*/
|
|
|
|
public function hasRelevantContent()
|
|
|
|
{
|
2015-10-19 22:49:30 -04:00
|
|
|
return !empty($this->content);
|
2015-04-28 18:08:42 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Get relevant content.
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
public function getRelevantContent()
|
|
|
|
{
|
|
|
|
return $this->content;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Get raw content (unfiltered).
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
public function getRawContent()
|
|
|
|
{
|
|
|
|
return $this->html;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Set raw content (unfiltered).
|
|
|
|
*
|
|
|
|
* @param string $html
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
|
|
|
* @return Scraper
|
|
|
|
*/
|
|
|
|
public function setRawContent($html)
|
|
|
|
{
|
|
|
|
$this->html = $html;
|
2015-10-19 22:49:30 -04:00
|
|
|
|
2015-04-28 18:08:42 +02:00
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Get filtered relevant content.
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
public function getFilteredContent()
|
|
|
|
{
|
|
|
|
$filter = Filter::html($this->content, $this->url);
|
|
|
|
$filter->setConfig($this->config);
|
2015-10-19 22:49:30 -04:00
|
|
|
|
2015-04-28 18:08:42 +02:00
|
|
|
return $filter->execute();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Download the HTML content.
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
2015-10-19 22:49:30 -04:00
|
|
|
* @return bool
|
2015-04-28 18:08:42 +02:00
|
|
|
*/
|
|
|
|
public function download()
|
|
|
|
{
|
2015-10-19 22:49:30 -04:00
|
|
|
if (!empty($this->url)) {
|
2015-04-28 18:08:42 +02:00
|
|
|
|
|
|
|
// Clear everything
|
|
|
|
$this->html = '';
|
|
|
|
$this->content = '';
|
|
|
|
$this->encoding = '';
|
|
|
|
|
|
|
|
try {
|
|
|
|
$client = Client::getInstance();
|
|
|
|
$client->setConfig($this->config);
|
|
|
|
$client->setTimeout($this->config->getGrabberTimeout());
|
|
|
|
$client->setUserAgent($this->config->getGrabberUserAgent());
|
|
|
|
$client->execute($this->url);
|
|
|
|
|
|
|
|
$this->url = $client->getUrl();
|
|
|
|
$this->html = $client->getContent();
|
|
|
|
$this->encoding = $client->getEncoding();
|
|
|
|
|
|
|
|
return true;
|
2015-10-19 22:49:30 -04:00
|
|
|
} catch (ClientException $e) {
|
2015-04-28 18:08:42 +02:00
|
|
|
Logger::setMessage(get_called_class().': '.$e->getMessage());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Execute the scraper.
|
2015-04-28 18:08:42 +02:00
|
|
|
*/
|
2016-12-26 17:32:18 -05:00
|
|
|
public function execute($pageContent = '', $recursionDepth = 0)
|
2015-04-28 18:08:42 +02:00
|
|
|
{
|
2016-03-24 17:49:50 -04:00
|
|
|
$this->html = '';
|
|
|
|
$this->encoding = '';
|
2016-12-26 17:32:18 -05:00
|
|
|
$this->content = '';
|
2016-03-24 17:49:50 -04:00
|
|
|
$this->download();
|
|
|
|
$this->prepareHtml();
|
2015-04-28 18:08:42 +02:00
|
|
|
|
2016-03-24 17:49:50 -04:00
|
|
|
$parser = $this->getParser();
|
2015-10-19 22:49:30 -04:00
|
|
|
|
2016-03-24 17:49:50 -04:00
|
|
|
if ($parser !== null) {
|
2016-12-26 17:32:18 -05:00
|
|
|
$maxRecursions = $this->config->getMaxRecursions();
|
|
|
|
if(!isset($maxRecursions)){
|
|
|
|
$maxRecursions = 25;
|
|
|
|
}
|
|
|
|
$pageContent .= $parser->execute();
|
|
|
|
// check if there is a link to next page and recursively get content (max 25 pages)
|
|
|
|
if((($nextLink = $parser->findNextLink()) !== null) && $recursionDepth < $maxRecursions){
|
|
|
|
$nextLink = Url::resolve($nextLink,$this->url);
|
|
|
|
$this->setUrl($nextLink);
|
|
|
|
$this->execute($pageContent,$recursionDepth+1);
|
|
|
|
}
|
|
|
|
else{
|
|
|
|
$this->content = $pageContent;
|
|
|
|
}
|
2016-03-24 17:49:50 -04:00
|
|
|
Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
|
2015-04-28 18:08:42 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Get the parser.
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
|
|
|
* @return ParserInterface
|
|
|
|
*/
|
|
|
|
public function getParser()
|
|
|
|
{
|
|
|
|
$ruleLoader = new RuleLoader($this->config);
|
|
|
|
$rules = $ruleLoader->getRules($this->url);
|
|
|
|
|
2015-10-19 22:49:30 -04:00
|
|
|
if (!empty($rules['grabber'])) {
|
2015-04-28 18:08:42 +02:00
|
|
|
Logger::setMessage(get_called_class().': Parse content with rules');
|
|
|
|
|
|
|
|
foreach ($rules['grabber'] as $pattern => $rule) {
|
|
|
|
$url = new Url($this->url);
|
|
|
|
$sub_url = $url->getFullPath();
|
|
|
|
|
|
|
|
if (preg_match($pattern, $sub_url)) {
|
|
|
|
Logger::setMessage(get_called_class().': Matched url '.$sub_url);
|
|
|
|
return new RuleParser($this->html, $rule);
|
|
|
|
}
|
|
|
|
}
|
2015-10-19 22:49:30 -04:00
|
|
|
} elseif ($this->enableCandidateParser) {
|
2015-04-28 18:08:42 +02:00
|
|
|
Logger::setMessage(get_called_class().': Parse content with candidates');
|
|
|
|
}
|
|
|
|
|
2016-03-24 17:49:50 -04:00
|
|
|
return new CandidateParser($this->html);
|
2015-04-28 18:08:42 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Normalize encoding and strip head tag.
|
2015-04-28 18:08:42 +02:00
|
|
|
*/
|
|
|
|
public function prepareHtml()
|
|
|
|
{
|
|
|
|
$html_encoding = XmlParser::getEncodingFromMetaTag($this->html);
|
|
|
|
|
|
|
|
$this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding);
|
|
|
|
$this->html = Filter::stripHeadTags($this->html);
|
|
|
|
|
|
|
|
Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'" ; HTML Encoding "'.$html_encoding.'"');
|
|
|
|
}
|
|
|
|
}
|