2015-04-28 18:08:42 +02:00
|
|
|
<?php
|
|
|
|
|
|
|
|
namespace PicoFeed\Scraper;
|
|
|
|
|
2016-03-24 17:49:50 -04:00
|
|
|
use PicoFeed\Base;
|
2015-04-28 18:08:42 +02:00
|
|
|
use PicoFeed\Logging\Logger;
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* RuleLoader class.
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
|
|
|
* @author Frederic Guillot
|
|
|
|
* @author Bernhard Posselt
|
|
|
|
*/
|
2016-03-24 17:49:50 -04:00
|
|
|
class RuleLoader extends Base
|
2015-04-28 18:08:42 +02:00
|
|
|
{
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Get the rules for an URL.
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
|
|
|
* @param string $url the URL that should be looked up
|
2015-10-19 22:49:30 -04:00
|
|
|
*
|
2015-04-28 18:08:42 +02:00
|
|
|
* @return array the array containing the rules
|
|
|
|
*/
|
|
|
|
public function getRules($url)
|
|
|
|
{
|
|
|
|
$hostname = parse_url($url, PHP_URL_HOST);
|
|
|
|
|
|
|
|
if ($hostname !== false) {
|
|
|
|
$files = $this->getRulesFileList($hostname);
|
|
|
|
|
|
|
|
foreach ($this->getRulesFolders() as $folder) {
|
|
|
|
$rule = $this->loadRuleFile($folder, $files);
|
|
|
|
|
2015-10-19 22:49:30 -04:00
|
|
|
if (!empty($rule)) {
|
2015-04-28 18:08:42 +02:00
|
|
|
return $rule;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return array();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Get the list of possible rules file names for a given hostname.
|
|
|
|
*
|
|
|
|
* @param string $hostname Hostname
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
|
|
|
* @return array
|
|
|
|
*/
|
|
|
|
public function getRulesFileList($hostname)
|
|
|
|
{
|
|
|
|
$files = array($hostname); // subdomain.domain.tld
|
|
|
|
$parts = explode('.', $hostname);
|
|
|
|
$len = count($parts);
|
|
|
|
|
|
|
|
if ($len > 2) {
|
|
|
|
$subdomain = array_shift($parts);
|
|
|
|
$files[] = implode('.', $parts); // domain.tld
|
|
|
|
$files[] = '.'.implode('.', $parts); // .domain.tld
|
|
|
|
$files[] = $subdomain; // subdomain
|
2015-10-19 22:49:30 -04:00
|
|
|
} elseif ($len === 2) {
|
2015-04-28 18:08:42 +02:00
|
|
|
$files[] = '.'.implode('.', $parts); // .domain.tld
|
|
|
|
$files[] = $parts[0]; // domain
|
|
|
|
}
|
|
|
|
|
|
|
|
return $files;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Load a rule file from the defined folder.
|
|
|
|
*
|
|
|
|
* @param string $folder Rule directory
|
|
|
|
* @param array $files List of possible file names
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
|
|
|
* @return array
|
|
|
|
*/
|
|
|
|
public function loadRuleFile($folder, array $files)
|
|
|
|
{
|
|
|
|
foreach ($files as $file) {
|
|
|
|
$filename = $folder.'/'.$file.'.php';
|
|
|
|
if (file_exists($filename)) {
|
|
|
|
Logger::setMessage(get_called_class().' Load rule: '.$file);
|
2015-10-19 22:49:30 -04:00
|
|
|
|
2015-04-28 18:08:42 +02:00
|
|
|
return include $filename;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return array();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Get the list of folders that contains rules.
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
|
|
|
* @return array
|
|
|
|
*/
|
|
|
|
public function getRulesFolders()
|
|
|
|
{
|
2016-03-24 17:49:50 -04:00
|
|
|
$folders = array();
|
2015-04-28 18:08:42 +02:00
|
|
|
|
|
|
|
if ($this->config !== null && $this->config->getGrabberRulesFolder() !== null) {
|
|
|
|
$folders[] = $this->config->getGrabberRulesFolder();
|
|
|
|
}
|
|
|
|
|
2016-03-24 17:49:50 -04:00
|
|
|
$folders[] = __DIR__ . '/../Rules';
|
|
|
|
|
2015-04-28 18:08:42 +02:00
|
|
|
return $folders;
|
|
|
|
}
|
|
|
|
}
|