2015-04-28 18:08:42 +02:00
|
|
|
<?php
|
|
|
|
|
|
|
|
namespace PicoFeed\Scraper;
|
|
|
|
|
|
|
|
use DOMXPath;
|
|
|
|
use PicoFeed\Parser\XmlParser;
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Rule Parser.
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
|
|
|
* @author Frederic Guillot
|
|
|
|
*/
|
|
|
|
class RuleParser implements ParserInterface
|
|
|
|
{
|
|
|
|
private $dom;
|
|
|
|
private $xpath;
|
|
|
|
private $rules = array();
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Constructor.
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
2015-10-19 22:49:30 -04:00
|
|
|
* @param string $html
|
|
|
|
* @param array $rules
|
2015-04-28 18:08:42 +02:00
|
|
|
*/
|
|
|
|
public function __construct($html, array $rules)
|
|
|
|
{
|
|
|
|
$this->rules = $rules;
|
|
|
|
$this->dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html);
|
|
|
|
$this->xpath = new DOMXPath($this->dom);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Get the relevant content with predefined rules.
|
2015-04-28 18:08:42 +02:00
|
|
|
*
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
public function execute()
|
|
|
|
{
|
|
|
|
$this->stripTags();
|
2015-10-19 22:49:30 -04:00
|
|
|
|
2015-04-28 18:08:42 +02:00
|
|
|
return $this->findContent();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Remove HTML tags.
|
2015-04-28 18:08:42 +02:00
|
|
|
*/
|
|
|
|
public function stripTags()
|
|
|
|
{
|
|
|
|
if (isset($this->rules['strip']) && is_array($this->rules['strip'])) {
|
|
|
|
foreach ($this->rules['strip'] as $pattern) {
|
|
|
|
$nodes = $this->xpath->query($pattern);
|
|
|
|
|
|
|
|
if ($nodes !== false && $nodes->length > 0) {
|
|
|
|
foreach ($nodes as $node) {
|
|
|
|
$node->parentNode->removeChild($node);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Fetch content based on Xpath rules.
|
2015-04-28 18:08:42 +02:00
|
|
|
*/
|
|
|
|
public function findContent()
|
|
|
|
{
|
|
|
|
$content = '';
|
|
|
|
if (isset($this->rules['body']) && is_array($this->rules['body'])) {
|
|
|
|
foreach ($this->rules['body'] as $pattern) {
|
|
|
|
$nodes = $this->xpath->query($pattern);
|
|
|
|
|
|
|
|
if ($nodes !== false && $nodes->length > 0) {
|
|
|
|
foreach ($nodes as $node) {
|
|
|
|
$content .= $this->dom->saveXML($node);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return $content;
|
|
|
|
}
|
2016-12-26 17:32:18 -05:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Fetch next link based on Xpath rules.
|
|
|
|
*
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
public function findNextLink()
|
|
|
|
{
|
|
|
|
if (isset($this->rules['next_page']) && is_array($this->rules['next_page'])) {
|
|
|
|
foreach ($this->rules['next_page'] as $pattern) {
|
|
|
|
$nodes = $this->xpath->query($pattern);
|
|
|
|
if ($nodes !== false && $nodes->length > 0) {
|
|
|
|
foreach ($nodes as $node) {
|
|
|
|
return $node->getAttribute('href');
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return null;
|
|
|
|
}
|
2015-04-28 18:08:42 +02:00
|
|
|
}
|