url = $url;
$this->html = $html;
$this->encoding = $encoding;
}
/**
* Set config object
*
* @access public
* @param \PicoFeed\Config $config Config instance
* @return \PicoFeed\Grabber
*/
public function setConfig($config)
{
$this->config = $config;
return $this;
}
/**
* Get relevant content
*
* @access public
* @return string
*/
public function getContent()
{
return $this->content;
}
/**
* Get raw content (unfiltered)
*
* @access public
* @return string
*/
public function getRawContent()
{
return $this->html;
}
/**
* Parse the HTML content
*
* @access public
* @return bool
*/
public function parse()
{
if ($this->html) {
Logging::setMessage(get_called_class().' Fix encoding');
Logging::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'"');
$this->html = Filter::stripHeadTags($this->html);
if ($this->encoding == 'windows-1251') {
$this->html = Encoding::cp1251ToUtf8($this->html);
}
else {
$this->html = Encoding::toUTF8($this->html);
}
Logging::setMessage(get_called_class().' Content length: '.strlen($this->html).' bytes');
$rules = $this->getRules();
if (is_array($rules)) {
Logging::setMessage(get_called_class().' Parse content with rules');
$this->parseContentWithRules($rules);
}
else {
Logging::setMessage(get_called_class().' Parse content with candidates');
$this->parseContentWithCandidates();
}
}
else {
Logging::setMessage(get_called_class().' No content fetched');
}
Logging::setMessage(get_called_class().' Content length: '.strlen($this->content).' bytes');
Logging::setMessage(get_called_class().' Grabber done');
return $this->content !== '';
}
/**
* Download the HTML content
*
* @access public
* @return HTML content
*/
public function download()
{
$client = Client::getInstance();
if ($this->config !== null) {
$client->setTimeout($this->config->getGrabberTimeout())
->setUserAgent($this->config->getGrabberUserAgent())
->setMaxRedirections($this->config->getMaxRedirections())
->setMaxBodySize($this->config->getMaxBodySize())
->setProxyHostname($this->config->getProxyHostname())
->setProxyPort($this->config->getProxyPort())
->setProxyUsername($this->config->getProxyUsername())
->setProxyPassword($this->config->getProxyPassword());
}
$client->execute($this->url);
$this->html = $client->getContent();
$this->encoding = $client->getEncoding();
return $this->html;
}
/**
* Try to find a predefined rule
*
* @access public
* @return mixed
*/
public function getRules()
{
$hostname = parse_url($this->url, PHP_URL_HOST);
$files = array($hostname);
if (substr($hostname, 0, 4) == 'www.') {
$files[] = substr($hostname, 4);
}
if (($pos = strpos($hostname, '.')) !== false) {
$files[] = substr($hostname, $pos);
$files[] = substr($hostname, 0, $pos);
}
foreach ($files as $file) {
$filename = __DIR__.'/Rules/'.$file.'.php';
if (file_exists($filename)) {
Logging::setMessage(get_called_class().' Load rule: '.$file);
return include $filename;
}
}
return false;
}
/**
* Get the relevant content with predefined rules
*
* @access public
* @param array $rules Rules
*/
public function parseContentWithRules(array $rules)
{
$dom = XmlParser::getHtmlDocument(''.$this->html);
$xpath = new DOMXPath($dom);
if (isset($rules['strip']) && is_array($rules['strip'])) {
foreach ($rules['strip'] as $pattern) {
$nodes = $xpath->query($pattern);
if ($nodes !== false && $nodes->length > 0) {
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
}
}
}
if (isset($rules['body']) && is_array($rules['body'])) {
foreach ($rules['body'] as $pattern) {
$nodes = $xpath->query($pattern);
if ($nodes !== false && $nodes->length > 0) {
foreach ($nodes as $node) {
$this->content .= $dom->saveXML($node);
}
}
}
}
}
/**
* Get the relevant content with the list of potential attributes
*
* @access public
*/
public function parseContentWithCandidates()
{
$dom = XmlParser::getHtmlDocument(''.$this->html);
$xpath = new DOMXPath($dom);
// Try to lookup in each tag
foreach ($this->candidatesAttributes as $candidate) {
Logging::setMessage(get_called_class().' Try this candidate: "'.$candidate.'"');
$nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
Logging::setMessage(get_called_class().' Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
break;
}
}
// Try to fetch
if (! $this->content) {
$nodes = $xpath->query('//article');
if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
Logging::setMessage(get_called_class().' Find tag ('.strlen($this->content).' bytes)');
}
}
if (strlen($this->content) < 50) {
Logging::setMessage(get_called_class().' No enought content fetched, get the full body');
$this->content = $dom->saveXML($dom->firstChild);
}
Logging::setMessage(get_called_class().' Strip garbage');
$this->stripGarbage();
}
/**
* Strip useless tags
*
* @access public
*/
public function stripGarbage()
{
$dom = XmlParser::getDomDocument($this->content);
if ($dom !== false) {
$xpath = new DOMXPath($dom);
foreach ($this->stripTags as $tag) {
$nodes = $xpath->query('//'.$tag);
if ($nodes !== false && $nodes->length > 0) {
Logging::setMessage(get_called_class().' Strip tag: "'.$tag.'"');
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
}
}
foreach ($this->stripAttributes as $attribute) {
$nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
if ($nodes !== false && $nodes->length > 0) {
Logging::setMessage(get_called_class().' Strip attribute: "'.$attribute.'"');
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
}
}
$this->content = $dom->saveXML($dom->documentElement);
}
}
}