url = $url; $this->html = $html; $this->encoding = $encoding; } /** * Set config object * * @access public * @param \PicoFeed\Config\Config $config Config instance * @return Grabber */ public function setConfig($config) { $this->config = $config; return $this; } /** * Get relevant content * * @access public * @return string */ public function getContent() { return $this->content; } /** * Get raw content (unfiltered) * * @access public * @return string */ public function getRawContent() { return $this->html; } /** * Get filtered relevant content * * @access public * @return string */ public function getFilteredContent() { $filter = Filter::html($this->content, Url::base($this->url)); $filter->setConfig($this->config); return $filter->execute(); } /** * Parse the HTML content * * @access public * @return bool */ public function parse() { if ($this->html) { Logger::setMessage(get_called_class().' Fix encoding'); Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'"'); $this->html = Encoding::convert($this->html, $this->encoding); $this->html = Filter::stripHeadTags($this->html); Logger::setMessage(get_called_class().' Content length: '.strlen($this->html).' bytes'); $rules = $this->getRules(); if (is_array($rules)) { Logger::setMessage(get_called_class().' Parse content with rules'); $this->parseContentWithRules($rules); } else { Logger::setMessage(get_called_class().' Parse content with candidates'); $this->parseContentWithCandidates(); } } else { Logger::setMessage(get_called_class().' No content fetched'); } Logger::setMessage(get_called_class().' Content length: '.strlen($this->content).' bytes'); Logger::setMessage(get_called_class().' Grabber done'); return $this->content !== ''; } /** * Download the HTML content * * @access public * @return HTML content */ public function download() { $client = Client::getInstance(); $client->setConfig($this->config); $client->execute($this->url); $this->url = $client->getUrl(); $this->html = $client->getContent(); $this->encoding = $client->getEncoding(); return $this->html; } /** * Try to find a predefined rule * * @access public * @return mixed */ public function getRules() { $hostname = parse_url($this->url, PHP_URL_HOST); if ($hostname === false) { return false; } $files = array($hostname); if (substr($hostname, 0, 4) == 'www.') { $files[] = substr($hostname, 4); } if (($pos = strpos($hostname, '.')) !== false) { $files[] = substr($hostname, $pos); $files[] = substr($hostname, $pos + 1); $files[] = substr($hostname, 0, $pos); } foreach ($files as $file) { $filename = __DIR__.'/../Rules/'.$file.'.php'; if (file_exists($filename)) { Logger::setMessage(get_called_class().' Load rule: '.$file); return include $filename; } } return false; } /** * Get the relevant content with predefined rules * * @access public * @param array $rules Rules */ public function parseContentWithRules(array $rules) { // Logger::setMessage($this->html); $dom = XmlParser::getHtmlDocument(''.$this->html); $xpath = new DOMXPath($dom); if (isset($rules['strip']) && is_array($rules['strip'])) { foreach ($rules['strip'] as $pattern) { $nodes = $xpath->query($pattern); if ($nodes !== false && $nodes->length > 0) { foreach ($nodes as $node) { $node->parentNode->removeChild($node); } } } } if (isset($rules['body']) && is_array($rules['body'])) { foreach ($rules['body'] as $pattern) { $nodes = $xpath->query($pattern); if ($nodes !== false && $nodes->length > 0) { foreach ($nodes as $node) { $this->content .= $dom->saveXML($node); } } } } } /** * Get the relevant content with the list of potential attributes * * @access public */ public function parseContentWithCandidates() { $dom = XmlParser::getHtmlDocument(''.$this->html); $xpath = new DOMXPath($dom); // Try to lookup in each tag foreach ($this->candidatesAttributes as $candidate) { Logger::setMessage(get_called_class().' Try this candidate: "'.$candidate.'"'); $nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]'); if ($nodes !== false && $nodes->length > 0) { $this->content = $dom->saveXML($nodes->item(0)); Logger::setMessage(get_called_class().' Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)'); break; } } // Try to fetch
if (! $this->content) { $nodes = $xpath->query('//article'); if ($nodes !== false && $nodes->length > 0) { $this->content = $dom->saveXML($nodes->item(0)); Logger::setMessage(get_called_class().' Find
tag ('.strlen($this->content).' bytes)'); } } if (strlen($this->content) < 50) { Logger::setMessage(get_called_class().' No enought content fetched, get the full body'); $this->content = $dom->saveXML($dom->firstChild); } Logger::setMessage(get_called_class().' Strip garbage'); $this->stripGarbage(); } /** * Strip useless tags * * @access public */ public function stripGarbage() { $dom = XmlParser::getDomDocument($this->content); if ($dom !== false) { $xpath = new DOMXPath($dom); foreach ($this->stripTags as $tag) { $nodes = $xpath->query('//'.$tag); if ($nodes !== false && $nodes->length > 0) { Logger::setMessage(get_called_class().' Strip tag: "'.$tag.'"'); foreach ($nodes as $node) { $node->parentNode->removeChild($node); } } } foreach ($this->stripAttributes as $attribute) { $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]'); if ($nodes !== false && $nodes->length > 0) { Logger::setMessage(get_called_class().' Strip attribute: "'.$attribute.'"'); foreach ($nodes as $node) { $node->parentNode->removeChild($node); } } } $this->content = $dom->saveXML($dom->documentElement); } } }