dom = XmlParser::getHtmlDocument(''.$html); $this->xpath = new DOMXPath($this->dom); } /** * Get the relevant content with the list of potential attributes. * * @return string */ public function execute() { $content = $this->findContentWithCandidates(); if (strlen($content) < 200) { $content = $this->findContentWithArticle(); } if (strlen($content) < 50) { $content = $this->findContentWithBody(); } return $this->stripGarbage($content); } /** * Find content based on the list of tag candidates. * * @return string */ public function findContentWithCandidates() { foreach ($this->candidatesAttributes as $candidate) { Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"'); $nodes = $this->xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]'); if ($nodes !== false && $nodes->length > 0) { Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'"'); return $this->dom->saveXML($nodes->item(0)); } } return ''; } /** * Find
tag. * * @return string */ public function findContentWithArticle() { $nodes = $this->xpath->query('//article'); if ($nodes !== false && $nodes->length > 0) { Logger::setMessage(get_called_class().': Find
tag'); return $this->dom->saveXML($nodes->item(0)); } return ''; } /** * Find tag. * * @return string */ public function findContentWithBody() { $nodes = $this->xpath->query('//body'); if ($nodes !== false && $nodes->length > 0) { Logger::setMessage(get_called_class().' Find '); return $this->dom->saveXML($nodes->item(0)); } return ''; } /** * Strip useless tags. * * @param string $content * @return string */ public function stripGarbage($content) { $dom = XmlParser::getDomDocument($content); if ($dom !== false) { $xpath = new DOMXPath($dom); $this->stripTags($xpath); $this->stripAttributes($dom, $xpath); $content = $dom->saveXML($dom->documentElement); } return $content; } /** * Remove blacklisted tags. * * @param DOMXPath $xpath */ public function stripTags(DOMXPath $xpath) { foreach ($this->stripTags as $tag) { $nodes = $xpath->query('//'.$tag); if ($nodes !== false && $nodes->length > 0) { Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"'); foreach ($nodes as $node) { $node->parentNode->removeChild($node); } } } } /** * Remove blacklisted attributes. * * @param DomDocument $dom * @param DOMXPath $xpath */ public function stripAttributes(DomDocument $dom, DOMXPath $xpath) { foreach ($this->stripAttributes as $attribute) { $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]'); if ($nodes !== false && $nodes->length > 0) { Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"'); foreach ($nodes as $node) { if ($this->shouldRemove($dom, $node)) { $node->parentNode->removeChild($node); } } } } } /** * Find link for next page of the article. * * @return string */ public function findNextLink() { return null; } /** * Return false if the node should not be removed. * * @param DomDocument $dom * @param \DomNode $node * @return bool */ public function shouldRemove(DomDocument $dom, $node) { $document_length = strlen($dom->textContent); $node_length = strlen($node->textContent); if ($document_length === 0) { return true; } $ratio = $node_length * 100 / $document_length; if ($ratio >= 90) { Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%'); return false; } return true; } }