284 lines
6.4 KiB
PHP
Raw Normal View History

2015-04-28 18:08:42 +02:00
<?php
namespace PicoFeed\Scraper;
use DomDocument;
use DOMXPath;
use PicoFeed\Logging\Logger;
use PicoFeed\Parser\XmlParser;
/**
* Candidate Parser.
2015-04-28 18:08:42 +02:00
*
* @author Frederic Guillot
*/
class CandidateParser implements ParserInterface
{
private $dom;
private $xpath;
/**
* List of attributes to try to get the content, order is important, generic terms at the end.
2015-04-28 18:08:42 +02:00
*
* @var array
*/
private $candidatesAttributes = array(
'articleBody',
'articlebody',
'article-body',
'articleContent',
'articlecontent',
'article-content',
'articlePage',
'post-content',
'post_content',
'entry-content',
'entry-body',
'main-content',
'story_content',
'storycontent',
'entryBox',
'entrytext',
'comic',
'post',
'article',
'content',
'main',
);
/**
* List of attributes to strip.
2015-04-28 18:08:42 +02:00
*
* @var array
*/
private $stripAttributes = array(
'comment',
'share',
'links',
'toolbar',
'fb',
'footer',
'credit',
'bottom',
'nav',
'header',
'social',
'tag',
'metadata',
'entry-utility',
'related-posts',
'tweet',
'categories',
'post_title',
'by_line',
'byline',
'sponsors',
);
/**
* Tags to remove.
2015-04-28 18:08:42 +02:00
*
* @var array
*/
private $stripTags = array(
'nav',
'header',
'footer',
'aside',
'form',
);
/**
* Constructor.
2015-04-28 18:08:42 +02:00
*
* @param string $html
2015-04-28 18:08:42 +02:00
*/
public function __construct($html)
{
$this->dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html);
$this->xpath = new DOMXPath($this->dom);
}
/**
* Get the relevant content with the list of potential attributes.
2015-04-28 18:08:42 +02:00
*
* @return string
*/
public function execute()
{
$content = $this->findContentWithCandidates();
if (strlen($content) < 200) {
$content = $this->findContentWithArticle();
}
if (strlen($content) < 50) {
$content = $this->findContentWithBody();
}
return $this->stripGarbage($content);
}
/**
* Find content based on the list of tag candidates.
2015-04-28 18:08:42 +02:00
*
* @return string
*/
public function findContentWithCandidates()
{
foreach ($this->candidatesAttributes as $candidate) {
Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"');
$nodes = $this->xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
if ($nodes !== false && $nodes->length > 0) {
Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'"');
2015-04-28 18:08:42 +02:00
return $this->dom->saveXML($nodes->item(0));
}
}
return '';
}
/**
* Find <article/> tag.
2015-04-28 18:08:42 +02:00
*
* @return string
*/
public function findContentWithArticle()
{
$nodes = $this->xpath->query('//article');
if ($nodes !== false && $nodes->length > 0) {
Logger::setMessage(get_called_class().': Find <article/> tag');
2015-04-28 18:08:42 +02:00
return $this->dom->saveXML($nodes->item(0));
}
return '';
}
/**
* Find <body/> tag.
2015-04-28 18:08:42 +02:00
*
* @return string
*/
public function findContentWithBody()
{
$nodes = $this->xpath->query('//body');
if ($nodes !== false && $nodes->length > 0) {
Logger::setMessage(get_called_class().' Find <body/>');
2015-04-28 18:08:42 +02:00
return $this->dom->saveXML($nodes->item(0));
}
return '';
}
/**
* Strip useless tags.
*
* @param string $content
2015-04-28 18:08:42 +02:00
*
* @return string
*/
public function stripGarbage($content)
{
$dom = XmlParser::getDomDocument($content);
if ($dom !== false) {
$xpath = new DOMXPath($dom);
$this->stripTags($xpath);
$this->stripAttributes($dom, $xpath);
$content = $dom->saveXML($dom->documentElement);
}
return $content;
}
/**
* Remove blacklisted tags.
2015-04-28 18:08:42 +02:00
*
* @param DOMXPath $xpath
2015-04-28 18:08:42 +02:00
*/
public function stripTags(DOMXPath $xpath)
{
foreach ($this->stripTags as $tag) {
$nodes = $xpath->query('//'.$tag);
if ($nodes !== false && $nodes->length > 0) {
Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"');
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
}
}
}
/**
* Remove blacklisted attributes.
2015-04-28 18:08:42 +02:00
*
* @param DomDocument $dom
* @param DOMXPath $xpath
2015-04-28 18:08:42 +02:00
*/
public function stripAttributes(DomDocument $dom, DOMXPath $xpath)
{
foreach ($this->stripAttributes as $attribute) {
$nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
if ($nodes !== false && $nodes->length > 0) {
Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"');
foreach ($nodes as $node) {
if ($this->shouldRemove($dom, $node)) {
$node->parentNode->removeChild($node);
}
}
}
}
}
2016-12-26 17:32:18 -05:00
/**
* Find link for next page of the article.
*
* @return string
*/
public function findNextLink()
{
return null;
}
2015-04-28 18:08:42 +02:00
/**
* Return false if the node should not be removed.
*
* @param DomDocument $dom
* @param DomNode $node
2015-04-28 18:08:42 +02:00
*
* @return bool
2015-04-28 18:08:42 +02:00
*/
public function shouldRemove(DomDocument $dom, $node)
{
$document_length = strlen($dom->textContent);
$node_length = strlen($node->textContent);
if ($document_length === 0) {
return true;
}
$ratio = $node_length * 100 / $document_length;
if ($ratio >= 90) {
Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%');
2015-04-28 18:08:42 +02:00
return false;
}
return true;
}
}