2015-04-28 18:08:42 +02:00
< ? php
namespace PicoFeed\Scraper ;
use DomDocument ;
use DOMXPath ;
use PicoFeed\Logging\Logger ;
use PicoFeed\Parser\XmlParser ;
/**
2015-10-20 04:49:30 +02:00
* Candidate Parser .
2015-04-28 18:08:42 +02:00
*
* @ author Frederic Guillot
*/
class CandidateParser implements ParserInterface
{
private $dom ;
private $xpath ;
/**
2015-10-20 04:49:30 +02:00
* List of attributes to try to get the content , order is important , generic terms at the end .
2015-04-28 18:08:42 +02:00
*
* @ var array
*/
private $candidatesAttributes = array (
'articleBody' ,
'articlebody' ,
'article-body' ,
'articleContent' ,
'articlecontent' ,
'article-content' ,
'articlePage' ,
'post-content' ,
'post_content' ,
'entry-content' ,
'entry-body' ,
'main-content' ,
'story_content' ,
'storycontent' ,
'entryBox' ,
'entrytext' ,
'comic' ,
'post' ,
'article' ,
'content' ,
'main' ,
);
/**
2015-10-20 04:49:30 +02:00
* List of attributes to strip .
2015-04-28 18:08:42 +02:00
*
* @ var array
*/
private $stripAttributes = array (
'comment' ,
'share' ,
'links' ,
'toolbar' ,
'fb' ,
'footer' ,
'credit' ,
'bottom' ,
'nav' ,
'header' ,
'social' ,
'tag' ,
'metadata' ,
'entry-utility' ,
'related-posts' ,
'tweet' ,
'categories' ,
'post_title' ,
'by_line' ,
'byline' ,
'sponsors' ,
);
/**
2015-10-20 04:49:30 +02:00
* Tags to remove .
2015-04-28 18:08:42 +02:00
*
* @ var array
*/
private $stripTags = array (
'nav' ,
'header' ,
'footer' ,
'aside' ,
'form' ,
);
/**
2015-10-20 04:49:30 +02:00
* Constructor .
2015-04-28 18:08:42 +02:00
*
2015-10-20 04:49:30 +02:00
* @ param string $html
2015-04-28 18:08:42 +02:00
*/
public function __construct ( $html )
{
$this -> dom = XmlParser :: getHtmlDocument ( '<?xml version="1.0" encoding="UTF-8">' . $html );
$this -> xpath = new DOMXPath ( $this -> dom );
}
/**
2015-10-20 04:49:30 +02:00
* Get the relevant content with the list of potential attributes .
2015-04-28 18:08:42 +02:00
*
* @ return string
*/
public function execute ()
{
$content = $this -> findContentWithCandidates ();
if ( strlen ( $content ) < 200 ) {
$content = $this -> findContentWithArticle ();
}
if ( strlen ( $content ) < 50 ) {
$content = $this -> findContentWithBody ();
}
return $this -> stripGarbage ( $content );
}
/**
2015-10-20 04:49:30 +02:00
* Find content based on the list of tag candidates .
2015-04-28 18:08:42 +02:00
*
* @ return string
*/
public function findContentWithCandidates ()
{
foreach ( $this -> candidatesAttributes as $candidate ) {
Logger :: setMessage ( get_called_class () . ': Try this candidate: "' . $candidate . '"' );
$nodes = $this -> xpath -> query ( '//*[(contains(@class, "' . $candidate . '") or @id="' . $candidate . '") and not (contains(@class, "nav") or contains(@class, "page"))]' );
if ( $nodes !== false && $nodes -> length > 0 ) {
Logger :: setMessage ( get_called_class () . ': Find candidate "' . $candidate . '"' );
2015-10-20 04:49:30 +02:00
2015-04-28 18:08:42 +02:00
return $this -> dom -> saveXML ( $nodes -> item ( 0 ));
}
}
return '' ;
}
/**
2015-10-20 04:49:30 +02:00
* Find < article /> tag .
2015-04-28 18:08:42 +02:00
*
* @ return string
*/
public function findContentWithArticle ()
{
$nodes = $this -> xpath -> query ( '//article' );
if ( $nodes !== false && $nodes -> length > 0 ) {
Logger :: setMessage ( get_called_class () . ': Find <article/> tag' );
2015-10-20 04:49:30 +02:00
2015-04-28 18:08:42 +02:00
return $this -> dom -> saveXML ( $nodes -> item ( 0 ));
}
return '' ;
}
/**
2015-10-20 04:49:30 +02:00
* Find < body /> tag .
2015-04-28 18:08:42 +02:00
*
* @ return string
*/
public function findContentWithBody ()
{
$nodes = $this -> xpath -> query ( '//body' );
if ( $nodes !== false && $nodes -> length > 0 ) {
Logger :: setMessage ( get_called_class () . ' Find <body/>' );
2015-10-20 04:49:30 +02:00
2015-04-28 18:08:42 +02:00
return $this -> dom -> saveXML ( $nodes -> item ( 0 ));
}
return '' ;
}
/**
2015-10-20 04:49:30 +02:00
* Strip useless tags .
*
* @ param string $content
2015-04-28 18:08:42 +02:00
*
* @ return string
*/
public function stripGarbage ( $content )
{
$dom = XmlParser :: getDomDocument ( $content );
if ( $dom !== false ) {
$xpath = new DOMXPath ( $dom );
$this -> stripTags ( $xpath );
$this -> stripAttributes ( $dom , $xpath );
$content = $dom -> saveXML ( $dom -> documentElement );
}
return $content ;
}
/**
2015-10-20 04:49:30 +02:00
* Remove blacklisted tags .
2015-04-28 18:08:42 +02:00
*
2015-10-20 04:49:30 +02:00
* @ param DOMXPath $xpath
2015-04-28 18:08:42 +02:00
*/
public function stripTags ( DOMXPath $xpath )
{
foreach ( $this -> stripTags as $tag ) {
$nodes = $xpath -> query ( '//' . $tag );
if ( $nodes !== false && $nodes -> length > 0 ) {
Logger :: setMessage ( get_called_class () . ': Strip tag: "' . $tag . '"' );
foreach ( $nodes as $node ) {
$node -> parentNode -> removeChild ( $node );
}
}
}
}
/**
2015-10-20 04:49:30 +02:00
* Remove blacklisted attributes .
2015-04-28 18:08:42 +02:00
*
2015-10-20 04:49:30 +02:00
* @ param D omDocument $dom
* @ param DOMXPath $xpath
2015-04-28 18:08:42 +02:00
*/
public function stripAttributes ( DomDocument $dom , DOMXPath $xpath )
{
foreach ( $this -> stripAttributes as $attribute ) {
$nodes = $xpath -> query ( '//*[contains(@class, "' . $attribute . '") or contains(@id, "' . $attribute . '")]' );
if ( $nodes !== false && $nodes -> length > 0 ) {
Logger :: setMessage ( get_called_class () . ': Strip attribute: "' . $attribute . '"' );
foreach ( $nodes as $node ) {
if ( $this -> shouldRemove ( $dom , $node )) {
$node -> parentNode -> removeChild ( $node );
}
}
}
}
}
/**
2015-10-20 04:49:30 +02:00
* Return false if the node should not be removed .
*
* @ param DomDocument $dom
* @ param DomNode $node
2015-04-28 18:08:42 +02:00
*
2015-10-20 04:49:30 +02:00
* @ return bool
2015-04-28 18:08:42 +02:00
*/
public function shouldRemove ( DomDocument $dom , $node )
{
$document_length = strlen ( $dom -> textContent );
$node_length = strlen ( $node -> textContent );
if ( $document_length === 0 ) {
return true ;
}
$ratio = $node_length * 100 / $document_length ;
if ( $ratio >= 90 ) {
Logger :: setMessage ( get_called_class () . ': Should not remove this node (' . $node -> nodeName . ') ratio: ' . $ratio . '%' );
2015-10-20 04:49:30 +02:00
2015-04-28 18:08:42 +02:00
return false ;
}
return true ;
}
}