2013-08-31 17:05:45 +02:00
< ? php
2014-12-24 03:28:26 +01:00
namespace PicoFeed\Client ;
2013-08-31 17:05:45 +02:00
2014-05-20 20:20:27 +02:00
use DOMXPath ;
2014-12-24 03:28:26 +01:00
use PicoFeed\Encoding\Encoding ;
use PicoFeed\Logging\Logger ;
use PicoFeed\Filter\Filter ;
use PicoFeed\Parser\XmlParser ;
2014-05-20 20:20:27 +02:00
/**
* Grabber class
*
* @ author Frederic Guillot
2014-12-24 03:28:26 +01:00
* @ package Client
2014-05-20 20:20:27 +02:00
*/
2013-08-31 17:05:45 +02:00
class Grabber
{
2014-05-20 20:20:27 +02:00
/**
* URL
*
* @ access private
* @ var string
*/
private $url = '' ;
/**
* Relevant content
*
* @ access private
* @ var string
*/
private $content = '' ;
/**
* HTML content
*
* @ access private
* @ var string
*/
private $html = '' ;
/**
* HTML content encoding
*
* @ access private
* @ var string
*/
private $encoding = '' ;
/**
* List of attributes to try to get the content , order is important , generic terms at the end
*
* @ access private
* @ var array
*/
private $candidatesAttributes = array (
2013-08-31 17:05:45 +02:00
'articleBody' ,
'articlebody' ,
2013-09-02 20:04:10 +02:00
'article-body' ,
2013-08-31 17:05:45 +02:00
'articleContent' ,
'articlecontent' ,
2013-09-02 20:04:10 +02:00
'article-content' ,
2013-08-31 17:05:45 +02:00
'articlePage' ,
'post-content' ,
2013-09-05 02:45:06 +02:00
'post_content' ,
2013-09-01 00:37:26 +02:00
'entry-content' ,
2013-09-02 20:04:10 +02:00
'main-content' ,
2013-10-01 04:15:18 +02:00
'story_content' ,
'storycontent' ,
2013-09-05 02:45:06 +02:00
'entryBox' ,
'entrytext' ,
2013-09-02 20:04:10 +02:00
'comic' ,
2013-09-05 02:45:06 +02:00
'post' ,
2013-09-02 20:04:10 +02:00
'article' ,
2013-08-31 17:05:45 +02:00
'content' ,
'main' ,
);
2014-05-20 20:20:27 +02:00
/**
* List of attributes to strip
*
* @ access private
* @ var array
*/
private $stripAttributes = array (
2013-08-31 17:05:45 +02:00
'comment' ,
'share' ,
'links' ,
'toolbar' ,
'fb' ,
'footer' ,
'credit' ,
'bottom' ,
'nav' ,
'header' ,
'social' ,
2013-10-01 04:15:18 +02:00
'tag' ,
'metadata' ,
2013-09-01 00:37:26 +02:00
'entry-utility' ,
2013-10-01 04:15:18 +02:00
'related-posts' ,
'tweet' ,
'categories' ,
2013-08-31 17:05:45 +02:00
);
2014-05-20 20:20:27 +02:00
/**
* Tags to remove
*
* @ access private
* @ var array
*/
private $stripTags = array (
2013-08-31 17:05:45 +02:00
'script' ,
'style' ,
'nav' ,
'header' ,
'footer' ,
'aside' ,
2013-10-01 04:15:18 +02:00
'form' ,
2013-08-31 17:05:45 +02:00
);
2014-05-20 20:20:27 +02:00
/**
* Config object
*
* @ access private
2014-12-24 03:28:26 +01:00
* @ var \PicoFeed\Config\Config
2014-05-20 20:20:27 +02:00
*/
2014-12-24 03:28:26 +01:00
private $config ;
2014-05-20 20:20:27 +02:00
/**
* Constructor
*
* @ access public
* @ param string $url Url
* @ param string $html HTML content
* @ param string $encoding Charset
*/
2013-10-04 05:14:39 +02:00
public function __construct ( $url , $html = '' , $encoding = 'utf-8' )
2013-08-31 17:05:45 +02:00
{
$this -> url = $url ;
2013-10-04 05:14:39 +02:00
$this -> html = $html ;
$this -> encoding = $encoding ;
2013-08-31 17:05:45 +02:00
}
2014-05-20 20:20:27 +02:00
/**
* Set config object
*
* @ access public
2014-12-24 03:28:26 +01:00
* @ param \PicoFeed\Config\Config $config Config instance
* @ return Grabber
2014-05-20 20:20:27 +02:00
*/
public function setConfig ( $config )
{
$this -> config = $config ;
return $this ;
}
2013-08-31 17:05:45 +02:00
2014-05-20 20:20:27 +02:00
/**
* Get relevant content
*
* @ access public
* @ return string
*/
public function getContent ()
{
return $this -> content ;
}
/**
* Get raw content ( unfiltered )
*
* @ access public
* @ return string
*/
public function getRawContent ()
{
return $this -> html ;
}
2014-12-24 03:28:26 +01:00
/**
* Get filtered relevant content
*
* @ access public
* @ return string
*/
public function getFilteredContent ()
{
2014-12-24 21:58:24 +01:00
$filter = Filter :: html ( $this -> content , Url :: base ( $this -> url ));
2014-12-24 03:28:26 +01:00
$filter -> setConfig ( $this -> config );
return $filter -> execute ();
}
2014-05-20 20:20:27 +02:00
/**
* Parse the HTML content
*
* @ access public
* @ return bool
*/
2013-08-31 17:05:45 +02:00
public function parse ()
{
if ( $this -> html ) {
2014-12-24 03:28:26 +01:00
Logger :: setMessage ( get_called_class () . ' Fix encoding' );
Logger :: setMessage ( get_called_class () . ': HTTP Encoding "' . $this -> encoding . '"' );
2013-10-04 05:14:39 +02:00
2014-05-25 14:47:03 +02:00
$this -> html = Encoding :: convert ( $this -> html , $this -> encoding );
2014-12-24 03:28:26 +01:00
$this -> html = Filter :: stripHeadTags ( $this -> html );
2013-08-31 17:05:45 +02:00
2014-12-24 03:28:26 +01:00
Logger :: setMessage ( get_called_class () . ' Content length: ' . strlen ( $this -> html ) . ' bytes' );
2013-08-31 17:05:45 +02:00
$rules = $this -> getRules ();
if ( is_array ( $rules )) {
2014-12-24 03:28:26 +01:00
Logger :: setMessage ( get_called_class () . ' Parse content with rules' );
2013-09-01 00:37:26 +02:00
$this -> parseContentWithRules ( $rules );
2013-08-31 17:05:45 +02:00
}
else {
2014-12-24 03:28:26 +01:00
Logger :: setMessage ( get_called_class () . ' Parse content with candidates' );
2013-09-01 00:37:26 +02:00
$this -> parseContentWithCandidates ();
2013-08-31 17:05:45 +02:00
}
}
else {
2014-12-24 03:28:26 +01:00
Logger :: setMessage ( get_called_class () . ' No content fetched' );
2013-08-31 17:05:45 +02:00
}
2014-12-24 03:28:26 +01:00
Logger :: setMessage ( get_called_class () . ' Content length: ' . strlen ( $this -> content ) . ' bytes' );
Logger :: setMessage ( get_called_class () . ' Grabber done' );
2013-08-31 17:05:45 +02:00
return $this -> content !== '' ;
}
2014-05-20 20:20:27 +02:00
/**
* Download the HTML content
*
* @ access public
* @ return HTML content
*/
public function download ()
2013-08-31 17:05:45 +02:00
{
2014-05-20 20:20:27 +02:00
$client = Client :: getInstance ();
2014-10-19 20:42:31 +02:00
$client -> setConfig ( $this -> config );
2014-05-20 20:20:27 +02:00
$client -> execute ( $this -> url );
2014-10-19 20:42:31 +02:00
2014-12-24 03:28:26 +01:00
$this -> url = $client -> getUrl ();
2013-08-31 17:05:45 +02:00
$this -> html = $client -> getContent ();
2014-05-20 20:20:27 +02:00
$this -> encoding = $client -> getEncoding ();
2013-08-31 17:05:45 +02:00
return $this -> html ;
}
2014-05-20 20:20:27 +02:00
/**
* Try to find a predefined rule
*
* @ access public
* @ return mixed
*/
2013-08-31 17:05:45 +02:00
public function getRules ()
{
$hostname = parse_url ( $this -> url , PHP_URL_HOST );
2014-10-19 20:42:31 +02:00
if ( $hostname === false ) {
return false ;
}
2013-08-31 17:05:45 +02:00
$files = array ( $hostname );
2013-10-01 04:15:18 +02:00
if ( substr ( $hostname , 0 , 4 ) == 'www.' ) {
$files [] = substr ( $hostname , 4 );
}
if (( $pos = strpos ( $hostname , '.' )) !== false ) {
$files [] = substr ( $hostname , $pos );
2014-05-27 00:49:52 +02:00
$files [] = substr ( $hostname , $pos + 1 );
2013-10-01 04:15:18 +02:00
$files [] = substr ( $hostname , 0 , $pos );
}
2013-08-31 17:05:45 +02:00
foreach ( $files as $file ) {
2014-12-24 03:28:26 +01:00
$filename = __DIR__ . '/../Rules/' . $file . '.php' ;
2013-08-31 17:05:45 +02:00
if ( file_exists ( $filename )) {
2014-12-24 03:28:26 +01:00
Logger :: setMessage ( get_called_class () . ' Load rule: ' . $file );
2013-08-31 17:05:45 +02:00
return include $filename ;
}
}
return false ;
}
2014-05-20 20:20:27 +02:00
/**
* Get the relevant content with predefined rules
*
* @ access public
* @ param array $rules Rules
*/
2013-09-01 00:37:26 +02:00
public function parseContentWithRules ( array $rules )
2013-08-31 17:05:45 +02:00
{
2014-12-24 03:28:26 +01:00
// Logger::setMessage($this->html);
2014-05-20 20:20:27 +02:00
$dom = XmlParser :: getHtmlDocument ( '<?xml version="1.0" encoding="UTF-8">' . $this -> html );
$xpath = new DOMXPath ( $dom );
2013-08-31 17:05:45 +02:00
if ( isset ( $rules [ 'strip' ]) && is_array ( $rules [ 'strip' ])) {
foreach ( $rules [ 'strip' ] as $pattern ) {
$nodes = $xpath -> query ( $pattern );
if ( $nodes !== false && $nodes -> length > 0 ) {
foreach ( $nodes as $node ) {
$node -> parentNode -> removeChild ( $node );
}
}
}
}
if ( isset ( $rules [ 'body' ]) && is_array ( $rules [ 'body' ])) {
foreach ( $rules [ 'body' ] as $pattern ) {
$nodes = $xpath -> query ( $pattern );
if ( $nodes !== false && $nodes -> length > 0 ) {
foreach ( $nodes as $node ) {
$this -> content .= $dom -> saveXML ( $node );
}
}
}
}
}
2014-05-20 20:20:27 +02:00
/**
* Get the relevant content with the list of potential attributes
*
* @ access public
*/
2013-09-01 00:37:26 +02:00
public function parseContentWithCandidates ()
2013-08-31 17:05:45 +02:00
{
2014-05-20 20:20:27 +02:00
$dom = XmlParser :: getHtmlDocument ( '<?xml version="1.0" encoding="UTF-8">' . $this -> html );
$xpath = new DOMXPath ( $dom );
2013-08-31 17:05:45 +02:00
2013-09-02 20:04:10 +02:00
// Try to lookup in each tag
foreach ( $this -> candidatesAttributes as $candidate ) {
2014-12-24 03:28:26 +01:00
Logger :: setMessage ( get_called_class () . ' Try this candidate: "' . $candidate . '"' );
2013-08-31 17:05:45 +02:00
2013-09-02 20:04:10 +02:00
$nodes = $xpath -> query ( '//*[(contains(@class, "' . $candidate . '") or @id="' . $candidate . '") and not (contains(@class, "nav") or contains(@class, "page"))]' );
if ( $nodes !== false && $nodes -> length > 0 ) {
$this -> content = $dom -> saveXML ( $nodes -> item ( 0 ));
2014-12-24 03:28:26 +01:00
Logger :: setMessage ( get_called_class () . ' Find candidate "' . $candidate . '" (' . strlen ( $this -> content ) . ' bytes)' );
2013-09-02 20:04:10 +02:00
break ;
}
2013-08-31 17:05:45 +02:00
}
2013-09-02 20:04:10 +02:00
// Try to fetch <article/>
2013-09-01 00:37:26 +02:00
if ( ! $this -> content ) {
2013-08-31 17:05:45 +02:00
2013-09-02 20:04:10 +02:00
$nodes = $xpath -> query ( '//article' );
2013-09-01 00:37:26 +02:00
2013-09-02 20:04:10 +02:00
if ( $nodes !== false && $nodes -> length > 0 ) {
$this -> content = $dom -> saveXML ( $nodes -> item ( 0 ));
2014-12-24 03:28:26 +01:00
Logger :: setMessage ( get_called_class () . ' Find <article/> tag (' . strlen ( $this -> content ) . ' bytes)' );
2013-08-31 17:05:45 +02:00
}
}
2013-09-01 00:37:26 +02:00
if ( strlen ( $this -> content ) < 50 ) {
2014-12-24 03:28:26 +01:00
Logger :: setMessage ( get_called_class () . ' No enought content fetched, get the full body' );
2013-09-01 00:37:26 +02:00
$this -> content = $dom -> saveXML ( $dom -> firstChild );
}
2014-12-24 03:28:26 +01:00
Logger :: setMessage ( get_called_class () . ' Strip garbage' );
2013-09-01 00:37:26 +02:00
$this -> stripGarbage ();
2013-08-31 17:05:45 +02:00
}
2014-05-20 20:20:27 +02:00
/**
* Strip useless tags
*
* @ access public
*/
2013-08-31 17:05:45 +02:00
public function stripGarbage ()
{
2014-05-20 20:20:27 +02:00
$dom = XmlParser :: getDomDocument ( $this -> content );
2013-08-31 17:05:45 +02:00
2014-05-20 20:20:27 +02:00
if ( $dom !== false ) {
2013-08-31 17:05:45 +02:00
2014-05-20 20:20:27 +02:00
$xpath = new DOMXPath ( $dom );
2013-08-31 17:05:45 +02:00
2014-05-20 20:20:27 +02:00
foreach ( $this -> stripTags as $tag ) {
$nodes = $xpath -> query ( '//' . $tag );
if ( $nodes !== false && $nodes -> length > 0 ) {
2014-12-24 03:28:26 +01:00
Logger :: setMessage ( get_called_class () . ' Strip tag: "' . $tag . '"' );
2014-05-20 20:20:27 +02:00
foreach ( $nodes as $node ) {
$node -> parentNode -> removeChild ( $node );
}
2013-08-31 17:05:45 +02:00
}
}
2014-05-20 20:20:27 +02:00
foreach ( $this -> stripAttributes as $attribute ) {
2013-08-31 17:05:45 +02:00
2014-05-20 20:20:27 +02:00
$nodes = $xpath -> query ( '//*[contains(@class, "' . $attribute . '") or contains(@id, "' . $attribute . '")]' );
2013-08-31 17:05:45 +02:00
2014-05-20 20:20:27 +02:00
if ( $nodes !== false && $nodes -> length > 0 ) {
2014-12-24 03:28:26 +01:00
Logger :: setMessage ( get_called_class () . ' Strip attribute: "' . $attribute . '"' );
2014-05-20 20:20:27 +02:00
foreach ( $nodes as $node ) {
$node -> parentNode -> removeChild ( $node );
}
2013-08-31 17:05:45 +02:00
}
}
2014-05-20 20:20:27 +02:00
$this -> content = $dom -> saveXML ( $dom -> documentElement );
}
2013-08-31 17:05:45 +02:00
}
}