2013-08-31 17:05:45 +02:00
< ? php
2014-12-24 03:28:26 +01:00
namespace PicoFeed\Client ;
2013-08-31 17:05:45 +02:00
2014-05-20 20:20:27 +02:00
use DOMXPath ;
2014-12-24 03:28:26 +01:00
use PicoFeed\Encoding\Encoding ;
use PicoFeed\Logging\Logger ;
use PicoFeed\Filter\Filter ;
use PicoFeed\Parser\XmlParser ;
2014-05-20 20:20:27 +02:00
/**
* Grabber class
*
* @ author Frederic Guillot
2014-12-24 03:28:26 +01:00
* @ package Client
2014-05-20 20:20:27 +02:00
*/
2013-08-31 17:05:45 +02:00
class Grabber
{
2014-05-20 20:20:27 +02:00
/**
* URL
*
* @ access private
* @ var string
*/
private $url = '' ;
/**
* Relevant content
*
* @ access private
* @ var string
*/
private $content = '' ;
/**
* HTML content
*
* @ access private
* @ var string
*/
private $html = '' ;
/**
* HTML content encoding
*
* @ access private
* @ var string
*/
private $encoding = '' ;
2015-01-02 21:25:11 +01:00
/**
* Flag to skip download and parsing
*
* @ access private
* @ var boolean
*/
private $skip_processing = false ;
2014-05-20 20:20:27 +02:00
/**
* List of attributes to try to get the content , order is important , generic terms at the end
*
* @ access private
* @ var array
*/
private $candidatesAttributes = array (
2013-08-31 17:05:45 +02:00
'articleBody' ,
'articlebody' ,
2013-09-02 20:04:10 +02:00
'article-body' ,
2013-08-31 17:05:45 +02:00
'articleContent' ,
'articlecontent' ,
2013-09-02 20:04:10 +02:00
'article-content' ,
2013-08-31 17:05:45 +02:00
'articlePage' ,
'post-content' ,
2013-09-05 02:45:06 +02:00
'post_content' ,
2013-09-01 00:37:26 +02:00
'entry-content' ,
2015-01-02 21:25:11 +01:00
'entry-body' ,
2013-09-02 20:04:10 +02:00
'main-content' ,
2013-10-01 04:15:18 +02:00
'story_content' ,
'storycontent' ,
2013-09-05 02:45:06 +02:00
'entryBox' ,
'entrytext' ,
2013-09-02 20:04:10 +02:00
'comic' ,
2013-09-05 02:45:06 +02:00
'post' ,
2013-09-02 20:04:10 +02:00
'article' ,
2013-08-31 17:05:45 +02:00
'content' ,
'main' ,
);
2014-05-20 20:20:27 +02:00
/**
* List of attributes to strip
*
* @ access private
* @ var array
*/
private $stripAttributes = array (
2013-08-31 17:05:45 +02:00
'comment' ,
'share' ,
'links' ,
'toolbar' ,
'fb' ,
'footer' ,
'credit' ,
'bottom' ,
'nav' ,
'header' ,
'social' ,
2013-10-01 04:15:18 +02:00
'tag' ,
'metadata' ,
2013-09-01 00:37:26 +02:00
'entry-utility' ,
2013-10-01 04:15:18 +02:00
'related-posts' ,
'tweet' ,
'categories' ,
2015-01-02 21:25:11 +01:00
'post_title' ,
'by_line' ,
'byline' ,
'sponsors' ,
2013-08-31 17:05:45 +02:00
);
2014-05-20 20:20:27 +02:00
/**
* Tags to remove
*
* @ access private
* @ var array
*/
private $stripTags = array (
2013-08-31 17:05:45 +02:00
'nav' ,
'header' ,
'footer' ,
'aside' ,
2013-10-01 04:15:18 +02:00
'form' ,
2013-08-31 17:05:45 +02:00
);
2014-05-20 20:20:27 +02:00
/**
* Config object
*
* @ access private
2014-12-24 03:28:26 +01:00
* @ var \PicoFeed\Config\Config
2014-05-20 20:20:27 +02:00
*/
2014-12-24 03:28:26 +01:00
private $config ;
2014-05-20 20:20:27 +02:00
/**
* Constructor
*
* @ access public
* @ param string $url Url
* @ param string $html HTML content
* @ param string $encoding Charset
*/
2013-10-04 05:14:39 +02:00
public function __construct ( $url , $html = '' , $encoding = 'utf-8' )
2013-08-31 17:05:45 +02:00
{
$this -> url = $url ;
2013-10-04 05:14:39 +02:00
$this -> html = $html ;
$this -> encoding = $encoding ;
2015-01-02 21:25:11 +01:00
$this -> handleFiles ();
$this -> handleStreamingVideos ();
2013-08-31 17:05:45 +02:00
}
2014-05-20 20:20:27 +02:00
/**
* Set config object
*
* @ access public
2014-12-24 03:28:26 +01:00
* @ param \PicoFeed\Config\Config $config Config instance
* @ return Grabber
2014-05-20 20:20:27 +02:00
*/
public function setConfig ( $config )
{
$this -> config = $config ;
return $this ;
}
2013-08-31 17:05:45 +02:00
2015-01-18 15:20:36 +01:00
/**
* Get URL to download .
*
* @ access public
* @ return string
*/
public function getUrl ()
{
return $this -> url ;
}
/**
* Set URL to download and reset object to use for another grab .
*
* @ access public
* @ param string $url URL
* @ return string
*/
public function setUrl ( $url )
{
$this -> url = $url ;
$this -> html = " " ;
$this -> content = " " ;
$this -> encoding = " " ;
$this -> handleFiles ();
$this -> handleStreamingVideos ();
}
2014-05-20 20:20:27 +02:00
/**
* Get relevant content
*
* @ access public
* @ return string
*/
public function getContent ()
{
return $this -> content ;
}
/**
* Get raw content ( unfiltered )
*
* @ access public
* @ return string
*/
public function getRawContent ()
{
return $this -> html ;
}
2014-12-24 03:28:26 +01:00
/**
* Get filtered relevant content
*
* @ access public
* @ return string
*/
public function getFilteredContent ()
{
2015-01-02 21:25:11 +01:00
$filter = Filter :: html ( $this -> content , $this -> url );
2014-12-24 03:28:26 +01:00
$filter -> setConfig ( $this -> config );
return $filter -> execute ();
}
2015-01-02 21:25:11 +01:00
/**
* Return the Youtube embed player and skip processing
*
* @ access public
* @ return string
*/
public function handleStreamingVideos ()
{
if ( preg_match ( " #(?<=v=|v \ /|vi=|vi \ /|youtu.be \ /)[a-zA-Z0-9_-] { 11}# " , $this -> url , $matches )) {
$this -> content = '<iframe width="560" height="315" src="//www.youtube.com/embed/' . $matches [ 0 ] . '" frameborder="0"></iframe>' ;
$this -> skip_processing = true ;
}
}
/**
* Skip processing for PDF documents
*
* @ access public
* @ return string
*/
public function handleFiles ()
{
if ( substr ( $this -> url , - 3 ) === 'pdf' ) {
$this -> skip_processing = true ;
Logger :: setMessage ( get_called_class () . ': PDF document => processing skipped' );
}
}
2014-05-20 20:20:27 +02:00
/**
* Parse the HTML content
*
* @ access public
* @ return bool
*/
2013-08-31 17:05:45 +02:00
public function parse ()
{
2015-01-02 21:25:11 +01:00
if ( $this -> skip_processing ) {
return true ;
}
2013-08-31 17:05:45 +02:00
if ( $this -> html ) {
2015-03-01 19:56:11 +01:00
$html_encoding = XmlParser :: getEncodingFromMetaTag ( $this -> html );
2013-08-31 17:05:45 +02:00
2015-03-01 19:56:11 +01:00
// Encode everything in UTF-8
Logger :: setMessage ( get_called_class () . ': HTTP Encoding "' . $this -> encoding . '" ; HTML Encoding "' . $html_encoding . '"' );
$this -> html = Encoding :: convert ( $this -> html , $html_encoding ? : $this -> encoding );
2014-12-24 03:28:26 +01:00
$this -> html = Filter :: stripHeadTags ( $this -> html );
2013-08-31 17:05:45 +02:00
2015-01-02 21:25:11 +01:00
Logger :: setMessage ( get_called_class () . ': Content length: ' . strlen ( $this -> html ) . ' bytes' );
2013-08-31 17:05:45 +02:00
$rules = $this -> getRules ();
2015-04-11 02:34:48 +02:00
if ( ! empty ( $rules )) {
2015-01-02 21:25:11 +01:00
Logger :: setMessage ( get_called_class () . ': Parse content with rules' );
2013-09-01 00:37:26 +02:00
$this -> parseContentWithRules ( $rules );
2013-08-31 17:05:45 +02:00
}
else {
2015-01-02 21:25:11 +01:00
Logger :: setMessage ( get_called_class () . ': Parse content with candidates' );
2013-09-01 00:37:26 +02:00
$this -> parseContentWithCandidates ();
2013-08-31 17:05:45 +02:00
}
}
else {
2015-01-02 21:25:11 +01:00
Logger :: setMessage ( get_called_class () . ': No content fetched' );
2013-08-31 17:05:45 +02:00
}
2015-01-02 21:25:11 +01:00
Logger :: setMessage ( get_called_class () . ': Content length: ' . strlen ( $this -> content ) . ' bytes' );
Logger :: setMessage ( get_called_class () . ': Grabber done' );
2013-08-31 17:05:45 +02:00
return $this -> content !== '' ;
}
2014-05-20 20:20:27 +02:00
/**
* Download the HTML content
*
* @ access public
* @ return HTML content
*/
public function download ()
2013-08-31 17:05:45 +02:00
{
2015-01-18 15:20:36 +01:00
if ( ! $this -> skip_processing && $this -> url != '' ) {
2014-10-19 20:42:31 +02:00
2015-01-02 21:25:11 +01:00
try {
2015-01-02 17:54:40 +01:00
2015-01-02 21:25:11 +01:00
$client = Client :: getInstance ();
2015-04-11 02:34:48 +02:00
if ( $this -> config !== null ) {
$client -> setConfig ( $this -> config );
$client -> setTimeout ( $this -> config -> getGrabberTimeout ());
$client -> setUserAgent ( $this -> config -> getGrabberUserAgent ());
}
2015-01-02 21:25:11 +01:00
$client -> execute ( $this -> url );
$this -> url = $client -> getUrl ();
$this -> html = $client -> getContent ();
$this -> encoding = $client -> getEncoding ();
}
catch ( ClientException $e ) {
Logger :: setMessage ( get_called_class () . ': ' . $e -> getMessage ());
}
2015-01-02 17:54:40 +01:00
}
2013-08-31 17:05:45 +02:00
return $this -> html ;
}
2014-05-20 20:20:27 +02:00
/**
* Try to find a predefined rule
*
* @ access public
2015-04-11 02:34:48 +02:00
* @ return array
2014-05-20 20:20:27 +02:00
*/
2013-08-31 17:05:45 +02:00
public function getRules ()
{
$hostname = parse_url ( $this -> url , PHP_URL_HOST );
2014-10-19 20:42:31 +02:00
2015-04-11 02:34:48 +02:00
if ( $hostname !== false ) {
$files = $this -> getRulesFileList ( $hostname );
2014-10-19 20:42:31 +02:00
2015-04-11 02:34:48 +02:00
foreach ( $this -> getRulesFolders () as $folder ) {
$rule = $this -> loadRuleFile ( $folder , $files );
2013-08-31 17:05:45 +02:00
2015-04-11 02:34:48 +02:00
if ( ! empty ( $rule )) {
return $rule ;
}
}
2013-10-01 04:15:18 +02:00
}
2015-04-11 02:34:48 +02:00
return array ();
}
/**
* Get the list of possible rules file names for a given hostname
*
* @ access public
* @ param string $hostname Hostname
* @ return array
*/
public function getRulesFileList ( $hostname )
{
$files = array ( $hostname ); // subdomain.domain.tld
$parts = explode ( '.' , $hostname );
$len = count ( $parts );
if ( $len > 2 ) {
$subdomain = array_shift ( $parts );
$files [] = implode ( '.' , $parts ); // domain.tld
$files [] = '.' . implode ( '.' , $parts ); // .domain.tld
$files [] = $subdomain ; // subdomain
}
else if ( $len === 2 ) {
$files [] = '.' . implode ( '.' , $parts ); // .domain.tld
$files [] = $parts [ 0 ]; // domain
2013-10-01 04:15:18 +02:00
}
2013-08-31 17:05:45 +02:00
2015-04-11 02:34:48 +02:00
return $files ;
}
2013-08-31 17:05:45 +02:00
2015-04-11 02:34:48 +02:00
/**
* Load a rule file from the defined folder
*
* @ access public
* @ param string $folder Rule directory
* @ param array $files List of possible file names
* @ return array
*/
public function loadRuleFile ( $folder , array $files )
{
foreach ( $files as $file ) {
$filename = $folder . '/' . $file . '.php' ;
2013-08-31 17:05:45 +02:00
if ( file_exists ( $filename )) {
2014-12-24 03:28:26 +01:00
Logger :: setMessage ( get_called_class () . ' Load rule: ' . $file );
2013-08-31 17:05:45 +02:00
return include $filename ;
}
}
2015-04-11 02:34:48 +02:00
return array ();
}
/**
* Get the list of folders that contains rules
*
* @ access public
* @ return array
*/
public function getRulesFolders ()
{
$folders = array ( __DIR__ . '/../Rules' );
if ( $this -> config !== null && $this -> config -> getGrabberRulesFolder () !== null ) {
$folders [] = $this -> config -> getGrabberRulesFolder ();
}
return $folders ;
2013-08-31 17:05:45 +02:00
}
2014-05-20 20:20:27 +02:00
/**
* Get the relevant content with predefined rules
*
* @ access public
* @ param array $rules Rules
*/
2013-09-01 00:37:26 +02:00
public function parseContentWithRules ( array $rules )
2013-08-31 17:05:45 +02:00
{
2014-12-24 03:28:26 +01:00
// Logger::setMessage($this->html);
2014-05-20 20:20:27 +02:00
$dom = XmlParser :: getHtmlDocument ( '<?xml version="1.0" encoding="UTF-8">' . $this -> html );
$xpath = new DOMXPath ( $dom );
2013-08-31 17:05:45 +02:00
if ( isset ( $rules [ 'strip' ]) && is_array ( $rules [ 'strip' ])) {
foreach ( $rules [ 'strip' ] as $pattern ) {
$nodes = $xpath -> query ( $pattern );
if ( $nodes !== false && $nodes -> length > 0 ) {
foreach ( $nodes as $node ) {
$node -> parentNode -> removeChild ( $node );
}
}
}
}
if ( isset ( $rules [ 'body' ]) && is_array ( $rules [ 'body' ])) {
foreach ( $rules [ 'body' ] as $pattern ) {
$nodes = $xpath -> query ( $pattern );
if ( $nodes !== false && $nodes -> length > 0 ) {
foreach ( $nodes as $node ) {
$this -> content .= $dom -> saveXML ( $node );
}
}
}
}
}
2014-05-20 20:20:27 +02:00
/**
* Get the relevant content with the list of potential attributes
*
* @ access public
*/
2013-09-01 00:37:26 +02:00
public function parseContentWithCandidates ()
2013-08-31 17:05:45 +02:00
{
2014-05-20 20:20:27 +02:00
$dom = XmlParser :: getHtmlDocument ( '<?xml version="1.0" encoding="UTF-8">' . $this -> html );
$xpath = new DOMXPath ( $dom );
2013-08-31 17:05:45 +02:00
2013-09-02 20:04:10 +02:00
// Try to lookup in each tag
foreach ( $this -> candidatesAttributes as $candidate ) {
2015-01-02 21:25:11 +01:00
Logger :: setMessage ( get_called_class () . ': Try this candidate: "' . $candidate . '"' );
2013-08-31 17:05:45 +02:00
2013-09-02 20:04:10 +02:00
$nodes = $xpath -> query ( '//*[(contains(@class, "' . $candidate . '") or @id="' . $candidate . '") and not (contains(@class, "nav") or contains(@class, "page"))]' );
if ( $nodes !== false && $nodes -> length > 0 ) {
$this -> content = $dom -> saveXML ( $nodes -> item ( 0 ));
2015-01-02 21:25:11 +01:00
Logger :: setMessage ( get_called_class () . ': Find candidate "' . $candidate . '" (' . strlen ( $this -> content ) . ' bytes)' );
2013-09-02 20:04:10 +02:00
break ;
}
2013-08-31 17:05:45 +02:00
}
2013-09-02 20:04:10 +02:00
// Try to fetch <article/>
2015-01-02 21:25:11 +01:00
if ( strlen ( $this -> content ) < 200 ) {
2013-08-31 17:05:45 +02:00
2013-09-02 20:04:10 +02:00
$nodes = $xpath -> query ( '//article' );
2013-09-01 00:37:26 +02:00
2013-09-02 20:04:10 +02:00
if ( $nodes !== false && $nodes -> length > 0 ) {
$this -> content = $dom -> saveXML ( $nodes -> item ( 0 ));
2015-01-02 21:25:11 +01:00
Logger :: setMessage ( get_called_class () . ': Find <article/> tag (' . strlen ( $this -> content ) . ' bytes)' );
2013-08-31 17:05:45 +02:00
}
}
2013-09-01 00:37:26 +02:00
2015-01-02 21:25:11 +01:00
// Get everything
2013-09-01 00:37:26 +02:00
if ( strlen ( $this -> content ) < 50 ) {
2015-01-02 21:25:11 +01:00
$nodes = $xpath -> query ( '//body' );
if ( $nodes !== false && $nodes -> length > 0 ) {
Logger :: setMessage ( get_called_class () . ' No enought content fetched, get //body' );
$this -> content = $dom -> saveXML ( $nodes -> item ( 0 ));
}
2013-09-01 00:37:26 +02:00
}
2015-01-02 21:25:11 +01:00
Logger :: setMessage ( get_called_class () . ': Strip garbage' );
2013-09-01 00:37:26 +02:00
$this -> stripGarbage ();
2013-08-31 17:05:45 +02:00
}
2014-05-20 20:20:27 +02:00
/**
* Strip useless tags
*
* @ access public
*/
2013-08-31 17:05:45 +02:00
public function stripGarbage ()
{
2014-05-20 20:20:27 +02:00
$dom = XmlParser :: getDomDocument ( $this -> content );
2013-08-31 17:05:45 +02:00
2014-05-20 20:20:27 +02:00
if ( $dom !== false ) {
2013-08-31 17:05:45 +02:00
2014-05-20 20:20:27 +02:00
$xpath = new DOMXPath ( $dom );
2013-08-31 17:05:45 +02:00
2014-05-20 20:20:27 +02:00
foreach ( $this -> stripTags as $tag ) {
$nodes = $xpath -> query ( '//' . $tag );
if ( $nodes !== false && $nodes -> length > 0 ) {
2015-01-02 21:25:11 +01:00
Logger :: setMessage ( get_called_class () . ': Strip tag: "' . $tag . '"' );
2014-05-20 20:20:27 +02:00
foreach ( $nodes as $node ) {
$node -> parentNode -> removeChild ( $node );
}
2013-08-31 17:05:45 +02:00
}
}
2014-05-20 20:20:27 +02:00
foreach ( $this -> stripAttributes as $attribute ) {
2013-08-31 17:05:45 +02:00
2014-05-20 20:20:27 +02:00
$nodes = $xpath -> query ( '//*[contains(@class, "' . $attribute . '") or contains(@id, "' . $attribute . '")]' );
2013-08-31 17:05:45 +02:00
2014-05-20 20:20:27 +02:00
if ( $nodes !== false && $nodes -> length > 0 ) {
2015-01-02 21:25:11 +01:00
Logger :: setMessage ( get_called_class () . ': Strip attribute: "' . $attribute . '"' );
2014-05-20 20:20:27 +02:00
foreach ( $nodes as $node ) {
2015-01-02 21:25:11 +01:00
if ( $this -> shouldRemove ( $dom , $node )) {
$node -> parentNode -> removeChild ( $node );
}
2014-05-20 20:20:27 +02:00
}
2013-08-31 17:05:45 +02:00
}
}
2014-05-20 20:20:27 +02:00
$this -> content = $dom -> saveXML ( $dom -> documentElement );
}
2013-08-31 17:05:45 +02:00
}
2015-01-02 21:25:11 +01:00
/**
* Return false if the node should not be removed
*
* @ access public
* @ param DomDocument $dom
* @ param DomNode $node
* @ return boolean
*/
public function shouldRemove ( $dom , $node )
{
$document_length = strlen ( $dom -> textContent );
$node_length = strlen ( $node -> textContent );
if ( $document_length === 0 ) {
return true ;
}
$ratio = $node_length * 100 / $document_length ;
if ( $ratio >= 90 ) {
Logger :: setMessage ( get_called_class () . ': Should not remove this node (' . $node -> nodeName . ') ratio: ' . $ratio . '%' );
return false ;
}
return true ;
}
2013-08-31 17:05:45 +02:00
}