2013-08-31 17:05:45 +02:00
< ? php
namespace PicoFeed ;
require_once __DIR__ . '/Client.php' ;
require_once __DIR__ . '/Encoding.php' ;
require_once __DIR__ . '/Logging.php' ;
2013-09-01 00:37:26 +02:00
require_once __DIR__ . '/Filter.php' ;
2013-08-31 17:05:45 +02:00
class Grabber
{
public $content = '' ;
public $html = '' ;
2013-10-04 05:14:39 +02:00
public $encoding = '' ;
2013-08-31 17:05:45 +02:00
2013-09-02 20:04:10 +02:00
// Order is important, generic terms at the end
2013-08-31 17:05:45 +02:00
public $candidatesAttributes = array (
'articleBody' ,
'articlebody' ,
2013-09-02 20:04:10 +02:00
'article-body' ,
2013-08-31 17:05:45 +02:00
'articleContent' ,
'articlecontent' ,
2013-09-02 20:04:10 +02:00
'article-content' ,
2013-08-31 17:05:45 +02:00
'articlePage' ,
'post-content' ,
2013-09-05 02:45:06 +02:00
'post_content' ,
2013-09-01 00:37:26 +02:00
'entry-content' ,
2013-09-02 20:04:10 +02:00
'main-content' ,
2013-10-01 04:15:18 +02:00
'story_content' ,
'storycontent' ,
2013-09-05 02:45:06 +02:00
'entryBox' ,
'entrytext' ,
2013-09-02 20:04:10 +02:00
'comic' ,
2013-09-05 02:45:06 +02:00
'post' ,
2013-09-02 20:04:10 +02:00
'article' ,
2013-08-31 17:05:45 +02:00
'content' ,
'main' ,
);
public $stripAttributes = array (
'comment' ,
'share' ,
'links' ,
'toolbar' ,
'fb' ,
'footer' ,
'credit' ,
'bottom' ,
'nav' ,
'header' ,
'social' ,
2013-10-01 04:15:18 +02:00
'tag' ,
'metadata' ,
2013-09-01 00:37:26 +02:00
'entry-utility' ,
2013-10-01 04:15:18 +02:00
'related-posts' ,
'tweet' ,
'categories' ,
2013-08-31 17:05:45 +02:00
);
public $stripTags = array (
'script' ,
'style' ,
'nav' ,
'header' ,
'footer' ,
'aside' ,
2013-10-01 04:15:18 +02:00
'form' ,
2013-08-31 17:05:45 +02:00
);
2013-10-04 05:14:39 +02:00
public function __construct ( $url , $html = '' , $encoding = 'utf-8' )
2013-08-31 17:05:45 +02:00
{
$this -> url = $url ;
2013-10-04 05:14:39 +02:00
$this -> html = $html ;
$this -> encoding = $encoding ;
2013-08-31 17:05:45 +02:00
}
public function parse ()
{
if ( $this -> html ) {
2013-09-01 00:37:26 +02:00
Logging :: log ( \get_called_class () . ' Fix encoding' );
2013-10-04 05:14:39 +02:00
Logging :: log ( \get_called_class () . ': HTTP Encoding "' . $this -> encoding . '"' );
2013-09-01 00:37:26 +02:00
$this -> html = Filter :: stripMetaTags ( $this -> html );
2013-10-04 05:14:39 +02:00
if ( $this -> encoding == 'windows-1251' ) {
$this -> html = Encoding :: cp1251ToUtf8 ( $this -> html );
}
else {
$this -> html = Encoding :: toUTF8 ( $this -> html );
}
2013-08-31 17:05:45 +02:00
2014-02-16 01:31:22 +01:00
Logging :: log ( \get_called_class () . ' Content length: ' . strlen ( $this -> html ) . ' bytes' );
2013-08-31 17:05:45 +02:00
$rules = $this -> getRules ();
if ( is_array ( $rules )) {
Logging :: log ( \get_called_class () . ' Parse content with rules' );
2013-09-01 00:37:26 +02:00
$this -> parseContentWithRules ( $rules );
2013-08-31 17:05:45 +02:00
}
else {
Logging :: log ( \get_called_class () . ' Parse content with candidates' );
2013-09-01 00:37:26 +02:00
$this -> parseContentWithCandidates ();
2013-08-31 17:05:45 +02:00
}
}
else {
Logging :: log ( \get_called_class () . ' No content fetched' );
}
2013-09-02 20:04:10 +02:00
Logging :: log ( \get_called_class () . ' Content length: ' . strlen ( $this -> content ) . ' bytes' );
2013-08-31 17:05:45 +02:00
Logging :: log ( \get_called_class () . ' Grabber done' );
return $this -> content !== '' ;
}
public function download ( $timeout = 5 , $user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36' )
{
$client = Client :: create ();
$client -> url = $this -> url ;
$client -> timeout = $timeout ;
$client -> user_agent = $user_agent ;
$client -> execute ();
$this -> html = $client -> getContent ();
return $this -> html ;
}
public function getRules ()
{
$hostname = parse_url ( $this -> url , PHP_URL_HOST );
$files = array ( $hostname );
2013-10-01 04:15:18 +02:00
if ( substr ( $hostname , 0 , 4 ) == 'www.' ) {
$files [] = substr ( $hostname , 4 );
}
if (( $pos = strpos ( $hostname , '.' )) !== false ) {
$files [] = substr ( $hostname , $pos );
$files [] = substr ( $hostname , 0 , $pos );
}
2013-08-31 17:05:45 +02:00
foreach ( $files as $file ) {
$filename = __DIR__ . '/Rules/' . $file . '.php' ;
if ( file_exists ( $filename )) {
2014-02-16 01:31:22 +01:00
Logging :: log ( \get_called_class () . ' Load rule: ' . $file );
2013-08-31 17:05:45 +02:00
return include $filename ;
}
}
return false ;
}
2013-09-01 00:37:26 +02:00
public function parseContentWithRules ( array $rules )
2013-08-31 17:05:45 +02:00
{
2013-09-01 00:37:26 +02:00
\libxml_use_internal_errors ( true );
$dom = new \DOMDocument ;
$dom -> loadHTML ( '<?xml version="1.0" encoding="UTF-8">' . $this -> html );
2013-08-31 17:05:45 +02:00
$xpath = new \DOMXPath ( $dom );
if ( isset ( $rules [ 'strip' ]) && is_array ( $rules [ 'strip' ])) {
foreach ( $rules [ 'strip' ] as $pattern ) {
$nodes = $xpath -> query ( $pattern );
if ( $nodes !== false && $nodes -> length > 0 ) {
foreach ( $nodes as $node ) {
$node -> parentNode -> removeChild ( $node );
}
}
}
}
if ( isset ( $rules [ 'body' ]) && is_array ( $rules [ 'body' ])) {
foreach ( $rules [ 'body' ] as $pattern ) {
$nodes = $xpath -> query ( $pattern );
if ( $nodes !== false && $nodes -> length > 0 ) {
foreach ( $nodes as $node ) {
$this -> content .= $dom -> saveXML ( $node );
}
}
}
}
}
2013-09-01 00:37:26 +02:00
public function parseContentWithCandidates ()
2013-08-31 17:05:45 +02:00
{
2013-09-01 00:37:26 +02:00
\libxml_use_internal_errors ( true );
$dom = new \DOMDocument ;
$dom -> loadHTML ( '<?xml version="1.0" encoding="UTF-8">' . $this -> html );
2013-08-31 17:05:45 +02:00
$xpath = new \DOMXPath ( $dom );
2013-09-02 20:04:10 +02:00
// Try to lookup in each tag
foreach ( $this -> candidatesAttributes as $candidate ) {
Logging :: log ( \get_called_class () . ' Try this candidate: "' . $candidate . '"' );
2013-08-31 17:05:45 +02:00
2013-09-02 20:04:10 +02:00
$nodes = $xpath -> query ( '//*[(contains(@class, "' . $candidate . '") or @id="' . $candidate . '") and not (contains(@class, "nav") or contains(@class, "page"))]' );
if ( $nodes !== false && $nodes -> length > 0 ) {
$this -> content = $dom -> saveXML ( $nodes -> item ( 0 ));
Logging :: log ( \get_called_class () . ' Find candidate "' . $candidate . '" (' . strlen ( $this -> content ) . ' bytes)' );
break ;
}
2013-08-31 17:05:45 +02:00
}
2013-09-02 20:04:10 +02:00
// Try to fetch <article/>
2013-09-01 00:37:26 +02:00
if ( ! $this -> content ) {
2013-08-31 17:05:45 +02:00
2013-09-02 20:04:10 +02:00
$nodes = $xpath -> query ( '//article' );
2013-09-01 00:37:26 +02:00
2013-09-02 20:04:10 +02:00
if ( $nodes !== false && $nodes -> length > 0 ) {
$this -> content = $dom -> saveXML ( $nodes -> item ( 0 ));
Logging :: log ( \get_called_class () . ' Find <article/> tag (' . strlen ( $this -> content ) . ' bytes)' );
2013-08-31 17:05:45 +02:00
}
}
2013-09-01 00:37:26 +02:00
if ( strlen ( $this -> content ) < 50 ) {
Logging :: log ( \get_called_class () . ' No enought content fetched, get the full body' );
$this -> content = $dom -> saveXML ( $dom -> firstChild );
}
Logging :: log ( \get_called_class () . ' Strip garbage' );
$this -> stripGarbage ();
2013-08-31 17:05:45 +02:00
}
public function stripGarbage ()
{
\libxml_use_internal_errors ( true );
$dom = new \DOMDocument ;
2013-09-02 20:04:10 +02:00
$dom -> loadXML ( $this -> content );
2013-08-31 17:05:45 +02:00
$xpath = new \DOMXPath ( $dom );
foreach ( $this -> stripTags as $tag ) {
$nodes = $xpath -> query ( '//' . $tag );
if ( $nodes !== false && $nodes -> length > 0 ) {
2013-09-02 20:04:10 +02:00
Logging :: log ( \get_called_class () . ' Strip tag: "' . $tag . '"' );
2013-08-31 17:05:45 +02:00
foreach ( $nodes as $node ) {
$node -> parentNode -> removeChild ( $node );
}
}
}
foreach ( $this -> stripAttributes as $attribute ) {
$nodes = $xpath -> query ( '//*[contains(@class, "' . $attribute . '") or contains(@id, "' . $attribute . '")]' );
if ( $nodes !== false && $nodes -> length > 0 ) {
2013-09-02 20:04:10 +02:00
Logging :: log ( \get_called_class () . ' Strip attribute: "' . $tag . '"' );
2013-08-31 17:05:45 +02:00
foreach ( $nodes as $node ) {
$node -> parentNode -> removeChild ( $node );
}
}
}
2013-09-02 20:04:10 +02:00
$this -> content = $dom -> saveXML ( $dom -> documentElement );
2013-08-31 17:05:45 +02:00
}
}