534 lines
13 KiB
PHP
Raw Normal View History

<?php
namespace PicoFeed\Client;
2014-05-20 14:20:27 -04:00
use DOMXPath;
use PicoFeed\Encoding\Encoding;
use PicoFeed\Logging\Logger;
use PicoFeed\Filter\Filter;
use PicoFeed\Parser\XmlParser;
2014-05-20 14:20:27 -04:00
/**
* Grabber class
*
* @author Frederic Guillot
* @package Client
2014-05-20 14:20:27 -04:00
*/
class Grabber
{
2014-05-20 14:20:27 -04:00
/**
* URL
*
* @access private
* @var string
*/
private $url = '';
/**
* Relevant content
*
* @access private
* @var string
*/
private $content = '';
/**
* HTML content
*
* @access private
* @var string
*/
private $html = '';
/**
* HTML content encoding
*
* @access private
* @var string
*/
private $encoding = '';
/**
* Flag to skip download and parsing
*
* @access private
* @var boolean
*/
private $skip_processing = false;
2014-05-20 14:20:27 -04:00
/**
* List of attributes to try to get the content, order is important, generic terms at the end
*
* @access private
* @var array
*/
private $candidatesAttributes = array(
'articleBody',
'articlebody',
'article-body',
'articleContent',
'articlecontent',
'article-content',
'articlePage',
'post-content',
'post_content',
2013-08-31 18:37:26 -04:00
'entry-content',
'entry-body',
'main-content',
2013-09-30 22:15:18 -04:00
'story_content',
'storycontent',
'entryBox',
'entrytext',
'comic',
'post',
'article',
'content',
'main',
);
2014-05-20 14:20:27 -04:00
/**
* List of attributes to strip
*
* @access private
* @var array
*/
private $stripAttributes = array(
'comment',
'share',
'links',
'toolbar',
'fb',
'footer',
'credit',
'bottom',
'nav',
'header',
'social',
2013-09-30 22:15:18 -04:00
'tag',
'metadata',
2013-08-31 18:37:26 -04:00
'entry-utility',
2013-09-30 22:15:18 -04:00
'related-posts',
'tweet',
'categories',
'post_title',
'by_line',
'byline',
'sponsors',
);
2014-05-20 14:20:27 -04:00
/**
* Tags to remove
*
* @access private
* @var array
*/
private $stripTags = array(
'nav',
'header',
'footer',
'aside',
2013-09-30 22:15:18 -04:00
'form',
);
2014-05-20 14:20:27 -04:00
/**
* Config object
*
* @access private
* @var \PicoFeed\Config\Config
2014-05-20 14:20:27 -04:00
*/
private $config;
2014-05-20 14:20:27 -04:00
/**
* Constructor
*
* @access public
* @param string $url Url
* @param string $html HTML content
* @param string $encoding Charset
*/
2013-10-03 23:14:39 -04:00
public function __construct($url, $html = '', $encoding = 'utf-8')
{
$this->url = $url;
2013-10-03 23:14:39 -04:00
$this->html = $html;
$this->encoding = $encoding;
$this->handleFiles();
$this->handleStreamingVideos();
}
2014-05-20 14:20:27 -04:00
/**
* Set config object
*
* @access public
* @param \PicoFeed\Config\Config $config Config instance
* @return Grabber
2014-05-20 14:20:27 -04:00
*/
public function setConfig($config)
{
$this->config = $config;
return $this;
}
/**
* Get URL to download.
*
* @access public
* @return string
*/
public function getUrl()
{
return $this->url;
}
/**
* Set URL to download and reset object to use for another grab.
*
* @access public
* @param string $url URL
* @return string
*/
public function setUrl($url)
{
$this->url = $url;
$this->html = "";
$this->content = "";
$this->encoding = "";
$this->handleFiles();
$this->handleStreamingVideos();
}
2014-05-20 14:20:27 -04:00
/**
* Get relevant content
*
* @access public
* @return string
*/
public function getContent()
{
return $this->content;
}
/**
* Get raw content (unfiltered)
*
* @access public
* @return string
*/
public function getRawContent()
{
return $this->html;
}
/**
* Get filtered relevant content
*
* @access public
* @return string
*/
public function getFilteredContent()
{
$filter = Filter::html($this->content, $this->url);
$filter->setConfig($this->config);
return $filter->execute();
}
/**
* Return the Youtube embed player and skip processing
*
* @access public
* @return string
*/
public function handleStreamingVideos()
{
if (preg_match("#(?<=v=|v\/|vi=|vi\/|youtu.be\/)[a-zA-Z0-9_-]{11}#", $this->url, $matches)) {
$this->content = '<iframe width="560" height="315" src="//www.youtube.com/embed/'.$matches[0].'" frameborder="0"></iframe>';
$this->skip_processing = true;
}
}
/**
* Skip processing for PDF documents
*
* @access public
* @return string
*/
public function handleFiles()
{
if (substr($this->url, -3) === 'pdf') {
$this->skip_processing = true;
Logger::setMessage(get_called_class().': PDF document => processing skipped');
}
}
2014-05-20 14:20:27 -04:00
/**
* Parse the HTML content
*
* @access public
* @return bool
*/
public function parse()
{
if ($this->skip_processing) {
return true;
}
if ($this->html) {
2015-03-01 19:56:11 +01:00
$html_encoding = XmlParser::getEncodingFromMetaTag($this->html);
2015-03-01 19:56:11 +01:00
// Encode everything in UTF-8
Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'" ; HTML Encoding "'.$html_encoding.'"');
$this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding);
$this->html = Filter::stripHeadTags($this->html);
Logger::setMessage(get_called_class().': Content length: '.strlen($this->html).' bytes');
$rules = $this->getRules();
if (is_array($rules)) {
Logger::setMessage(get_called_class().': Parse content with rules');
2013-08-31 18:37:26 -04:00
$this->parseContentWithRules($rules);
}
else {
Logger::setMessage(get_called_class().': Parse content with candidates');
2013-08-31 18:37:26 -04:00
$this->parseContentWithCandidates();
}
}
else {
Logger::setMessage(get_called_class().': No content fetched');
}
Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
Logger::setMessage(get_called_class().': Grabber done');
return $this->content !== '';
}
2014-05-20 14:20:27 -04:00
/**
* Download the HTML content
*
* @access public
* @return HTML content
*/
public function download()
{
if (! $this->skip_processing && $this->url != '') {
2014-10-19 14:42:31 -04:00
try {
2015-01-02 11:54:40 -05:00
$client = Client::getInstance();
$client->setConfig($this->config);
$client->execute($this->url);
$this->url = $client->getUrl();
$this->html = $client->getContent();
$this->encoding = $client->getEncoding();
}
catch (ClientException $e) {
Logger::setMessage(get_called_class().': '.$e->getMessage());
}
2015-01-02 11:54:40 -05:00
}
return $this->html;
}
2014-05-20 14:20:27 -04:00
/**
* Try to find a predefined rule
*
* @access public
* @return mixed
*/
public function getRules()
{
$hostname = parse_url($this->url, PHP_URL_HOST);
2014-10-19 14:42:31 -04:00
if ($hostname === false) {
return false;
}
$files = array($hostname);
2013-09-30 22:15:18 -04:00
if (substr($hostname, 0, 4) == 'www.') {
$files[] = substr($hostname, 4);
}
if (($pos = strpos($hostname, '.')) !== false) {
$files[] = substr($hostname, $pos);
2014-05-26 18:49:52 -04:00
$files[] = substr($hostname, $pos + 1);
2013-09-30 22:15:18 -04:00
$files[] = substr($hostname, 0, $pos);
}
foreach ($files as $file) {
$filename = __DIR__.'/../Rules/'.$file.'.php';
if (file_exists($filename)) {
Logger::setMessage(get_called_class().' Load rule: '.$file);
return include $filename;
}
}
return false;
}
2014-05-20 14:20:27 -04:00
/**
* Get the relevant content with predefined rules
*
* @access public
* @param array $rules Rules
*/
2013-08-31 18:37:26 -04:00
public function parseContentWithRules(array $rules)
{
// Logger::setMessage($this->html);
2014-05-20 14:20:27 -04:00
$dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$this->html);
$xpath = new DOMXPath($dom);
if (isset($rules['strip']) && is_array($rules['strip'])) {
foreach ($rules['strip'] as $pattern) {
$nodes = $xpath->query($pattern);
if ($nodes !== false && $nodes->length > 0) {
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
}
}
}
if (isset($rules['body']) && is_array($rules['body'])) {
foreach ($rules['body'] as $pattern) {
$nodes = $xpath->query($pattern);
if ($nodes !== false && $nodes->length > 0) {
foreach ($nodes as $node) {
$this->content .= $dom->saveXML($node);
}
}
}
}
}
2014-05-20 14:20:27 -04:00
/**
* Get the relevant content with the list of potential attributes
*
* @access public
*/
2013-08-31 18:37:26 -04:00
public function parseContentWithCandidates()
{
2014-05-20 14:20:27 -04:00
$dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$this->html);
$xpath = new DOMXPath($dom);
// Try to lookup in each tag
foreach ($this->candidatesAttributes as $candidate) {
Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"');
$nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
break;
}
}
// Try to fetch <article/>
if (strlen($this->content) < 200) {
$nodes = $xpath->query('//article');
2013-08-31 18:37:26 -04:00
if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
Logger::setMessage(get_called_class().': Find <article/> tag ('.strlen($this->content).' bytes)');
}
}
2013-08-31 18:37:26 -04:00
// Get everything
2013-08-31 18:37:26 -04:00
if (strlen($this->content) < 50) {
$nodes = $xpath->query('//body');
if ($nodes !== false && $nodes->length > 0) {
Logger::setMessage(get_called_class().' No enought content fetched, get //body');
$this->content = $dom->saveXML($nodes->item(0));
}
2013-08-31 18:37:26 -04:00
}
Logger::setMessage(get_called_class().': Strip garbage');
2013-08-31 18:37:26 -04:00
$this->stripGarbage();
}
2014-05-20 14:20:27 -04:00
/**
* Strip useless tags
*
* @access public
*/
public function stripGarbage()
{
2014-05-20 14:20:27 -04:00
$dom = XmlParser::getDomDocument($this->content);
2014-05-20 14:20:27 -04:00
if ($dom !== false) {
2014-05-20 14:20:27 -04:00
$xpath = new DOMXPath($dom);
2014-05-20 14:20:27 -04:00
foreach ($this->stripTags as $tag) {
$nodes = $xpath->query('//'.$tag);
if ($nodes !== false && $nodes->length > 0) {
Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"');
2014-05-20 14:20:27 -04:00
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
}
}
2014-05-20 14:20:27 -04:00
foreach ($this->stripAttributes as $attribute) {
2014-05-20 14:20:27 -04:00
$nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
2014-05-20 14:20:27 -04:00
if ($nodes !== false && $nodes->length > 0) {
Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"');
2014-05-20 14:20:27 -04:00
foreach ($nodes as $node) {
if ($this->shouldRemove($dom, $node)) {
$node->parentNode->removeChild($node);
}
2014-05-20 14:20:27 -04:00
}
}
}
2014-05-20 14:20:27 -04:00
$this->content = $dom->saveXML($dom->documentElement);
}
}
/**
* Return false if the node should not be removed
*
* @access public
* @param DomDocument $dom
* @param DomNode $node
* @return boolean
*/
public function shouldRemove($dom, $node)
{
$document_length = strlen($dom->textContent);
$node_length = strlen($node->textContent);
if ($document_length === 0) {
return true;
}
$ratio = $node_length * 100 / $document_length;
if ($ratio >= 90) {
Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%');
return false;
}
return true;
}
}