Improve content grabber

This commit is contained in:
Frédéric Guillot 2013-08-31 18:37:26 -04:00
parent 14d67d85e8
commit 242234c0a0
10 changed files with 107 additions and 53 deletions

View File

@ -367,11 +367,9 @@ function update_feed_cache_infos($feed_id, $last_modified, $etag)
function parse_content_with_readability($content, $url)
{
require_once 'vendor/Readability/Readability.php';
require_once 'vendor/PicoFeed/Encoding.php';
if (! empty($content)) {
$content = \PicoFeed\Encoding::toUTF8($content);
$readability = new \Readability($content, $url);
if ($readability->init()) {
@ -400,13 +398,14 @@ function download_content($url)
// Try first with PicoFeed grabber and with Readability after
$grabber = new \PicoFeed\Grabber($url);
$grabber->html = $html;
$content = '';
if ($grabber->parse()) {
$content = $grabber->content;
}
if (empty($content)) {
$content = parse_content_with_readability($html, $url);
$content = parse_content_with_readability($grabber->html, $url);
}
// Filter content

View File

@ -464,4 +464,10 @@ class Filter
return $data;
}
public static function stripMetaTags($data)
{
return preg_replace('/<meta\s.*?\/>/is', '', $data);
}
}

View File

@ -5,6 +5,7 @@ namespace PicoFeed;
require_once __DIR__.'/Client.php';
require_once __DIR__.'/Encoding.php';
require_once __DIR__.'/Logging.php';
require_once __DIR__.'/Filter.php';
class Grabber
{
@ -20,6 +21,7 @@ class Grabber
'articlecontent',
'articlePage',
'post-content',
'entry-content',
'content',
'main',
);
@ -36,6 +38,7 @@ class Grabber
'nav',
'header',
'social',
'entry-utility',
);
public $stripTags = array(
@ -58,34 +61,23 @@ class Grabber
{
if ($this->html) {
Logging::log(\get_called_class().' HTML fetched');
Logging::log(\get_called_class().' Fix encoding');
$this->html = Filter::stripMetaTags($this->html);
$this->html = Encoding::toUtf8($this->html);
Logging::log(\get_called_class().' Try to find rules');
$rules = $this->getRules();
\libxml_use_internal_errors(true);
$dom = new \DOMDocument;
$dom->loadHTML($this->html);
if (is_array($rules)) {
Logging::log(\get_called_class().' Parse content with rules');
$this->parseContentWithRules($dom, $rules);
$this->parseContentWithRules($rules);
}
else {
Logging::log(\get_called_class().' Parse content with candidates');
$this->parseContentWithCandidates($dom);
if (strlen($this->content) < 50) {
Logging::log(\get_called_class().' No enought content fetched, get the full body');
$this->content = $dom->saveXML($dom->firstChild);
}
Logging::log(\get_called_class().' Strip garbage');
$this->stripGarbage();
$this->parseContentWithCandidates();
}
}
else {
Logging::log(\get_called_class().' No content fetched');
}
@ -129,8 +121,11 @@ class Grabber
}
public function parseContentWithRules($dom, array $rules)
public function parseContentWithRules(array $rules)
{
\libxml_use_internal_errors(true);
$dom = new \DOMDocument;
$dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$this->html);
$xpath = new \DOMXPath($dom);
if (isset($rules['strip']) && is_array($rules['strip'])) {
@ -147,21 +142,6 @@ class Grabber
}
}
if (isset($rules['strip_id_or_class']) && is_array($rules['strip_id_or_class'])) {
foreach ($rules['strip_id_or_class'] as $pattern) {
$pattern = strtr($pattern, array("'" => '', '"' => ''));
$nodes = $xpath->query("//*[contains(@class, '$pattern') or contains(@id, '$pattern')]");
if ($nodes !== false && $nodes->length > 0) {
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
}
}
}
if (isset($rules['body']) && is_array($rules['body'])) {
foreach ($rules['body'] as $pattern) {
@ -178,8 +158,11 @@ class Grabber
}
public function parseContentWithCandidates($dom)
public function parseContentWithCandidates()
{
\libxml_use_internal_errors(true);
$dom = new \DOMDocument;
$dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$this->html);
$xpath = new \DOMXPath($dom);
// Try to fetch <article/>
@ -187,19 +170,28 @@ class Grabber
if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
return;
}
// Try to lookup in each <div/>
foreach ($this->candidatesAttributes as $candidate) {
if (! $this->content) {
$nodes = $xpath->query('//div[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
foreach ($this->candidatesAttributes as $candidate) {
if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
return;
$nodes = $xpath->query('//div[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
}
}
}
if (strlen($this->content) < 50) {
Logging::log(\get_called_class().' No enought content fetched, get the full body');
$this->content = $dom->saveXML($dom->firstChild);
}
Logging::log(\get_called_class().' Strip garbage');
$this->stripGarbage();
}
@ -207,7 +199,7 @@ class Grabber
{
\libxml_use_internal_errors(true);
$dom = new \DOMDocument;
$dom->loadXML($this->content);
$dom->loadXML('<?xml version="1.0" encoding="UTF-8">'.$this->content);
$xpath = new \DOMXPath($dom);
foreach ($this->stripTags as $tag) {

View File

@ -18,8 +18,8 @@ abstract class Parser
public $items = array();
public $grabber = false;
public $grabber_ignore_urls = array();
public $grabber_timeout = 5;
public $grabber_user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)';
public $grabber_timeout = null;
public $grabber_user_agent = null;
abstract public function execute();
@ -45,8 +45,7 @@ abstract class Parser
if ($this->grabber && ! in_array($item_url, $this->grabber_ignore_urls)) {
$grabber = new Grabber($item_url);
$grabber->download($this->grabber_timeout, $this->grabber_user_agent);
$grabber->parse();
if ($grabber->content) $item_content = $grabber->content;
if ($grabber->parse()) $item_content = $grabber->content;
}
if ($item_content) {

View File

@ -6,5 +6,5 @@ return array(
),
'strip' => array(
'//*[contains(@class, "fb-like") or contains(@class, "social")]'
)
),
);

View File

@ -1,8 +1,6 @@
<?php
return array(
'title' => '//header/h1',
'test_url' => 'http://opinionator.blogs.nytimes.com/2011/02/03/lost-and-gone-forever/',
'test_url' => 'http://krugman.blogs.nytimes.com/2012/09/12/a-vote-of-confidence/',
'test_url' => 'http://bits.blogs.nytimes.com/2012/01/16/wikipedia-plans-to-go-dark-on-wednesday-to-protest-sopa/',
'body' => array(
'//div[@class="postContent"]',

View File

@ -0,0 +1,25 @@
<?php
return array(
'test_url' => 'https://en.wikipedia.org/wiki/Grace_Hopper',
'body' => array(
'//div[@id="bodyContent"]',
),
'strip' => array(
"//div[@id='toc']",
"//div[@id='catlinks']",
"//div[@id='jump-to-nav']",
"//div[@class='thumbcaption']//div[@class='magnify']",
"//table[@class='navbox']",
"//table[contains(@class, 'infobox')]",
"//div[@class='dablink']",
"//div[@id='contentSub']",
"//div[@id='siteSub']",
"//table[@id='persondata']",
"//table[contains(@class, 'metadata')]",
"//*[contains(@class, 'noprint')]",
"//*[contains(@class, 'printfooter')]",
"//*[contains(@class, 'editsection')]",
"//*[contains(@class, 'error')]",
"//span[@title='pronunciation:']",
),
);

View File

@ -0,0 +1,12 @@
<?php
return array(
'test_url' => 'http://techcrunch.com/2013/08/31/indias-visa-maze/',
'body' => array(
'//div[contains(@class, "media-container")]',
'//div[@class="body-copy"]',
),
'strip' => array(
'//script',
'//style',
)
);

View File

@ -2,7 +2,21 @@
return array(
'test_url' => 'http://www.cnn.com/2013/08/31/world/meast/syria-civil-war/index.html?hpt=hp_t1',
'body' => array(
'//*[contains(@class, "cnn_storypgraphtxt")]]',
'//*[contains(@class, "cnnvideo_wrapper")]]',
'//div[@class="cnn_strycntntlft"]',
),
'strip' => array(
'//script',
'//style',
'//div[@class="cnn_stryshrwdgtbtm"]',
'//div[@class="cnn_strybtmcntnt"]',
'//div[@class="cnn_strylftcntnt"]',
'//div[contains(@class, "cnnGalleryContainer")]',
'//div[contains(@class, "cnn_strylftcexpbx")]',
'//div[contains(@class, "articleGalleryNavContainer")]',
'//div[contains(@class, "cnnArticleGalleryCaptionControl")]',
'//div[contains(@class, "cnnArticleGalleryNavPrevNextDisabled")]',
'//div[contains(@class, "cnnArticleGalleryNavPrevNext")]',
'//div[contains(@class, "cnn_html_media_title_new")]',
'//div[contains(@id, "disqus")]',
)
);

View File

@ -0,0 +1,9 @@
<?php
return array(
'test_url' => 'http://www.theguardian.com/law/2013/aug/31/microsoft-google-sue-us-fisa',
'body' => array(
'//div[@id="article-wrapper"]',
),
'strip' => array(
),
);