Improve content dowloader and add Github rule

This commit is contained in:
Frédéric Guillot 2013-09-02 14:04:10 -04:00
parent 3b8d62a237
commit 67d9fbd944
6 changed files with 44 additions and 19 deletions

View File

@ -428,6 +428,7 @@ nav .active a {
line-height: 1.5em; line-height: 1.5em;
font-size: 100%; font-size: 100%;
font-family: Georgia, serif; font-family: Georgia, serif;
overflow: auto;
} }
.items #current-item { .items #current-item {

View File

@ -433,12 +433,16 @@ function download_item($item_id)
->save(array('content' => $content)); ->save(array('content' => $content));
} }
write_debug();
return array( return array(
'result' => true, 'result' => true,
'content' => $content 'content' => $content
); );
} }
write_debug();
return array( return array(
'result' => false, 'result' => false,
'content' => '' 'content' => ''

View File

@ -12,16 +12,20 @@ class Grabber
public $content = ''; public $content = '';
public $html = ''; public $html = '';
// Order is important // Order is important, generic terms at the end
public $candidatesAttributes = array( public $candidatesAttributes = array(
'article',
'articleBody', 'articleBody',
'articlebody', 'articlebody',
'article-body',
'articleContent', 'articleContent',
'articlecontent', 'articlecontent',
'article-content',
'articlePage', 'articlePage',
'post-content', 'post-content',
'entry-content', 'entry-content',
'main-content',
'comic',
'article',
'content', 'content',
'main', 'main',
); );
@ -81,6 +85,7 @@ class Grabber
Logging::log(\get_called_class().' No content fetched'); Logging::log(\get_called_class().' No content fetched');
} }
Logging::log(\get_called_class().' Content length: '.strlen($this->content).' bytes');
Logging::log(\get_called_class().' Grabber done'); Logging::log(\get_called_class().' Grabber done');
return $this->content !== ''; return $this->content !== '';
@ -165,23 +170,28 @@ class Grabber
$dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$this->html); $dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$this->html);
$xpath = new \DOMXPath($dom); $xpath = new \DOMXPath($dom);
// Try to fetch <article/> // Try to lookup in each tag
$nodes = $xpath->query('//article'); foreach ($this->candidatesAttributes as $candidate) {
if ($nodes !== false && $nodes->length > 0) { Logging::log(\get_called_class().' Try this candidate: "'.$candidate.'"');
$this->content = $dom->saveXML($nodes->item(0));
$nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
Logging::log(\get_called_class().' Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
break;
}
} }
// Try to lookup in each <div/> // Try to fetch <article/>
if (! $this->content) { if (! $this->content) {
foreach ($this->candidatesAttributes as $candidate) { $nodes = $xpath->query('//article');
$nodes = $xpath->query('//div[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]'); if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
if ($nodes !== false && $nodes->length > 0) { Logging::log(\get_called_class().' Find <article/> tag ('.strlen($this->content).' bytes)');
$this->content = $dom->saveXML($nodes->item(0));
}
} }
} }
@ -199,7 +209,7 @@ class Grabber
{ {
\libxml_use_internal_errors(true); \libxml_use_internal_errors(true);
$dom = new \DOMDocument; $dom = new \DOMDocument;
$dom->loadXML('<?xml version="1.0" encoding="UTF-8">'.$this->content); $dom->loadXML($this->content);
$xpath = new \DOMXPath($dom); $xpath = new \DOMXPath($dom);
foreach ($this->stripTags as $tag) { foreach ($this->stripTags as $tag) {
@ -207,6 +217,7 @@ class Grabber
$nodes = $xpath->query('//'.$tag); $nodes = $xpath->query('//'.$tag);
if ($nodes !== false && $nodes->length > 0) { if ($nodes !== false && $nodes->length > 0) {
Logging::log(\get_called_class().' Strip tag: "'.$tag.'"');
foreach ($nodes as $node) { foreach ($nodes as $node) {
$node->parentNode->removeChild($node); $node->parentNode->removeChild($node);
} }
@ -218,16 +229,13 @@ class Grabber
$nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]'); $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
if ($nodes !== false && $nodes->length > 0) { if ($nodes !== false && $nodes->length > 0) {
Logging::log(\get_called_class().' Strip attribute: "'.$tag.'"');
foreach ($nodes as $node) { foreach ($nodes as $node) {
$node->parentNode->removeChild($node); $node->parentNode->removeChild($node);
} }
} }
} }
$this->content = ''; $this->content = $dom->saveXML($dom->documentElement);
foreach($dom->childNodes as $node) {
$this->content .= $dom->saveXML($node);
}
} }
} }

10
vendor/PicoFeed/Rules/github.com.php vendored Normal file
View File

@ -0,0 +1,10 @@
<?php
return array(
'test_url' => 'https://github.com/audreyr/favicon-cheat-sheet',
'body' => array(
'//article[contains(@class, "entry-content")]',
),
'strip' => array(
'//h1'
)
);

View File

@ -8,5 +8,6 @@ return array(
'strip' => array( 'strip' => array(
'//script', '//script',
'//style', '//style',
'//*[contains(@class, "module-crunchbase")]'
) )
); );

View File

@ -3,6 +3,7 @@ return array(
'test_url' => 'http://www.bbc.co.uk/news/world-middle-east-23911833', 'test_url' => 'http://www.bbc.co.uk/news/world-middle-east-23911833',
'body' => array( 'body' => array(
'//div[@class="story-body"]', '//div[@class="story-body"]',
'//div[@class="indPost"]'
), ),
'strip' => array( 'strip' => array(
'//script', '//script',