Improve content dowloader and add Github rule

This commit is contained in:
Frédéric Guillot 2013-09-02 14:04:10 -04:00
parent 3b8d62a237
commit 67d9fbd944
6 changed files with 44 additions and 19 deletions

View File

@ -428,6 +428,7 @@ nav .active a {
line-height: 1.5em;
font-size: 100%;
font-family: Georgia, serif;
overflow: auto;
}
.items #current-item {

View File

@ -433,12 +433,16 @@ function download_item($item_id)
->save(array('content' => $content));
}
write_debug();
return array(
'result' => true,
'content' => $content
);
}
write_debug();
return array(
'result' => false,
'content' => ''

View File

@ -12,16 +12,20 @@ class Grabber
public $content = '';
public $html = '';
// Order is important
// Order is important, generic terms at the end
public $candidatesAttributes = array(
'article',
'articleBody',
'articlebody',
'article-body',
'articleContent',
'articlecontent',
'article-content',
'articlePage',
'post-content',
'entry-content',
'main-content',
'comic',
'article',
'content',
'main',
);
@ -81,6 +85,7 @@ class Grabber
Logging::log(\get_called_class().' No content fetched');
}
Logging::log(\get_called_class().' Content length: '.strlen($this->content).' bytes');
Logging::log(\get_called_class().' Grabber done');
return $this->content !== '';
@ -165,23 +170,28 @@ class Grabber
$dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$this->html);
$xpath = new \DOMXPath($dom);
// Try to lookup in each tag
foreach ($this->candidatesAttributes as $candidate) {
Logging::log(\get_called_class().' Try this candidate: "'.$candidate.'"');
$nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
Logging::log(\get_called_class().' Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
break;
}
}
// Try to fetch <article/>
if (! $this->content) {
$nodes = $xpath->query('//article');
if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
}
// Try to lookup in each <div/>
if (! $this->content) {
foreach ($this->candidatesAttributes as $candidate) {
$nodes = $xpath->query('//div[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
}
Logging::log(\get_called_class().' Find <article/> tag ('.strlen($this->content).' bytes)');
}
}
@ -199,7 +209,7 @@ class Grabber
{
\libxml_use_internal_errors(true);
$dom = new \DOMDocument;
$dom->loadXML('<?xml version="1.0" encoding="UTF-8">'.$this->content);
$dom->loadXML($this->content);
$xpath = new \DOMXPath($dom);
foreach ($this->stripTags as $tag) {
@ -207,6 +217,7 @@ class Grabber
$nodes = $xpath->query('//'.$tag);
if ($nodes !== false && $nodes->length > 0) {
Logging::log(\get_called_class().' Strip tag: "'.$tag.'"');
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
@ -218,16 +229,13 @@ class Grabber
$nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
if ($nodes !== false && $nodes->length > 0) {
Logging::log(\get_called_class().' Strip attribute: "'.$tag.'"');
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
}
}
$this->content = '';
foreach($dom->childNodes as $node) {
$this->content .= $dom->saveXML($node);
}
$this->content = $dom->saveXML($dom->documentElement);
}
}

10
vendor/PicoFeed/Rules/github.com.php vendored Normal file
View File

@ -0,0 +1,10 @@
<?php
return array(
'test_url' => 'https://github.com/audreyr/favicon-cheat-sheet',
'body' => array(
'//article[contains(@class, "entry-content")]',
),
'strip' => array(
'//h1'
)
);

View File

@ -8,5 +8,6 @@ return array(
'strip' => array(
'//script',
'//style',
'//*[contains(@class, "module-crunchbase")]'
)
);

View File

@ -3,6 +3,7 @@ return array(
'test_url' => 'http://www.bbc.co.uk/news/world-middle-east-23911833',
'body' => array(
'//div[@class="story-body"]',
'//div[@class="indPost"]'
),
'strip' => array(
'//script',