Improve content dowloader and add Github rule
This commit is contained in:
parent
3b8d62a237
commit
67d9fbd944
@ -428,6 +428,7 @@ nav .active a {
|
|||||||
line-height: 1.5em;
|
line-height: 1.5em;
|
||||||
font-size: 100%;
|
font-size: 100%;
|
||||||
font-family: Georgia, serif;
|
font-family: Georgia, serif;
|
||||||
|
overflow: auto;
|
||||||
}
|
}
|
||||||
|
|
||||||
.items #current-item {
|
.items #current-item {
|
||||||
|
@ -433,12 +433,16 @@ function download_item($item_id)
|
|||||||
->save(array('content' => $content));
|
->save(array('content' => $content));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
write_debug();
|
||||||
|
|
||||||
return array(
|
return array(
|
||||||
'result' => true,
|
'result' => true,
|
||||||
'content' => $content
|
'content' => $content
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
write_debug();
|
||||||
|
|
||||||
return array(
|
return array(
|
||||||
'result' => false,
|
'result' => false,
|
||||||
'content' => ''
|
'content' => ''
|
||||||
|
48
vendor/PicoFeed/Grabber.php
vendored
48
vendor/PicoFeed/Grabber.php
vendored
@ -12,16 +12,20 @@ class Grabber
|
|||||||
public $content = '';
|
public $content = '';
|
||||||
public $html = '';
|
public $html = '';
|
||||||
|
|
||||||
// Order is important
|
// Order is important, generic terms at the end
|
||||||
public $candidatesAttributes = array(
|
public $candidatesAttributes = array(
|
||||||
'article',
|
|
||||||
'articleBody',
|
'articleBody',
|
||||||
'articlebody',
|
'articlebody',
|
||||||
|
'article-body',
|
||||||
'articleContent',
|
'articleContent',
|
||||||
'articlecontent',
|
'articlecontent',
|
||||||
|
'article-content',
|
||||||
'articlePage',
|
'articlePage',
|
||||||
'post-content',
|
'post-content',
|
||||||
'entry-content',
|
'entry-content',
|
||||||
|
'main-content',
|
||||||
|
'comic',
|
||||||
|
'article',
|
||||||
'content',
|
'content',
|
||||||
'main',
|
'main',
|
||||||
);
|
);
|
||||||
@ -81,6 +85,7 @@ class Grabber
|
|||||||
Logging::log(\get_called_class().' No content fetched');
|
Logging::log(\get_called_class().' No content fetched');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Logging::log(\get_called_class().' Content length: '.strlen($this->content).' bytes');
|
||||||
Logging::log(\get_called_class().' Grabber done');
|
Logging::log(\get_called_class().' Grabber done');
|
||||||
|
|
||||||
return $this->content !== '';
|
return $this->content !== '';
|
||||||
@ -165,23 +170,28 @@ class Grabber
|
|||||||
$dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$this->html);
|
$dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$this->html);
|
||||||
$xpath = new \DOMXPath($dom);
|
$xpath = new \DOMXPath($dom);
|
||||||
|
|
||||||
|
// Try to lookup in each tag
|
||||||
|
foreach ($this->candidatesAttributes as $candidate) {
|
||||||
|
|
||||||
|
Logging::log(\get_called_class().' Try this candidate: "'.$candidate.'"');
|
||||||
|
|
||||||
|
$nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
|
||||||
|
|
||||||
|
if ($nodes !== false && $nodes->length > 0) {
|
||||||
|
$this->content = $dom->saveXML($nodes->item(0));
|
||||||
|
Logging::log(\get_called_class().' Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Try to fetch <article/>
|
// Try to fetch <article/>
|
||||||
|
if (! $this->content) {
|
||||||
|
|
||||||
$nodes = $xpath->query('//article');
|
$nodes = $xpath->query('//article');
|
||||||
|
|
||||||
if ($nodes !== false && $nodes->length > 0) {
|
if ($nodes !== false && $nodes->length > 0) {
|
||||||
$this->content = $dom->saveXML($nodes->item(0));
|
$this->content = $dom->saveXML($nodes->item(0));
|
||||||
}
|
Logging::log(\get_called_class().' Find <article/> tag ('.strlen($this->content).' bytes)');
|
||||||
|
|
||||||
// Try to lookup in each <div/>
|
|
||||||
if (! $this->content) {
|
|
||||||
|
|
||||||
foreach ($this->candidatesAttributes as $candidate) {
|
|
||||||
|
|
||||||
$nodes = $xpath->query('//div[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
|
|
||||||
|
|
||||||
if ($nodes !== false && $nodes->length > 0) {
|
|
||||||
$this->content = $dom->saveXML($nodes->item(0));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -199,7 +209,7 @@ class Grabber
|
|||||||
{
|
{
|
||||||
\libxml_use_internal_errors(true);
|
\libxml_use_internal_errors(true);
|
||||||
$dom = new \DOMDocument;
|
$dom = new \DOMDocument;
|
||||||
$dom->loadXML('<?xml version="1.0" encoding="UTF-8">'.$this->content);
|
$dom->loadXML($this->content);
|
||||||
$xpath = new \DOMXPath($dom);
|
$xpath = new \DOMXPath($dom);
|
||||||
|
|
||||||
foreach ($this->stripTags as $tag) {
|
foreach ($this->stripTags as $tag) {
|
||||||
@ -207,6 +217,7 @@ class Grabber
|
|||||||
$nodes = $xpath->query('//'.$tag);
|
$nodes = $xpath->query('//'.$tag);
|
||||||
|
|
||||||
if ($nodes !== false && $nodes->length > 0) {
|
if ($nodes !== false && $nodes->length > 0) {
|
||||||
|
Logging::log(\get_called_class().' Strip tag: "'.$tag.'"');
|
||||||
foreach ($nodes as $node) {
|
foreach ($nodes as $node) {
|
||||||
$node->parentNode->removeChild($node);
|
$node->parentNode->removeChild($node);
|
||||||
}
|
}
|
||||||
@ -218,16 +229,13 @@ class Grabber
|
|||||||
$nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
|
$nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
|
||||||
|
|
||||||
if ($nodes !== false && $nodes->length > 0) {
|
if ($nodes !== false && $nodes->length > 0) {
|
||||||
|
Logging::log(\get_called_class().' Strip attribute: "'.$tag.'"');
|
||||||
foreach ($nodes as $node) {
|
foreach ($nodes as $node) {
|
||||||
$node->parentNode->removeChild($node);
|
$node->parentNode->removeChild($node);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
$this->content = '';
|
$this->content = $dom->saveXML($dom->documentElement);
|
||||||
|
|
||||||
foreach($dom->childNodes as $node) {
|
|
||||||
$this->content .= $dom->saveXML($node);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
10
vendor/PicoFeed/Rules/github.com.php
vendored
Normal file
10
vendor/PicoFeed/Rules/github.com.php
vendored
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'test_url' => 'https://github.com/audreyr/favicon-cheat-sheet',
|
||||||
|
'body' => array(
|
||||||
|
'//article[contains(@class, "entry-content")]',
|
||||||
|
),
|
||||||
|
'strip' => array(
|
||||||
|
'//h1'
|
||||||
|
)
|
||||||
|
);
|
1
vendor/PicoFeed/Rules/techcrunch.com.php
vendored
1
vendor/PicoFeed/Rules/techcrunch.com.php
vendored
@ -8,5 +8,6 @@ return array(
|
|||||||
'strip' => array(
|
'strip' => array(
|
||||||
'//script',
|
'//script',
|
||||||
'//style',
|
'//style',
|
||||||
|
'//*[contains(@class, "module-crunchbase")]'
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
1
vendor/PicoFeed/Rules/www.bbc.co.uk.php
vendored
1
vendor/PicoFeed/Rules/www.bbc.co.uk.php
vendored
@ -3,6 +3,7 @@ return array(
|
|||||||
'test_url' => 'http://www.bbc.co.uk/news/world-middle-east-23911833',
|
'test_url' => 'http://www.bbc.co.uk/news/world-middle-east-23911833',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="story-body"]',
|
'//div[@class="story-body"]',
|
||||||
|
'//div[@class="indPost"]'
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
'//script',
|
'//script',
|
||||||
|
Loading…
Reference in New Issue
Block a user