Improve content grabber
This commit is contained in:
parent
14d67d85e8
commit
242234c0a0
@ -367,11 +367,9 @@ function update_feed_cache_infos($feed_id, $last_modified, $etag)
|
|||||||
function parse_content_with_readability($content, $url)
|
function parse_content_with_readability($content, $url)
|
||||||
{
|
{
|
||||||
require_once 'vendor/Readability/Readability.php';
|
require_once 'vendor/Readability/Readability.php';
|
||||||
require_once 'vendor/PicoFeed/Encoding.php';
|
|
||||||
|
|
||||||
if (! empty($content)) {
|
if (! empty($content)) {
|
||||||
|
|
||||||
$content = \PicoFeed\Encoding::toUTF8($content);
|
|
||||||
$readability = new \Readability($content, $url);
|
$readability = new \Readability($content, $url);
|
||||||
|
|
||||||
if ($readability->init()) {
|
if ($readability->init()) {
|
||||||
@ -400,13 +398,14 @@ function download_content($url)
|
|||||||
// Try first with PicoFeed grabber and with Readability after
|
// Try first with PicoFeed grabber and with Readability after
|
||||||
$grabber = new \PicoFeed\Grabber($url);
|
$grabber = new \PicoFeed\Grabber($url);
|
||||||
$grabber->html = $html;
|
$grabber->html = $html;
|
||||||
|
$content = '';
|
||||||
|
|
||||||
if ($grabber->parse()) {
|
if ($grabber->parse()) {
|
||||||
$content = $grabber->content;
|
$content = $grabber->content;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (empty($content)) {
|
if (empty($content)) {
|
||||||
$content = parse_content_with_readability($html, $url);
|
$content = parse_content_with_readability($grabber->html, $url);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Filter content
|
// Filter content
|
||||||
|
6
vendor/PicoFeed/Filter.php
vendored
6
vendor/PicoFeed/Filter.php
vendored
@ -464,4 +464,10 @@ class Filter
|
|||||||
|
|
||||||
return $data;
|
return $data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static function stripMetaTags($data)
|
||||||
|
{
|
||||||
|
return preg_replace('/<meta\s.*?\/>/is', '', $data);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
74
vendor/PicoFeed/Grabber.php
vendored
74
vendor/PicoFeed/Grabber.php
vendored
@ -5,6 +5,7 @@ namespace PicoFeed;
|
|||||||
require_once __DIR__.'/Client.php';
|
require_once __DIR__.'/Client.php';
|
||||||
require_once __DIR__.'/Encoding.php';
|
require_once __DIR__.'/Encoding.php';
|
||||||
require_once __DIR__.'/Logging.php';
|
require_once __DIR__.'/Logging.php';
|
||||||
|
require_once __DIR__.'/Filter.php';
|
||||||
|
|
||||||
class Grabber
|
class Grabber
|
||||||
{
|
{
|
||||||
@ -20,6 +21,7 @@ class Grabber
|
|||||||
'articlecontent',
|
'articlecontent',
|
||||||
'articlePage',
|
'articlePage',
|
||||||
'post-content',
|
'post-content',
|
||||||
|
'entry-content',
|
||||||
'content',
|
'content',
|
||||||
'main',
|
'main',
|
||||||
);
|
);
|
||||||
@ -36,6 +38,7 @@ class Grabber
|
|||||||
'nav',
|
'nav',
|
||||||
'header',
|
'header',
|
||||||
'social',
|
'social',
|
||||||
|
'entry-utility',
|
||||||
);
|
);
|
||||||
|
|
||||||
public $stripTags = array(
|
public $stripTags = array(
|
||||||
@ -58,34 +61,23 @@ class Grabber
|
|||||||
{
|
{
|
||||||
if ($this->html) {
|
if ($this->html) {
|
||||||
|
|
||||||
Logging::log(\get_called_class().' HTML fetched');
|
Logging::log(\get_called_class().' Fix encoding');
|
||||||
|
$this->html = Filter::stripMetaTags($this->html);
|
||||||
|
$this->html = Encoding::toUtf8($this->html);
|
||||||
|
|
||||||
|
Logging::log(\get_called_class().' Try to find rules');
|
||||||
$rules = $this->getRules();
|
$rules = $this->getRules();
|
||||||
|
|
||||||
\libxml_use_internal_errors(true);
|
|
||||||
$dom = new \DOMDocument;
|
|
||||||
$dom->loadHTML($this->html);
|
|
||||||
|
|
||||||
if (is_array($rules)) {
|
if (is_array($rules)) {
|
||||||
Logging::log(\get_called_class().' Parse content with rules');
|
Logging::log(\get_called_class().' Parse content with rules');
|
||||||
$this->parseContentWithRules($dom, $rules);
|
$this->parseContentWithRules($rules);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
|
||||||
Logging::log(\get_called_class().' Parse content with candidates');
|
Logging::log(\get_called_class().' Parse content with candidates');
|
||||||
$this->parseContentWithCandidates($dom);
|
$this->parseContentWithCandidates();
|
||||||
|
|
||||||
if (strlen($this->content) < 50) {
|
|
||||||
Logging::log(\get_called_class().' No enought content fetched, get the full body');
|
|
||||||
$this->content = $dom->saveXML($dom->firstChild);
|
|
||||||
}
|
|
||||||
|
|
||||||
Logging::log(\get_called_class().' Strip garbage');
|
|
||||||
$this->stripGarbage();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
|
||||||
Logging::log(\get_called_class().' No content fetched');
|
Logging::log(\get_called_class().' No content fetched');
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -129,8 +121,11 @@ class Grabber
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public function parseContentWithRules($dom, array $rules)
|
public function parseContentWithRules(array $rules)
|
||||||
{
|
{
|
||||||
|
\libxml_use_internal_errors(true);
|
||||||
|
$dom = new \DOMDocument;
|
||||||
|
$dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$this->html);
|
||||||
$xpath = new \DOMXPath($dom);
|
$xpath = new \DOMXPath($dom);
|
||||||
|
|
||||||
if (isset($rules['strip']) && is_array($rules['strip'])) {
|
if (isset($rules['strip']) && is_array($rules['strip'])) {
|
||||||
@ -147,21 +142,6 @@ class Grabber
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isset($rules['strip_id_or_class']) && is_array($rules['strip_id_or_class'])) {
|
|
||||||
|
|
||||||
foreach ($rules['strip_id_or_class'] as $pattern) {
|
|
||||||
|
|
||||||
$pattern = strtr($pattern, array("'" => '', '"' => ''));
|
|
||||||
$nodes = $xpath->query("//*[contains(@class, '$pattern') or contains(@id, '$pattern')]");
|
|
||||||
|
|
||||||
if ($nodes !== false && $nodes->length > 0) {
|
|
||||||
foreach ($nodes as $node) {
|
|
||||||
$node->parentNode->removeChild($node);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isset($rules['body']) && is_array($rules['body'])) {
|
if (isset($rules['body']) && is_array($rules['body'])) {
|
||||||
|
|
||||||
foreach ($rules['body'] as $pattern) {
|
foreach ($rules['body'] as $pattern) {
|
||||||
@ -178,8 +158,11 @@ class Grabber
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public function parseContentWithCandidates($dom)
|
public function parseContentWithCandidates()
|
||||||
{
|
{
|
||||||
|
\libxml_use_internal_errors(true);
|
||||||
|
$dom = new \DOMDocument;
|
||||||
|
$dom->loadHTML('<?xml version="1.0" encoding="UTF-8">'.$this->html);
|
||||||
$xpath = new \DOMXPath($dom);
|
$xpath = new \DOMXPath($dom);
|
||||||
|
|
||||||
// Try to fetch <article/>
|
// Try to fetch <article/>
|
||||||
@ -187,19 +170,28 @@ class Grabber
|
|||||||
|
|
||||||
if ($nodes !== false && $nodes->length > 0) {
|
if ($nodes !== false && $nodes->length > 0) {
|
||||||
$this->content = $dom->saveXML($nodes->item(0));
|
$this->content = $dom->saveXML($nodes->item(0));
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Try to lookup in each <div/>
|
// Try to lookup in each <div/>
|
||||||
foreach ($this->candidatesAttributes as $candidate) {
|
if (! $this->content) {
|
||||||
|
|
||||||
$nodes = $xpath->query('//div[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
|
foreach ($this->candidatesAttributes as $candidate) {
|
||||||
|
|
||||||
if ($nodes !== false && $nodes->length > 0) {
|
$nodes = $xpath->query('//div[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
|
||||||
$this->content = $dom->saveXML($nodes->item(0));
|
|
||||||
return;
|
if ($nodes !== false && $nodes->length > 0) {
|
||||||
|
$this->content = $dom->saveXML($nodes->item(0));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (strlen($this->content) < 50) {
|
||||||
|
Logging::log(\get_called_class().' No enought content fetched, get the full body');
|
||||||
|
$this->content = $dom->saveXML($dom->firstChild);
|
||||||
|
}
|
||||||
|
|
||||||
|
Logging::log(\get_called_class().' Strip garbage');
|
||||||
|
$this->stripGarbage();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -207,7 +199,7 @@ class Grabber
|
|||||||
{
|
{
|
||||||
\libxml_use_internal_errors(true);
|
\libxml_use_internal_errors(true);
|
||||||
$dom = new \DOMDocument;
|
$dom = new \DOMDocument;
|
||||||
$dom->loadXML($this->content);
|
$dom->loadXML('<?xml version="1.0" encoding="UTF-8">'.$this->content);
|
||||||
$xpath = new \DOMXPath($dom);
|
$xpath = new \DOMXPath($dom);
|
||||||
|
|
||||||
foreach ($this->stripTags as $tag) {
|
foreach ($this->stripTags as $tag) {
|
||||||
|
7
vendor/PicoFeed/Parser.php
vendored
7
vendor/PicoFeed/Parser.php
vendored
@ -18,8 +18,8 @@ abstract class Parser
|
|||||||
public $items = array();
|
public $items = array();
|
||||||
public $grabber = false;
|
public $grabber = false;
|
||||||
public $grabber_ignore_urls = array();
|
public $grabber_ignore_urls = array();
|
||||||
public $grabber_timeout = 5;
|
public $grabber_timeout = null;
|
||||||
public $grabber_user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)';
|
public $grabber_user_agent = null;
|
||||||
|
|
||||||
|
|
||||||
abstract public function execute();
|
abstract public function execute();
|
||||||
@ -45,8 +45,7 @@ abstract class Parser
|
|||||||
if ($this->grabber && ! in_array($item_url, $this->grabber_ignore_urls)) {
|
if ($this->grabber && ! in_array($item_url, $this->grabber_ignore_urls)) {
|
||||||
$grabber = new Grabber($item_url);
|
$grabber = new Grabber($item_url);
|
||||||
$grabber->download($this->grabber_timeout, $this->grabber_user_agent);
|
$grabber->download($this->grabber_timeout, $this->grabber_user_agent);
|
||||||
$grabber->parse();
|
if ($grabber->parse()) $item_content = $grabber->content;
|
||||||
if ($grabber->content) $item_content = $grabber->content;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($item_content) {
|
if ($item_content) {
|
||||||
|
2
vendor/PicoFeed/Rules/.blog.lemonde.fr.php
vendored
2
vendor/PicoFeed/Rules/.blog.lemonde.fr.php
vendored
@ -6,5 +6,5 @@ return array(
|
|||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
'//*[contains(@class, "fb-like") or contains(@class, "social")]'
|
'//*[contains(@class, "fb-like") or contains(@class, "social")]'
|
||||||
)
|
),
|
||||||
);
|
);
|
2
vendor/PicoFeed/Rules/.blogs.nytimes.com.php
vendored
2
vendor/PicoFeed/Rules/.blogs.nytimes.com.php
vendored
@ -1,8 +1,6 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
'title' => '//header/h1',
|
'title' => '//header/h1',
|
||||||
'test_url' => 'http://opinionator.blogs.nytimes.com/2011/02/03/lost-and-gone-forever/',
|
|
||||||
'test_url' => 'http://krugman.blogs.nytimes.com/2012/09/12/a-vote-of-confidence/',
|
|
||||||
'test_url' => 'http://bits.blogs.nytimes.com/2012/01/16/wikipedia-plans-to-go-dark-on-wednesday-to-protest-sopa/',
|
'test_url' => 'http://bits.blogs.nytimes.com/2012/01/16/wikipedia-plans-to-go-dark-on-wednesday-to-protest-sopa/',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="postContent"]',
|
'//div[@class="postContent"]',
|
||||||
|
25
vendor/PicoFeed/Rules/.wikipedia.org.php
vendored
Normal file
25
vendor/PicoFeed/Rules/.wikipedia.org.php
vendored
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'test_url' => 'https://en.wikipedia.org/wiki/Grace_Hopper',
|
||||||
|
'body' => array(
|
||||||
|
'//div[@id="bodyContent"]',
|
||||||
|
),
|
||||||
|
'strip' => array(
|
||||||
|
"//div[@id='toc']",
|
||||||
|
"//div[@id='catlinks']",
|
||||||
|
"//div[@id='jump-to-nav']",
|
||||||
|
"//div[@class='thumbcaption']//div[@class='magnify']",
|
||||||
|
"//table[@class='navbox']",
|
||||||
|
"//table[contains(@class, 'infobox')]",
|
||||||
|
"//div[@class='dablink']",
|
||||||
|
"//div[@id='contentSub']",
|
||||||
|
"//div[@id='siteSub']",
|
||||||
|
"//table[@id='persondata']",
|
||||||
|
"//table[contains(@class, 'metadata')]",
|
||||||
|
"//*[contains(@class, 'noprint')]",
|
||||||
|
"//*[contains(@class, 'printfooter')]",
|
||||||
|
"//*[contains(@class, 'editsection')]",
|
||||||
|
"//*[contains(@class, 'error')]",
|
||||||
|
"//span[@title='pronunciation:']",
|
||||||
|
),
|
||||||
|
);
|
12
vendor/PicoFeed/Rules/techcrunch.com.php
vendored
Normal file
12
vendor/PicoFeed/Rules/techcrunch.com.php
vendored
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'test_url' => 'http://techcrunch.com/2013/08/31/indias-visa-maze/',
|
||||||
|
'body' => array(
|
||||||
|
'//div[contains(@class, "media-container")]',
|
||||||
|
'//div[@class="body-copy"]',
|
||||||
|
),
|
||||||
|
'strip' => array(
|
||||||
|
'//script',
|
||||||
|
'//style',
|
||||||
|
)
|
||||||
|
);
|
18
vendor/PicoFeed/Rules/www.cnn.com.php
vendored
18
vendor/PicoFeed/Rules/www.cnn.com.php
vendored
@ -2,7 +2,21 @@
|
|||||||
return array(
|
return array(
|
||||||
'test_url' => 'http://www.cnn.com/2013/08/31/world/meast/syria-civil-war/index.html?hpt=hp_t1',
|
'test_url' => 'http://www.cnn.com/2013/08/31/world/meast/syria-civil-war/index.html?hpt=hp_t1',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//*[contains(@class, "cnn_storypgraphtxt")]]',
|
'//div[@class="cnn_strycntntlft"]',
|
||||||
'//*[contains(@class, "cnnvideo_wrapper")]]',
|
|
||||||
),
|
),
|
||||||
|
'strip' => array(
|
||||||
|
'//script',
|
||||||
|
'//style',
|
||||||
|
'//div[@class="cnn_stryshrwdgtbtm"]',
|
||||||
|
'//div[@class="cnn_strybtmcntnt"]',
|
||||||
|
'//div[@class="cnn_strylftcntnt"]',
|
||||||
|
'//div[contains(@class, "cnnGalleryContainer")]',
|
||||||
|
'//div[contains(@class, "cnn_strylftcexpbx")]',
|
||||||
|
'//div[contains(@class, "articleGalleryNavContainer")]',
|
||||||
|
'//div[contains(@class, "cnnArticleGalleryCaptionControl")]',
|
||||||
|
'//div[contains(@class, "cnnArticleGalleryNavPrevNextDisabled")]',
|
||||||
|
'//div[contains(@class, "cnnArticleGalleryNavPrevNext")]',
|
||||||
|
'//div[contains(@class, "cnn_html_media_title_new")]',
|
||||||
|
'//div[contains(@id, "disqus")]',
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
9
vendor/PicoFeed/Rules/www.theguardian.com.php
vendored
Normal file
9
vendor/PicoFeed/Rules/www.theguardian.com.php
vendored
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'test_url' => 'http://www.theguardian.com/law/2013/aug/31/microsoft-google-sue-us-fisa',
|
||||||
|
'body' => array(
|
||||||
|
'//div[@id="article-wrapper"]',
|
||||||
|
),
|
||||||
|
'strip' => array(
|
||||||
|
),
|
||||||
|
);
|
Loading…
Reference in New Issue
Block a user