Improve content grabber: add rules for specific websites and add automatic download for feeds
This commit is contained in:
parent
1429c2f44e
commit
e77b785263
@ -12,7 +12,7 @@ require 'model.php';
|
||||
if (file_exists('config.php')) require 'config.php';
|
||||
|
||||
defined('APP_VERSION') or define('APP_VERSION', 'master');
|
||||
defined('HTTP_TIMEOUT') or define('HTTP_TIMEOUT', 10);
|
||||
defined('HTTP_TIMEOUT') or define('HTTP_TIMEOUT', 20);
|
||||
defined('DB_FILENAME') or define('DB_FILENAME', 'data/db.sqlite');
|
||||
defined('DEBUG') or define('DEBUG', true);
|
||||
defined('DEBUG_FILENAME') or define('DEBUG_FILENAME', 'data/debug.log');
|
||||
|
36
index.php
36
index.php
@ -322,6 +322,38 @@ Router\get_action('refresh-all', function() {
|
||||
});
|
||||
|
||||
|
||||
// Disable content grabber for a feed
|
||||
Router\get_action('disable-grabber-feed', function() {
|
||||
|
||||
$id = Request\int_param('feed_id');
|
||||
|
||||
if ($id && Model\disable_grabber_feed($id)) {
|
||||
Session\flash(t('The content grabber is disabled successfully.'));
|
||||
}
|
||||
else {
|
||||
Session\flash_error(t('Unable to disable the content grabber for this subscription.'));
|
||||
}
|
||||
|
||||
Response\redirect('?action=feeds');
|
||||
});
|
||||
|
||||
|
||||
// Enable content grabber for a feed
|
||||
Router\get_action('enable-grabber-feed', function() {
|
||||
|
||||
$id = Request\int_param('feed_id');
|
||||
|
||||
if ($id && Model\enable_grabber_feed($id)) {
|
||||
Session\flash(t('The content grabber is enabled successfully.'));
|
||||
}
|
||||
else {
|
||||
Session\flash_error(t('Unable to activate the content grabber for this subscription.'));
|
||||
}
|
||||
|
||||
Response\redirect('?action=feeds');
|
||||
});
|
||||
|
||||
|
||||
// Confirmation box to disable a feed
|
||||
Router\get_action('confirm-disable-feed', function() {
|
||||
|
||||
@ -467,7 +499,7 @@ Router\get_action('add', function() {
|
||||
// Add the feed
|
||||
Router\post_action('add', function() {
|
||||
|
||||
$result = Model\import_feed(trim($_POST['url']));
|
||||
$result = Model\import_feed(trim($_POST['url']), isset($_POST['download_content']) && $_POST['download_content'] == 1);
|
||||
|
||||
if ($result) {
|
||||
|
||||
@ -590,11 +622,9 @@ Router\post_action('config', function() {
|
||||
if ($valid) {
|
||||
|
||||
if (Model\save_config($values)) {
|
||||
|
||||
Session\flash(t('Your preferences are updated.'));
|
||||
}
|
||||
else {
|
||||
|
||||
Session\flash_error(t('Unable to update your preferences.'));
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,13 @@
|
||||
<?php
|
||||
|
||||
return array(
|
||||
'The content grabber is enabled successfully.' => 'Le téléchargement de contenu est activé avec succès.',
|
||||
'Unable to activate the content grabber for this subscription.' => 'Impossible d\'activer le téléchargement de contenu pour cet abonnement.',
|
||||
'enable full content' => 'télécharger le contenu complet',
|
||||
'disable full content' => 'désactiver le téléchargement du contenu',
|
||||
'Download full content' => 'Télécharger le contenu complet',
|
||||
'Downloading full content is slower because Miniflux grab the content from the original website. You should use that for subscriptions that display only a summary. This feature doesn\'t work with all websites.' =>
|
||||
'Le téléchargement complet du contenu est plus lent car Miniflux va chercher le contenu sur le site original. Vous devriez utiliser cela uniquement pour les abonnements qui affichent seulement un résumé. Cette fonctionnalité ne marche pas avec tous les sites web.',
|
||||
'No message' => 'Aucun message',
|
||||
'flush messages' => 'supprimer les messages',
|
||||
'API endpoint:' => 'URL de l\'API : ',
|
||||
|
149
model.php
149
model.php
@ -2,7 +2,6 @@
|
||||
|
||||
namespace Model;
|
||||
|
||||
require_once 'vendor/PicoFeed/Encoding.php';
|
||||
require_once 'vendor/PicoFeed/Filter.php';
|
||||
require_once 'vendor/PicoFeed/Client.php';
|
||||
require_once 'vendor/PicoFeed/Export.php';
|
||||
@ -25,8 +24,9 @@ use PicoFeed\Reader;
|
||||
use PicoFeed\Export;
|
||||
|
||||
|
||||
const DB_VERSION = 14;
|
||||
const DB_VERSION = 15;
|
||||
const HTTP_USERAGENT = 'Miniflux - http://miniflux.net';
|
||||
const HTTP_FAKE_USERAGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36';
|
||||
const LIMIT_ALL = -1;
|
||||
|
||||
|
||||
@ -169,7 +169,7 @@ function import_feeds($content)
|
||||
}
|
||||
|
||||
|
||||
function import_feed($url)
|
||||
function import_feed($url, $grabber = false)
|
||||
{
|
||||
$reader = new Reader;
|
||||
$resource = $reader->download($url, '', '', HTTP_TIMEOUT, HTTP_USERAGENT);
|
||||
@ -178,6 +178,7 @@ function import_feed($url)
|
||||
|
||||
if ($parser !== false) {
|
||||
|
||||
$parser->grabber = $grabber;
|
||||
$feed = $parser->execute();
|
||||
|
||||
if ($feed === false || ! $feed->title || ! $feed->url) {
|
||||
@ -193,13 +194,14 @@ function import_feed($url)
|
||||
$rs = $db->table('feeds')->save(array(
|
||||
'title' => $feed->title,
|
||||
'site_url' => $feed->url,
|
||||
'feed_url' => $reader->getUrl()
|
||||
'feed_url' => $reader->getUrl(),
|
||||
'download_content' => $grabber ? 1 : 0
|
||||
));
|
||||
|
||||
if ($rs) {
|
||||
|
||||
$feed_id = $db->getConnection()->getLastId();
|
||||
update_items($feed_id, $feed->items);
|
||||
update_items($feed_id, $feed->items, $grabber);
|
||||
write_debug();
|
||||
|
||||
return (int) $feed_id;
|
||||
@ -255,12 +257,25 @@ function update_feed($feed_id)
|
||||
|
||||
if ($parser !== false) {
|
||||
|
||||
$feed = $parser->execute();
|
||||
if ($feed['download_content']) {
|
||||
|
||||
if ($feed !== false) {
|
||||
// Don't fetch previous items, only new one
|
||||
$parser->grabber_ignore_urls = \PicoTools\singleton('db')
|
||||
->table('items')
|
||||
->eq('feed_id', $feed_id)
|
||||
->findAllByColumn('url');
|
||||
|
||||
$parser->grabber = true;
|
||||
$parser->grabber_timeout = HTTP_TIMEOUT;
|
||||
$parser->grabber_user_agent = HTTP_FAKE_USERAGENT;
|
||||
}
|
||||
|
||||
$result = $parser->execute();
|
||||
|
||||
if ($result !== false) {
|
||||
|
||||
update_feed_cache_infos($feed_id, $resource->getLastModified(), $resource->getEtag());
|
||||
update_items($feed_id, $feed->items);
|
||||
update_items($feed_id, $result->items, $parser->grabber);
|
||||
write_debug();
|
||||
|
||||
return true;
|
||||
@ -349,52 +364,82 @@ function update_feed_cache_infos($feed_id, $last_modified, $etag)
|
||||
}
|
||||
|
||||
|
||||
function download_item($item_id)
|
||||
function parse_content_with_readability($content, $url)
|
||||
{
|
||||
require_once 'vendor/Readability/Readability.php';
|
||||
|
||||
$item = get_item($item_id);
|
||||
|
||||
$client = \PicoFeed\Client::create();
|
||||
$client->url = $item['url'];
|
||||
$client->timeout = HTTP_TIMEOUT;
|
||||
$client->user_agent = HTTP_USERAGENT;
|
||||
$client->execute();
|
||||
|
||||
$content = $client->getContent();
|
||||
require_once 'vendor/PicoFeed/Encoding.php';
|
||||
|
||||
if (! empty($content)) {
|
||||
|
||||
$content = \PicoFeed\Encoding::toUTF8($content);
|
||||
|
||||
$readability = new \Readability($content, $item['url']);
|
||||
$readability = new \Readability($content, $url);
|
||||
|
||||
if ($readability->init()) {
|
||||
|
||||
// Get relevant content
|
||||
$content = $readability->getContent()->innerHTML;
|
||||
|
||||
// Filter content
|
||||
$filter = new \PicoFeed\Filter($content, $item['url']);
|
||||
$content = $filter->execute();
|
||||
|
||||
$nocontent = (bool) get_config_value('nocontent');
|
||||
if ($nocontent === false) {
|
||||
|
||||
// Save content
|
||||
\PicoTools\singleton('db')
|
||||
->table('items')
|
||||
->eq('id', $item['id'])
|
||||
->save(array('content' => $content));
|
||||
}
|
||||
|
||||
return array(
|
||||
'result' => true,
|
||||
'content' => $content
|
||||
);
|
||||
return $readability->getContent()->innerHTML;
|
||||
}
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
|
||||
function download_content($url)
|
||||
{
|
||||
require_once 'vendor/PicoFeed/Grabber.php';
|
||||
|
||||
$client = \PicoFeed\Client::create();
|
||||
$client->url = $url;
|
||||
$client->timeout = HTTP_TIMEOUT;
|
||||
$client->user_agent = HTTP_FAKE_USERAGENT;
|
||||
$client->execute();
|
||||
|
||||
$html = $client->getContent();
|
||||
|
||||
if (! empty($html)) {
|
||||
|
||||
// Try first with PicoFeed grabber and with Readability after
|
||||
$grabber = new \PicoFeed\Grabber($url);
|
||||
$grabber->html = $html;
|
||||
|
||||
if ($grabber->parse()) {
|
||||
$content = $grabber->content;
|
||||
}
|
||||
|
||||
if (empty($content)) {
|
||||
$content = parse_content_with_readability($html, $url);
|
||||
}
|
||||
|
||||
// Filter content
|
||||
$filter = new \PicoFeed\Filter($content, $url);
|
||||
return $filter->execute();
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
|
||||
function download_item($item_id)
|
||||
{
|
||||
$item = get_item($item_id);
|
||||
$content = download_content($item['url']);
|
||||
|
||||
if (! empty($content)) {
|
||||
|
||||
if (! get_config_value('nocontent')) {
|
||||
|
||||
// Save content
|
||||
\PicoTools\singleton('db')
|
||||
->table('items')
|
||||
->eq('id', $item['id'])
|
||||
->save(array('content' => $content));
|
||||
}
|
||||
|
||||
return array(
|
||||
'result' => true,
|
||||
'content' => $content
|
||||
);
|
||||
}
|
||||
|
||||
return array(
|
||||
'result' => false,
|
||||
'content' => ''
|
||||
@ -427,6 +472,18 @@ function disable_feed($feed_id)
|
||||
}
|
||||
|
||||
|
||||
function enable_grabber_feed($feed_id)
|
||||
{
|
||||
return \PicoTools\singleton('db')->table('feeds')->eq('id', $feed_id)->save((array('download_content' => 1)));
|
||||
}
|
||||
|
||||
|
||||
function disable_grabber_feed($feed_id)
|
||||
{
|
||||
return \PicoTools\singleton('db')->table('feeds')->eq('id', $feed_id)->save((array('download_content' => 0)));
|
||||
}
|
||||
|
||||
|
||||
function get_items($status, $offset = null, $limit = null)
|
||||
{
|
||||
return \PicoTools\singleton('db')
|
||||
@ -727,7 +784,7 @@ function autoflush()
|
||||
}
|
||||
|
||||
|
||||
function update_items($feed_id, array $items)
|
||||
function update_items($feed_id, array $items, $grabber = false)
|
||||
{
|
||||
$nocontent = (bool) get_config_value('nocontent');
|
||||
|
||||
@ -744,6 +801,10 @@ function update_items($feed_id, array $items)
|
||||
// Insert only new item
|
||||
if ($db->table('items')->eq('id', $item->id)->count() !== 1) {
|
||||
|
||||
if (! $item->content && ! $nocontent && $grabber) {
|
||||
$item->content = download_content($item->url);
|
||||
}
|
||||
|
||||
$db->table('items')->save(array(
|
||||
'id' => $item->id,
|
||||
'title' => $item->title,
|
||||
|
@ -3,6 +3,12 @@
|
||||
namespace Schema;
|
||||
|
||||
|
||||
function version_15($pdo)
|
||||
{
|
||||
$pdo->exec('ALTER TABLE feeds ADD COLUMN download_content INTEGER DEFAULT 0');
|
||||
}
|
||||
|
||||
|
||||
function version_14($pdo)
|
||||
{
|
||||
$pdo->exec('ALTER TABLE config ADD COLUMN feed_token TEXT DEFAULT "'.\Model\generate_token().'"');
|
||||
|
@ -10,6 +10,8 @@
|
||||
<form method="post" action="?action=add" autocomplete="off">
|
||||
<?= Helper\form_label(t('Website or Feed URL'), 'url') ?>
|
||||
<?= Helper\form_text('url', $values, array(), array('required', 'autofocus', 'placeholder="'.t('http://website/').'"')) ?>
|
||||
<?= Helper\form_checkbox('download_content', t('Download full content'), 1, isset($values['download_content']) ? $values['download_content'] : false) ?><br/>
|
||||
<p class="form-help"><?= t('Downloading full content is slower because Miniflux grab the content from the original website. You should use that for subscriptions that display only a summary. This feature doesn\'t work with all websites.') ?></p>
|
||||
<div class="form-actions">
|
||||
<button type="submit" class="btn btn-blue"><?= t('Add') ?></button>
|
||||
</div>
|
||||
|
@ -28,7 +28,7 @@
|
||||
<span id="loading-feed-<?= $feed['id'] ?>"></span>
|
||||
<?php endif ?>
|
||||
|
||||
<a href="<?= $feed['site_url'] ?>" rel="noreferrer" target="_blank"><?= Helper\escape($feed['title']) ?></a>
|
||||
<a href="?action=feed-items&feed_id=<?= $feed['id'] ?>"><?= Helper\escape($feed['title']) ?></a>
|
||||
|
||||
<?php if ($feed['enabled']): ?>
|
||||
<?php if ($feed['last_checked']): ?>
|
||||
@ -47,14 +47,18 @@
|
||||
|
||||
<span class="hide-mobile"><a href="?action=confirm-remove-feed&feed_id=<?= $feed['id'] ?>"><?= t('remove') ?></a> |</span>
|
||||
|
||||
<?php if ($feed['enabled']): ?>
|
||||
<span class="hide-mobile"><a href="?action=confirm-disable-feed&feed_id=<?= $feed['id'] ?>"><?= t('disable') ?></a> |</span>
|
||||
<a href="?action=refresh-feed&feed_id=<?= $feed['id'] ?>" data-feed-id="<?= $feed['id'] ?>" data-action="refresh-feed"><?= t('refresh') ?></a> |
|
||||
<?php if ($feed['download_content']): ?>
|
||||
<span class="hide-mobile"><a href="?action=disable-grabber-feed&feed_id=<?= $feed['id'] ?>"><strong><?= t('disable full content') ?></strong></a> |</span>
|
||||
<?php else: ?>
|
||||
<span class="hide-mobile"><a href="?action=enable-feed&feed_id=<?= $feed['id'] ?>"><?= t('enable') ?></a> |</span>
|
||||
<span class="hide-mobile"><a href="?action=enable-grabber-feed&feed_id=<?= $feed['id'] ?>"><?= t('enable full content') ?></a> |</span>
|
||||
<?php endif ?>
|
||||
|
||||
<span class="hide-mobile"><a href="?action=feed-items&feed_id=<?= $feed['id'] ?>"><?= t('items') ?></a></span>
|
||||
<?php if ($feed['enabled']): ?>
|
||||
<span class="hide-mobile"><a href="?action=confirm-disable-feed&feed_id=<?= $feed['id'] ?>"><?= t('disable') ?></a> |</span>
|
||||
<a href="?action=refresh-feed&feed_id=<?= $feed['id'] ?>" data-feed-id="<?= $feed['id'] ?>" data-action="refresh-feed"><?= t('refresh') ?></a>
|
||||
<?php else: ?>
|
||||
<span class="hide-mobile"><a href="?action=enable-feed&feed_id=<?= $feed['id'] ?>"><?= t('enable') ?></a></span>
|
||||
<?php endif ?>
|
||||
</p>
|
||||
</article>
|
||||
<?php endforeach ?>
|
||||
|
1
vendor/PicoDb/Table.php
vendored
1
vendor/PicoDb/Table.php
vendored
@ -138,6 +138,7 @@ class Table
|
||||
|
||||
public function findAllByColumn($column)
|
||||
{
|
||||
$this->columns = array($column);
|
||||
$rq = $this->db->execute($this->buildSelectQuery(), $this->values);
|
||||
if (false === $rq) return false;
|
||||
|
||||
|
2
vendor/PicoFeed/Clients/Curl.php
vendored
2
vendor/PicoFeed/Clients/Curl.php
vendored
@ -64,6 +64,8 @@ class Curl extends \PicoFeed\Client
|
||||
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // For auto-signed certificates...
|
||||
curl_setopt($ch, CURLOPT_WRITEFUNCTION, array($this, 'readBody'));
|
||||
curl_setopt($ch, CURLOPT_HEADERFUNCTION, array($this, 'readHeaders'));
|
||||
curl_setopt($ch, CURLOPT_COOKIEJAR, 'php://memory');
|
||||
curl_setopt($ch, CURLOPT_COOKIEFILE, 'php://memory');
|
||||
curl_exec($ch);
|
||||
|
||||
Logging::log(\get_called_class().' cURL total time: '.curl_getinfo($ch, CURLINFO_TOTAL_TIME));
|
||||
|
10
vendor/PicoFeed/Filter.php
vendored
10
vendor/PicoFeed/Filter.php
vendored
@ -454,4 +454,14 @@ class Filter
|
||||
{
|
||||
return preg_replace('~<(?:!DOCTYPE|/?(?:html|head|body))[^>]*>\s*~i', '', $data);
|
||||
}
|
||||
|
||||
|
||||
public static function stripXmlTag($data)
|
||||
{
|
||||
if (strpos($data, '<?xml') !== false) {
|
||||
$data = substr($data, strrpos($data, '?>') + 2);
|
||||
}
|
||||
|
||||
return $data;
|
||||
}
|
||||
}
|
||||
|
241
vendor/PicoFeed/Grabber.php
vendored
Normal file
241
vendor/PicoFeed/Grabber.php
vendored
Normal file
@ -0,0 +1,241 @@
|
||||
<?php
|
||||
|
||||
namespace PicoFeed;
|
||||
|
||||
require_once __DIR__.'/Client.php';
|
||||
require_once __DIR__.'/Encoding.php';
|
||||
require_once __DIR__.'/Logging.php';
|
||||
|
||||
class Grabber
|
||||
{
|
||||
public $content = '';
|
||||
public $html = '';
|
||||
|
||||
// Order is important
|
||||
public $candidatesAttributes = array(
|
||||
'article',
|
||||
'articleBody',
|
||||
'articlebody',
|
||||
'articleContent',
|
||||
'articlecontent',
|
||||
'articlePage',
|
||||
'post-content',
|
||||
'content',
|
||||
'main',
|
||||
);
|
||||
|
||||
public $stripAttributes = array(
|
||||
'comment',
|
||||
'share',
|
||||
'links',
|
||||
'toolbar',
|
||||
'fb',
|
||||
'footer',
|
||||
'credit',
|
||||
'bottom',
|
||||
'nav',
|
||||
'header',
|
||||
'social',
|
||||
);
|
||||
|
||||
public $stripTags = array(
|
||||
'script',
|
||||
'style',
|
||||
'nav',
|
||||
'header',
|
||||
'footer',
|
||||
'aside',
|
||||
);
|
||||
|
||||
|
||||
public function __construct($url)
|
||||
{
|
||||
$this->url = $url;
|
||||
}
|
||||
|
||||
|
||||
public function parse()
|
||||
{
|
||||
if ($this->html) {
|
||||
|
||||
Logging::log(\get_called_class().' HTML fetched');
|
||||
|
||||
$rules = $this->getRules();
|
||||
|
||||
\libxml_use_internal_errors(true);
|
||||
$dom = new \DOMDocument;
|
||||
$dom->loadHTML($this->html);
|
||||
|
||||
if (is_array($rules)) {
|
||||
Logging::log(\get_called_class().' Parse content with rules');
|
||||
$this->parseContentWithRules($dom, $rules);
|
||||
}
|
||||
else {
|
||||
|
||||
Logging::log(\get_called_class().' Parse content with candidates');
|
||||
$this->parseContentWithCandidates($dom);
|
||||
|
||||
if (strlen($this->content) < 50) {
|
||||
Logging::log(\get_called_class().' No enought content fetched, get the full body');
|
||||
$this->content = $dom->saveXML($dom->firstChild);
|
||||
}
|
||||
|
||||
Logging::log(\get_called_class().' Strip garbage');
|
||||
$this->stripGarbage();
|
||||
}
|
||||
}
|
||||
else {
|
||||
|
||||
Logging::log(\get_called_class().' No content fetched');
|
||||
}
|
||||
|
||||
Logging::log(\get_called_class().' Grabber done');
|
||||
|
||||
return $this->content !== '';
|
||||
}
|
||||
|
||||
|
||||
public function download($timeout = 5, $user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36')
|
||||
{
|
||||
$client = Client::create();
|
||||
$client->url = $this->url;
|
||||
$client->timeout = $timeout;
|
||||
$client->user_agent = $user_agent;
|
||||
$client->execute();
|
||||
$this->html = $client->getContent();
|
||||
|
||||
return $this->html;
|
||||
}
|
||||
|
||||
|
||||
public function getRules()
|
||||
{
|
||||
$hostname = parse_url($this->url, PHP_URL_HOST);
|
||||
$files = array($hostname);
|
||||
|
||||
if (substr($hostname, 0, 4) == 'www.') $files[] = substr($hostname, 4);
|
||||
if (($pos = strpos($hostname, '.')) !== false) $files[] = substr($hostname, $pos);
|
||||
|
||||
foreach ($files as $file) {
|
||||
|
||||
$filename = __DIR__.'/Rules/'.$file.'.php';
|
||||
|
||||
if (file_exists($filename)) {
|
||||
return include $filename;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public function parseContentWithRules($dom, array $rules)
|
||||
{
|
||||
$xpath = new \DOMXPath($dom);
|
||||
|
||||
if (isset($rules['strip']) && is_array($rules['strip'])) {
|
||||
|
||||
foreach ($rules['strip'] as $pattern) {
|
||||
|
||||
$nodes = $xpath->query($pattern);
|
||||
|
||||
if ($nodes !== false && $nodes->length > 0) {
|
||||
foreach ($nodes as $node) {
|
||||
$node->parentNode->removeChild($node);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (isset($rules['strip_id_or_class']) && is_array($rules['strip_id_or_class'])) {
|
||||
|
||||
foreach ($rules['strip_id_or_class'] as $pattern) {
|
||||
|
||||
$pattern = strtr($pattern, array("'" => '', '"' => ''));
|
||||
$nodes = $xpath->query("//*[contains(@class, '$pattern') or contains(@id, '$pattern')]");
|
||||
|
||||
if ($nodes !== false && $nodes->length > 0) {
|
||||
foreach ($nodes as $node) {
|
||||
$node->parentNode->removeChild($node);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (isset($rules['body']) && is_array($rules['body'])) {
|
||||
|
||||
foreach ($rules['body'] as $pattern) {
|
||||
|
||||
$nodes = $xpath->query($pattern);
|
||||
|
||||
if ($nodes !== false && $nodes->length > 0) {
|
||||
foreach ($nodes as $node) {
|
||||
$this->content .= $dom->saveXML($node);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public function parseContentWithCandidates($dom)
|
||||
{
|
||||
$xpath = new \DOMXPath($dom);
|
||||
|
||||
// Try to fetch <article/>
|
||||
$nodes = $xpath->query('//article');
|
||||
|
||||
if ($nodes !== false && $nodes->length > 0) {
|
||||
$this->content = $dom->saveXML($nodes->item(0));
|
||||
return;
|
||||
}
|
||||
|
||||
// Try to lookup in each <div/>
|
||||
foreach ($this->candidatesAttributes as $candidate) {
|
||||
|
||||
$nodes = $xpath->query('//div[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
|
||||
|
||||
if ($nodes !== false && $nodes->length > 0) {
|
||||
$this->content = $dom->saveXML($nodes->item(0));
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public function stripGarbage()
|
||||
{
|
||||
\libxml_use_internal_errors(true);
|
||||
$dom = new \DOMDocument;
|
||||
$dom->loadXML($this->content);
|
||||
$xpath = new \DOMXPath($dom);
|
||||
|
||||
foreach ($this->stripTags as $tag) {
|
||||
|
||||
$nodes = $xpath->query('//'.$tag);
|
||||
|
||||
if ($nodes !== false && $nodes->length > 0) {
|
||||
foreach ($nodes as $node) {
|
||||
$node->parentNode->removeChild($node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($this->stripAttributes as $attribute) {
|
||||
|
||||
$nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
|
||||
|
||||
if ($nodes !== false && $nodes->length > 0) {
|
||||
foreach ($nodes as $node) {
|
||||
$node->parentNode->removeChild($node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$this->content = '';
|
||||
|
||||
foreach($dom->childNodes as $node) {
|
||||
$this->content .= $dom->saveXML($node);
|
||||
}
|
||||
}
|
||||
}
|
30
vendor/PicoFeed/Parser.php
vendored
30
vendor/PicoFeed/Parser.php
vendored
@ -5,6 +5,7 @@ namespace PicoFeed;
|
||||
require_once __DIR__.'/Logging.php';
|
||||
require_once __DIR__.'/Filter.php';
|
||||
require_once __DIR__.'/Encoding.php';
|
||||
require_once __DIR__.'/Grabber.php';
|
||||
|
||||
abstract class Parser
|
||||
{
|
||||
@ -15,6 +16,10 @@ abstract class Parser
|
||||
public $title = '';
|
||||
public $updated = '';
|
||||
public $items = array();
|
||||
public $grabber = false;
|
||||
public $grabber_ignore_urls = array();
|
||||
public $grabber_timeout = 5;
|
||||
public $grabber_user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)';
|
||||
|
||||
|
||||
abstract public function execute();
|
||||
@ -23,7 +28,7 @@ abstract class Parser
|
||||
public function __construct($content)
|
||||
{
|
||||
// Strip XML tag to avoid multiple encoding/decoding in next XML processing
|
||||
$this->content = $this->stripXmlTag($content);
|
||||
$this->content = Filter::stripXmlTag($content);
|
||||
|
||||
// Encode everything in UTF-8
|
||||
$this->content = Encoding::toUTF8($this->content);
|
||||
@ -33,13 +38,19 @@ abstract class Parser
|
||||
}
|
||||
|
||||
|
||||
public function filterHtml($str, $item_url)
|
||||
public function filterHtml($item_content, $item_url)
|
||||
{
|
||||
$content = '';
|
||||
|
||||
if ($str) {
|
||||
if ($this->grabber && ! in_array($item_url, $this->grabber_ignore_urls)) {
|
||||
$grabber = new Grabber($item_url);
|
||||
$grabber->download($this->grabber_timeout, $this->grabber_user_agent);
|
||||
$grabber->parse();
|
||||
if ($grabber->content) $item_content = $grabber->content;
|
||||
}
|
||||
|
||||
$filter = new Filter($str, $item_url);
|
||||
if ($item_content) {
|
||||
$filter = new Filter($item_content, $item_url);
|
||||
$content = $filter->execute();
|
||||
}
|
||||
|
||||
@ -72,17 +83,6 @@ abstract class Parser
|
||||
}
|
||||
|
||||
|
||||
public function stripXmlTag($data)
|
||||
{
|
||||
if (strpos($data, '<?xml') !== false) {
|
||||
|
||||
$data = substr($data, strrpos($data, '?>') + 2);
|
||||
}
|
||||
|
||||
return $data;
|
||||
}
|
||||
|
||||
|
||||
// Trim whitespace from the begining, the end and inside a string and don't break utf-8 string
|
||||
public function stripWhiteSpace($value)
|
||||
{
|
||||
|
22
vendor/PicoFeed/Reader.php
vendored
22
vendor/PicoFeed/Reader.php
vendored
@ -5,6 +5,7 @@ namespace PicoFeed;
|
||||
require_once __DIR__.'/Logging.php';
|
||||
require_once __DIR__.'/Parser.php';
|
||||
require_once __DIR__.'/Client.php';
|
||||
require_once __DIR__.'/Filter.php';
|
||||
|
||||
class Reader
|
||||
{
|
||||
@ -59,25 +60,20 @@ class Reader
|
||||
$data = preg_replace('/<!--(.{0,5000}?)-->/Uis', '', $data);
|
||||
|
||||
/* Strip Doctype:
|
||||
* Doctype needs to be within the first 500 characters. (Ideally the first!)
|
||||
* Doctype needs to be within the first 100 characters. (Ideally the first!)
|
||||
* If it's not found by then, we need to stop looking to prevent PREG
|
||||
* from reaching max backtrack depth and crashing.
|
||||
*/
|
||||
$data = preg_replace('/^.{0,500}<!DOCTYPE([^>]*)>/Uis', '', $data);
|
||||
$data = preg_replace('/^.{0,100}<!DOCTYPE([^>]*)>/Uis', '', $data);
|
||||
|
||||
// Find <?xml version....
|
||||
if (strpos($data, '<?xml') !== false) {
|
||||
// Strip <?xml version....
|
||||
$data = Filter::stripXmlTag($data);
|
||||
|
||||
$data = substr($data, strrpos($data, '?>') + 2);
|
||||
// Find the first tag
|
||||
$open_tag = strpos($data, '<');
|
||||
$close_tag = strpos($data, '>');
|
||||
|
||||
// Find the first tag
|
||||
$open_tag = strpos($data, '<');
|
||||
$close_tag = strpos($data, '>');
|
||||
|
||||
return substr($data, $open_tag, $close_tag);
|
||||
}
|
||||
|
||||
return $data;
|
||||
return substr($data, $open_tag, $close_tag);
|
||||
}
|
||||
|
||||
|
||||
|
10
vendor/PicoFeed/Rules/.blog.lemonde.fr.php
vendored
Normal file
10
vendor/PicoFeed/Rules/.blog.lemonde.fr.php
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
<?php
|
||||
return array(
|
||||
'test_url' => 'http://combat.blog.lemonde.fr/2013/08/31/teddy-riner-le-rookie-devenu-rambo/#xtor=RSS-3208',
|
||||
'body' => array(
|
||||
'//div[@class="entry-content"]',
|
||||
),
|
||||
'strip' => array(
|
||||
'//*[contains(@class, "fb-like") or contains(@class, "social")]'
|
||||
)
|
||||
);
|
13
vendor/PicoFeed/Rules/.blogs.nytimes.com.php
vendored
Normal file
13
vendor/PicoFeed/Rules/.blogs.nytimes.com.php
vendored
Normal file
@ -0,0 +1,13 @@
|
||||
<?php
|
||||
return array(
|
||||
'title' => '//header/h1',
|
||||
'test_url' => 'http://opinionator.blogs.nytimes.com/2011/02/03/lost-and-gone-forever/',
|
||||
'test_url' => 'http://krugman.blogs.nytimes.com/2012/09/12/a-vote-of-confidence/',
|
||||
'test_url' => 'http://bits.blogs.nytimes.com/2012/01/16/wikipedia-plans-to-go-dark-on-wednesday-to-protest-sopa/',
|
||||
'body' => array(
|
||||
'//div[@class="postContent"]',
|
||||
),
|
||||
'strip' => array(
|
||||
'//*[@class="shareToolsBox"]',
|
||||
),
|
||||
);
|
8
vendor/PicoFeed/Rules/.nytimes.com.php
vendored
Normal file
8
vendor/PicoFeed/Rules/.nytimes.com.php
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
<?php
|
||||
return array(
|
||||
'test_url' => 'http://www.nytimes.com/2011/05/15/world/middleeast/15prince.html',
|
||||
'title' => '//h1[@class="articleHeadline"]',
|
||||
'body' => array(
|
||||
'//div[@class="articleBody"]',
|
||||
),
|
||||
);
|
16
vendor/PicoFeed/Rules/.slate.com.php
vendored
Normal file
16
vendor/PicoFeed/Rules/.slate.com.php
vendored
Normal file
@ -0,0 +1,16 @@
|
||||
<?php
|
||||
return array(
|
||||
'test_url' => 'http://www.slate.com/articles/business/moneybox/2013/08/microsoft_ceo_steve_ballmer_retires_a_firsthand_account_of_the_company_s.html',
|
||||
'body' => array(
|
||||
'//div[@class="sl-art-body"]',
|
||||
),
|
||||
'strip' => array(
|
||||
'//*[contains(@class, "social") or contains(@class, "comments") or contains(@class, "sl-article-floatin-tools") or contains(@class, "sl-art-pag")]',
|
||||
'//*[@id="mys_slate_logged_in"]',
|
||||
'//*[@id="sl_article_tools_myslate_bottom"]',
|
||||
'//*[@id="mys_myslate"]',
|
||||
'//*[@class="sl-viral-container"]',
|
||||
'//*[@class="sl-art-creds-cntr"]',
|
||||
'//*[@class="sl-art-ad-midflex"]',
|
||||
)
|
||||
);
|
11
vendor/PicoFeed/Rules/.wsj.com.php
vendored
Normal file
11
vendor/PicoFeed/Rules/.wsj.com.php
vendored
Normal file
@ -0,0 +1,11 @@
|
||||
<?php
|
||||
return array(
|
||||
'test_url' => 'http://online.wsj.com/article/SB10001424127887324108204579023143974408428.html',
|
||||
'body' => array(
|
||||
'//div[@class="articlePage"]',
|
||||
),
|
||||
'strip' => array(
|
||||
'//*[@id="articleThumbnail_2"]',
|
||||
'//*[@class="socialByline"]',
|
||||
)
|
||||
);
|
9
vendor/PicoFeed/Rules/rue89.feedsportal.com.php
vendored
Normal file
9
vendor/PicoFeed/Rules/rue89.feedsportal.com.php
vendored
Normal file
@ -0,0 +1,9 @@
|
||||
<?php
|
||||
return array(
|
||||
'test_url' => 'http://rue89.feedsportal.com/c/33822/f/608948/s/30999fa0/sc/24/l/0L0Srue890N0C20A130C0A80C30A0Cfaisait0Eboris0Eboillon0Eex0Esarko0Eboy0E350A0E0A0A0A0Eeuros0Egare0Enord0E245315/story01.htm',
|
||||
'body' => array(
|
||||
'//*[@id="article"]/div[contains(@class, "content")]',
|
||||
),
|
||||
'strip' => array(
|
||||
)
|
||||
);
|
20
vendor/PicoFeed/Rules/www.bbc.co.uk.php
vendored
Normal file
20
vendor/PicoFeed/Rules/www.bbc.co.uk.php
vendored
Normal file
@ -0,0 +1,20 @@
|
||||
<?php
|
||||
return array(
|
||||
'test_url' => 'http://www.bbc.co.uk/news/world-middle-east-23911833',
|
||||
'body' => array(
|
||||
'//div[@class="story-body"]',
|
||||
),
|
||||
'strip' => array(
|
||||
'//script',
|
||||
'//form',
|
||||
'//style',
|
||||
'//*[@class="story-date"]',
|
||||
'//*[@class="story-header"]',
|
||||
'//*[@class="story-related"]',
|
||||
'//*[contains(@class, "byline")]',
|
||||
'//*[contains(@class, "story-feature")]',
|
||||
'//*[@id="video-carousel-container"]',
|
||||
'//*[@id="also-related-links"]',
|
||||
'//*[contains(@class, "share") or contains(@class, "hidden") or contains(@class, "hyper")]',
|
||||
)
|
||||
);
|
8
vendor/PicoFeed/Rules/www.cnn.com.php
vendored
Normal file
8
vendor/PicoFeed/Rules/www.cnn.com.php
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
<?php
|
||||
return array(
|
||||
'test_url' => 'http://www.cnn.com/2013/08/31/world/meast/syria-civil-war/index.html?hpt=hp_t1',
|
||||
'body' => array(
|
||||
'//*[contains(@class, "cnn_storypgraphtxt")]]',
|
||||
'//*[contains(@class, "cnnvideo_wrapper")]]',
|
||||
),
|
||||
);
|
8
vendor/PicoFeed/Rules/www.egscomics.com.php
vendored
Normal file
8
vendor/PicoFeed/Rules/www.egscomics.com.php
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
<?php
|
||||
return array(
|
||||
'test_url' => 'http://www.egscomics.com/index.php?id=1690',
|
||||
'title' => '/html/head/title',
|
||||
'body' => array(
|
||||
'//img[@id="comic"]'
|
||||
)
|
||||
);
|
9
vendor/PicoFeed/Rules/www.lemonde.fr.php
vendored
Normal file
9
vendor/PicoFeed/Rules/www.lemonde.fr.php
vendored
Normal file
@ -0,0 +1,9 @@
|
||||
<?php
|
||||
return array(
|
||||
'test_url' => 'http://www.lemonde.fr/societe/article/2013/08/30/boris-boillon-ancien-ambassadeur-de-sarkozy-arrete-avec-350-000-euros-en-liquide_3469109_3224.html',
|
||||
'body' => array(
|
||||
'//div[@id="articleBody"]',
|
||||
),
|
||||
'strip' => array(
|
||||
),
|
||||
);
|
10
vendor/PicoFeed/Rules/www.numerama.com.php
vendored
Normal file
10
vendor/PicoFeed/Rules/www.numerama.com.php
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
<?php
|
||||
return array(
|
||||
'test_url' => 'http://www.numerama.com/magazine/26857-bientot-des-robots-dans-les-cuisines-de-mcdo.html',
|
||||
'body' => array(
|
||||
'//*[@id="general_content"]/table/tbody/tr/td[1]/div/div/div[6]/h2',
|
||||
'//div[@id="newstext"]',
|
||||
),
|
||||
'strip' => array(
|
||||
)
|
||||
);
|
17
vendor/PicoFeed/Rules/www.slate.fr.php
vendored
Normal file
17
vendor/PicoFeed/Rules/www.slate.fr.php
vendored
Normal file
@ -0,0 +1,17 @@
|
||||
<?php
|
||||
return array(
|
||||
'test_url' => 'http://www.slate.fr/monde/77034/allemagne-2013-couacs-campagne',
|
||||
'body' => array(
|
||||
'//div[@class="article_content"]',
|
||||
),
|
||||
'strip' => array(
|
||||
'//script',
|
||||
'//style',
|
||||
'//*[@id="slate_associated_bn"]',
|
||||
'//*[@id="ligatus-article"]',
|
||||
'//*[@id="article_sidebar"]',
|
||||
'//div[contains(@id, "reseaux")]',
|
||||
'//*[contains(@class, "smart") or contains(@class, "article_tags") or contains(@class, "article_reactions")]',
|
||||
'//*[contains(@class, "OUTBRAIN") or contains(@class, "related_item") or contains(@class, "share")]',
|
||||
)
|
||||
);
|
Loading…
Reference in New Issue
Block a user