Improve content grabber: add rules for specific websites and add automatic download for feeds

This commit is contained in:
Frédéric Guillot 2013-08-31 11:05:45 -04:00
parent 1429c2f44e
commit e77b785263
25 changed files with 581 additions and 82 deletions

View File

@ -12,7 +12,7 @@ require 'model.php';
if (file_exists('config.php')) require 'config.php'; if (file_exists('config.php')) require 'config.php';
defined('APP_VERSION') or define('APP_VERSION', 'master'); defined('APP_VERSION') or define('APP_VERSION', 'master');
defined('HTTP_TIMEOUT') or define('HTTP_TIMEOUT', 10); defined('HTTP_TIMEOUT') or define('HTTP_TIMEOUT', 20);
defined('DB_FILENAME') or define('DB_FILENAME', 'data/db.sqlite'); defined('DB_FILENAME') or define('DB_FILENAME', 'data/db.sqlite');
defined('DEBUG') or define('DEBUG', true); defined('DEBUG') or define('DEBUG', true);
defined('DEBUG_FILENAME') or define('DEBUG_FILENAME', 'data/debug.log'); defined('DEBUG_FILENAME') or define('DEBUG_FILENAME', 'data/debug.log');

View File

@ -322,6 +322,38 @@ Router\get_action('refresh-all', function() {
}); });
// Disable content grabber for a feed
Router\get_action('disable-grabber-feed', function() {
$id = Request\int_param('feed_id');
if ($id && Model\disable_grabber_feed($id)) {
Session\flash(t('The content grabber is disabled successfully.'));
}
else {
Session\flash_error(t('Unable to disable the content grabber for this subscription.'));
}
Response\redirect('?action=feeds');
});
// Enable content grabber for a feed
Router\get_action('enable-grabber-feed', function() {
$id = Request\int_param('feed_id');
if ($id && Model\enable_grabber_feed($id)) {
Session\flash(t('The content grabber is enabled successfully.'));
}
else {
Session\flash_error(t('Unable to activate the content grabber for this subscription.'));
}
Response\redirect('?action=feeds');
});
// Confirmation box to disable a feed // Confirmation box to disable a feed
Router\get_action('confirm-disable-feed', function() { Router\get_action('confirm-disable-feed', function() {
@ -467,7 +499,7 @@ Router\get_action('add', function() {
// Add the feed // Add the feed
Router\post_action('add', function() { Router\post_action('add', function() {
$result = Model\import_feed(trim($_POST['url'])); $result = Model\import_feed(trim($_POST['url']), isset($_POST['download_content']) && $_POST['download_content'] == 1);
if ($result) { if ($result) {
@ -590,11 +622,9 @@ Router\post_action('config', function() {
if ($valid) { if ($valid) {
if (Model\save_config($values)) { if (Model\save_config($values)) {
Session\flash(t('Your preferences are updated.')); Session\flash(t('Your preferences are updated.'));
} }
else { else {
Session\flash_error(t('Unable to update your preferences.')); Session\flash_error(t('Unable to update your preferences.'));
} }

View File

@ -1,6 +1,13 @@
<?php <?php
return array( return array(
'The content grabber is enabled successfully.' => 'Le téléchargement de contenu est activé avec succès.',
'Unable to activate the content grabber for this subscription.' => 'Impossible d\'activer le téléchargement de contenu pour cet abonnement.',
'enable full content' => 'télécharger le contenu complet',
'disable full content' => 'désactiver le téléchargement du contenu',
'Download full content' => 'Télécharger le contenu complet',
'Downloading full content is slower because Miniflux grab the content from the original website. You should use that for subscriptions that display only a summary. This feature doesn\'t work with all websites.' =>
'Le téléchargement complet du contenu est plus lent car Miniflux va chercher le contenu sur le site original. Vous devriez utiliser cela uniquement pour les abonnements qui affichent seulement un résumé. Cette fonctionnalité ne marche pas avec tous les sites web.',
'No message' => 'Aucun message', 'No message' => 'Aucun message',
'flush messages' => 'supprimer les messages', 'flush messages' => 'supprimer les messages',
'API endpoint:' => 'URL de l\'API : ', 'API endpoint:' => 'URL de l\'API : ',

149
model.php
View File

@ -2,7 +2,6 @@
namespace Model; namespace Model;
require_once 'vendor/PicoFeed/Encoding.php';
require_once 'vendor/PicoFeed/Filter.php'; require_once 'vendor/PicoFeed/Filter.php';
require_once 'vendor/PicoFeed/Client.php'; require_once 'vendor/PicoFeed/Client.php';
require_once 'vendor/PicoFeed/Export.php'; require_once 'vendor/PicoFeed/Export.php';
@ -25,8 +24,9 @@ use PicoFeed\Reader;
use PicoFeed\Export; use PicoFeed\Export;
const DB_VERSION = 14; const DB_VERSION = 15;
const HTTP_USERAGENT = 'Miniflux - http://miniflux.net'; const HTTP_USERAGENT = 'Miniflux - http://miniflux.net';
const HTTP_FAKE_USERAGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36';
const LIMIT_ALL = -1; const LIMIT_ALL = -1;
@ -169,7 +169,7 @@ function import_feeds($content)
} }
function import_feed($url) function import_feed($url, $grabber = false)
{ {
$reader = new Reader; $reader = new Reader;
$resource = $reader->download($url, '', '', HTTP_TIMEOUT, HTTP_USERAGENT); $resource = $reader->download($url, '', '', HTTP_TIMEOUT, HTTP_USERAGENT);
@ -178,6 +178,7 @@ function import_feed($url)
if ($parser !== false) { if ($parser !== false) {
$parser->grabber = $grabber;
$feed = $parser->execute(); $feed = $parser->execute();
if ($feed === false || ! $feed->title || ! $feed->url) { if ($feed === false || ! $feed->title || ! $feed->url) {
@ -193,13 +194,14 @@ function import_feed($url)
$rs = $db->table('feeds')->save(array( $rs = $db->table('feeds')->save(array(
'title' => $feed->title, 'title' => $feed->title,
'site_url' => $feed->url, 'site_url' => $feed->url,
'feed_url' => $reader->getUrl() 'feed_url' => $reader->getUrl(),
'download_content' => $grabber ? 1 : 0
)); ));
if ($rs) { if ($rs) {
$feed_id = $db->getConnection()->getLastId(); $feed_id = $db->getConnection()->getLastId();
update_items($feed_id, $feed->items); update_items($feed_id, $feed->items, $grabber);
write_debug(); write_debug();
return (int) $feed_id; return (int) $feed_id;
@ -255,12 +257,25 @@ function update_feed($feed_id)
if ($parser !== false) { if ($parser !== false) {
$feed = $parser->execute(); if ($feed['download_content']) {
if ($feed !== false) { // Don't fetch previous items, only new one
$parser->grabber_ignore_urls = \PicoTools\singleton('db')
->table('items')
->eq('feed_id', $feed_id)
->findAllByColumn('url');
$parser->grabber = true;
$parser->grabber_timeout = HTTP_TIMEOUT;
$parser->grabber_user_agent = HTTP_FAKE_USERAGENT;
}
$result = $parser->execute();
if ($result !== false) {
update_feed_cache_infos($feed_id, $resource->getLastModified(), $resource->getEtag()); update_feed_cache_infos($feed_id, $resource->getLastModified(), $resource->getEtag());
update_items($feed_id, $feed->items); update_items($feed_id, $result->items, $parser->grabber);
write_debug(); write_debug();
return true; return true;
@ -349,52 +364,82 @@ function update_feed_cache_infos($feed_id, $last_modified, $etag)
} }
function download_item($item_id) function parse_content_with_readability($content, $url)
{ {
require_once 'vendor/Readability/Readability.php'; require_once 'vendor/Readability/Readability.php';
require_once 'vendor/PicoFeed/Encoding.php';
$item = get_item($item_id);
$client = \PicoFeed\Client::create();
$client->url = $item['url'];
$client->timeout = HTTP_TIMEOUT;
$client->user_agent = HTTP_USERAGENT;
$client->execute();
$content = $client->getContent();
if (! empty($content)) { if (! empty($content)) {
$content = \PicoFeed\Encoding::toUTF8($content); $content = \PicoFeed\Encoding::toUTF8($content);
$readability = new \Readability($content, $url);
$readability = new \Readability($content, $item['url']);
if ($readability->init()) { if ($readability->init()) {
return $readability->getContent()->innerHTML;
// Get relevant content
$content = $readability->getContent()->innerHTML;
// Filter content
$filter = new \PicoFeed\Filter($content, $item['url']);
$content = $filter->execute();
$nocontent = (bool) get_config_value('nocontent');
if ($nocontent === false) {
// Save content
\PicoTools\singleton('db')
->table('items')
->eq('id', $item['id'])
->save(array('content' => $content));
}
return array(
'result' => true,
'content' => $content
);
} }
} }
return '';
}
function download_content($url)
{
require_once 'vendor/PicoFeed/Grabber.php';
$client = \PicoFeed\Client::create();
$client->url = $url;
$client->timeout = HTTP_TIMEOUT;
$client->user_agent = HTTP_FAKE_USERAGENT;
$client->execute();
$html = $client->getContent();
if (! empty($html)) {
// Try first with PicoFeed grabber and with Readability after
$grabber = new \PicoFeed\Grabber($url);
$grabber->html = $html;
if ($grabber->parse()) {
$content = $grabber->content;
}
if (empty($content)) {
$content = parse_content_with_readability($html, $url);
}
// Filter content
$filter = new \PicoFeed\Filter($content, $url);
return $filter->execute();
}
return '';
}
function download_item($item_id)
{
$item = get_item($item_id);
$content = download_content($item['url']);
if (! empty($content)) {
if (! get_config_value('nocontent')) {
// Save content
\PicoTools\singleton('db')
->table('items')
->eq('id', $item['id'])
->save(array('content' => $content));
}
return array(
'result' => true,
'content' => $content
);
}
return array( return array(
'result' => false, 'result' => false,
'content' => '' 'content' => ''
@ -427,6 +472,18 @@ function disable_feed($feed_id)
} }
function enable_grabber_feed($feed_id)
{
return \PicoTools\singleton('db')->table('feeds')->eq('id', $feed_id)->save((array('download_content' => 1)));
}
function disable_grabber_feed($feed_id)
{
return \PicoTools\singleton('db')->table('feeds')->eq('id', $feed_id)->save((array('download_content' => 0)));
}
function get_items($status, $offset = null, $limit = null) function get_items($status, $offset = null, $limit = null)
{ {
return \PicoTools\singleton('db') return \PicoTools\singleton('db')
@ -727,7 +784,7 @@ function autoflush()
} }
function update_items($feed_id, array $items) function update_items($feed_id, array $items, $grabber = false)
{ {
$nocontent = (bool) get_config_value('nocontent'); $nocontent = (bool) get_config_value('nocontent');
@ -744,6 +801,10 @@ function update_items($feed_id, array $items)
// Insert only new item // Insert only new item
if ($db->table('items')->eq('id', $item->id)->count() !== 1) { if ($db->table('items')->eq('id', $item->id)->count() !== 1) {
if (! $item->content && ! $nocontent && $grabber) {
$item->content = download_content($item->url);
}
$db->table('items')->save(array( $db->table('items')->save(array(
'id' => $item->id, 'id' => $item->id,
'title' => $item->title, 'title' => $item->title,

View File

@ -3,6 +3,12 @@
namespace Schema; namespace Schema;
function version_15($pdo)
{
$pdo->exec('ALTER TABLE feeds ADD COLUMN download_content INTEGER DEFAULT 0');
}
function version_14($pdo) function version_14($pdo)
{ {
$pdo->exec('ALTER TABLE config ADD COLUMN feed_token TEXT DEFAULT "'.\Model\generate_token().'"'); $pdo->exec('ALTER TABLE config ADD COLUMN feed_token TEXT DEFAULT "'.\Model\generate_token().'"');

View File

@ -10,6 +10,8 @@
<form method="post" action="?action=add" autocomplete="off"> <form method="post" action="?action=add" autocomplete="off">
<?= Helper\form_label(t('Website or Feed URL'), 'url') ?> <?= Helper\form_label(t('Website or Feed URL'), 'url') ?>
<?= Helper\form_text('url', $values, array(), array('required', 'autofocus', 'placeholder="'.t('http://website/').'"')) ?> <?= Helper\form_text('url', $values, array(), array('required', 'autofocus', 'placeholder="'.t('http://website/').'"')) ?>
<?= Helper\form_checkbox('download_content', t('Download full content'), 1, isset($values['download_content']) ? $values['download_content'] : false) ?><br/>
<p class="form-help"><?= t('Downloading full content is slower because Miniflux grab the content from the original website. You should use that for subscriptions that display only a summary. This feature doesn\'t work with all websites.') ?></p>
<div class="form-actions"> <div class="form-actions">
<button type="submit" class="btn btn-blue"><?= t('Add') ?></button> <button type="submit" class="btn btn-blue"><?= t('Add') ?></button>
</div> </div>

View File

@ -28,7 +28,7 @@
<span id="loading-feed-<?= $feed['id'] ?>"></span> <span id="loading-feed-<?= $feed['id'] ?>"></span>
<?php endif ?> <?php endif ?>
<a href="<?= $feed['site_url'] ?>" rel="noreferrer" target="_blank"><?= Helper\escape($feed['title']) ?></a> <a href="?action=feed-items&amp;feed_id=<?= $feed['id'] ?>"><?= Helper\escape($feed['title']) ?></a>
<?php if ($feed['enabled']): ?> <?php if ($feed['enabled']): ?>
<?php if ($feed['last_checked']): ?> <?php if ($feed['last_checked']): ?>
@ -47,14 +47,18 @@
<span class="hide-mobile"><a href="?action=confirm-remove-feed&amp;feed_id=<?= $feed['id'] ?>"><?= t('remove') ?></a> |</span> <span class="hide-mobile"><a href="?action=confirm-remove-feed&amp;feed_id=<?= $feed['id'] ?>"><?= t('remove') ?></a> |</span>
<?php if ($feed['enabled']): ?> <?php if ($feed['download_content']): ?>
<span class="hide-mobile"><a href="?action=confirm-disable-feed&amp;feed_id=<?= $feed['id'] ?>"><?= t('disable') ?></a> |</span> <span class="hide-mobile"><a href="?action=disable-grabber-feed&amp;feed_id=<?= $feed['id'] ?>"><strong><?= t('disable full content') ?></strong></a> |</span>
<a href="?action=refresh-feed&amp;feed_id=<?= $feed['id'] ?>" data-feed-id="<?= $feed['id'] ?>" data-action="refresh-feed"><?= t('refresh') ?></a> |
<?php else: ?> <?php else: ?>
<span class="hide-mobile"><a href="?action=enable-feed&amp;feed_id=<?= $feed['id'] ?>"><?= t('enable') ?></a> |</span> <span class="hide-mobile"><a href="?action=enable-grabber-feed&amp;feed_id=<?= $feed['id'] ?>"><?= t('enable full content') ?></a> |</span>
<?php endif ?> <?php endif ?>
<span class="hide-mobile"><a href="?action=feed-items&amp;feed_id=<?= $feed['id'] ?>"><?= t('items') ?></a></span> <?php if ($feed['enabled']): ?>
<span class="hide-mobile"><a href="?action=confirm-disable-feed&amp;feed_id=<?= $feed['id'] ?>"><?= t('disable') ?></a> |</span>
<a href="?action=refresh-feed&amp;feed_id=<?= $feed['id'] ?>" data-feed-id="<?= $feed['id'] ?>" data-action="refresh-feed"><?= t('refresh') ?></a>
<?php else: ?>
<span class="hide-mobile"><a href="?action=enable-feed&amp;feed_id=<?= $feed['id'] ?>"><?= t('enable') ?></a></span>
<?php endif ?>
</p> </p>
</article> </article>
<?php endforeach ?> <?php endforeach ?>

View File

@ -138,6 +138,7 @@ class Table
public function findAllByColumn($column) public function findAllByColumn($column)
{ {
$this->columns = array($column);
$rq = $this->db->execute($this->buildSelectQuery(), $this->values); $rq = $this->db->execute($this->buildSelectQuery(), $this->values);
if (false === $rq) return false; if (false === $rq) return false;

View File

@ -64,6 +64,8 @@ class Curl extends \PicoFeed\Client
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // For auto-signed certificates... curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // For auto-signed certificates...
curl_setopt($ch, CURLOPT_WRITEFUNCTION, array($this, 'readBody')); curl_setopt($ch, CURLOPT_WRITEFUNCTION, array($this, 'readBody'));
curl_setopt($ch, CURLOPT_HEADERFUNCTION, array($this, 'readHeaders')); curl_setopt($ch, CURLOPT_HEADERFUNCTION, array($this, 'readHeaders'));
curl_setopt($ch, CURLOPT_COOKIEJAR, 'php://memory');
curl_setopt($ch, CURLOPT_COOKIEFILE, 'php://memory');
curl_exec($ch); curl_exec($ch);
Logging::log(\get_called_class().' cURL total time: '.curl_getinfo($ch, CURLINFO_TOTAL_TIME)); Logging::log(\get_called_class().' cURL total time: '.curl_getinfo($ch, CURLINFO_TOTAL_TIME));

View File

@ -454,4 +454,14 @@ class Filter
{ {
return preg_replace('~<(?:!DOCTYPE|/?(?:html|head|body))[^>]*>\s*~i', '', $data); return preg_replace('~<(?:!DOCTYPE|/?(?:html|head|body))[^>]*>\s*~i', '', $data);
} }
public static function stripXmlTag($data)
{
if (strpos($data, '<?xml') !== false) {
$data = substr($data, strrpos($data, '?>') + 2);
}
return $data;
}
} }

241
vendor/PicoFeed/Grabber.php vendored Normal file
View File

@ -0,0 +1,241 @@
<?php
namespace PicoFeed;
require_once __DIR__.'/Client.php';
require_once __DIR__.'/Encoding.php';
require_once __DIR__.'/Logging.php';
class Grabber
{
public $content = '';
public $html = '';
// Order is important
public $candidatesAttributes = array(
'article',
'articleBody',
'articlebody',
'articleContent',
'articlecontent',
'articlePage',
'post-content',
'content',
'main',
);
public $stripAttributes = array(
'comment',
'share',
'links',
'toolbar',
'fb',
'footer',
'credit',
'bottom',
'nav',
'header',
'social',
);
public $stripTags = array(
'script',
'style',
'nav',
'header',
'footer',
'aside',
);
public function __construct($url)
{
$this->url = $url;
}
public function parse()
{
if ($this->html) {
Logging::log(\get_called_class().' HTML fetched');
$rules = $this->getRules();
\libxml_use_internal_errors(true);
$dom = new \DOMDocument;
$dom->loadHTML($this->html);
if (is_array($rules)) {
Logging::log(\get_called_class().' Parse content with rules');
$this->parseContentWithRules($dom, $rules);
}
else {
Logging::log(\get_called_class().' Parse content with candidates');
$this->parseContentWithCandidates($dom);
if (strlen($this->content) < 50) {
Logging::log(\get_called_class().' No enought content fetched, get the full body');
$this->content = $dom->saveXML($dom->firstChild);
}
Logging::log(\get_called_class().' Strip garbage');
$this->stripGarbage();
}
}
else {
Logging::log(\get_called_class().' No content fetched');
}
Logging::log(\get_called_class().' Grabber done');
return $this->content !== '';
}
public function download($timeout = 5, $user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36')
{
$client = Client::create();
$client->url = $this->url;
$client->timeout = $timeout;
$client->user_agent = $user_agent;
$client->execute();
$this->html = $client->getContent();
return $this->html;
}
public function getRules()
{
$hostname = parse_url($this->url, PHP_URL_HOST);
$files = array($hostname);
if (substr($hostname, 0, 4) == 'www.') $files[] = substr($hostname, 4);
if (($pos = strpos($hostname, '.')) !== false) $files[] = substr($hostname, $pos);
foreach ($files as $file) {
$filename = __DIR__.'/Rules/'.$file.'.php';
if (file_exists($filename)) {
return include $filename;
}
}
return false;
}
public function parseContentWithRules($dom, array $rules)
{
$xpath = new \DOMXPath($dom);
if (isset($rules['strip']) && is_array($rules['strip'])) {
foreach ($rules['strip'] as $pattern) {
$nodes = $xpath->query($pattern);
if ($nodes !== false && $nodes->length > 0) {
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
}
}
}
if (isset($rules['strip_id_or_class']) && is_array($rules['strip_id_or_class'])) {
foreach ($rules['strip_id_or_class'] as $pattern) {
$pattern = strtr($pattern, array("'" => '', '"' => ''));
$nodes = $xpath->query("//*[contains(@class, '$pattern') or contains(@id, '$pattern')]");
if ($nodes !== false && $nodes->length > 0) {
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
}
}
}
if (isset($rules['body']) && is_array($rules['body'])) {
foreach ($rules['body'] as $pattern) {
$nodes = $xpath->query($pattern);
if ($nodes !== false && $nodes->length > 0) {
foreach ($nodes as $node) {
$this->content .= $dom->saveXML($node);
}
}
}
}
}
public function parseContentWithCandidates($dom)
{
$xpath = new \DOMXPath($dom);
// Try to fetch <article/>
$nodes = $xpath->query('//article');
if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
return;
}
// Try to lookup in each <div/>
foreach ($this->candidatesAttributes as $candidate) {
$nodes = $xpath->query('//div[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
return;
}
}
}
public function stripGarbage()
{
\libxml_use_internal_errors(true);
$dom = new \DOMDocument;
$dom->loadXML($this->content);
$xpath = new \DOMXPath($dom);
foreach ($this->stripTags as $tag) {
$nodes = $xpath->query('//'.$tag);
if ($nodes !== false && $nodes->length > 0) {
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
}
}
foreach ($this->stripAttributes as $attribute) {
$nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
if ($nodes !== false && $nodes->length > 0) {
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
}
}
$this->content = '';
foreach($dom->childNodes as $node) {
$this->content .= $dom->saveXML($node);
}
}
}

View File

@ -5,6 +5,7 @@ namespace PicoFeed;
require_once __DIR__.'/Logging.php'; require_once __DIR__.'/Logging.php';
require_once __DIR__.'/Filter.php'; require_once __DIR__.'/Filter.php';
require_once __DIR__.'/Encoding.php'; require_once __DIR__.'/Encoding.php';
require_once __DIR__.'/Grabber.php';
abstract class Parser abstract class Parser
{ {
@ -15,6 +16,10 @@ abstract class Parser
public $title = ''; public $title = '';
public $updated = ''; public $updated = '';
public $items = array(); public $items = array();
public $grabber = false;
public $grabber_ignore_urls = array();
public $grabber_timeout = 5;
public $grabber_user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)';
abstract public function execute(); abstract public function execute();
@ -23,7 +28,7 @@ abstract class Parser
public function __construct($content) public function __construct($content)
{ {
// Strip XML tag to avoid multiple encoding/decoding in next XML processing // Strip XML tag to avoid multiple encoding/decoding in next XML processing
$this->content = $this->stripXmlTag($content); $this->content = Filter::stripXmlTag($content);
// Encode everything in UTF-8 // Encode everything in UTF-8
$this->content = Encoding::toUTF8($this->content); $this->content = Encoding::toUTF8($this->content);
@ -33,13 +38,19 @@ abstract class Parser
} }
public function filterHtml($str, $item_url) public function filterHtml($item_content, $item_url)
{ {
$content = ''; $content = '';
if ($str) { if ($this->grabber && ! in_array($item_url, $this->grabber_ignore_urls)) {
$grabber = new Grabber($item_url);
$grabber->download($this->grabber_timeout, $this->grabber_user_agent);
$grabber->parse();
if ($grabber->content) $item_content = $grabber->content;
}
$filter = new Filter($str, $item_url); if ($item_content) {
$filter = new Filter($item_content, $item_url);
$content = $filter->execute(); $content = $filter->execute();
} }
@ -72,17 +83,6 @@ abstract class Parser
} }
public function stripXmlTag($data)
{
if (strpos($data, '<?xml') !== false) {
$data = substr($data, strrpos($data, '?>') + 2);
}
return $data;
}
// Trim whitespace from the begining, the end and inside a string and don't break utf-8 string // Trim whitespace from the begining, the end and inside a string and don't break utf-8 string
public function stripWhiteSpace($value) public function stripWhiteSpace($value)
{ {

View File

@ -5,6 +5,7 @@ namespace PicoFeed;
require_once __DIR__.'/Logging.php'; require_once __DIR__.'/Logging.php';
require_once __DIR__.'/Parser.php'; require_once __DIR__.'/Parser.php';
require_once __DIR__.'/Client.php'; require_once __DIR__.'/Client.php';
require_once __DIR__.'/Filter.php';
class Reader class Reader
{ {
@ -59,25 +60,20 @@ class Reader
$data = preg_replace('/<!--(.{0,5000}?)-->/Uis', '', $data); $data = preg_replace('/<!--(.{0,5000}?)-->/Uis', '', $data);
/* Strip Doctype: /* Strip Doctype:
* Doctype needs to be within the first 500 characters. (Ideally the first!) * Doctype needs to be within the first 100 characters. (Ideally the first!)
* If it's not found by then, we need to stop looking to prevent PREG * If it's not found by then, we need to stop looking to prevent PREG
* from reaching max backtrack depth and crashing. * from reaching max backtrack depth and crashing.
*/ */
$data = preg_replace('/^.{0,500}<!DOCTYPE([^>]*)>/Uis', '', $data); $data = preg_replace('/^.{0,100}<!DOCTYPE([^>]*)>/Uis', '', $data);
// Find <?xml version.... // Strip <?xml version....
if (strpos($data, '<?xml') !== false) { $data = Filter::stripXmlTag($data);
$data = substr($data, strrpos($data, '?>') + 2); // Find the first tag
$open_tag = strpos($data, '<');
$close_tag = strpos($data, '>');
// Find the first tag return substr($data, $open_tag, $close_tag);
$open_tag = strpos($data, '<');
$close_tag = strpos($data, '>');
return substr($data, $open_tag, $close_tag);
}
return $data;
} }

View File

@ -0,0 +1,10 @@
<?php
return array(
'test_url' => 'http://combat.blog.lemonde.fr/2013/08/31/teddy-riner-le-rookie-devenu-rambo/#xtor=RSS-3208',
'body' => array(
'//div[@class="entry-content"]',
),
'strip' => array(
'//*[contains(@class, "fb-like") or contains(@class, "social")]'
)
);

View File

@ -0,0 +1,13 @@
<?php
return array(
'title' => '//header/h1',
'test_url' => 'http://opinionator.blogs.nytimes.com/2011/02/03/lost-and-gone-forever/',
'test_url' => 'http://krugman.blogs.nytimes.com/2012/09/12/a-vote-of-confidence/',
'test_url' => 'http://bits.blogs.nytimes.com/2012/01/16/wikipedia-plans-to-go-dark-on-wednesday-to-protest-sopa/',
'body' => array(
'//div[@class="postContent"]',
),
'strip' => array(
'//*[@class="shareToolsBox"]',
),
);

View File

@ -0,0 +1,8 @@
<?php
return array(
'test_url' => 'http://www.nytimes.com/2011/05/15/world/middleeast/15prince.html',
'title' => '//h1[@class="articleHeadline"]',
'body' => array(
'//div[@class="articleBody"]',
),
);

16
vendor/PicoFeed/Rules/.slate.com.php vendored Normal file
View File

@ -0,0 +1,16 @@
<?php
return array(
'test_url' => 'http://www.slate.com/articles/business/moneybox/2013/08/microsoft_ceo_steve_ballmer_retires_a_firsthand_account_of_the_company_s.html',
'body' => array(
'//div[@class="sl-art-body"]',
),
'strip' => array(
'//*[contains(@class, "social") or contains(@class, "comments") or contains(@class, "sl-article-floatin-tools") or contains(@class, "sl-art-pag")]',
'//*[@id="mys_slate_logged_in"]',
'//*[@id="sl_article_tools_myslate_bottom"]',
'//*[@id="mys_myslate"]',
'//*[@class="sl-viral-container"]',
'//*[@class="sl-art-creds-cntr"]',
'//*[@class="sl-art-ad-midflex"]',
)
);

11
vendor/PicoFeed/Rules/.wsj.com.php vendored Normal file
View File

@ -0,0 +1,11 @@
<?php
return array(
'test_url' => 'http://online.wsj.com/article/SB10001424127887324108204579023143974408428.html',
'body' => array(
'//div[@class="articlePage"]',
),
'strip' => array(
'//*[@id="articleThumbnail_2"]',
'//*[@class="socialByline"]',
)
);

View File

@ -0,0 +1,9 @@
<?php
return array(
'test_url' => 'http://rue89.feedsportal.com/c/33822/f/608948/s/30999fa0/sc/24/l/0L0Srue890N0C20A130C0A80C30A0Cfaisait0Eboris0Eboillon0Eex0Esarko0Eboy0E350A0E0A0A0A0Eeuros0Egare0Enord0E245315/story01.htm',
'body' => array(
'//*[@id="article"]/div[contains(@class, "content")]',
),
'strip' => array(
)
);

20
vendor/PicoFeed/Rules/www.bbc.co.uk.php vendored Normal file
View File

@ -0,0 +1,20 @@
<?php
return array(
'test_url' => 'http://www.bbc.co.uk/news/world-middle-east-23911833',
'body' => array(
'//div[@class="story-body"]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//*[@class="story-date"]',
'//*[@class="story-header"]',
'//*[@class="story-related"]',
'//*[contains(@class, "byline")]',
'//*[contains(@class, "story-feature")]',
'//*[@id="video-carousel-container"]',
'//*[@id="also-related-links"]',
'//*[contains(@class, "share") or contains(@class, "hidden") or contains(@class, "hyper")]',
)
);

8
vendor/PicoFeed/Rules/www.cnn.com.php vendored Normal file
View File

@ -0,0 +1,8 @@
<?php
return array(
'test_url' => 'http://www.cnn.com/2013/08/31/world/meast/syria-civil-war/index.html?hpt=hp_t1',
'body' => array(
'//*[contains(@class, "cnn_storypgraphtxt")]]',
'//*[contains(@class, "cnnvideo_wrapper")]]',
),
);

View File

@ -0,0 +1,8 @@
<?php
return array(
'test_url' => 'http://www.egscomics.com/index.php?id=1690',
'title' => '/html/head/title',
'body' => array(
'//img[@id="comic"]'
)
);

View File

@ -0,0 +1,9 @@
<?php
return array(
'test_url' => 'http://www.lemonde.fr/societe/article/2013/08/30/boris-boillon-ancien-ambassadeur-de-sarkozy-arrete-avec-350-000-euros-en-liquide_3469109_3224.html',
'body' => array(
'//div[@id="articleBody"]',
),
'strip' => array(
),
);

View File

@ -0,0 +1,10 @@
<?php
return array(
'test_url' => 'http://www.numerama.com/magazine/26857-bientot-des-robots-dans-les-cuisines-de-mcdo.html',
'body' => array(
'//*[@id="general_content"]/table/tbody/tr/td[1]/div/div/div[6]/h2',
'//div[@id="newstext"]',
),
'strip' => array(
)
);

17
vendor/PicoFeed/Rules/www.slate.fr.php vendored Normal file
View File

@ -0,0 +1,17 @@
<?php
return array(
'test_url' => 'http://www.slate.fr/monde/77034/allemagne-2013-couacs-campagne',
'body' => array(
'//div[@class="article_content"]',
),
'strip' => array(
'//script',
'//style',
'//*[@id="slate_associated_bn"]',
'//*[@id="ligatus-article"]',
'//*[@id="article_sidebar"]',
'//div[contains(@id, "reseaux")]',
'//*[contains(@class, "smart") or contains(@class, "article_tags") or contains(@class, "article_reactions")]',
'//*[contains(@class, "OUTBRAIN") or contains(@class, "related_item") or contains(@class, "share")]',
)
);