Improve content grabber: add rules for specific websites and add automatic download for feeds

2013-08-31 11:05:45 -04:00 · 2013-08-31 11:05:45 -04:00 · e77b785263
commit e77b785263
parent 1429c2f44e
25 changed files with 581 additions and 82 deletions
--- a/common.php
+++ b/common.php
@ -12,7 +12,7 @@ require 'model.php';
 if (file_exists('config.php')) require 'config.php';

 defined('APP_VERSION') or define('APP_VERSION', 'master');
-defined('HTTP_TIMEOUT') or define('HTTP_TIMEOUT', 10);
+defined('HTTP_TIMEOUT') or define('HTTP_TIMEOUT', 20);
 defined('DB_FILENAME') or define('DB_FILENAME', 'data/db.sqlite');
 defined('DEBUG') or define('DEBUG', true);
 defined('DEBUG_FILENAME') or define('DEBUG_FILENAME', 'data/debug.log');
--- a/index.php
+++ b/index.php
@ -322,6 +322,38 @@ Router\get_action('refresh-all', function() {
 });


+// Disable content grabber for a feed
+Router\get_action('disable-grabber-feed', function() {
+
+    $id = Request\int_param('feed_id');
+
+    if ($id && Model\disable_grabber_feed($id)) {
+        Session\flash(t('The content grabber is disabled successfully.'));
+    }
+    else {
+        Session\flash_error(t('Unable to disable the content grabber for this subscription.'));
+    }
+
+    Response\redirect('?action=feeds');
+});
+
+
+// Enable content grabber for a feed
+Router\get_action('enable-grabber-feed', function() {
+
+    $id = Request\int_param('feed_id');
+
+    if ($id && Model\enable_grabber_feed($id)) {
+        Session\flash(t('The content grabber is enabled successfully.'));
+    }
+    else {
+        Session\flash_error(t('Unable to activate the content grabber for this subscription.'));
+    }
+
+    Response\redirect('?action=feeds');
+});
+
+
 // Confirmation box to disable a feed
 Router\get_action('confirm-disable-feed', function() {

@ -467,7 +499,7 @@ Router\get_action('add', function() {
 // Add the feed
 Router\post_action('add', function() {

-    $result = Model\import_feed(trim($_POST['url']));
+    $result = Model\import_feed(trim($_POST['url']), isset($_POST['download_content']) && $_POST['download_content'] == 1);

    if ($result) {

@ -590,11 +622,9 @@ Router\post_action('config', function() {
    if ($valid) {

        if (Model\save_config($values)) {
-
            Session\flash(t('Your preferences are updated.'));
        }
        else {
-
            Session\flash_error(t('Unable to update your preferences.'));
        }

--- a/locales/fr_FR/translations.php
+++ b/locales/fr_FR/translations.php
@ -1,6 +1,13 @@
 <?php

 return array(
+    'The content grabber is enabled successfully.' => 'Le téléchargement de contenu est activé avec succès.',
+    'Unable to activate the content grabber for this subscription.' => 'Impossible d\'activer le téléchargement de contenu pour cet abonnement.',
+    'enable full content' => 'télécharger le contenu complet',
+    'disable full content' => 'désactiver le téléchargement du contenu',
+    'Download full content' => 'Télécharger le contenu complet',
+    'Downloading full content is slower because Miniflux grab the content from the original website. You should use that for subscriptions that display only a summary. This feature doesn\'t work with all websites.' =>
+    'Le téléchargement complet du contenu est plus lent car Miniflux va chercher le contenu sur le site original. Vous devriez utiliser cela uniquement pour les abonnements qui affichent seulement un résumé. Cette fonctionnalité ne marche pas avec tous les sites web.',
    'No message' => 'Aucun message',
    'flush messages' => 'supprimer les messages',
    'API endpoint:' => 'URL de l\'API : ',
--- a/model.php
+++ b/model.php
@ -2,7 +2,6 @@

 namespace Model;

-require_once 'vendor/PicoFeed/Encoding.php';
 require_once 'vendor/PicoFeed/Filter.php';
 require_once 'vendor/PicoFeed/Client.php';
 require_once 'vendor/PicoFeed/Export.php';
@ -25,8 +24,9 @@ use PicoFeed\Reader;
 use PicoFeed\Export;


-const DB_VERSION     = 14;
+const DB_VERSION     = 15;
 const HTTP_USERAGENT = 'Miniflux - http://miniflux.net';
+const HTTP_FAKE_USERAGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36';
 const LIMIT_ALL      = -1;


@ -169,7 +169,7 @@ function import_feeds($content)
 }


-function import_feed($url)
+function import_feed($url, $grabber = false)
 {
    $reader = new Reader;
    $resource = $reader->download($url, '', '', HTTP_TIMEOUT, HTTP_USERAGENT);
@ -178,6 +178,7 @@ function import_feed($url)

    if ($parser !== false) {

+        $parser->grabber = $grabber;
        $feed = $parser->execute();

        if ($feed === false || ! $feed->title || ! $feed->url) {
@ -193,13 +194,14 @@ function import_feed($url)
            $rs = $db->table('feeds')->save(array(
                'title' => $feed->title,
                'site_url' => $feed->url,
-                'feed_url' => $reader->getUrl()
+                'feed_url' => $reader->getUrl(),
+                'download_content' => $grabber ? 1 : 0
            ));

            if ($rs) {

                $feed_id = $db->getConnection()->getLastId();
-                update_items($feed_id, $feed->items);
+                update_items($feed_id, $feed->items, $grabber);
                write_debug();

                return (int) $feed_id;
@ -255,12 +257,25 @@ function update_feed($feed_id)

    if ($parser !== false) {

-        $feed = $parser->execute();
+        if ($feed['download_content']) {

-        if ($feed !== false) {
+            // Don't fetch previous items, only new one
+            $parser->grabber_ignore_urls = \PicoTools\singleton('db')
+                                                ->table('items')
+                                                ->eq('feed_id', $feed_id)
+                                                ->findAllByColumn('url');
+
+            $parser->grabber = true;
+            $parser->grabber_timeout = HTTP_TIMEOUT;
+            $parser->grabber_user_agent = HTTP_FAKE_USERAGENT;
+        }
+
+        $result = $parser->execute();
+
+        if ($result !== false) {

            update_feed_cache_infos($feed_id, $resource->getLastModified(), $resource->getEtag());
-            update_items($feed_id, $feed->items);
+            update_items($feed_id, $result->items, $parser->grabber);
            write_debug();

            return true;
@ -349,52 +364,82 @@ function update_feed_cache_infos($feed_id, $last_modified, $etag)
 }


-function download_item($item_id)
+function parse_content_with_readability($content, $url)
 {
    require_once 'vendor/Readability/Readability.php';
-
-    $item = get_item($item_id);
-
-    $client = \PicoFeed\Client::create();
-    $client->url = $item['url'];
-    $client->timeout = HTTP_TIMEOUT;
-    $client->user_agent = HTTP_USERAGENT;
-    $client->execute();
-
-    $content = $client->getContent();
+    require_once 'vendor/PicoFeed/Encoding.php';

    if (! empty($content)) {

        $content = \PicoFeed\Encoding::toUTF8($content);
-
-        $readability = new \Readability($content, $item['url']);
+        $readability = new \Readability($content, $url);

        if ($readability->init()) {
-
-            // Get relevant content
-            $content = $readability->getContent()->innerHTML;
-
-            // Filter content
-            $filter = new \PicoFeed\Filter($content, $item['url']);
-            $content = $filter->execute();
-
-            $nocontent = (bool) get_config_value('nocontent');
-            if ($nocontent === false) {
-
-                // Save content
-                \PicoTools\singleton('db')
-                    ->table('items')
-                    ->eq('id', $item['id'])
-                    ->save(array('content' => $content));
-            }
-
-            return array(
-                'result' => true,
-                'content' => $content
-            );
+            return $readability->getContent()->innerHTML;
        }
    }

+    return '';
+}
+
+
+function download_content($url)
+{
+    require_once 'vendor/PicoFeed/Grabber.php';
+
+    $client = \PicoFeed\Client::create();
+    $client->url = $url;
+    $client->timeout = HTTP_TIMEOUT;
+    $client->user_agent = HTTP_FAKE_USERAGENT;
+    $client->execute();
+
+    $html = $client->getContent();
+
+    if (! empty($html)) {
+
+        // Try first with PicoFeed grabber and with Readability after
+        $grabber = new \PicoFeed\Grabber($url);
+        $grabber->html = $html;
+
+        if ($grabber->parse()) {
+            $content = $grabber->content;
+        }
+
+        if (empty($content)) {
+            $content = parse_content_with_readability($html, $url);
+        }
+
+        // Filter content
+        $filter = new \PicoFeed\Filter($content, $url);
+        return $filter->execute();
+    }
+
+    return '';
+}
+
+
+function download_item($item_id)
+{
+    $item = get_item($item_id);
+    $content = download_content($item['url']);
+
+    if (! empty($content)) {
+
+        if (! get_config_value('nocontent')) {
+
+            // Save content
+            \PicoTools\singleton('db')
+                ->table('items')
+                ->eq('id', $item['id'])
+                ->save(array('content' => $content));
+        }
+
+        return array(
+            'result' => true,
+            'content' => $content
+        );
+    }
+
    return array(
        'result' => false,
        'content' => ''
@ -427,6 +472,18 @@ function disable_feed($feed_id)
 }


+function enable_grabber_feed($feed_id)
+{
+    return \PicoTools\singleton('db')->table('feeds')->eq('id', $feed_id)->save((array('download_content' => 1)));
+}
+
+
+function disable_grabber_feed($feed_id)
+{
+    return \PicoTools\singleton('db')->table('feeds')->eq('id', $feed_id)->save((array('download_content' => 0)));
+}
+
+
 function get_items($status, $offset = null, $limit = null)
 {
    return \PicoTools\singleton('db')
@ -727,7 +784,7 @@ function autoflush()
 }


-function update_items($feed_id, array $items)
+function update_items($feed_id, array $items, $grabber = false)
 {
    $nocontent = (bool) get_config_value('nocontent');

@ -744,6 +801,10 @@ function update_items($feed_id, array $items)
            // Insert only new item
            if ($db->table('items')->eq('id', $item->id)->count() !== 1) {

+                if (! $item->content && ! $nocontent && $grabber) {
+                    $item->content = download_content($item->url);
+                }
+
                $db->table('items')->save(array(
                    'id' => $item->id,
                    'title' => $item->title,
--- a/schema.php
+++ b/schema.php
@ -3,6 +3,12 @@
 namespace Schema;


+function version_15($pdo)
+{
+    $pdo->exec('ALTER TABLE feeds ADD COLUMN download_content INTEGER DEFAULT 0');
+}
+
+
 function version_14($pdo)
 {
    $pdo->exec('ALTER TABLE config ADD COLUMN feed_token TEXT DEFAULT "'.\Model\generate_token().'"');
--- a/templates/add.php
+++ b/templates/add.php
@ -10,6 +10,8 @@
 <form method="post" action="?action=add" autocomplete="off">
    <?= Helper\form_label(t('Website or Feed URL'), 'url') ?>
    <?= Helper\form_text('url', $values, array(), array('required', 'autofocus', 'placeholder="'.t('http://website/').'"')) ?>
+    <?= Helper\form_checkbox('download_content', t('Download full content'), 1, isset($values['download_content']) ? $values['download_content'] : false) ?><br/>
+    <p class="form-help"><?= t('Downloading full content is slower because Miniflux grab the content from the original website. You should use that for subscriptions that display only a summary. This feature doesn\'t work with all websites.') ?></p>
    <div class="form-actions">
        <button type="submit" class="btn btn-blue"><?= t('Add') ?></button>
    </div>
--- a/templates/feeds.php
+++ b/templates/feeds.php
@ -28,7 +28,7 @@
                    <span id="loading-feed-<?= $feed['id'] ?>"></span>
                <?php endif ?>

-                <a href="<?= $feed['site_url'] ?>" rel="noreferrer" target="_blank"><?= Helper\escape($feed['title']) ?></a>
+                <a href="?action=feed-items&amp;feed_id=<?= $feed['id'] ?>"><?= Helper\escape($feed['title']) ?></a>

                <?php if ($feed['enabled']): ?>
                    <?php if ($feed['last_checked']): ?>
@ -47,14 +47,18 @@

                <span class="hide-mobile"><a href="?action=confirm-remove-feed&amp;feed_id=<?= $feed['id'] ?>"><?= t('remove') ?></a> |</span>

-                <?php if ($feed['enabled']): ?>
-                    <span class="hide-mobile"><a href="?action=confirm-disable-feed&amp;feed_id=<?= $feed['id'] ?>"><?= t('disable') ?></a> |</span>
-                    <a href="?action=refresh-feed&amp;feed_id=<?= $feed['id'] ?>" data-feed-id="<?= $feed['id'] ?>" data-action="refresh-feed"><?= t('refresh') ?></a> |
+                <?php if ($feed['download_content']): ?>
+                    <span class="hide-mobile"><a href="?action=disable-grabber-feed&amp;feed_id=<?= $feed['id'] ?>"><strong><?= t('disable full content') ?></strong></a> |</span>
                <?php else: ?>
-                    <span class="hide-mobile"><a href="?action=enable-feed&amp;feed_id=<?= $feed['id'] ?>"><?= t('enable') ?></a> |</span>
+                    <span class="hide-mobile"><a href="?action=enable-grabber-feed&amp;feed_id=<?= $feed['id'] ?>"><?= t('enable full content') ?></a> |</span>
                <?php endif ?>

-                <span class="hide-mobile"><a href="?action=feed-items&amp;feed_id=<?= $feed['id'] ?>"><?= t('items') ?></a></span>
+                <?php if ($feed['enabled']): ?>
+                    <span class="hide-mobile"><a href="?action=confirm-disable-feed&amp;feed_id=<?= $feed['id'] ?>"><?= t('disable') ?></a> |</span>
+                    <a href="?action=refresh-feed&amp;feed_id=<?= $feed['id'] ?>" data-feed-id="<?= $feed['id'] ?>" data-action="refresh-feed"><?= t('refresh') ?></a>
+                <?php else: ?>
+                    <span class="hide-mobile"><a href="?action=enable-feed&amp;feed_id=<?= $feed['id'] ?>"><?= t('enable') ?></a></span>
+                <?php endif ?>
            </p>
        </article>
    <?php endforeach ?>
--- a/vendor/PicoDb/Table.php
+++ b/vendor/PicoDb/Table.php
@ -138,6 +138,7 @@ class Table

    public function findAllByColumn($column)
    {
+        $this->columns = array($column);
        $rq = $this->db->execute($this->buildSelectQuery(), $this->values);
        if (false === $rq) return false;

--- a/vendor/PicoFeed/Clients/Curl.php
+++ b/vendor/PicoFeed/Clients/Curl.php
@ -64,6 +64,8 @@ class Curl extends \PicoFeed\Client
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // For auto-signed certificates...
        curl_setopt($ch, CURLOPT_WRITEFUNCTION, array($this, 'readBody'));
        curl_setopt($ch, CURLOPT_HEADERFUNCTION, array($this, 'readHeaders'));
+        curl_setopt($ch, CURLOPT_COOKIEJAR, 'php://memory');
+        curl_setopt($ch, CURLOPT_COOKIEFILE, 'php://memory');
        curl_exec($ch);

        Logging::log(\get_called_class().' cURL total time: '.curl_getinfo($ch, CURLINFO_TOTAL_TIME));
--- a/vendor/PicoFeed/Filter.php
+++ b/vendor/PicoFeed/Filter.php
@ -454,4 +454,14 @@ class Filter
    {
        return preg_replace('~<(?:!DOCTYPE|/?(?:html|head|body))[^>]*>\s*~i', '', $data);
    }
+
+
+    public static function stripXmlTag($data)
+    {
+        if (strpos($data, '<?xml') !== false) {
+            $data = substr($data, strrpos($data, '?>') + 2);
+        }
+
+        return $data;
+    }
 }
--- a/vendor/PicoFeed/Grabber.php
+++ b/vendor/PicoFeed/Grabber.php
@ -0,0 +1,241 @@
+<?php
+
+namespace PicoFeed;
+
+require_once __DIR__.'/Client.php';
+require_once __DIR__.'/Encoding.php';
+require_once __DIR__.'/Logging.php';
+
+class Grabber
+{
+    public $content = '';
+    public $html = '';
+
+    // Order is important
+    public $candidatesAttributes = array(
+        'article',
+        'articleBody',
+        'articlebody',
+        'articleContent',
+        'articlecontent',
+        'articlePage',
+        'post-content',
+        'content',
+        'main',
+    );
+
+    public $stripAttributes = array(
+        'comment',
+        'share',
+        'links',
+        'toolbar',
+        'fb',
+        'footer',
+        'credit',
+        'bottom',
+        'nav',
+        'header',
+        'social',
+    );
+
+    public $stripTags = array(
+        'script',
+        'style',
+        'nav',
+        'header',
+        'footer',
+        'aside',
+    );
+
+
+    public function __construct($url)
+    {
+        $this->url = $url;
+    }
+
+
+    public function parse()
+    {
+        if ($this->html) {
+
+            Logging::log(\get_called_class().' HTML fetched');
+
+            $rules = $this->getRules();
+
+            \libxml_use_internal_errors(true);
+            $dom = new \DOMDocument;
+            $dom->loadHTML($this->html);
+
+            if (is_array($rules)) {
+                Logging::log(\get_called_class().' Parse content with rules');
+                $this->parseContentWithRules($dom, $rules);
+            }
+            else {
+
+                Logging::log(\get_called_class().' Parse content with candidates');
+                $this->parseContentWithCandidates($dom);
+
+                if (strlen($this->content) < 50) {
+                    Logging::log(\get_called_class().' No enought content fetched, get the full body');
+                    $this->content = $dom->saveXML($dom->firstChild);
+                }
+
+                Logging::log(\get_called_class().' Strip garbage');
+                $this->stripGarbage();
+            }
+        }
+        else {
+
+            Logging::log(\get_called_class().' No content fetched');
+        }
+
+        Logging::log(\get_called_class().' Grabber done');
+
+        return $this->content !== '';
+    }
+
+
+    public function download($timeout = 5, $user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36')
+    {
+        $client = Client::create();
+        $client->url = $this->url;
+        $client->timeout = $timeout;
+        $client->user_agent = $user_agent;
+        $client->execute();
+        $this->html = $client->getContent();
+
+        return $this->html;
+    }
+
+
+    public function getRules()
+    {
+        $hostname = parse_url($this->url, PHP_URL_HOST);
+        $files = array($hostname);
+
+        if (substr($hostname, 0, 4) == 'www.') $files[] = substr($hostname, 4);
+        if (($pos = strpos($hostname, '.')) !== false) $files[] = substr($hostname, $pos);
+
+        foreach ($files as $file) {
+
+            $filename = __DIR__.'/Rules/'.$file.'.php';
+
+            if (file_exists($filename)) {
+                return include $filename;
+            }
+        }
+
+        return false;
+    }
+
+
+    public function parseContentWithRules($dom, array $rules)
+    {
+        $xpath = new \DOMXPath($dom);
+
+        if (isset($rules['strip']) && is_array($rules['strip'])) {
+
+            foreach ($rules['strip'] as $pattern) {
+
+                $nodes = $xpath->query($pattern);
+
+                if ($nodes !== false && $nodes->length > 0) {
+                    foreach ($nodes as $node) {
+                        $node->parentNode->removeChild($node);
+                    }
+                }
+            }
+        }
+
+        if (isset($rules['strip_id_or_class']) && is_array($rules['strip_id_or_class'])) {
+
+            foreach ($rules['strip_id_or_class'] as $pattern) {
+
+                $pattern = strtr($pattern, array("'" => '', '"' => ''));
+                $nodes = $xpath->query("//*[contains(@class, '$pattern') or contains(@id, '$pattern')]");
+
+                if ($nodes !== false && $nodes->length > 0) {
+                    foreach ($nodes as $node) {
+                        $node->parentNode->removeChild($node);
+                    }
+                }
+            }
+        }
+
+        if (isset($rules['body']) && is_array($rules['body'])) {
+
+            foreach ($rules['body'] as $pattern) {
+
+                $nodes = $xpath->query($pattern);
+
+                if ($nodes !== false && $nodes->length > 0) {
+                    foreach ($nodes as $node) {
+                        $this->content .= $dom->saveXML($node);
+                    }
+                }
+            }
+        }
+    }
+
+
+    public function parseContentWithCandidates($dom)
+    {
+        $xpath = new \DOMXPath($dom);
+
+        // Try to fetch <article/>
+        $nodes = $xpath->query('//article');
+
+        if ($nodes !== false && $nodes->length > 0) {
+            $this->content = $dom->saveXML($nodes->item(0));
+            return;
+        }
+
+        // Try to lookup in each <div/>
+        foreach ($this->candidatesAttributes as $candidate) {
+
+            $nodes = $xpath->query('//div[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
+
+            if ($nodes !== false && $nodes->length > 0) {
+                $this->content = $dom->saveXML($nodes->item(0));
+                return;
+            }
+        }
+    }
+
+
+    public function stripGarbage()
+    {
+        \libxml_use_internal_errors(true);
+        $dom = new \DOMDocument;
+        $dom->loadXML($this->content);
+        $xpath = new \DOMXPath($dom);
+
+        foreach ($this->stripTags as $tag) {
+
+            $nodes = $xpath->query('//'.$tag);
+
+            if ($nodes !== false && $nodes->length > 0) {
+                foreach ($nodes as $node) {
+                    $node->parentNode->removeChild($node);
+                }
+            }
+        }
+
+        foreach ($this->stripAttributes as $attribute) {
+
+            $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
+
+            if ($nodes !== false && $nodes->length > 0) {
+                foreach ($nodes as $node) {
+                    $node->parentNode->removeChild($node);
+                }
+            }
+        }
+
+        $this->content = '';
+
+        foreach($dom->childNodes as $node) {
+            $this->content .= $dom->saveXML($node);
+        }
+    }
+}
--- a/vendor/PicoFeed/Parser.php
+++ b/vendor/PicoFeed/Parser.php
@ -5,6 +5,7 @@ namespace PicoFeed;
 require_once __DIR__.'/Logging.php';
 require_once __DIR__.'/Filter.php';
 require_once __DIR__.'/Encoding.php';
+require_once __DIR__.'/Grabber.php';

 abstract class Parser
 {
@ -15,6 +16,10 @@ abstract class Parser
    public $title = '';
    public $updated = '';
    public $items = array();
+    public $grabber = false;
+    public $grabber_ignore_urls = array();
+    public $grabber_timeout = 5;
+    public $grabber_user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)';


    abstract public function execute();
@ -23,7 +28,7 @@ abstract class Parser
    public function __construct($content)
    {
        // Strip XML tag to avoid multiple encoding/decoding in next XML processing
-        $this->content = $this->stripXmlTag($content);
+        $this->content = Filter::stripXmlTag($content);

        // Encode everything in UTF-8
        $this->content = Encoding::toUTF8($this->content);
@ -33,13 +38,19 @@ abstract class Parser
    }


-    public function filterHtml($str, $item_url)
+    public function filterHtml($item_content, $item_url)
    {
        $content = '';

-        if ($str) {
+        if ($this->grabber && ! in_array($item_url, $this->grabber_ignore_urls)) {
+            $grabber = new Grabber($item_url);
+            $grabber->download($this->grabber_timeout, $this->grabber_user_agent);
+            $grabber->parse();
+            if ($grabber->content) $item_content = $grabber->content;
+        }

-            $filter = new Filter($str, $item_url);
+        if ($item_content) {
+            $filter = new Filter($item_content, $item_url);
            $content = $filter->execute();
        }

@ -72,17 +83,6 @@ abstract class Parser
    }


-    public function stripXmlTag($data)
-    {
-        if (strpos($data, '<?xml') !== false) {
-
-            $data = substr($data, strrpos($data, '?>') + 2);
-        }
-
-        return $data;
-    }
-
-
    // Trim whitespace from the begining, the end and inside a string and don't break utf-8 string
    public function stripWhiteSpace($value)
    {
--- a/vendor/PicoFeed/Reader.php
+++ b/vendor/PicoFeed/Reader.php
@ -5,6 +5,7 @@ namespace PicoFeed;
 require_once __DIR__.'/Logging.php';
 require_once __DIR__.'/Parser.php';
 require_once __DIR__.'/Client.php';
+require_once __DIR__.'/Filter.php';

 class Reader
 {
@ -59,25 +60,20 @@ class Reader
        $data = preg_replace('/<!--(.{0,5000}?)-->/Uis', '', $data);

        /* Strip Doctype:
-         * Doctype needs to be within the first 500 characters. (Ideally the first!)
+         * Doctype needs to be within the first 100 characters. (Ideally the first!)
         * If it's not found by then, we need to stop looking to prevent PREG
         * from reaching max backtrack depth and crashing.
         */
-        $data = preg_replace('/^.{0,500}<!DOCTYPE([^>]*)>/Uis', '', $data);
+        $data = preg_replace('/^.{0,100}<!DOCTYPE([^>]*)>/Uis', '', $data);

-        // Find <?xml version....
-        if (strpos($data, '<?xml') !== false) {
+        // Strip <?xml version....
+        $data = Filter::stripXmlTag($data);

-            $data = substr($data, strrpos($data, '?>') + 2);
+        // Find the first tag
+        $open_tag = strpos($data, '<');
+        $close_tag = strpos($data, '>');

-            // Find the first tag
-            $open_tag = strpos($data, '<');
-            $close_tag = strpos($data, '>');
-
-            return substr($data, $open_tag, $close_tag);
-        }
-
-        return $data;
+        return substr($data, $open_tag, $close_tag);
    }


--- a/vendor/PicoFeed/Rules/.blog.lemonde.fr.php
+++ b/vendor/PicoFeed/Rules/.blog.lemonde.fr.php
@ -0,0 +1,10 @@
+<?php
+return array(
+    'test_url' => 'http://combat.blog.lemonde.fr/2013/08/31/teddy-riner-le-rookie-devenu-rambo/#xtor=RSS-3208',
+    'body' => array(
+        '//div[@class="entry-content"]',
+    ),
+    'strip' => array(
+        '//*[contains(@class, "fb-like") or contains(@class, "social")]'
+    )
+);
--- a/vendor/PicoFeed/Rules/.blogs.nytimes.com.php
+++ b/vendor/PicoFeed/Rules/.blogs.nytimes.com.php
@ -0,0 +1,13 @@
+<?php
+return array(
+    'title' => '//header/h1',
+    'test_url' => 'http://opinionator.blogs.nytimes.com/2011/02/03/lost-and-gone-forever/',
+    'test_url' => 'http://krugman.blogs.nytimes.com/2012/09/12/a-vote-of-confidence/',
+    'test_url' => 'http://bits.blogs.nytimes.com/2012/01/16/wikipedia-plans-to-go-dark-on-wednesday-to-protest-sopa/',
+    'body' => array(
+         '//div[@class="postContent"]',
+    ),
+    'strip' => array(
+         '//*[@class="shareToolsBox"]',
+    ),
+);
--- a/vendor/PicoFeed/Rules/.nytimes.com.php
+++ b/vendor/PicoFeed/Rules/.nytimes.com.php
@ -0,0 +1,8 @@
+<?php
+return array(
+    'test_url' => 'http://www.nytimes.com/2011/05/15/world/middleeast/15prince.html',
+    'title' => '//h1[@class="articleHeadline"]',
+    'body' => array(
+        '//div[@class="articleBody"]',
+    ),
+);
--- a/vendor/PicoFeed/Rules/.slate.com.php
+++ b/vendor/PicoFeed/Rules/.slate.com.php
@ -0,0 +1,16 @@
+<?php
+return array(
+    'test_url' => 'http://www.slate.com/articles/business/moneybox/2013/08/microsoft_ceo_steve_ballmer_retires_a_firsthand_account_of_the_company_s.html',
+    'body' => array(
+        '//div[@class="sl-art-body"]',
+    ),
+    'strip' => array(
+        '//*[contains(@class, "social") or contains(@class, "comments") or contains(@class, "sl-article-floatin-tools")  or contains(@class, "sl-art-pag")]',
+        '//*[@id="mys_slate_logged_in"]',
+        '//*[@id="sl_article_tools_myslate_bottom"]',
+        '//*[@id="mys_myslate"]',
+        '//*[@class="sl-viral-container"]',
+        '//*[@class="sl-art-creds-cntr"]',
+        '//*[@class="sl-art-ad-midflex"]',
+    )
+);
--- a/vendor/PicoFeed/Rules/.wsj.com.php
+++ b/vendor/PicoFeed/Rules/.wsj.com.php
@ -0,0 +1,11 @@
+<?php
+return array(
+    'test_url' => 'http://online.wsj.com/article/SB10001424127887324108204579023143974408428.html',
+    'body' => array(
+        '//div[@class="articlePage"]',
+    ),
+    'strip' => array(
+        '//*[@id="articleThumbnail_2"]',
+        '//*[@class="socialByline"]',
+    )
+);
--- a/vendor/PicoFeed/Rules/rue89.feedsportal.com.php
+++ b/vendor/PicoFeed/Rules/rue89.feedsportal.com.php
@ -0,0 +1,9 @@
+<?php
+return array(
+    'test_url' => 'http://rue89.feedsportal.com/c/33822/f/608948/s/30999fa0/sc/24/l/0L0Srue890N0C20A130C0A80C30A0Cfaisait0Eboris0Eboillon0Eex0Esarko0Eboy0E350A0E0A0A0A0Eeuros0Egare0Enord0E245315/story01.htm',
+    'body' => array(
+        '//*[@id="article"]/div[contains(@class, "content")]',
+    ),
+    'strip' => array(
+    )
+);
--- a/vendor/PicoFeed/Rules/www.bbc.co.uk.php
+++ b/vendor/PicoFeed/Rules/www.bbc.co.uk.php
@ -0,0 +1,20 @@
+<?php
+return array(
+    'test_url' => 'http://www.bbc.co.uk/news/world-middle-east-23911833',
+    'body' => array(
+        '//div[@class="story-body"]',
+    ),
+    'strip' => array(
+        '//script',
+        '//form',
+        '//style',
+        '//*[@class="story-date"]',
+        '//*[@class="story-header"]',
+        '//*[@class="story-related"]',
+        '//*[contains(@class, "byline")]',
+        '//*[contains(@class, "story-feature")]',
+        '//*[@id="video-carousel-container"]',
+        '//*[@id="also-related-links"]',
+        '//*[contains(@class, "share") or contains(@class, "hidden") or contains(@class, "hyper")]',
+    )
+);
--- a/vendor/PicoFeed/Rules/www.cnn.com.php
+++ b/vendor/PicoFeed/Rules/www.cnn.com.php
@ -0,0 +1,8 @@
+<?php
+return array(
+    'test_url' => 'http://www.cnn.com/2013/08/31/world/meast/syria-civil-war/index.html?hpt=hp_t1',
+    'body' => array(
+        '//*[contains(@class, "cnn_storypgraphtxt")]]',
+        '//*[contains(@class, "cnnvideo_wrapper")]]',
+    ),
+);
--- a/vendor/PicoFeed/Rules/www.egscomics.com.php
+++ b/vendor/PicoFeed/Rules/www.egscomics.com.php
@ -0,0 +1,8 @@
+<?php
+return array(
+    'test_url' => 'http://www.egscomics.com/index.php?id=1690',
+    'title' => '/html/head/title',
+    'body' => array(
+        '//img[@id="comic"]'
+    )
+);
--- a/vendor/PicoFeed/Rules/www.lemonde.fr.php
+++ b/vendor/PicoFeed/Rules/www.lemonde.fr.php
@ -0,0 +1,9 @@
+<?php
+return array(
+    'test_url' => 'http://www.lemonde.fr/societe/article/2013/08/30/boris-boillon-ancien-ambassadeur-de-sarkozy-arrete-avec-350-000-euros-en-liquide_3469109_3224.html',
+    'body' => array(
+        '//div[@id="articleBody"]',
+    ),
+    'strip' => array(
+    ),
+);
--- a/vendor/PicoFeed/Rules/www.numerama.com.php
+++ b/vendor/PicoFeed/Rules/www.numerama.com.php
@ -0,0 +1,10 @@
+<?php
+return array(
+    'test_url' => 'http://www.numerama.com/magazine/26857-bientot-des-robots-dans-les-cuisines-de-mcdo.html',
+    'body' => array(
+        '//*[@id="general_content"]/table/tbody/tr/td[1]/div/div/div[6]/h2',
+        '//div[@id="newstext"]',
+    ),
+    'strip' => array(
+    )
+);
--- a/vendor/PicoFeed/Rules/www.slate.fr.php
+++ b/vendor/PicoFeed/Rules/www.slate.fr.php
@ -0,0 +1,17 @@
+<?php
+return array(
+    'test_url' => 'http://www.slate.fr/monde/77034/allemagne-2013-couacs-campagne',
+    'body' => array(
+        '//div[@class="article_content"]',
+    ),
+    'strip' => array(
+        '//script',
+        '//style',
+        '//*[@id="slate_associated_bn"]',
+        '//*[@id="ligatus-article"]',
+        '//*[@id="article_sidebar"]',
+        '//div[contains(@id, "reseaux")]',
+        '//*[contains(@class, "smart") or contains(@class, "article_tags") or contains(@class, "article_reactions")]',
+        '//*[contains(@class, "OUTBRAIN") or contains(@class, "related_item") or contains(@class, "share")]',
+    )
+);