From 56c03c16b0b9c26473429411441ee899f9bc5cc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Tue, 15 Apr 2014 18:15:31 -0400 Subject: [PATCH] Improve content grabber --- README.markdown | 3 ++- vendor/PicoFeed/Filter.php | 15 ++++++++++++--- vendor/PicoFeed/Grabber.php | 2 +- vendor/PicoFeed/Rules/www.nextinpact.com.php | 9 +++++++++ vendor/PicoFeed/Rules/www.pseudo-sciences.org.php | 3 +++ 5 files changed, 27 insertions(+), 5 deletions(-) create mode 100644 vendor/PicoFeed/Rules/www.nextinpact.com.php diff --git a/README.markdown b/README.markdown index 44b8aa3..e2d103e 100644 --- a/README.markdown +++ b/README.markdown @@ -38,7 +38,8 @@ Features ### Awesome features - Keyboard shortcuts -- Full article download for feeds that display only a summary (website scraper based on Xpath rules) +- Full article download for feeds that display only a summary +- Enclosure support (videos and podcasts) - Feed updates via a cronjob or with the user interface with one click ### More diff --git a/vendor/PicoFeed/Filter.php b/vendor/PicoFeed/Filter.php index 6253bdf..bbf5b7c 100644 --- a/vendor/PicoFeed/Filter.php +++ b/vendor/PicoFeed/Filter.php @@ -711,16 +711,25 @@ class Filter } /** - * Strip meta tags from the HTML content + * Strip head tag from the HTML content * * @static * @access public * @param string $data Input data * @return string */ - public static function stripMetaTags($data) + public static function stripHeadTags($data) { - return preg_replace('//is', '', $data); + $start = strpos($data, ''); + $end = strpos($data, ''); + + if ($start !== false && $end !== false) { + $before = substr($data, 0, $start); + $after = substr($data, $end + 7); + $data = $before.$after; + } + + return $data; } /** diff --git a/vendor/PicoFeed/Grabber.php b/vendor/PicoFeed/Grabber.php index dde60d8..329d291 100644 --- a/vendor/PicoFeed/Grabber.php +++ b/vendor/PicoFeed/Grabber.php @@ -83,7 +83,7 @@ class Grabber Logging::log(\get_called_class().' Fix encoding'); Logging::log(\get_called_class().': HTTP Encoding "'.$this->encoding.'"'); - $this->html = Filter::stripMetaTags($this->html); + $this->html = Filter::stripHeadTags($this->html); if ($this->encoding == 'windows-1251') { $this->html = Encoding::cp1251ToUtf8($this->html); diff --git a/vendor/PicoFeed/Rules/www.nextinpact.com.php b/vendor/PicoFeed/Rules/www.nextinpact.com.php new file mode 100644 index 0000000..fc45ef2 --- /dev/null +++ b/vendor/PicoFeed/Rules/www.nextinpact.com.php @@ -0,0 +1,9 @@ + 'http://www.pcinpact.com/news/85954-air-france-ne-vous-demande-plus-deteindre-vos-appareils-electroniques.htm?utm_source=PCi_RSS_Feed&utm_medium=news&utm_campaign=pcinpact', + 'body' => array( + '//div[contains(@id, "actu_content")]', + ), + 'strip' => array( + ), +); \ No newline at end of file diff --git a/vendor/PicoFeed/Rules/www.pseudo-sciences.org.php b/vendor/PicoFeed/Rules/www.pseudo-sciences.org.php index 11073a5..bfb9303 100644 --- a/vendor/PicoFeed/Rules/www.pseudo-sciences.org.php +++ b/vendor/PicoFeed/Rules/www.pseudo-sciences.org.php @@ -5,5 +5,8 @@ return array( '//div[@id="art_main"]', ), 'strip' => array( + '//div[@id="art_print"]', + '//div[@id="art_chapo"]', + '//img[@class="puce"]', ), );