Improve content grabber

This commit is contained in:
Frédéric Guillot 2014-04-15 18:15:31 -04:00
parent 5cfe2cd839
commit 56c03c16b0
5 changed files with 27 additions and 5 deletions

View File

@ -38,7 +38,8 @@ Features
### Awesome features
- Keyboard shortcuts
- Full article download for feeds that display only a summary (website scraper based on Xpath rules)
- Full article download for feeds that display only a summary
- Enclosure support (videos and podcasts)
- Feed updates via a cronjob or with the user interface with one click
### More

View File

@ -711,16 +711,25 @@ class Filter
}
/**
* Strip meta tags from the HTML content
* Strip head tag from the HTML content
*
* @static
* @access public
* @param string $data Input data
* @return string
*/
public static function stripMetaTags($data)
public static function stripHeadTags($data)
{
return preg_replace('/<meta\s.*?\/>/is', '', $data);
$start = strpos($data, '<head>');
$end = strpos($data, '</head>');
if ($start !== false && $end !== false) {
$before = substr($data, 0, $start);
$after = substr($data, $end + 7);
$data = $before.$after;
}
return $data;
}
/**

View File

@ -83,7 +83,7 @@ class Grabber
Logging::log(\get_called_class().' Fix encoding');
Logging::log(\get_called_class().': HTTP Encoding "'.$this->encoding.'"');
$this->html = Filter::stripMetaTags($this->html);
$this->html = Filter::stripHeadTags($this->html);
if ($this->encoding == 'windows-1251') {
$this->html = Encoding::cp1251ToUtf8($this->html);

View File

@ -0,0 +1,9 @@
<?php
return array(
'test_url' => 'http://www.pcinpact.com/news/85954-air-france-ne-vous-demande-plus-deteindre-vos-appareils-electroniques.htm?utm_source=PCi_RSS_Feed&utm_medium=news&utm_campaign=pcinpact',
'body' => array(
'//div[contains(@id, "actu_content")]',
),
'strip' => array(
),
);

View File

@ -5,5 +5,8 @@ return array(
'//div[@id="art_main"]',
),
'strip' => array(
'//div[@id="art_print"]',
'//div[@id="art_chapo"]',
'//img[@class="puce"]',
),
);