Improve content grabber
This commit is contained in:
parent
5cfe2cd839
commit
56c03c16b0
@ -38,7 +38,8 @@ Features
|
||||
### Awesome features
|
||||
|
||||
- Keyboard shortcuts
|
||||
- Full article download for feeds that display only a summary (website scraper based on Xpath rules)
|
||||
- Full article download for feeds that display only a summary
|
||||
- Enclosure support (videos and podcasts)
|
||||
- Feed updates via a cronjob or with the user interface with one click
|
||||
|
||||
### More
|
||||
|
15
vendor/PicoFeed/Filter.php
vendored
15
vendor/PicoFeed/Filter.php
vendored
@ -711,16 +711,25 @@ class Filter
|
||||
}
|
||||
|
||||
/**
|
||||
* Strip meta tags from the HTML content
|
||||
* Strip head tag from the HTML content
|
||||
*
|
||||
* @static
|
||||
* @access public
|
||||
* @param string $data Input data
|
||||
* @return string
|
||||
*/
|
||||
public static function stripMetaTags($data)
|
||||
public static function stripHeadTags($data)
|
||||
{
|
||||
return preg_replace('/<meta\s.*?\/>/is', '', $data);
|
||||
$start = strpos($data, '<head>');
|
||||
$end = strpos($data, '</head>');
|
||||
|
||||
if ($start !== false && $end !== false) {
|
||||
$before = substr($data, 0, $start);
|
||||
$after = substr($data, $end + 7);
|
||||
$data = $before.$after;
|
||||
}
|
||||
|
||||
return $data;
|
||||
}
|
||||
|
||||
/**
|
||||
|
2
vendor/PicoFeed/Grabber.php
vendored
2
vendor/PicoFeed/Grabber.php
vendored
@ -83,7 +83,7 @@ class Grabber
|
||||
Logging::log(\get_called_class().' Fix encoding');
|
||||
Logging::log(\get_called_class().': HTTP Encoding "'.$this->encoding.'"');
|
||||
|
||||
$this->html = Filter::stripMetaTags($this->html);
|
||||
$this->html = Filter::stripHeadTags($this->html);
|
||||
|
||||
if ($this->encoding == 'windows-1251') {
|
||||
$this->html = Encoding::cp1251ToUtf8($this->html);
|
||||
|
9
vendor/PicoFeed/Rules/www.nextinpact.com.php
vendored
Normal file
9
vendor/PicoFeed/Rules/www.nextinpact.com.php
vendored
Normal file
@ -0,0 +1,9 @@
|
||||
<?php
|
||||
return array(
|
||||
'test_url' => 'http://www.pcinpact.com/news/85954-air-france-ne-vous-demande-plus-deteindre-vos-appareils-electroniques.htm?utm_source=PCi_RSS_Feed&utm_medium=news&utm_campaign=pcinpact',
|
||||
'body' => array(
|
||||
'//div[contains(@id, "actu_content")]',
|
||||
),
|
||||
'strip' => array(
|
||||
),
|
||||
);
|
@ -5,5 +5,8 @@ return array(
|
||||
'//div[@id="art_main"]',
|
||||
),
|
||||
'strip' => array(
|
||||
'//div[@id="art_print"]',
|
||||
'//div[@id="art_chapo"]',
|
||||
'//img[@class="puce"]',
|
||||
),
|
||||
);
|
||||
|
Loading…
Reference in New Issue
Block a user