Improve content grabber
This commit is contained in:
parent
5cfe2cd839
commit
56c03c16b0
@ -38,7 +38,8 @@ Features
|
|||||||
### Awesome features
|
### Awesome features
|
||||||
|
|
||||||
- Keyboard shortcuts
|
- Keyboard shortcuts
|
||||||
- Full article download for feeds that display only a summary (website scraper based on Xpath rules)
|
- Full article download for feeds that display only a summary
|
||||||
|
- Enclosure support (videos and podcasts)
|
||||||
- Feed updates via a cronjob or with the user interface with one click
|
- Feed updates via a cronjob or with the user interface with one click
|
||||||
|
|
||||||
### More
|
### More
|
||||||
|
15
vendor/PicoFeed/Filter.php
vendored
15
vendor/PicoFeed/Filter.php
vendored
@ -711,16 +711,25 @@ class Filter
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Strip meta tags from the HTML content
|
* Strip head tag from the HTML content
|
||||||
*
|
*
|
||||||
* @static
|
* @static
|
||||||
* @access public
|
* @access public
|
||||||
* @param string $data Input data
|
* @param string $data Input data
|
||||||
* @return string
|
* @return string
|
||||||
*/
|
*/
|
||||||
public static function stripMetaTags($data)
|
public static function stripHeadTags($data)
|
||||||
{
|
{
|
||||||
return preg_replace('/<meta\s.*?\/>/is', '', $data);
|
$start = strpos($data, '<head>');
|
||||||
|
$end = strpos($data, '</head>');
|
||||||
|
|
||||||
|
if ($start !== false && $end !== false) {
|
||||||
|
$before = substr($data, 0, $start);
|
||||||
|
$after = substr($data, $end + 7);
|
||||||
|
$data = $before.$after;
|
||||||
|
}
|
||||||
|
|
||||||
|
return $data;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
2
vendor/PicoFeed/Grabber.php
vendored
2
vendor/PicoFeed/Grabber.php
vendored
@ -83,7 +83,7 @@ class Grabber
|
|||||||
Logging::log(\get_called_class().' Fix encoding');
|
Logging::log(\get_called_class().' Fix encoding');
|
||||||
Logging::log(\get_called_class().': HTTP Encoding "'.$this->encoding.'"');
|
Logging::log(\get_called_class().': HTTP Encoding "'.$this->encoding.'"');
|
||||||
|
|
||||||
$this->html = Filter::stripMetaTags($this->html);
|
$this->html = Filter::stripHeadTags($this->html);
|
||||||
|
|
||||||
if ($this->encoding == 'windows-1251') {
|
if ($this->encoding == 'windows-1251') {
|
||||||
$this->html = Encoding::cp1251ToUtf8($this->html);
|
$this->html = Encoding::cp1251ToUtf8($this->html);
|
||||||
|
9
vendor/PicoFeed/Rules/www.nextinpact.com.php
vendored
Normal file
9
vendor/PicoFeed/Rules/www.nextinpact.com.php
vendored
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'test_url' => 'http://www.pcinpact.com/news/85954-air-france-ne-vous-demande-plus-deteindre-vos-appareils-electroniques.htm?utm_source=PCi_RSS_Feed&utm_medium=news&utm_campaign=pcinpact',
|
||||||
|
'body' => array(
|
||||||
|
'//div[contains(@id, "actu_content")]',
|
||||||
|
),
|
||||||
|
'strip' => array(
|
||||||
|
),
|
||||||
|
);
|
@ -5,5 +5,8 @@ return array(
|
|||||||
'//div[@id="art_main"]',
|
'//div[@id="art_main"]',
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
|
'//div[@id="art_print"]',
|
||||||
|
'//div[@id="art_chapo"]',
|
||||||
|
'//img[@class="puce"]',
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
|
Loading…
Reference in New Issue
Block a user