parent
901a57e298
commit
f89ed85a83
@ -18,14 +18,22 @@ Especially websites that use a lot of Javascript to generate the content.
|
|||||||
How to write a grabber rules file?
|
How to write a grabber rules file?
|
||||||
----------------------------------
|
----------------------------------
|
||||||
|
|
||||||
Add a PHP file to the directory `rules`, the filename must be the domain name with the suffix `.php`:
|
Miniflux will try first to find the file in the [default bundled rules directory](https://github.com/miniflux/miniflux/tree/master/vendor/fguillot/picofeed/lib/PicoFeed/Rules), then it will try to load your custom rules.
|
||||||
|
|
||||||
Example with the BBC website, `www.bbc.co.uk.php`:
|
You can create custom rules, by adding a PHP file to the directory `rules`. The filename must be the domain name with the suffix `.php`.
|
||||||
|
|
||||||
|
Each rule has the following keys:
|
||||||
|
* **body**: An array of xpath expressions which will be extracted from the page
|
||||||
|
* **strip**: An array of xpath expressions which will be removed from the matched content
|
||||||
|
* **test_url**: A test url to a matching page to test the grabber
|
||||||
|
|
||||||
|
Example for the BBC website, `www.bbc.co.uk.php`:
|
||||||
|
|
||||||
```php
|
```php
|
||||||
<?php
|
<?php
|
||||||
|
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.bbc.co.uk/news/world-middle-east-23911833',
|
'test_url' => 'http://www.bbc.co.uk/news/world-middle-east-23911833',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="story-body"]',
|
'//div[@class="story-body"]',
|
||||||
@ -43,12 +51,42 @@ return array(
|
|||||||
'//*[@id="also-related-links"]',
|
'//*[@id="also-related-links"]',
|
||||||
'//*[contains(@class, "share") or contains(@class, "hidden") or contains(@class, "hyper")]',
|
'//*[contains(@class, "share") or contains(@class, "hidden") or contains(@class, "hyper")]',
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
```
|
```
|
||||||
|
|
||||||
Actually, only the keys `body`, `strip` and `test_url` are supported.
|
Each rule file can contain rules for different subdivisions of a website. Those subdivisions are distinguished by their URL. The first level array key of a rule file will be matched against the full path of the URL using **preg_match**, e.g. for **http://www.bbc.co.uk/news/world-middle-east-23911833?test=1** the URL that would be matched is **/news/world-middle-east-23911833?test=1**
|
||||||
|
|
||||||
Miniflux will try first to find the file in the [default bundled rules directory](https://github.com/miniflux/miniflux/tree/master/vendor/fguillot/picofeed/lib/PicoFeed/Rules), then it will try to load your custom rules.
|
Let's say you want to extract a div with the id **video** if the article points to an URL like **http://comix.com/videos/423**, **audio** if the article points to an URL like **http://comix.com/podcasts/5** and all other links to the page should instead take the div with the id **content**. The following rulefile ```comix.com.php``` would fit that requirement:
|
||||||
|
|
||||||
|
```php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%^/videos.*%' => array(
|
||||||
|
'test_url' => 'http://comix.com/videos/423',
|
||||||
|
'body' => array(
|
||||||
|
'//div[@id="video"]',
|
||||||
|
),
|
||||||
|
'strip' => array()
|
||||||
|
),
|
||||||
|
'%^/podcasts.*%' => array(
|
||||||
|
'test_url' => 'http://comix.com/podcasts/5',
|
||||||
|
'body' => array(
|
||||||
|
'//div[@id="audio"]',
|
||||||
|
),
|
||||||
|
'strip' => array()
|
||||||
|
),
|
||||||
|
'%.*%' => array(
|
||||||
|
'test_url' => 'http://comix.com/blog/1',
|
||||||
|
'body' => array(
|
||||||
|
'//div[@id="content"]',
|
||||||
|
),
|
||||||
|
'strip' => array()
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
Sharing your custom rules with the community
|
Sharing your custom rules with the community
|
||||||
--------------------------------------------
|
--------------------------------------------
|
||||||
|
@ -6,7 +6,7 @@ use Model\Service;
|
|||||||
use Model\Config;
|
use Model\Config;
|
||||||
use PicoDb\Database;
|
use PicoDb\Database;
|
||||||
use PicoFeed\Logging\Logger;
|
use PicoFeed\Logging\Logger;
|
||||||
use PicoFeed\Client\Grabber;
|
use PicoFeed\Scraper\Scraper;
|
||||||
|
|
||||||
// Get all items without filtering
|
// Get all items without filtering
|
||||||
function get_all()
|
function get_all()
|
||||||
@ -520,12 +520,12 @@ function download_content_url($url)
|
|||||||
{
|
{
|
||||||
$content = '';
|
$content = '';
|
||||||
|
|
||||||
$grabber = new Grabber($url);
|
$grabber = new Scraper(Config\get_reader_config());
|
||||||
$grabber->setConfig(Config\get_reader_config());
|
$grabber->setUrl($url);
|
||||||
$grabber->download();
|
$grabber->execute();
|
||||||
|
|
||||||
if ($grabber->parse()) {
|
if ($grabber->hasRelevantContent()) {
|
||||||
$content = $grabber->getFilteredcontent();
|
$content = $grabber->getFilteredContent();
|
||||||
}
|
}
|
||||||
|
|
||||||
return $content;
|
return $content;
|
||||||
|
26
vendor/composer/ClassLoader.php
vendored
26
vendor/composer/ClassLoader.php
vendored
@ -54,6 +54,8 @@ class ClassLoader
|
|||||||
private $useIncludePath = false;
|
private $useIncludePath = false;
|
||||||
private $classMap = array();
|
private $classMap = array();
|
||||||
|
|
||||||
|
private $classMapAuthoritative = false;
|
||||||
|
|
||||||
public function getPrefixes()
|
public function getPrefixes()
|
||||||
{
|
{
|
||||||
if (!empty($this->prefixesPsr0)) {
|
if (!empty($this->prefixesPsr0)) {
|
||||||
@ -248,6 +250,27 @@ class ClassLoader
|
|||||||
return $this->useIncludePath;
|
return $this->useIncludePath;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Turns off searching the prefix and fallback directories for classes
|
||||||
|
* that have not been registered with the class map.
|
||||||
|
*
|
||||||
|
* @param bool $classMapAuthoritative
|
||||||
|
*/
|
||||||
|
public function setClassMapAuthoritative($classMapAuthoritative)
|
||||||
|
{
|
||||||
|
$this->classMapAuthoritative = $classMapAuthoritative;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should class lookup fail if not found in the current class map?
|
||||||
|
*
|
||||||
|
* @return bool
|
||||||
|
*/
|
||||||
|
public function isClassMapAuthoritative()
|
||||||
|
{
|
||||||
|
return $this->classMapAuthoritative;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Registers this instance as an autoloader.
|
* Registers this instance as an autoloader.
|
||||||
*
|
*
|
||||||
@ -299,6 +322,9 @@ class ClassLoader
|
|||||||
if (isset($this->classMap[$class])) {
|
if (isset($this->classMap[$class])) {
|
||||||
return $this->classMap[$class];
|
return $this->classMap[$class];
|
||||||
}
|
}
|
||||||
|
if ($this->classMapAuthoritative) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
$file = $this->findFileWithExtension($class, '.php');
|
$file = $this->findFileWithExtension($class, '.php');
|
||||||
|
|
||||||
|
6
vendor/composer/autoload_classmap.php
vendored
6
vendor/composer/autoload_classmap.php
vendored
@ -20,7 +20,6 @@ return array(
|
|||||||
'PicoFeed\\Client\\Client' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Client.php',
|
'PicoFeed\\Client\\Client' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Client.php',
|
||||||
'PicoFeed\\Client\\ClientException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/ClientException.php',
|
'PicoFeed\\Client\\ClientException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/ClientException.php',
|
||||||
'PicoFeed\\Client\\Curl' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Curl.php',
|
'PicoFeed\\Client\\Curl' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Curl.php',
|
||||||
'PicoFeed\\Client\\Grabber' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php',
|
|
||||||
'PicoFeed\\Client\\HttpHeaders' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/HttpHeaders.php',
|
'PicoFeed\\Client\\HttpHeaders' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/HttpHeaders.php',
|
||||||
'PicoFeed\\Client\\InvalidCertificateException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/InvalidCertificateException.php',
|
'PicoFeed\\Client\\InvalidCertificateException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/InvalidCertificateException.php',
|
||||||
'PicoFeed\\Client\\InvalidUrlException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/InvalidUrlException.php',
|
'PicoFeed\\Client\\InvalidUrlException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/InvalidUrlException.php',
|
||||||
@ -54,6 +53,11 @@ return array(
|
|||||||
'PicoFeed\\Reader\\ReaderException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/ReaderException.php',
|
'PicoFeed\\Reader\\ReaderException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/ReaderException.php',
|
||||||
'PicoFeed\\Reader\\SubscriptionNotFoundException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/SubscriptionNotFoundException.php',
|
'PicoFeed\\Reader\\SubscriptionNotFoundException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/SubscriptionNotFoundException.php',
|
||||||
'PicoFeed\\Reader\\UnsupportedFeedFormatException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/UnsupportedFeedFormatException.php',
|
'PicoFeed\\Reader\\UnsupportedFeedFormatException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/UnsupportedFeedFormatException.php',
|
||||||
|
'PicoFeed\\Scraper\\CandidateParser' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Scraper/CandidateParser.php',
|
||||||
|
'PicoFeed\\Scraper\\ParserInterface' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Scraper/ParserInterface.php',
|
||||||
|
'PicoFeed\\Scraper\\RuleLoader' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Scraper/RuleLoader.php',
|
||||||
|
'PicoFeed\\Scraper\\RuleParser' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Scraper/RuleParser.php',
|
||||||
|
'PicoFeed\\Scraper\\Scraper' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Scraper/Scraper.php',
|
||||||
'PicoFeed\\Serialization\\Export' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Serialization/Export.php',
|
'PicoFeed\\Serialization\\Export' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Serialization/Export.php',
|
||||||
'PicoFeed\\Serialization\\Import' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Serialization/Import.php',
|
'PicoFeed\\Serialization\\Import' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Serialization/Import.php',
|
||||||
'PicoFeed\\Syndication\\Atom' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Syndication/Atom.php',
|
'PicoFeed\\Syndication\\Atom' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Syndication/Atom.php',
|
||||||
|
43
vendor/composer/installed.json
vendored
43
vendor/composer/installed.json
vendored
@ -45,18 +45,18 @@
|
|||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/fguillot/picoFarad.git",
|
"url": "https://github.com/fguillot/picoFarad.git",
|
||||||
"reference": "1bc48a4367adf359f3439c2e0ae20a7d299d8ccd"
|
"reference": "a5817c49ca3037829ec1509d14724be5f29c35a0"
|
||||||
},
|
},
|
||||||
"dist": {
|
"dist": {
|
||||||
"type": "zip",
|
"type": "zip",
|
||||||
"url": "https://api.github.com/repos/fguillot/picoFarad/zipball/1bc48a4367adf359f3439c2e0ae20a7d299d8ccd",
|
"url": "https://api.github.com/repos/fguillot/picoFarad/zipball/a5817c49ca3037829ec1509d14724be5f29c35a0",
|
||||||
"reference": "1bc48a4367adf359f3439c2e0ae20a7d299d8ccd",
|
"reference": "a5817c49ca3037829ec1509d14724be5f29c35a0",
|
||||||
"shasum": ""
|
"shasum": ""
|
||||||
},
|
},
|
||||||
"require": {
|
"require": {
|
||||||
"php": ">=5.3.0"
|
"php": ">=5.3.0"
|
||||||
},
|
},
|
||||||
"time": "2015-02-01 19:40:13",
|
"time": "2015-04-14 01:53:02",
|
||||||
"type": "library",
|
"type": "library",
|
||||||
"installation-source": "dist",
|
"installation-source": "dist",
|
||||||
"autoload": {
|
"autoload": {
|
||||||
@ -66,7 +66,7 @@
|
|||||||
},
|
},
|
||||||
"notification-url": "https://packagist.org/downloads/",
|
"notification-url": "https://packagist.org/downloads/",
|
||||||
"license": [
|
"license": [
|
||||||
"Unlicense"
|
"MIT"
|
||||||
],
|
],
|
||||||
"authors": [
|
"authors": [
|
||||||
{
|
{
|
||||||
@ -84,18 +84,18 @@
|
|||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/fguillot/simpleValidator.git",
|
"url": "https://github.com/fguillot/simpleValidator.git",
|
||||||
"reference": "41655dc7b9224395f5bb3b5623f6e428fe6d64e8"
|
"reference": "2f30078bb6e688cf123c150d58fda322792a1532"
|
||||||
},
|
},
|
||||||
"dist": {
|
"dist": {
|
||||||
"type": "zip",
|
"type": "zip",
|
||||||
"url": "https://api.github.com/repos/fguillot/simpleValidator/zipball/41655dc7b9224395f5bb3b5623f6e428fe6d64e8",
|
"url": "https://api.github.com/repos/fguillot/simpleValidator/zipball/2f30078bb6e688cf123c150d58fda322792a1532",
|
||||||
"reference": "41655dc7b9224395f5bb3b5623f6e428fe6d64e8",
|
"reference": "2f30078bb6e688cf123c150d58fda322792a1532",
|
||||||
"shasum": ""
|
"shasum": ""
|
||||||
},
|
},
|
||||||
"require": {
|
"require": {
|
||||||
"php": ">=5.3.0"
|
"php": ">=5.3.0"
|
||||||
},
|
},
|
||||||
"time": "2015-04-05 21:44:06",
|
"time": "2015-04-14 02:03:43",
|
||||||
"type": "library",
|
"type": "library",
|
||||||
"installation-source": "dist",
|
"installation-source": "dist",
|
||||||
"autoload": {
|
"autoload": {
|
||||||
@ -109,8 +109,7 @@
|
|||||||
],
|
],
|
||||||
"authors": [
|
"authors": [
|
||||||
{
|
{
|
||||||
"name": "Frédéric Guillot",
|
"name": "Frédéric Guillot"
|
||||||
"homepage": "http://fredericguillot.com"
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "The most easy to use validator library for PHP :)",
|
"description": "The most easy to use validator library for PHP :)",
|
||||||
@ -123,18 +122,18 @@
|
|||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/fguillot/JsonRPC.git",
|
"url": "https://github.com/fguillot/JsonRPC.git",
|
||||||
"reference": "29d63a09ecd450d5e29fef74f687aab221055910"
|
"reference": "1a397be7739ddabba87b07f0354655bd91087518"
|
||||||
},
|
},
|
||||||
"dist": {
|
"dist": {
|
||||||
"type": "zip",
|
"type": "zip",
|
||||||
"url": "https://api.github.com/repos/fguillot/JsonRPC/zipball/29d63a09ecd450d5e29fef74f687aab221055910",
|
"url": "https://api.github.com/repos/fguillot/JsonRPC/zipball/1a397be7739ddabba87b07f0354655bd91087518",
|
||||||
"reference": "29d63a09ecd450d5e29fef74f687aab221055910",
|
"reference": "1a397be7739ddabba87b07f0354655bd91087518",
|
||||||
"shasum": ""
|
"shasum": ""
|
||||||
},
|
},
|
||||||
"require": {
|
"require": {
|
||||||
"php": ">=5.3.0"
|
"php": ">=5.3.0"
|
||||||
},
|
},
|
||||||
"time": "2015-04-05 21:49:38",
|
"time": "2015-04-14 01:50:16",
|
||||||
"type": "library",
|
"type": "library",
|
||||||
"installation-source": "dist",
|
"installation-source": "dist",
|
||||||
"autoload": {
|
"autoload": {
|
||||||
@ -144,7 +143,7 @@
|
|||||||
},
|
},
|
||||||
"notification-url": "https://packagist.org/downloads/",
|
"notification-url": "https://packagist.org/downloads/",
|
||||||
"license": [
|
"license": [
|
||||||
"Unlicense"
|
"MIT"
|
||||||
],
|
],
|
||||||
"authors": [
|
"authors": [
|
||||||
{
|
{
|
||||||
@ -152,7 +151,7 @@
|
|||||||
"homepage": "http://fredericguillot.com"
|
"homepage": "http://fredericguillot.com"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "A simple Json-RPC client/server library that just works",
|
"description": "Simple Json-RPC client/server library that just works",
|
||||||
"homepage": "https://github.com/fguillot/JsonRPC"
|
"homepage": "https://github.com/fguillot/JsonRPC"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -162,12 +161,12 @@
|
|||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/fguillot/picoFeed.git",
|
"url": "https://github.com/fguillot/picoFeed.git",
|
||||||
"reference": "273c344b35b468b6c8053f635332c3a404f8c7b9"
|
"reference": "a6087e8264550891c1b8a6da77eca0cab9328709"
|
||||||
},
|
},
|
||||||
"dist": {
|
"dist": {
|
||||||
"type": "zip",
|
"type": "zip",
|
||||||
"url": "https://api.github.com/repos/fguillot/picoFeed/zipball/273c344b35b468b6c8053f635332c3a404f8c7b9",
|
"url": "https://api.github.com/repos/fguillot/picoFeed/zipball/a6087e8264550891c1b8a6da77eca0cab9328709",
|
||||||
"reference": "273c344b35b468b6c8053f635332c3a404f8c7b9",
|
"reference": "a6087e8264550891c1b8a6da77eca0cab9328709",
|
||||||
"shasum": ""
|
"shasum": ""
|
||||||
},
|
},
|
||||||
"require": {
|
"require": {
|
||||||
@ -181,7 +180,7 @@
|
|||||||
"suggest": {
|
"suggest": {
|
||||||
"ext-curl": "PicoFeed will use cURL if present"
|
"ext-curl": "PicoFeed will use cURL if present"
|
||||||
},
|
},
|
||||||
"time": "2015-04-11 12:46:50",
|
"time": "2015-04-27 22:22:06",
|
||||||
"bin": [
|
"bin": [
|
||||||
"picofeed"
|
"picofeed"
|
||||||
],
|
],
|
||||||
@ -194,7 +193,7 @@
|
|||||||
},
|
},
|
||||||
"notification-url": "https://packagist.org/downloads/",
|
"notification-url": "https://packagist.org/downloads/",
|
||||||
"license": [
|
"license": [
|
||||||
"Unlicense"
|
"MIT"
|
||||||
],
|
],
|
||||||
"authors": [
|
"authors": [
|
||||||
{
|
{
|
||||||
|
21
vendor/fguillot/json-rpc/LICENSE
vendored
Normal file
21
vendor/fguillot/json-rpc/LICENSE
vendored
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
The MIT License (MIT)
|
||||||
|
|
||||||
|
Copyright (c) 2015 Frederic Guillot
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in
|
||||||
|
all copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
THE SOFTWARE.
|
2
vendor/fguillot/json-rpc/README.markdown
vendored
2
vendor/fguillot/json-rpc/README.markdown
vendored
@ -11,7 +11,7 @@ Features
|
|||||||
- Authentication and IP based client restrictions
|
- Authentication and IP based client restrictions
|
||||||
- Minimalist: there is only 2 files
|
- Minimalist: there is only 2 files
|
||||||
- Fully unit tested
|
- Fully unit tested
|
||||||
- License: Unlicense http://unlicense.org/
|
- License: MIT
|
||||||
|
|
||||||
Requirements
|
Requirements
|
||||||
------------
|
------------
|
||||||
|
4
vendor/fguillot/json-rpc/composer.json
vendored
4
vendor/fguillot/json-rpc/composer.json
vendored
@ -1,9 +1,9 @@
|
|||||||
{
|
{
|
||||||
"name": "fguillot/json-rpc",
|
"name": "fguillot/json-rpc",
|
||||||
"description": "A simple Json-RPC client/server library that just works",
|
"description": "Simple Json-RPC client/server library that just works",
|
||||||
"homepage": "https://github.com/fguillot/JsonRPC",
|
"homepage": "https://github.com/fguillot/JsonRPC",
|
||||||
"type": "library",
|
"type": "library",
|
||||||
"license": "Unlicense",
|
"license": "MIT",
|
||||||
"authors": [
|
"authors": [
|
||||||
{
|
{
|
||||||
"name": "Frédéric Guillot",
|
"name": "Frédéric Guillot",
|
||||||
|
21
vendor/fguillot/picofarad/LICENCE
vendored
Normal file
21
vendor/fguillot/picofarad/LICENCE
vendored
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
The MIT License (MIT)
|
||||||
|
|
||||||
|
Copyright (c) 2015 Frederic Guillot
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in
|
||||||
|
all copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
THE SOFTWARE.
|
2
vendor/fguillot/picofarad/README.md
vendored
2
vendor/fguillot/picofarad/README.md
vendored
@ -10,7 +10,7 @@ Features
|
|||||||
- No dependency
|
- No dependency
|
||||||
- Easy to use, fast and very lightweight
|
- Easy to use, fast and very lightweight
|
||||||
- Only 4 files: Request, Response, Router and Session
|
- Only 4 files: Request, Response, Router and Session
|
||||||
- License: Do what the fuck you want with that
|
- License: MIT
|
||||||
|
|
||||||
Requirements
|
Requirements
|
||||||
------------
|
------------
|
||||||
|
2
vendor/fguillot/picofarad/composer.json
vendored
2
vendor/fguillot/picofarad/composer.json
vendored
@ -3,7 +3,7 @@
|
|||||||
"description": "Minimalist micro-framework",
|
"description": "Minimalist micro-framework",
|
||||||
"homepage": "https://github.com/fguillot/picoFarad",
|
"homepage": "https://github.com/fguillot/picoFarad",
|
||||||
"type": "library",
|
"type": "library",
|
||||||
"license": "Unlicense",
|
"license": "MIT",
|
||||||
"authors": [
|
"authors": [
|
||||||
{
|
{
|
||||||
"name": "Frédéric Guillot",
|
"name": "Frédéric Guillot",
|
||||||
|
1
vendor/fguillot/picofeed/.gitignore
vendored
1
vendor/fguillot/picofeed/.gitignore
vendored
@ -1,2 +1,3 @@
|
|||||||
.DS_Store
|
.DS_Store
|
||||||
vendor/
|
vendor/
|
||||||
|
*.py
|
21
vendor/fguillot/picofeed/.travis.yml
vendored
21
vendor/fguillot/picofeed/.travis.yml
vendored
@ -1,12 +1,19 @@
|
|||||||
language: php
|
language: php
|
||||||
|
|
||||||
php:
|
php:
|
||||||
- "5.6"
|
- 7.0
|
||||||
- "5.5"
|
- 5.6
|
||||||
- "5.4"
|
- 5.5
|
||||||
- "5.3"
|
- 5.4
|
||||||
|
- 5.3
|
||||||
|
|
||||||
before_script: wget https://phar.phpunit.de/phpunit.phar
|
matrix:
|
||||||
script:
|
fast_finish: true
|
||||||
|
allow_failures:
|
||||||
|
- php: 7.0
|
||||||
|
|
||||||
|
before_script:
|
||||||
- composer dump-autoload
|
- composer dump-autoload
|
||||||
- php phpunit.phar
|
|
||||||
|
script:
|
||||||
|
- phpunit
|
21
vendor/fguillot/picofeed/LICENSE
vendored
Normal file
21
vendor/fguillot/picofeed/LICENSE
vendored
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
The MIT License (MIT)
|
||||||
|
|
||||||
|
Copyright (c) 2015 Frederic Guillot
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in
|
||||||
|
all copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
THE SOFTWARE.
|
3
vendor/fguillot/picofeed/README.markdown
vendored
3
vendor/fguillot/picofeed/README.markdown
vendored
@ -24,7 +24,7 @@ Features
|
|||||||
- Content grabber: download from the original website the full content
|
- Content grabber: download from the original website the full content
|
||||||
- Enclosure detection
|
- Enclosure detection
|
||||||
- RTL languages support
|
- RTL languages support
|
||||||
- License: Unlicense <http://unlicense.org/>
|
- License: MIT
|
||||||
|
|
||||||
Requirements
|
Requirements
|
||||||
------------
|
------------
|
||||||
@ -47,7 +47,6 @@ Authors
|
|||||||
Real world usage
|
Real world usage
|
||||||
----------------
|
----------------
|
||||||
|
|
||||||
- [AnythingNew](http://anythingnew.co)
|
|
||||||
- [Miniflux](http://miniflux.net)
|
- [Miniflux](http://miniflux.net)
|
||||||
- [Owncloud News](https://github.com/owncloud/news)
|
- [Owncloud News](https://github.com/owncloud/news)
|
||||||
|
|
||||||
|
24
vendor/fguillot/picofeed/UNLICENSE
vendored
24
vendor/fguillot/picofeed/UNLICENSE
vendored
@ -1,24 +0,0 @@
|
|||||||
This is free and unencumbered software released into the public domain.
|
|
||||||
|
|
||||||
Anyone is free to copy, modify, publish, use, compile, sell, or
|
|
||||||
distribute this software, either in source code form or as a compiled
|
|
||||||
binary, for any purpose, commercial or non-commercial, and by any
|
|
||||||
means.
|
|
||||||
|
|
||||||
In jurisdictions that recognize copyright laws, the author or authors
|
|
||||||
of this software dedicate any and all copyright interest in the
|
|
||||||
software to the public domain. We make this dedication for the benefit
|
|
||||||
of the public at large and to the detriment of our heirs and
|
|
||||||
successors. We intend this dedication to be an overt act of
|
|
||||||
relinquishment in perpetuity of all present and future rights to this
|
|
||||||
software under copyright law.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
||||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
||||||
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
|
||||||
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
||||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
|
|
||||||
For more information, please refer to <http://unlicense.org/>
|
|
2
vendor/fguillot/picofeed/composer.json
vendored
2
vendor/fguillot/picofeed/composer.json
vendored
@ -3,7 +3,7 @@
|
|||||||
"description": "Modern library to write or read feeds (RSS/Atom)",
|
"description": "Modern library to write or read feeds (RSS/Atom)",
|
||||||
"homepage": "http://fguillot.github.io/picoFeed",
|
"homepage": "http://fguillot.github.io/picoFeed",
|
||||||
"type": "library",
|
"type": "library",
|
||||||
"license": "Unlicense",
|
"license": "MIT",
|
||||||
"authors": [
|
"authors": [
|
||||||
{
|
{
|
||||||
"name": "Frédéric Guillot",
|
"name": "Frédéric Guillot",
|
||||||
|
@ -215,6 +215,27 @@ catch (PicoFeedException $e) {
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Custom regex filters
|
||||||
|
--------------------
|
||||||
|
In case you want modify the content with a simple regex, you can create a rule file named after the domain of the feed's link attribute. For the feed pointing to **http://www.twogag.com/** the file is stored under **Rules/twogag.com.php**
|
||||||
|
|
||||||
|
For filtering, only the array with the key **filter** will be considered. The first level key is a preg_match regex that will match the sub url, e.g. to only match a feed whose link attribute points to **twogag.com/test**, the regex could look like **%/test.*%**. The second level array contains a list of search and replace strings, which will be passed to the preg\_replace function. The first string is the argument that should be matched, the second is the replacement.
|
||||||
|
|
||||||
|
To replace all occurences of links to smaller images for twogag, the following rule can be used:
|
||||||
|
|
||||||
|
|
||||||
|
```php
|
||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'filter' => array(
|
||||||
|
'%.*%' => array(
|
||||||
|
"%http://www.twogag.com/comics-rss/([^.]+)\\.jpg%" =>
|
||||||
|
"http://www.twogag.com/comics/$1.jpg"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
Feed and item properties
|
Feed and item properties
|
||||||
------------------------
|
------------------------
|
||||||
|
|
||||||
|
77
vendor/fguillot/picofeed/docs/grabber.markdown
vendored
77
vendor/fguillot/picofeed/docs/grabber.markdown
vendored
@ -15,23 +15,41 @@ How the content grabber works?
|
|||||||
Standalone usage
|
Standalone usage
|
||||||
----------------
|
----------------
|
||||||
|
|
||||||
|
Fetch remote content:
|
||||||
|
|
||||||
```php
|
```php
|
||||||
<?php
|
<?php
|
||||||
|
|
||||||
use PicoFeed\Client\Grabber;
|
use PicoFeed\Config\Config;
|
||||||
|
use PicoFeed\Scraper\Scraper;
|
||||||
|
|
||||||
$grabber = new Grabber($item_url);
|
$config = new Config;
|
||||||
$grabber->download();
|
|
||||||
$grabber->parse();
|
$grabber = new Scraper($config)
|
||||||
|
$grabber->setUrl($url);
|
||||||
|
$grabber->execute();
|
||||||
|
|
||||||
// Get raw HTML content
|
// Get raw HTML content
|
||||||
echo $grabber->getRawContent();
|
echo $grabber->getRawContent();
|
||||||
|
|
||||||
// Get relevant content
|
// Get relevant content
|
||||||
echo $grabber->getContent();
|
echo $grabber->getRelevantContent();
|
||||||
|
|
||||||
// Get filtered relevant content
|
// Get filtered relevant content
|
||||||
echo $grabber->getFilteredContent();
|
echo $grabber->getFilteredContent();
|
||||||
|
|
||||||
|
// Return true if there is relevant content
|
||||||
|
var_dump($grabber->hasRelevantContent());
|
||||||
|
```
|
||||||
|
|
||||||
|
Parse HTML content:
|
||||||
|
|
||||||
|
```php
|
||||||
|
<?php
|
||||||
|
|
||||||
|
$grabber = new Scraper($config);
|
||||||
|
$grabber->setRawContent($html);
|
||||||
|
$grabber->execute();
|
||||||
```
|
```
|
||||||
|
|
||||||
Fetch full item contents during feed parsing
|
Fetch full item contents during feed parsing
|
||||||
@ -79,11 +97,11 @@ Configuration
|
|||||||
### Enable content grabber for items
|
### Enable content grabber for items
|
||||||
|
|
||||||
- Method name: `enableContentGrabber()`
|
- Method name: `enableContentGrabber()`
|
||||||
- Default value: false (content grabber is disabled by default)
|
- Default value: false (also fetch content if no rule file exist)
|
||||||
- Argument value: none
|
- Argument value: bool (true scrape only webpages which have a rule file)
|
||||||
|
|
||||||
```php
|
```php
|
||||||
$parser->enableContentGrabber();
|
$parser->enableContentGrabber(false);
|
||||||
```
|
```
|
||||||
|
|
||||||
### Ignore item urls for the content grabber
|
### Ignore item urls for the content grabber
|
||||||
@ -106,6 +124,8 @@ Example with the BBC website, `www.bbc.co.uk.php`:
|
|||||||
```php
|
```php
|
||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.bbc.co.uk/news/world-middle-east-23911833',
|
'test_url' => 'http://www.bbc.co.uk/news/world-middle-east-23911833',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="story-body"]',
|
'//div[@class="story-body"]',
|
||||||
@ -123,13 +143,52 @@ return array(
|
|||||||
'//*[@id="also-related-links"]',
|
'//*[@id="also-related-links"]',
|
||||||
'//*[contains(@class, "share") or contains(@class, "hidden") or contains(@class, "hyper")]',
|
'//*[contains(@class, "share") or contains(@class, "hidden") or contains(@class, "hyper")]',
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
```
|
```
|
||||||
|
Each rule file can contain multiple rules, based so links to different website URLs can be handled differently. The first level key is a regex, which will be matched against the full path of the URL using **preg_match**, e.g. for **http://www.bbc.co.uk/news/world-middle-east-23911833?test=1** the URL that would be matched is **/news/world-middle-east-23911833?test=1**
|
||||||
|
|
||||||
Actually, only `body`, `strip` and `test_url` are supported.
|
Each rule has the following keys:
|
||||||
|
* **body**: An array of xpath expressions which will be extracted from the page
|
||||||
|
* **strip**: An array of xpath expressions which will be removed from the matched content
|
||||||
|
* **test_url**: A test url to a matching page to test the grabber
|
||||||
|
|
||||||
Don't forget to send a pull request or a ticket to share your contribution with everybody,
|
Don't forget to send a pull request or a ticket to share your contribution with everybody,
|
||||||
|
|
||||||
|
**A more complex example**:
|
||||||
|
|
||||||
|
Let's say you wanted to extract a div with the id **video** if the article points to an URL like **http://comix.com/videos/423**, **audio** if the article points to an URL like **http://comix.com/podcasts/5** and all other links to the page should instead take the div with the id **content**. The following rulefile would fit that requirement and would be stored in a file called **lib/PicoFeed/Rules/comix.com.php**:
|
||||||
|
|
||||||
|
|
||||||
|
```php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%^/videos.*%' => array(
|
||||||
|
'test_url' => 'http://comix.com/videos/423',
|
||||||
|
'body' => array(
|
||||||
|
'//div[@id="video"]',
|
||||||
|
),
|
||||||
|
'strip' => array()
|
||||||
|
),
|
||||||
|
'%^/podcasts.*%' => array(
|
||||||
|
'test_url' => 'http://comix.com/podcasts/5',
|
||||||
|
'body' => array(
|
||||||
|
'//div[@id="audio"]',
|
||||||
|
),
|
||||||
|
'strip' => array()
|
||||||
|
),
|
||||||
|
'%.*%' => array(
|
||||||
|
'test_url' => 'http://comix.com/blog/1',
|
||||||
|
'body' => array(
|
||||||
|
'//div[@id="content"]',
|
||||||
|
),
|
||||||
|
'strip' => array()
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
List of content grabber rules
|
List of content grabber rules
|
||||||
-----------------------------
|
-----------------------------
|
||||||
|
|
||||||
|
@ -80,7 +80,7 @@ class Curl extends Client
|
|||||||
{
|
{
|
||||||
$length = strlen($buffer);
|
$length = strlen($buffer);
|
||||||
|
|
||||||
if ($buffer === "\r\n") {
|
if ($buffer === "\r\n" || $buffer === "\n") {
|
||||||
$this->response_headers_count++;
|
$this->response_headers_count++;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
@ -162,6 +162,7 @@ class Curl extends Client
|
|||||||
* Prepare curl proxy context
|
* Prepare curl proxy context
|
||||||
*
|
*
|
||||||
* @access private
|
* @access private
|
||||||
|
* @param resource $ch
|
||||||
* @return resource $ch
|
* @return resource $ch
|
||||||
*/
|
*/
|
||||||
private function prepareProxyContext($ch)
|
private function prepareProxyContext($ch)
|
||||||
@ -190,6 +191,7 @@ class Curl extends Client
|
|||||||
* Prepare curl auth context
|
* Prepare curl auth context
|
||||||
*
|
*
|
||||||
* @access private
|
* @access private
|
||||||
|
* @param resource $ch
|
||||||
* @return resource $ch
|
* @return resource $ch
|
||||||
*/
|
*/
|
||||||
private function prepareAuthContext($ch)
|
private function prepareAuthContext($ch)
|
||||||
@ -205,6 +207,7 @@ class Curl extends Client
|
|||||||
* Set write/header functions
|
* Set write/header functions
|
||||||
*
|
*
|
||||||
* @access private
|
* @access private
|
||||||
|
* @param resource $ch
|
||||||
* @return resource $ch
|
* @return resource $ch
|
||||||
*/
|
*/
|
||||||
private function prepareDownloadMode($ch)
|
private function prepareDownloadMode($ch)
|
||||||
@ -305,7 +308,7 @@ class Curl extends Client
|
|||||||
{
|
{
|
||||||
$this->executeContext();
|
$this->executeContext();
|
||||||
|
|
||||||
list($status, $headers) = HttpHeaders::parse(explode("\r\n", $this->response_headers[$this->response_headers_count - 1]));
|
list($status, $headers) = HttpHeaders::parse(explode("\n", $this->response_headers[$this->response_headers_count - 1]));
|
||||||
|
|
||||||
// When restricted with open_basedir
|
// When restricted with open_basedir
|
||||||
if ($this->needToHandleRedirection($follow_location, $status)) {
|
if ($this->needToHandleRedirection($follow_location, $status)) {
|
||||||
|
@ -1,592 +0,0 @@
|
|||||||
<?php
|
|
||||||
|
|
||||||
namespace PicoFeed\Client;
|
|
||||||
|
|
||||||
use DOMXPath;
|
|
||||||
use PicoFeed\Encoding\Encoding;
|
|
||||||
use PicoFeed\Logging\Logger;
|
|
||||||
use PicoFeed\Filter\Filter;
|
|
||||||
use PicoFeed\Parser\XmlParser;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Grabber class
|
|
||||||
*
|
|
||||||
* @author Frederic Guillot
|
|
||||||
* @package Client
|
|
||||||
*/
|
|
||||||
class Grabber
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
* URL
|
|
||||||
*
|
|
||||||
* @access private
|
|
||||||
* @var string
|
|
||||||
*/
|
|
||||||
private $url = '';
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Relevant content
|
|
||||||
*
|
|
||||||
* @access private
|
|
||||||
* @var string
|
|
||||||
*/
|
|
||||||
private $content = '';
|
|
||||||
|
|
||||||
/**
|
|
||||||
* HTML content
|
|
||||||
*
|
|
||||||
* @access private
|
|
||||||
* @var string
|
|
||||||
*/
|
|
||||||
private $html = '';
|
|
||||||
|
|
||||||
/**
|
|
||||||
* HTML content encoding
|
|
||||||
*
|
|
||||||
* @access private
|
|
||||||
* @var string
|
|
||||||
*/
|
|
||||||
private $encoding = '';
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Flag to skip download and parsing
|
|
||||||
*
|
|
||||||
* @access private
|
|
||||||
* @var boolean
|
|
||||||
*/
|
|
||||||
private $skip_processing = false;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* List of attributes to try to get the content, order is important, generic terms at the end
|
|
||||||
*
|
|
||||||
* @access private
|
|
||||||
* @var array
|
|
||||||
*/
|
|
||||||
private $candidatesAttributes = array(
|
|
||||||
'articleBody',
|
|
||||||
'articlebody',
|
|
||||||
'article-body',
|
|
||||||
'articleContent',
|
|
||||||
'articlecontent',
|
|
||||||
'article-content',
|
|
||||||
'articlePage',
|
|
||||||
'post-content',
|
|
||||||
'post_content',
|
|
||||||
'entry-content',
|
|
||||||
'entry-body',
|
|
||||||
'main-content',
|
|
||||||
'story_content',
|
|
||||||
'storycontent',
|
|
||||||
'entryBox',
|
|
||||||
'entrytext',
|
|
||||||
'comic',
|
|
||||||
'post',
|
|
||||||
'article',
|
|
||||||
'content',
|
|
||||||
'main',
|
|
||||||
);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* List of attributes to strip
|
|
||||||
*
|
|
||||||
* @access private
|
|
||||||
* @var array
|
|
||||||
*/
|
|
||||||
private $stripAttributes = array(
|
|
||||||
'comment',
|
|
||||||
'share',
|
|
||||||
'links',
|
|
||||||
'toolbar',
|
|
||||||
'fb',
|
|
||||||
'footer',
|
|
||||||
'credit',
|
|
||||||
'bottom',
|
|
||||||
'nav',
|
|
||||||
'header',
|
|
||||||
'social',
|
|
||||||
'tag',
|
|
||||||
'metadata',
|
|
||||||
'entry-utility',
|
|
||||||
'related-posts',
|
|
||||||
'tweet',
|
|
||||||
'categories',
|
|
||||||
'post_title',
|
|
||||||
'by_line',
|
|
||||||
'byline',
|
|
||||||
'sponsors',
|
|
||||||
);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tags to remove
|
|
||||||
*
|
|
||||||
* @access private
|
|
||||||
* @var array
|
|
||||||
*/
|
|
||||||
private $stripTags = array(
|
|
||||||
'nav',
|
|
||||||
'header',
|
|
||||||
'footer',
|
|
||||||
'aside',
|
|
||||||
'form',
|
|
||||||
);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Config object
|
|
||||||
*
|
|
||||||
* @access private
|
|
||||||
* @var \PicoFeed\Config\Config
|
|
||||||
*/
|
|
||||||
private $config;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Constructor
|
|
||||||
*
|
|
||||||
* @access public
|
|
||||||
* @param string $url Url
|
|
||||||
* @param string $html HTML content
|
|
||||||
* @param string $encoding Charset
|
|
||||||
*/
|
|
||||||
public function __construct($url, $html = '', $encoding = 'utf-8')
|
|
||||||
{
|
|
||||||
$this->url = $url;
|
|
||||||
$this->html = $html;
|
|
||||||
$this->encoding = $encoding;
|
|
||||||
|
|
||||||
$this->handleFiles();
|
|
||||||
$this->handleStreamingVideos();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Set config object
|
|
||||||
*
|
|
||||||
* @access public
|
|
||||||
* @param \PicoFeed\Config\Config $config Config instance
|
|
||||||
* @return Grabber
|
|
||||||
*/
|
|
||||||
public function setConfig($config)
|
|
||||||
{
|
|
||||||
$this->config = $config;
|
|
||||||
return $this;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get URL to download.
|
|
||||||
*
|
|
||||||
* @access public
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
public function getUrl()
|
|
||||||
{
|
|
||||||
return $this->url;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Set URL to download and reset object to use for another grab.
|
|
||||||
*
|
|
||||||
* @access public
|
|
||||||
* @param string $url URL
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
public function setUrl($url)
|
|
||||||
{
|
|
||||||
$this->url = $url;
|
|
||||||
$this->html = "";
|
|
||||||
$this->content = "";
|
|
||||||
$this->encoding = "";
|
|
||||||
|
|
||||||
$this->handleFiles();
|
|
||||||
$this->handleStreamingVideos();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get relevant content
|
|
||||||
*
|
|
||||||
* @access public
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
public function getContent()
|
|
||||||
{
|
|
||||||
return $this->content;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get raw content (unfiltered)
|
|
||||||
*
|
|
||||||
* @access public
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
public function getRawContent()
|
|
||||||
{
|
|
||||||
return $this->html;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get filtered relevant content
|
|
||||||
*
|
|
||||||
* @access public
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
public function getFilteredContent()
|
|
||||||
{
|
|
||||||
$filter = Filter::html($this->content, $this->url);
|
|
||||||
$filter->setConfig($this->config);
|
|
||||||
return $filter->execute();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Return the Youtube embed player and skip processing
|
|
||||||
*
|
|
||||||
* @access public
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
public function handleStreamingVideos()
|
|
||||||
{
|
|
||||||
if (preg_match("#(?<=v=|v\/|vi=|vi\/|youtu.be\/)[a-zA-Z0-9_-]{11}#", $this->url, $matches)) {
|
|
||||||
$this->content = '<iframe width="560" height="315" src="//www.youtube.com/embed/'.$matches[0].'" frameborder="0"></iframe>';
|
|
||||||
$this->skip_processing = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Skip processing for PDF documents
|
|
||||||
*
|
|
||||||
* @access public
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
public function handleFiles()
|
|
||||||
{
|
|
||||||
if (substr($this->url, -3) === 'pdf') {
|
|
||||||
$this->skip_processing = true;
|
|
||||||
Logger::setMessage(get_called_class().': PDF document => processing skipped');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Parse the HTML content
|
|
||||||
*
|
|
||||||
* @access public
|
|
||||||
* @return bool
|
|
||||||
*/
|
|
||||||
public function parse()
|
|
||||||
{
|
|
||||||
if ($this->skip_processing) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($this->html) {
|
|
||||||
$html_encoding = XmlParser::getEncodingFromMetaTag($this->html);
|
|
||||||
|
|
||||||
// Encode everything in UTF-8
|
|
||||||
Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'" ; HTML Encoding "'.$html_encoding.'"');
|
|
||||||
$this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding);
|
|
||||||
$this->html = Filter::stripHeadTags($this->html);
|
|
||||||
|
|
||||||
Logger::setMessage(get_called_class().': Content length: '.strlen($this->html).' bytes');
|
|
||||||
$rules = $this->getRules();
|
|
||||||
|
|
||||||
if (! empty($rules)) {
|
|
||||||
Logger::setMessage(get_called_class().': Parse content with rules');
|
|
||||||
$this->parseContentWithRules($rules);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
Logger::setMessage(get_called_class().': Parse content with candidates');
|
|
||||||
$this->parseContentWithCandidates();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
Logger::setMessage(get_called_class().': No content fetched');
|
|
||||||
}
|
|
||||||
|
|
||||||
Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
|
|
||||||
Logger::setMessage(get_called_class().': Grabber done');
|
|
||||||
|
|
||||||
return $this->content !== '';
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Download the HTML content
|
|
||||||
*
|
|
||||||
* @access public
|
|
||||||
* @return HTML content
|
|
||||||
*/
|
|
||||||
public function download()
|
|
||||||
{
|
|
||||||
if (! $this->skip_processing && $this->url != '') {
|
|
||||||
|
|
||||||
try {
|
|
||||||
|
|
||||||
$client = Client::getInstance();
|
|
||||||
|
|
||||||
if ($this->config !== null) {
|
|
||||||
$client->setConfig($this->config);
|
|
||||||
$client->setTimeout($this->config->getGrabberTimeout());
|
|
||||||
$client->setUserAgent($this->config->getGrabberUserAgent());
|
|
||||||
}
|
|
||||||
|
|
||||||
$client->execute($this->url);
|
|
||||||
|
|
||||||
$this->url = $client->getUrl();
|
|
||||||
$this->html = $client->getContent();
|
|
||||||
$this->encoding = $client->getEncoding();
|
|
||||||
}
|
|
||||||
catch (ClientException $e) {
|
|
||||||
Logger::setMessage(get_called_class().': '.$e->getMessage());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return $this->html;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Try to find a predefined rule
|
|
||||||
*
|
|
||||||
* @access public
|
|
||||||
* @return array
|
|
||||||
*/
|
|
||||||
public function getRules()
|
|
||||||
{
|
|
||||||
$hostname = parse_url($this->url, PHP_URL_HOST);
|
|
||||||
|
|
||||||
if ($hostname !== false) {
|
|
||||||
|
|
||||||
$files = $this->getRulesFileList($hostname);
|
|
||||||
|
|
||||||
foreach ($this->getRulesFolders() as $folder) {
|
|
||||||
$rule = $this->loadRuleFile($folder, $files);
|
|
||||||
|
|
||||||
if (! empty($rule)) {
|
|
||||||
return $rule;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return array();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get the list of possible rules file names for a given hostname
|
|
||||||
*
|
|
||||||
* @access public
|
|
||||||
* @param string $hostname Hostname
|
|
||||||
* @return array
|
|
||||||
*/
|
|
||||||
public function getRulesFileList($hostname)
|
|
||||||
{
|
|
||||||
$files = array($hostname); // subdomain.domain.tld
|
|
||||||
$parts = explode('.', $hostname);
|
|
||||||
$len = count($parts);
|
|
||||||
|
|
||||||
if ($len > 2) {
|
|
||||||
$subdomain = array_shift($parts);
|
|
||||||
$files[] = implode('.', $parts); // domain.tld
|
|
||||||
$files[] = '.'.implode('.', $parts); // .domain.tld
|
|
||||||
$files[] = $subdomain; // subdomain
|
|
||||||
}
|
|
||||||
else if ($len === 2) {
|
|
||||||
$files[] = '.'.implode('.', $parts); // .domain.tld
|
|
||||||
$files[] = $parts[0]; // domain
|
|
||||||
}
|
|
||||||
|
|
||||||
return $files;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Load a rule file from the defined folder
|
|
||||||
*
|
|
||||||
* @access public
|
|
||||||
* @param string $folder Rule directory
|
|
||||||
* @param array $files List of possible file names
|
|
||||||
* @return array
|
|
||||||
*/
|
|
||||||
public function loadRuleFile($folder, array $files)
|
|
||||||
{
|
|
||||||
foreach ($files as $file) {
|
|
||||||
$filename = $folder.'/'.$file.'.php';
|
|
||||||
|
|
||||||
if (file_exists($filename)) {
|
|
||||||
Logger::setMessage(get_called_class().' Load rule: '.$file);
|
|
||||||
return include $filename;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return array();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get the list of folders that contains rules
|
|
||||||
*
|
|
||||||
* @access public
|
|
||||||
* @return array
|
|
||||||
*/
|
|
||||||
public function getRulesFolders()
|
|
||||||
{
|
|
||||||
$folders = array(__DIR__.'/../Rules');
|
|
||||||
|
|
||||||
if ($this->config !== null && $this->config->getGrabberRulesFolder() !== null) {
|
|
||||||
$folders[] = $this->config->getGrabberRulesFolder();
|
|
||||||
}
|
|
||||||
|
|
||||||
return $folders;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get the relevant content with predefined rules
|
|
||||||
*
|
|
||||||
* @access public
|
|
||||||
* @param array $rules Rules
|
|
||||||
*/
|
|
||||||
public function parseContentWithRules(array $rules)
|
|
||||||
{
|
|
||||||
// Logger::setMessage($this->html);
|
|
||||||
$dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$this->html);
|
|
||||||
$xpath = new DOMXPath($dom);
|
|
||||||
|
|
||||||
if (isset($rules['strip']) && is_array($rules['strip'])) {
|
|
||||||
|
|
||||||
foreach ($rules['strip'] as $pattern) {
|
|
||||||
|
|
||||||
$nodes = $xpath->query($pattern);
|
|
||||||
|
|
||||||
if ($nodes !== false && $nodes->length > 0) {
|
|
||||||
foreach ($nodes as $node) {
|
|
||||||
$node->parentNode->removeChild($node);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isset($rules['body']) && is_array($rules['body'])) {
|
|
||||||
|
|
||||||
foreach ($rules['body'] as $pattern) {
|
|
||||||
|
|
||||||
$nodes = $xpath->query($pattern);
|
|
||||||
|
|
||||||
if ($nodes !== false && $nodes->length > 0) {
|
|
||||||
foreach ($nodes as $node) {
|
|
||||||
$this->content .= $dom->saveXML($node);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get the relevant content with the list of potential attributes
|
|
||||||
*
|
|
||||||
* @access public
|
|
||||||
*/
|
|
||||||
public function parseContentWithCandidates()
|
|
||||||
{
|
|
||||||
$dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$this->html);
|
|
||||||
$xpath = new DOMXPath($dom);
|
|
||||||
|
|
||||||
// Try to lookup in each tag
|
|
||||||
foreach ($this->candidatesAttributes as $candidate) {
|
|
||||||
|
|
||||||
Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"');
|
|
||||||
|
|
||||||
$nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
|
|
||||||
|
|
||||||
if ($nodes !== false && $nodes->length > 0) {
|
|
||||||
$this->content = $dom->saveXML($nodes->item(0));
|
|
||||||
Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try to fetch <article/>
|
|
||||||
if (strlen($this->content) < 200) {
|
|
||||||
|
|
||||||
$nodes = $xpath->query('//article');
|
|
||||||
|
|
||||||
if ($nodes !== false && $nodes->length > 0) {
|
|
||||||
$this->content = $dom->saveXML($nodes->item(0));
|
|
||||||
Logger::setMessage(get_called_class().': Find <article/> tag ('.strlen($this->content).' bytes)');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get everything
|
|
||||||
if (strlen($this->content) < 50) {
|
|
||||||
|
|
||||||
$nodes = $xpath->query('//body');
|
|
||||||
|
|
||||||
if ($nodes !== false && $nodes->length > 0) {
|
|
||||||
Logger::setMessage(get_called_class().' No enought content fetched, get //body');
|
|
||||||
$this->content = $dom->saveXML($nodes->item(0));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Logger::setMessage(get_called_class().': Strip garbage');
|
|
||||||
$this->stripGarbage();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Strip useless tags
|
|
||||||
*
|
|
||||||
* @access public
|
|
||||||
*/
|
|
||||||
public function stripGarbage()
|
|
||||||
{
|
|
||||||
$dom = XmlParser::getDomDocument($this->content);
|
|
||||||
|
|
||||||
if ($dom !== false) {
|
|
||||||
|
|
||||||
$xpath = new DOMXPath($dom);
|
|
||||||
|
|
||||||
foreach ($this->stripTags as $tag) {
|
|
||||||
|
|
||||||
$nodes = $xpath->query('//'.$tag);
|
|
||||||
|
|
||||||
if ($nodes !== false && $nodes->length > 0) {
|
|
||||||
Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"');
|
|
||||||
foreach ($nodes as $node) {
|
|
||||||
$node->parentNode->removeChild($node);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach ($this->stripAttributes as $attribute) {
|
|
||||||
|
|
||||||
$nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
|
|
||||||
|
|
||||||
if ($nodes !== false && $nodes->length > 0) {
|
|
||||||
Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"');
|
|
||||||
foreach ($nodes as $node) {
|
|
||||||
if ($this->shouldRemove($dom, $node)) {
|
|
||||||
$node->parentNode->removeChild($node);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
$this->content = $dom->saveXML($dom->documentElement);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Return false if the node should not be removed
|
|
||||||
*
|
|
||||||
* @access public
|
|
||||||
* @param DomDocument $dom
|
|
||||||
* @param DomNode $node
|
|
||||||
* @return boolean
|
|
||||||
*/
|
|
||||||
public function shouldRemove($dom, $node)
|
|
||||||
{
|
|
||||||
$document_length = strlen($dom->textContent);
|
|
||||||
$node_length = strlen($node->textContent);
|
|
||||||
|
|
||||||
if ($document_length === 0) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
$ratio = $node_length * 100 / $document_length;
|
|
||||||
|
|
||||||
if ($ratio >= 90) {
|
|
||||||
Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%');
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
@ -235,6 +235,7 @@ class Attribute
|
|||||||
'filterProtocolUrlAttribute',
|
'filterProtocolUrlAttribute',
|
||||||
'rewriteImageProxyUrl',
|
'rewriteImageProxyUrl',
|
||||||
'secureIframeSrc',
|
'secureIframeSrc',
|
||||||
|
'removeYouTubeAutoplay'
|
||||||
);
|
);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -404,6 +405,25 @@ class Attribute
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Removes YouTube autoplay from iframes
|
||||||
|
*
|
||||||
|
* @access public
|
||||||
|
* @param string $tag Tag name
|
||||||
|
* @param array $attribute Atttributes name
|
||||||
|
* @param string $value Attribute value
|
||||||
|
* @return boolean
|
||||||
|
*/
|
||||||
|
public function removeYouTubeAutoplay($tag, $attribute, &$value)
|
||||||
|
{
|
||||||
|
$regex = '%^(https://(?:www\.)?youtube.com/.*\?.*autoplay=)(1)(.*)%i';
|
||||||
|
if ($tag === 'iframe' && $attribute === 'src' && preg_match($regex, $value)) {
|
||||||
|
$value = preg_replace($regex, '${1}0$3', $value);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Rewrite image url to use with a proxy
|
* Rewrite image url to use with a proxy
|
||||||
*
|
*
|
||||||
|
@ -2,7 +2,9 @@
|
|||||||
|
|
||||||
namespace PicoFeed\Filter;
|
namespace PicoFeed\Filter;
|
||||||
|
|
||||||
|
use PicoFeed\Config\Config;
|
||||||
use PicoFeed\Client\Url;
|
use PicoFeed\Client\Url;
|
||||||
|
use PicoFeed\Scraper\RuleLoader;
|
||||||
use PicoFeed\Parser\XmlParser;
|
use PicoFeed\Parser\XmlParser;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -69,6 +71,14 @@ class Html
|
|||||||
*/
|
*/
|
||||||
public $attribute = '';
|
public $attribute = '';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The website to filter
|
||||||
|
*
|
||||||
|
* @access private
|
||||||
|
* @var string
|
||||||
|
*/
|
||||||
|
private $website;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize the filter, all inputs data must be encoded in UTF-8 before
|
* Initialize the filter, all inputs data must be encoded in UTF-8 before
|
||||||
*
|
*
|
||||||
@ -81,6 +91,7 @@ class Html
|
|||||||
$this->input = XmlParser::HtmlToXml($html);
|
$this->input = XmlParser::HtmlToXml($html);
|
||||||
$this->output = '';
|
$this->output = '';
|
||||||
$this->tag = new Tag;
|
$this->tag = new Tag;
|
||||||
|
$this->website = $website;
|
||||||
$this->attribute = new Attribute(new Url($website));
|
$this->attribute = new Attribute(new Url($website));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -155,9 +166,45 @@ class Html
|
|||||||
public function postFilter()
|
public function postFilter()
|
||||||
{
|
{
|
||||||
$this->output = $this->tag->removeEmptyTags($this->output);
|
$this->output = $this->tag->removeEmptyTags($this->output);
|
||||||
|
$this->output = $this->filterRules($this->output);
|
||||||
|
$this->output = $this->tag->removeMultipleBreakTags($this->output);
|
||||||
$this->output = trim($this->output);
|
$this->output = trim($this->output);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Called after XML parsing
|
||||||
|
* @param string $content the content that should be filtered
|
||||||
|
*
|
||||||
|
* @access public
|
||||||
|
*/
|
||||||
|
public function filterRules($content)
|
||||||
|
{
|
||||||
|
// the constructor should require a config, then this if can be removed
|
||||||
|
if ($this->config === null) {
|
||||||
|
$config = new Config;
|
||||||
|
} else {
|
||||||
|
$config = $this->config;
|
||||||
|
}
|
||||||
|
|
||||||
|
$loader = new RuleLoader($config);
|
||||||
|
$rules = $loader->getRules($this->website);
|
||||||
|
|
||||||
|
$url = new Url($this->website);
|
||||||
|
$sub_url = $url->getFullPath();
|
||||||
|
|
||||||
|
if (isset($rules['filter'])) {
|
||||||
|
foreach ($rules['filter'] as $pattern => $rule) {
|
||||||
|
if (preg_match($pattern, $sub_url)) {
|
||||||
|
foreach($rule as $search => $replace) {
|
||||||
|
$content = preg_replace($search, $replace, $content);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $content;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse opening tag
|
* Parse opening tag
|
||||||
*
|
*
|
||||||
|
@ -194,7 +194,7 @@ class Tag
|
|||||||
* @param string $data Input data
|
* @param string $data Input data
|
||||||
* @return string
|
* @return string
|
||||||
*/
|
*/
|
||||||
public function removeMultipleTags($data)
|
public function removeMultipleBreakTags($data)
|
||||||
{
|
{
|
||||||
return preg_replace("/(<br\s*\/?>\s*)+/", "<br/>", $data);
|
return preg_replace("/(<br\s*\/?>\s*)+/", "<br/>", $data);
|
||||||
}
|
}
|
||||||
|
@ -3,11 +3,11 @@
|
|||||||
namespace PicoFeed\Parser;
|
namespace PicoFeed\Parser;
|
||||||
|
|
||||||
use SimpleXMLElement;
|
use SimpleXMLElement;
|
||||||
|
use PicoFeed\Client\Url;
|
||||||
use PicoFeed\Encoding\Encoding;
|
use PicoFeed\Encoding\Encoding;
|
||||||
use PicoFeed\Filter\Filter;
|
use PicoFeed\Filter\Filter;
|
||||||
use PicoFeed\Logging\Logger;
|
use PicoFeed\Logging\Logger;
|
||||||
use PicoFeed\Client\Url;
|
use PicoFeed\Scraper\Scraper;
|
||||||
use PicoFeed\Client\Grabber;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Base parser class
|
* Base parser class
|
||||||
@ -81,6 +81,14 @@ abstract class Parser
|
|||||||
*/
|
*/
|
||||||
private $enable_grabber = false;
|
private $enable_grabber = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enable the content grabber on all pages
|
||||||
|
*
|
||||||
|
* @access private
|
||||||
|
* @var bool
|
||||||
|
*/
|
||||||
|
private $grabber_needs_rule_file = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Ignore those urls for the content scraper
|
* Ignore those urls for the content scraper
|
||||||
*
|
*
|
||||||
@ -237,11 +245,16 @@ abstract class Parser
|
|||||||
{
|
{
|
||||||
if ($this->enable_grabber && ! in_array($item->getUrl(), $this->grabber_ignore_urls)) {
|
if ($this->enable_grabber && ! in_array($item->getUrl(), $this->grabber_ignore_urls)) {
|
||||||
|
|
||||||
$grabber = new Grabber($item->getUrl());
|
$grabber = new Scraper($this->config);
|
||||||
$grabber->setConfig($this->config);
|
$grabber->setUrl($item->getUrl());
|
||||||
$grabber->download();
|
|
||||||
|
|
||||||
if ($grabber->parse()) {
|
if ($this->grabber_needs_rule_file) {
|
||||||
|
$grabber->disableCandidateParser();
|
||||||
|
}
|
||||||
|
|
||||||
|
$grabber->execute();
|
||||||
|
|
||||||
|
if ($grabber->hasRelevantContent()) {
|
||||||
$item->content = $grabber->getFilteredContent();
|
$item->content = $grabber->getFilteredContent();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -270,7 +283,6 @@ abstract class Parser
|
|||||||
* Generate a unique id for an entry (hash all arguments)
|
* Generate a unique id for an entry (hash all arguments)
|
||||||
*
|
*
|
||||||
* @access public
|
* @access public
|
||||||
* @param string $args Pieces of data to hash
|
|
||||||
* @return string
|
* @return string
|
||||||
*/
|
*/
|
||||||
public function generateId()
|
public function generateId()
|
||||||
@ -383,11 +395,14 @@ abstract class Parser
|
|||||||
* Enable the content grabber
|
* Enable the content grabber
|
||||||
*
|
*
|
||||||
* @access public
|
* @access public
|
||||||
|
* @param bool $needs_rule_file true if only pages with rule files should be
|
||||||
|
* scraped
|
||||||
* @return \PicoFeed\Parser\Parser
|
* @return \PicoFeed\Parser\Parser
|
||||||
*/
|
*/
|
||||||
public function enableContentGrabber()
|
public function enableContentGrabber($needs_rule_file = false)
|
||||||
{
|
{
|
||||||
$this->enable_grabber = true;
|
$this->enable_grabber = true;
|
||||||
|
$this->grabber_needs_rule_file = $needs_rule_file;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://combat.blog.lemonde.fr/2013/08/31/teddy-riner-le-rookie-devenu-rambo/#xtor=RSS-3208',
|
'test_url' => 'http://combat.blog.lemonde.fr/2013/08/31/teddy-riner-le-rookie-devenu-rambo/#xtor=RSS-3208',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="entry-content"]',
|
'//div[@class="entry-content"]',
|
||||||
@ -7,4 +9,6 @@ return array(
|
|||||||
'strip' => array(
|
'strip' => array(
|
||||||
'//*[contains(@class, "fb-like") or contains(@class, "social")]'
|
'//*[contains(@class, "fb-like") or contains(@class, "social")]'
|
||||||
),
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
@ -1,5 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'title' => '//header/h1',
|
'title' => '//header/h1',
|
||||||
'test_url' => 'http://bits.blogs.nytimes.com/2012/01/16/wikipedia-plans-to-go-dark-on-wednesday-to-protest-sopa/',
|
'test_url' => 'http://bits.blogs.nytimes.com/2012/01/16/wikipedia-plans-to-go-dark-on-wednesday-to-protest-sopa/',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
@ -8,4 +10,6 @@ return array(
|
|||||||
'strip' => array(
|
'strip' => array(
|
||||||
'//*[@class="shareToolsBox"]',
|
'//*[@class="shareToolsBox"]',
|
||||||
),
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
@ -1,9 +1,13 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.igen.fr/ailleurs/2014/05/nvidia-va-delaisser-les-smartphones-grand-public-86031',
|
'test_url' => 'http://www.igen.fr/ailleurs/2014/05/nvidia-va-delaisser-les-smartphones-grand-public-86031',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[contains(@class, "field-name-body")]'
|
'//div[contains(@class, "field-name-body")]'
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
),
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
@ -1,8 +1,11 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.nytimes.com/2011/05/15/world/middleeast/15prince.html',
|
'test_url' => 'http://www.nytimes.com/2011/05/15/world/middleeast/15prince.html',
|
||||||
'title' => '//h1[@class="articleHeadline"]',
|
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="articleBody"]',
|
'//div[@class="articleBody"]',
|
||||||
),
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
@ -1,9 +1,12 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1',
|
'test_url' => 'http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="KonaBody"]',
|
'//div[@class="KonaBody"]',
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array()
|
||||||
|
)
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.slate.com/articles/business/moneybox/2013/08/microsoft_ceo_steve_ballmer_retires_a_firsthand_account_of_the_company_s.html',
|
'test_url' => 'http://www.slate.com/articles/business/moneybox/2013/08/microsoft_ceo_steve_ballmer_retires_a_firsthand_account_of_the_company_s.html',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="sl-art-body"]',
|
'//div[@class="sl-art-body"]',
|
||||||
@ -13,4 +15,6 @@ return array(
|
|||||||
'//*[@class="sl-art-creds-cntr"]',
|
'//*[@class="sl-art-creds-cntr"]',
|
||||||
'//*[@class="sl-art-ad-midflex"]',
|
'//*[@class="sl-art-ad-midflex"]',
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
@ -1,5 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.theguardian.com/sustainable-business/2015/feb/02/2015-hyper-transparency-global-business',
|
'test_url' => 'http://www.theguardian.com/sustainable-business/2015/feb/02/2015-hyper-transparency-global-business',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[contains(@class, "content__main-column--article")]',
|
'//div[contains(@class, "content__main-column--article")]',
|
||||||
@ -7,4 +9,6 @@ return array(
|
|||||||
'strip' => array(
|
'strip' => array(
|
||||||
'//div[contains(@class, "meta-container")]',
|
'//div[contains(@class, "meta-container")]',
|
||||||
),
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
@ -1,5 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'https://en.wikipedia.org/wiki/Grace_Hopper',
|
'test_url' => 'https://en.wikipedia.org/wiki/Grace_Hopper',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@id="bodyContent"]',
|
'//div[@id="bodyContent"]',
|
||||||
@ -22,4 +24,6 @@ return array(
|
|||||||
"//*[contains(@class, 'error')]",
|
"//*[contains(@class, 'error')]",
|
||||||
"//span[@title='pronunciation:']",
|
"//span[@title='pronunciation:']",
|
||||||
),
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.wired.com/gamelife/2013/09/ouya-free-the-games/',
|
'test_url' => 'http://www.wired.com/gamelife/2013/09/ouya-free-the-games/',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="entry"]',
|
'//div[@class="entry"]',
|
||||||
@ -14,4 +16,6 @@ return array(
|
|||||||
'//img[contains(@src, "1x1")]',
|
'//img[contains(@src, "1x1")]',
|
||||||
'//a[contains(@href, "creativecommons")]',
|
'//a[contains(@href, "creativecommons")]',
|
||||||
),
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://online.wsj.com/article/SB10001424127887324108204579023143974408428.html',
|
'test_url' => 'http://online.wsj.com/article/SB10001424127887324108204579023143974408428.html',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="articlePage"]',
|
'//div[@class="articlePage"]',
|
||||||
@ -8,4 +10,6 @@ return array(
|
|||||||
'//*[@id="articleThumbnail_2"]',
|
'//*[@id="articleThumbnail_2"]',
|
||||||
'//*[@class="socialByline"]',
|
'//*[@class="socialByline"]',
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
@ -1,5 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.01net.com/editorial/624550/twitter-rachete-madbits-un-specialiste-francais-de-lanalyse-dimages/',
|
'test_url' => 'http://www.01net.com/editorial/624550/twitter-rachete-madbits-un-specialiste-francais-de-lanalyse-dimages/',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="article_ventre_box"]',
|
'//div[@class="article_ventre_box"]',
|
||||||
@ -11,4 +13,6 @@ return array(
|
|||||||
'//*[contains(@class, "article_toolbarMain")]',
|
'//*[contains(@class, "article_toolbarMain")]',
|
||||||
'//*[contains(@class, "article_imagehaute_box")]'
|
'//*[contains(@class, "article_imagehaute_box")]'
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
@ -1,5 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.alainonline.net/news_details.php?lang=arabic&sid=18907',
|
'test_url' => 'http://www.alainonline.net/news_details.php?lang=arabic&sid=18907',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="news_details"]'
|
'//div[@class="news_details"]'
|
||||||
@ -7,4 +9,6 @@ return array(
|
|||||||
'strip' => array(
|
'strip' => array(
|
||||||
'//div[@class="news_details"]/div/div[last()]',
|
'//div[@class="news_details"]/div/div[last()]',
|
||||||
),
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.allgemeine-zeitung.de/lokales/polizei/mainz-gonsenheim-unbekannte-rauben-esso-tankstelle-in-kurt-schumacher-strasse-aus_14913147.htm',
|
'test_url' => 'http://www.allgemeine-zeitung.de/lokales/polizei/mainz-gonsenheim-unbekannte-rauben-esso-tankstelle-in-kurt-schumacher-strasse-aus_14913147.htm',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[contains(@class, "article")][1]',
|
'//div[contains(@class, "article")][1]',
|
||||||
@ -17,4 +18,6 @@ return array(
|
|||||||
'//span[@class="nd address"]',
|
'//span[@class="nd address"]',
|
||||||
'//a[contains(@href, "abo-und-services")]'
|
'//a[contains(@href, "abo-und-services")]'
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/areadvd.de.php
vendored
Normal file
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/areadvd.de.php
vendored
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
|
'test_url' => 'http://www.areadvd.de/news/daily-deals-angebote-bei-lautsprecher-teufel-3/',
|
||||||
|
'body' => array('//div[contains(@class,"entry")]'),
|
||||||
|
'strip' => array(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/awkwardzombie.com.php
vendored
Normal file
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/awkwardzombie.com.php
vendored
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%/index.php.*comic=.*%' => array(
|
||||||
|
'test_url' => 'http://www.awkwardzombie.com/index.php?comic=041315',
|
||||||
|
'body' => array('//*[@id="comic"]/img'),
|
||||||
|
'strip' => array(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
@ -1,9 +1,13 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://blog.fefe.de/?ts=ad706a73',
|
'test_url' => 'http://blog.fefe.de/?ts=ad706a73',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'/html/body/ul'
|
'/html/body/ul'
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
),
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
@ -1,9 +1,13 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.bunicomic.com/comic/buni-623/',
|
'test_url' => 'http://www.bunicomic.com/comic/buni-623/',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="comic-table"]',
|
'//div[@class="comic-table"]',
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
),
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
12
vendor/fguillot/picofeed/lib/PicoFeed/Rules/cad-comic.com.php
vendored
Normal file
12
vendor/fguillot/picofeed/lib/PicoFeed/Rules/cad-comic.com.php
vendored
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%/cad/.+%' => array(
|
||||||
|
'test_url' => 'http://www.cad-comic.com/cad/20150417',
|
||||||
|
'body' => array(
|
||||||
|
'//*[@id="content"]/img'
|
||||||
|
),
|
||||||
|
'strip' => array(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/chaoslife.findchaos.com.php
vendored
Normal file
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/chaoslife.findchaos.com.php
vendored
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
|
'test_url' => 'http://chaoslife.findchaos.com/pets-in-the-wild',
|
||||||
|
'body' => array('//div[@id="comic"]'),
|
||||||
|
'strip' => array(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/cliquerefresh.com.php
vendored
Normal file
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/cliquerefresh.com.php
vendored
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%/comic.*%' => array(
|
||||||
|
'test_url' => 'http://cliquerefresh.com/comic/078-stating-the-obvious/',
|
||||||
|
'body' => array('//div[@class="comicImg"]/img | //div[@class="comicImg"]/a/img'),
|
||||||
|
'strip' => array(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
@ -1,9 +1,13 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://consomac.fr/news-2430-l-iphone-6-toujours-un-secret-bien-garde.html',
|
'test_url' => 'http://consomac.fr/news-2430-l-iphone-6-toujours-un-secret-bien-garde.html',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[contains(@id, "newscontent")]',
|
'//div[contains(@id, "newscontent")]',
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
),
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
4
vendor/fguillot/picofeed/lib/PicoFeed/Rules/dailyjs.com.php
vendored
Executable file → Normal file
4
vendor/fguillot/picofeed/lib/PicoFeed/Rules/dailyjs.com.php
vendored
Executable file → Normal file
@ -1,5 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://dailyjs.com/2014/08/07/p5js/',
|
'test_url' => 'http://dailyjs.com/2014/08/07/p5js/',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@id="post"]',
|
'//div[@id="post"]',
|
||||||
@ -12,4 +14,6 @@ return array(
|
|||||||
'//*[@class="navigation small"]',
|
'//*[@class="navigation small"]',
|
||||||
'//*[@id="related"]',
|
'//*[@id="related"]',
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.degroupnews.com/medias/vodsvod/amazon-concurrence-la-chromecast-de-google-avec-fire-tv-stick',
|
'test_url' => 'http://www.degroupnews.com/medias/vodsvod/amazon-concurrence-la-chromecast-de-google-avec-fire-tv-stick',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="contenu"]',
|
'//div[@class="contenu"]',
|
||||||
@ -7,4 +9,6 @@ return array(
|
|||||||
'strip' => array(
|
'strip' => array(
|
||||||
'//div[contains(@class, "a2a")]'
|
'//div[contains(@class, "a2a")]'
|
||||||
),
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
@ -1,5 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://derstandard.at/2000010267354/The-Witcher-3-Hohe-Hardware-Anforderungen-fuer-PC-Spieler?ref=rss',
|
'test_url' => 'http://derstandard.at/2000010267354/The-Witcher-3-Hohe-Hardware-Anforderungen-fuer-PC-Spieler?ref=rss',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="copytext"]',
|
'//div[@class="copytext"]',
|
||||||
@ -7,4 +9,6 @@ return array(
|
|||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
),
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
@ -1,9 +1,13 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://distrowatch.com/?newsid=08355',
|
'test_url' => 'http://distrowatch.com/?newsid=08355',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//td[@class="NewsText"][1]',
|
'//td[@class="NewsText"][1]',
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
@ -1,5 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://dozodomo.com/bento/2014/03/04/lart-des-maki-de-takayo-kiyota/',
|
'test_url' => 'http://dozodomo.com/bento/2014/03/04/lart-des-maki-de-takayo-kiyota/',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="joke"]',
|
'//div[@class="joke"]',
|
||||||
@ -8,4 +10,6 @@ return array(
|
|||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/engadget.com.php
vendored
Normal file
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/engadget.com.php
vendored
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
|
'test_url' => 'http://www.engadget.com/2015/04/20/dark-matter-discovery/?ncid=rss_truncated',
|
||||||
|
'body' => array('//div[@class="article-content"]/p[not(@class="read-more")] | //div[@class="article-content"]/div[@style="text-align: center;"]'),
|
||||||
|
'strip' => array(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
42
vendor/fguillot/picofeed/lib/PicoFeed/Rules/escapistmagazine.com.php
vendored
Normal file
42
vendor/fguillot/picofeed/lib/PicoFeed/Rules/escapistmagazine.com.php
vendored
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%/articles/view/comicsandcosplay/comics/critical-miss.*%' => array(
|
||||||
|
'body' => array('//*[@class="body"]/span/img | //div[@class="folder_nav_links"]/following::p'),
|
||||||
|
'test_url' => 'http://www.escapistmagazine.com/articles/view/comicsandcosplay/comics/critical-miss/13776-Critical-Miss-on-Framerates?utm_source=rss&utm_medium=rss&utm_campaign=articles',
|
||||||
|
'strip' => array()
|
||||||
|
),
|
||||||
|
'%/articles/view/comicsandcosplay/comics/namegame.*%' => array(
|
||||||
|
'body' => array('//*[@class="body"]/span/p/img[@height != "120"]'),
|
||||||
|
'test_url' => 'http://www.escapistmagazine.com/articles/view/comicsandcosplay/comics/namegame/9759-Leaving-the-Nest?utm_source=rss&utm_medium=rss&utm_campaign=articles',
|
||||||
|
'strip' => array()
|
||||||
|
),
|
||||||
|
'%/articles/view/comicsandcosplay/comics/stolen-pixels.*%' => array(
|
||||||
|
'body' => array('//*[@class="body"]/span/p[2]/img'),
|
||||||
|
'test_url' => 'http://www.escapistmagazine.com/articles/view/comicsandcosplay/comics/stolen-pixels/8866-Stolen-Pixels-258-Where-the-Boys-Are?utm_source=rss&utm_medium=rss&utm_campaign=articles',
|
||||||
|
'strip' => array()
|
||||||
|
),
|
||||||
|
'%/articles/view/comicsandcosplay/comics/bumhugparade.*%' => array(
|
||||||
|
'body' => array('//*[@class="body"]/span/p[2]/img'),
|
||||||
|
'test_url' => 'http://www.escapistmagazine.com/articles/view/comicsandcosplay/comics/bumhugparade/8262-Bumhug-Parade-13?utm_source=rss&utm_medium=rss&utm_campaign=articles',
|
||||||
|
'strip' => array()
|
||||||
|
),
|
||||||
|
'%/articles/view/comicsandcosplay.*/comics/escapistradiotheater%' => array(
|
||||||
|
'body' => array('//*[@class="body"]/span/p[2]/img'),
|
||||||
|
'test_url' => 'http://www.escapistmagazine.com/articles/view/comicsandcosplay/comics/escapistradiotheater/8265-The-Escapist-Radio-Theater-13?utm_source=rss&utm_medium=rss&utm_campaign=articles',
|
||||||
|
'strip' => array()
|
||||||
|
),
|
||||||
|
'%/articles/view/comicsandcosplay/comics/paused.*%' => array(
|
||||||
|
'body' => array('//*[@class="body"]/span/p[2]/img | //*[@class="body"]/span/div/img'),
|
||||||
|
'test_url' => 'http://www.escapistmagazine.com/articles/view/comicsandcosplay/comics/paused/8263-Paused-16?utm_source=rss&utm_medium=rss&utm_campaign=articles',
|
||||||
|
'strip' => array()
|
||||||
|
),
|
||||||
|
'%/articles/view/comicsandcosplay/comics/fraughtwithperil.*%' => array(
|
||||||
|
'body' => array('//*[@class="body"]'),
|
||||||
|
'test_url' => 'http://www.escapistmagazine.com/articles/view/comicsandcosplay/comics/fraughtwithperil/12166-The-Escapist-Presents-Escapist-Comics-Critical-Miss-B-lyeh-Fhlop?utm_source=rss&utm_medium=rss&utm_campaign=articles',
|
||||||
|
'strip' => array()
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
|
|
@ -1,9 +1,13 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://explosm.net/comics/3803/',
|
'test_url' => 'http://explosm.net/comics/3803/',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@id="comic-container"]',
|
'//div[@id="comic-container"]',
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
),
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
@ -1,9 +1,13 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.fastcodesign.com/3026548/exposure/peek-inside-the-worlds-forbidden-subway-tunnels',
|
'test_url' => 'http://www.fastcodesign.com/3026548/exposure/peek-inside-the-worlds-forbidden-subway-tunnels',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//article[contains(@class, "body prose")]',
|
'//article[contains(@class, "body prose")]',
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
@ -1,9 +1,13 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.fastcoexist.com/3026114/take-a-seat-on-this-gates-funded-future-toilet-that-will-change-how-we-think-about-poop',
|
'test_url' => 'http://www.fastcoexist.com/3026114/take-a-seat-on-this-gates-funded-future-toilet-that-will-change-how-we-think-about-poop',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//article[contains(@class, "body prose")]',
|
'//article[contains(@class, "body prose")]',
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
@ -1,9 +1,13 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.fastcompany.com/3026712/fast-feed/elon-musk-an-apple-tesla-merger-is-very-unlikely',
|
'test_url' => 'http://www.fastcompany.com/3026712/fast-feed/elon-musk-an-apple-tesla-merger-is-very-unlikely',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//article[contains(@class, "body prose")]',
|
'//article[contains(@class, "body prose")]',
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
@ -1,9 +1,13 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.ffworld.com/?rub=news&page=voir&id=2709',
|
'test_url' => 'http://www.ffworld.com/?rub=news&page=voir&id=2709',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="news_body"]',
|
'//div[@class="news_body"]',
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/fowllanguagecomics.com.php
vendored
Normal file
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/fowllanguagecomics.com.php
vendored
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
|
'body' => array('//*[@id="comic"] | //*[@class="post-image"]'),
|
||||||
|
'strip' => array(),
|
||||||
|
'test_url' => 'http://www.fowllanguagecomics.com/comic/working-out/'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
@ -1,5 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'https://github.com/audreyr/favicon-cheat-sheet',
|
'test_url' => 'https://github.com/audreyr/favicon-cheat-sheet',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//article[contains(@class, "entry-content")]',
|
'//article[contains(@class, "entry-content")]',
|
||||||
@ -7,4 +9,6 @@ return array(
|
|||||||
'strip' => array(
|
'strip' => array(
|
||||||
'//h1'
|
'//h1'
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
@ -1,8 +1,12 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.golem.de/news/breko-telekom-verzoegert-gezielt-den-vectoring-ausbau-1311-102974.html',
|
'test_url' => 'http://www.golem.de/news/breko-telekom-verzoegert-gezielt-den-vectoring-ausbau-1311-102974.html',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//header[@class="cluster-header"]',
|
'//header[@class="cluster-header"]',
|
||||||
'//div[@class="formatted"]'
|
'//div[@class="formatted"]'
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
@ -1,7 +1,11 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.heise.de/security/meldung/BND-300-Millionen-Euro-fuer-Fruehwarnsystem-gegen-Cyber-Attacken-2192237.html',
|
'test_url' => 'http://www.heise.de/security/meldung/BND-300-Millionen-Euro-fuer-Fruehwarnsystem-gegen-Cyber-Attacken-2192237.html',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="meldung_wrapper"]'
|
'//div[@class="meldung_wrapper"]'
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
@ -1,9 +1,13 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.huffingtonpost.com/2014/02/20/centscere-social-media-syracuse_n_4823848.html',
|
'test_url' => 'http://www.huffingtonpost.com/2014/02/20/centscere-social-media-syracuse_n_4823848.html',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//article[@class="content")]',
|
'//article[@class="content")]',
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
@ -1,8 +1,12 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://ing.dk/artikel/smart-husisolering-og-styring-skal-mindske-japans-energikrise-164517',
|
'test_url' => 'http://ing.dk/artikel/smart-husisolering-og-styring-skal-mindske-japans-energikrise-164517',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//section[contains(@class, "teaser")]',
|
'//section[contains(@class, "teaser")]',
|
||||||
'//section[contains(@class, "body")]',
|
'//section[contains(@class, "body")]',
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
@ -1,7 +1,11 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www./2014/05/20/le-playstation-now-arrive-en-beta-fermee-aux-etats-unis/',
|
'test_url' => 'http://www./2014/05/20/le-playstation-now-arrive-en-beta-fermee-aux-etats-unis/',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="post-content"]',
|
'//div[@class="post-content"]',
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
@ -1,9 +1,13 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.kanpai.fr/japon/comment-donner-lheure-en-japonais.html',
|
'test_url' => 'http://www.kanpai.fr/japon/comment-donner-lheure-en-japonais.html',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="single-left"]',
|
'//div[@class="single-left"]',
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
@ -1,8 +1,12 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://karriere.jobfinder.dk/artikel/dansk-professor-skal-lede-smart-grid-forskning-20-millioner-dollars-763',
|
'test_url' => 'http://karriere.jobfinder.dk/artikel/dansk-professor-skal-lede-smart-grid-forskning-20-millioner-dollars-763',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//section[contains(@class, "teaser")]',
|
'//section[contains(@class, "teaser")]',
|
||||||
'//section[contains(@class, "body")]',
|
'//section[contains(@class, "body")]',
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
@ -1,5 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://lejapon.fr/guide-voyage-japon/5223/tokyo-sous-la-neige.htm',
|
'test_url' => 'http://lejapon.fr/guide-voyage-japon/5223/tokyo-sous-la-neige.htm',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="entry"]'
|
'//div[@class="entry"]'
|
||||||
@ -10,4 +12,6 @@ return array(
|
|||||||
'//*[@class="navigation small"]',
|
'//*[@class="navigation small"]',
|
||||||
'//*[@id="related"]',
|
'//*[@id="related"]',
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
@ -1,9 +1,13 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://lesjoiesducode.fr/post/75576211207/quand-lappli-ne-fonctionne-plus-sans-aucune-raison',
|
'test_url' => 'http://lesjoiesducode.fr/post/75576211207/quand-lappli-ne-fonctionne-plus-sans-aucune-raison',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="blog-post-content"]',
|
'//div[@class="blog-post-content"]',
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
13
vendor/fguillot/picofeed/lib/PicoFeed/Rules/lfg.co.php
vendored
Normal file
13
vendor/fguillot/picofeed/lib/PicoFeed/Rules/lfg.co.php
vendored
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
|
'test_url' => 'http://www.lfg.co/page/871/?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+LookingForGroup+%28Looking+For+Group%29&utm_content=FeedBurner',
|
||||||
|
'body' => array(
|
||||||
|
'//*[@id="comic"]/img | //*[@class="content"]'
|
||||||
|
),
|
||||||
|
'strip' => array(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
@ -1,5 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://lifehacker.com/bring-water-bottle-caps-into-concerts-to-protect-your-d-1269334973',
|
'test_url' => 'http://lifehacker.com/bring-water-bottle-caps-into-concerts-to-protect-your-d-1269334973',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[contains(@class, "row")/img',
|
'//div[contains(@class, "row")/img',
|
||||||
@ -11,4 +13,6 @@ return array(
|
|||||||
'//h1',
|
'//h1',
|
||||||
'//aside',
|
'//aside',
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
@ -1,9 +1,13 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://lists.freebsd.org/pipermail/freebsd-announce/2013-September/001504.html',
|
'test_url' => 'http://lists.freebsd.org/pipermail/freebsd-announce/2013-September/001504.html',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//pre',
|
'//pre',
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/loadingartist.com.php
vendored
Normal file
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/loadingartist.com.php
vendored
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%/comic.*%' => array(
|
||||||
|
'test_url' => 'http://www.loadingartist.com/comic/lifted-spirits/',
|
||||||
|
'body' => array('//div[@class="comic"]'),
|
||||||
|
'strip' => array(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/loldwell.com.php
vendored
Normal file
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/loldwell.com.php
vendored
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
|
'test_url' => 'http://loldwell.com/?comic=food-math-101',
|
||||||
|
'body' => array('//*[@id="comic"]'),
|
||||||
|
'strip' => array(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
@ -1,9 +1,13 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.macg.co//logiciels/2014/05/feedly-sameliore-un-petit-peu-sur-mac-82205',
|
'test_url' => 'http://www.macg.co//logiciels/2014/05/feedly-sameliore-un-petit-peu-sur-mac-82205',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[contains(@class, "field-name-body")]'
|
'//div[contains(@class, "field-name-body")]'
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
),
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
@ -1,9 +1,13 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://marc.info/?l=openbsd-misc&m=141987113202061&w=2',
|
'test_url' => 'http://marc.info/?l=openbsd-misc&m=141987113202061&w=2',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//pre',
|
'//pre',
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
),
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/maximumble.thebookofbiff.com.php
vendored
Normal file
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/maximumble.thebookofbiff.com.php
vendored
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
|
'test_url' => 'http://maximumble.thebookofbiff.com/2015/04/20/1084-change/',
|
||||||
|
'body' => array('//div[@id="comic"]/div/a/img'),
|
||||||
|
'strip' => array(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
@ -1,9 +1,13 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'https://medium.com/lessons-learned/917b8b63ae3e',
|
'test_url' => 'https://medium.com/lessons-learned/917b8b63ae3e',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[contains(@class, "post-field body")]',
|
'//div[contains(@class, "post-field body")]',
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/metronieuws.nl.php
vendored
Normal file
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/metronieuws.nl.php
vendored
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
|
'test_url' => 'http://www.metronieuws.nl/sport/2015/04/broer-fellaini-zorgde-bijna-voor-paniek-bij-mourinho',
|
||||||
|
'body' => array('//div[contains(@class,"article-top")]/div[contains(@class,"image-component")] | //div[@class="article-full-width"]/div[1]'),
|
||||||
|
'strip' => array(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/mokepon.smackjeeves.com.php
vendored
Normal file
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/mokepon.smackjeeves.com.php
vendored
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
|
'test_url' => 'http://mokepon.smackjeeves.com/comics/2120096/chapter-9-page-68/',
|
||||||
|
'body' => array('//*[@id="comic_area_inner"]/img | //*[@id="comic_area_inner"]/a/img'),
|
||||||
|
'strip' => array(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
@ -1,9 +1,13 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.monwindowsphone.com/tout-savoir-sur-le-centre-d-action-de-windows-phone-8-1-t40574.html',
|
'test_url' => 'http://www.monwindowsphone.com/tout-savoir-sur-le-centre-d-action-de-windows-phone-8-1-t40574.html',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="blog-post-body"]'
|
'//div[@class="blog-post-body"]'
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
),
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
12
vendor/fguillot/picofeed/lib/PicoFeed/Rules/neustadt-ticker.de.php
vendored
Normal file
12
vendor/fguillot/picofeed/lib/PicoFeed/Rules/neustadt-ticker.de.php
vendored
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
|
'test_url' => 'http://www.neustadt-ticker.de/36480/aktuell/nachrichten/buergerbuero-neustadt-ab-heute-wieder-geoeffnet',
|
||||||
|
'body' => array('//div[contains(@class,"article")]/div[@class="PostContent" and *[not(contains(@class, "navigation"))]]'),
|
||||||
|
'strip' => array(
|
||||||
|
'//*[@id="wp_rp_first"]'
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/niceteethcomic.com.php
vendored
Normal file
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/niceteethcomic.com.php
vendored
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%/archives.*%' => array(
|
||||||
|
'test_url' => 'http://niceteethcomic.com/archives/page119/',
|
||||||
|
'body' => array('//*[@class="comicpane"]/a/img'),
|
||||||
|
'strip' => array(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
8
vendor/fguillot/picofeed/lib/PicoFeed/Rules/nichtlustig.de.php
vendored
Normal file
8
vendor/fguillot/picofeed/lib/PicoFeed/Rules/nichtlustig.de.php
vendored
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'filter' => array(
|
||||||
|
'%.*%' => array(
|
||||||
|
'%.*static.nichtlustig.de/comics/full/(\\d+).*%s' => '<img src="http://static.nichtlustig.de/comics/full/$1.jpg" />'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
@ -1,5 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'https://www.openrightsgroup.org/blog/2014/3-days-to-go-till-orgcon2014',
|
'test_url' => 'https://www.openrightsgroup.org/blog/2014/3-days-to-go-till-orgcon2014',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[contains(@class, "content")]/div',
|
'//div[contains(@class, "content")]/div',
|
||||||
@ -13,4 +15,6 @@ return array(
|
|||||||
'//h1[@class="pageTitle"]',
|
'//h1[@class="pageTitle"]',
|
||||||
'//p[@class="bookmarkThis"]',
|
'//p[@class="bookmarkThis"]',
|
||||||
),
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
@ -1,9 +1,13 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://pastebin.com/ed1pP9Ak',
|
'test_url' => 'http://pastebin.com/ed1pP9Ak',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="text"]',
|
'//div[@class="text"]',
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
21
vendor/fguillot/picofeed/lib/PicoFeed/Rules/penny-arcade.com.php
vendored
Normal file
21
vendor/fguillot/picofeed/lib/PicoFeed/Rules/penny-arcade.com.php
vendored
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%/news/.*%' => array(
|
||||||
|
'test_url' => 'http://penny-arcade.com/news/post/2015/04/15/101-part-two',
|
||||||
|
'body' => array(
|
||||||
|
'//*[@class="postBody"]/*',
|
||||||
|
),
|
||||||
|
'strip' => array(
|
||||||
|
)
|
||||||
|
),
|
||||||
|
'%/comic/.*%' => array(
|
||||||
|
'test_url' => 'http://penny-arcade.com/comic/2015/04/15',
|
||||||
|
'body' => array(
|
||||||
|
'//*[@id="comicFrame"]/a/img',
|
||||||
|
),
|
||||||
|
'strip' => array(
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
@ -1,7 +1,11 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'https://plus.google.com/+LarryPage/posts/Lh8SKC6sED1',
|
'test_url' => 'https://plus.google.com/+LarryPage/posts/Lh8SKC6sED1',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@role="article"]/div[contains(@class, "eE")]',
|
'//div[@role="article"]/div[contains(@class, "eE")]',
|
||||||
),
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
4
vendor/fguillot/picofeed/lib/PicoFeed/Rules/putaindecode.fr.php
vendored
Executable file → Normal file
4
vendor/fguillot/picofeed/lib/PicoFeed/Rules/putaindecode.fr.php
vendored
Executable file → Normal file
@ -1,5 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://putaindecode.fr/posts/js/etat-lieux-js-modulaire-front/',
|
'test_url' => 'http://putaindecode.fr/posts/js/etat-lieux-js-modulaire-front/',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//*[@class="putainde-Post-md"]',
|
'//*[@class="putainde-Post-md"]',
|
||||||
@ -9,4 +11,6 @@ return array(
|
|||||||
'//*[contains(@class, "comment-respond")]',
|
'//*[contains(@class, "comment-respond")]',
|
||||||
'//header'
|
'//header'
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
@ -1,9 +1,13 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://rue89.feedsportal.com/c/33822/f/608948/s/30999fa0/sc/24/l/0L0Srue890N0C20A130C0A80C30A0Cfaisait0Eboris0Eboillon0Eex0Esarko0Eboy0E350A0E0A0A0A0Eeuros0Egare0Enord0E245315/story01.htm',
|
'test_url' => 'http://rue89.feedsportal.com/c/33822/f/608948/s/30999fa0/sc/24/l/0L0Srue890N0C20A130C0A80C30A0Cfaisait0Eboris0Eboillon0Eex0Esarko0Eboy0E350A0E0A0A0A0Eeuros0Egare0Enord0E245315/story01.htm',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//*[@id="article"]/div[contains(@class, "content")]',
|
'//*[@id="article"]/div[contains(@class, "content")]',
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
12
vendor/fguillot/picofeed/lib/PicoFeed/Rules/satwcomic.com.php
vendored
Normal file
12
vendor/fguillot/picofeed/lib/PicoFeed/Rules/satwcomic.com.php
vendored
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
|
'test_url' => 'http://satwcomic.com/day-at-the-beach',
|
||||||
|
'body' => array(
|
||||||
|
'//div[@class="container"]/center/a/img'
|
||||||
|
),
|
||||||
|
'strip' => array(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
12
vendor/fguillot/picofeed/lib/PicoFeed/Rules/scrumalliance.org.php
vendored
Normal file
12
vendor/fguillot/picofeed/lib/PicoFeed/Rules/scrumalliance.org.php
vendored
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
|
'test_url' => 'https://www.scrumalliance.org/community/articles/2015/march/an-introduction-to-agile-project-intake?feed=articles',
|
||||||
|
'body' => array(
|
||||||
|
'//div[@class="article_content"]',
|
||||||
|
),
|
||||||
|
'strip' => array()
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
@ -1,9 +1,13 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.sitepoint.com/creating-hello-world-app-swift/',
|
'test_url' => 'http://www.sitepoint.com/creating-hello-world-app-swift/',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//section[@class="article_body"]',
|
'//section[@class="article_body"]',
|
||||||
),
|
),
|
||||||
'strip' => array(
|
'strip' => array(
|
||||||
),
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
11
vendor/fguillot/picofeed/lib/PicoFeed/Rules/slashdot.org.php
vendored
Normal file
11
vendor/fguillot/picofeed/lib/PicoFeed/Rules/slashdot.org.php
vendored
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
|
'test_url' => 'http://science.slashdot.org/story/15/04/20/0528253/pull-top-can-tabs-at-50-reach-historic-archaeological-status',
|
||||||
|
'body' => array(
|
||||||
|
'//article/div[@class="body"] | //article[@class="layout-article"]/div[@class="elips"]'),
|
||||||
|
'strip' => array(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
@ -1,5 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://smallhousebliss.com/2013/08/29/house-g-by-lode-architecture/',
|
'test_url' => 'http://smallhousebliss.com/2013/08/29/house-g-by-lode-architecture/',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[@class="post-content"]',
|
'//div[@class="post-content"]',
|
||||||
@ -12,4 +14,6 @@ return array(
|
|||||||
'//*[contains(@class, "postitle")]',
|
'//*[contains(@class, "postitle")]',
|
||||||
'//*[@id="nav-below"]',
|
'//*[@id="nav-below"]',
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/smarthomewelt.de.php
vendored
Normal file
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/smarthomewelt.de.php
vendored
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
|
'test_url' => 'http://smarthomewelt.de/apple-tv-amazon-echo-smart-home/',
|
||||||
|
'body' => array('//div[@class="entry-inner"]/p | //div[@class="entry-inner"]/div[contains(@class,"wp-caption")]'),
|
||||||
|
'strip' => array(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/smashingmagazine.com.php
vendored
Normal file
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/smashingmagazine.com.php
vendored
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
|
'test_url' => 'http://www.smashingmagazine.com/2015/04/17/using-sketch-for-responsive-web-design-case-study/',
|
||||||
|
'body' => array('//article[contains(@class,"post")]/p'),
|
||||||
|
'strip' => array(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
@ -1,7 +1,11 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://www.spiegel.de/politik/ausland/afrika-angola-geht-gegen-islam-vor-und-schliesst-moscheen-a-935788.html',
|
'test_url' => 'http://www.spiegel.de/politik/ausland/afrika-angola-geht-gegen-islam-vor-und-schliesst-moscheen-a-935788.html',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[contains(@class, "article-section")]'
|
'//div[contains(@class, "article-section")]'
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/sz.de.php
vendored
Normal file
10
vendor/fguillot/picofeed/lib/PicoFeed/Rules/sz.de.php
vendored
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<?php
|
||||||
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
|
'test_url' => 'http://sz.de/1.2443161',
|
||||||
|
'body' => array('//article[@id="sitecontent"]/section[@class="topenrichment"]//img | //article[@id="sitecontent"]/section[@class="body"]/section[@class="authors"]/preceding-sibling::*[not(contains(@class, "ad"))]'),
|
||||||
|
'strip' => array(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
@ -1,5 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
return array(
|
return array(
|
||||||
|
'grabber' => array(
|
||||||
|
'%.*%' => array(
|
||||||
'test_url' => 'http://techcrunch.com/2013/08/31/indias-visa-maze/',
|
'test_url' => 'http://techcrunch.com/2013/08/31/indias-visa-maze/',
|
||||||
'body' => array(
|
'body' => array(
|
||||||
'//div[contains(@class, "media-container")]',
|
'//div[contains(@class, "media-container")]',
|
||||||
@ -8,4 +10,6 @@ return array(
|
|||||||
'strip' => array(
|
'strip' => array(
|
||||||
'//*[contains(@class, "module-crunchbase")]'
|
'//*[contains(@class, "module-crunchbase")]'
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
);
|
);
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user