update libraries

fixes #365, #367
This commit is contained in:
Mathias Kresin 2015-04-28 18:08:42 +02:00
parent 901a57e298
commit f89ed85a83
145 changed files with 2748 additions and 1444 deletions

View File

@ -18,37 +18,75 @@ Especially websites that use a lot of Javascript to generate the content.
How to write a grabber rules file? How to write a grabber rules file?
---------------------------------- ----------------------------------
Add a PHP file to the directory `rules`, the filename must be the domain name with the suffix `.php`: Miniflux will try first to find the file in the [default bundled rules directory](https://github.com/miniflux/miniflux/tree/master/vendor/fguillot/picofeed/lib/PicoFeed/Rules), then it will try to load your custom rules.
Example with the BBC website, `www.bbc.co.uk.php`: You can create custom rules, by adding a PHP file to the directory `rules`. The filename must be the domain name with the suffix `.php`.
Each rule has the following keys:
* **body**: An array of xpath expressions which will be extracted from the page
* **strip**: An array of xpath expressions which will be removed from the matched content
* **test_url**: A test url to a matching page to test the grabber
Example for the BBC website, `www.bbc.co.uk.php`:
```php ```php
<?php <?php
return array( return array(
'test_url' => 'http://www.bbc.co.uk/news/world-middle-east-23911833', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@class="story-body"]', 'test_url' => 'http://www.bbc.co.uk/news/world-middle-east-23911833',
), 'body' => array(
'strip' => array( '//div[@class="story-body"]',
'//script', ),
'//form', 'strip' => array(
'//style', '//script',
'//*[@class="story-date"]', '//form',
'//*[@class="story-header"]', '//style',
'//*[@class="story-related"]', '//*[@class="story-date"]',
'//*[contains(@class, "byline")]', '//*[@class="story-header"]',
'//*[contains(@class, "story-feature")]', '//*[@class="story-related"]',
'//*[@id="video-carousel-container"]', '//*[contains(@class, "byline")]',
'//*[@id="also-related-links"]', '//*[contains(@class, "story-feature")]',
'//*[contains(@class, "share") or contains(@class, "hidden") or contains(@class, "hyper")]', '//*[@id="video-carousel-container"]',
'//*[@id="also-related-links"]',
'//*[contains(@class, "share") or contains(@class, "hidden") or contains(@class, "hyper")]',
)
)
) )
); );
``` ```
Actually, only the keys `body`, `strip` and `test_url` are supported. Each rule file can contain rules for different subdivisions of a website. Those subdivisions are distinguished by their URL. The first level array key of a rule file will be matched against the full path of the URL using **preg_match**, e.g. for **http://www.bbc.co.uk/news/world-middle-east-23911833?test=1** the URL that would be matched is **/news/world-middle-east-23911833?test=1**
Miniflux will try first to find the file in the [default bundled rules directory](https://github.com/miniflux/miniflux/tree/master/vendor/fguillot/picofeed/lib/PicoFeed/Rules), then it will try to load your custom rules. Let's say you want to extract a div with the id **video** if the article points to an URL like **http://comix.com/videos/423**, **audio** if the article points to an URL like **http://comix.com/podcasts/5** and all other links to the page should instead take the div with the id **content**. The following rulefile ```comix.com.php``` would fit that requirement:
```php
return array(
'grabber' => array(
'%^/videos.*%' => array(
'test_url' => 'http://comix.com/videos/423',
'body' => array(
'//div[@id="video"]',
),
'strip' => array()
),
'%^/podcasts.*%' => array(
'test_url' => 'http://comix.com/podcasts/5',
'body' => array(
'//div[@id="audio"]',
),
'strip' => array()
),
'%.*%' => array(
'test_url' => 'http://comix.com/blog/1',
'body' => array(
'//div[@id="content"]',
),
'strip' => array()
)
)
);
```
Sharing your custom rules with the community Sharing your custom rules with the community
-------------------------------------------- --------------------------------------------
@ -59,4 +97,4 @@ That will be merged in the Miniflux code base.
List of content grabber rules List of content grabber rules
----------------------------- -----------------------------
[List of rules included by default](https://github.com/miniflux/miniflux/tree/master/vendor/fguillot/picofeed/lib/PicoFeed/Rules). [List of rules included by default](https://github.com/miniflux/miniflux/tree/master/vendor/fguillot/picofeed/lib/PicoFeed/Rules).

View File

@ -6,7 +6,7 @@ use Model\Service;
use Model\Config; use Model\Config;
use PicoDb\Database; use PicoDb\Database;
use PicoFeed\Logging\Logger; use PicoFeed\Logging\Logger;
use PicoFeed\Client\Grabber; use PicoFeed\Scraper\Scraper;
// Get all items without filtering // Get all items without filtering
function get_all() function get_all()
@ -520,12 +520,12 @@ function download_content_url($url)
{ {
$content = ''; $content = '';
$grabber = new Grabber($url); $grabber = new Scraper(Config\get_reader_config());
$grabber->setConfig(Config\get_reader_config()); $grabber->setUrl($url);
$grabber->download(); $grabber->execute();
if ($grabber->parse()) { if ($grabber->hasRelevantContent()) {
$content = $grabber->getFilteredcontent(); $content = $grabber->getFilteredContent();
} }
return $content; return $content;

View File

@ -54,6 +54,8 @@ class ClassLoader
private $useIncludePath = false; private $useIncludePath = false;
private $classMap = array(); private $classMap = array();
private $classMapAuthoritative = false;
public function getPrefixes() public function getPrefixes()
{ {
if (!empty($this->prefixesPsr0)) { if (!empty($this->prefixesPsr0)) {
@ -248,6 +250,27 @@ class ClassLoader
return $this->useIncludePath; return $this->useIncludePath;
} }
/**
* Turns off searching the prefix and fallback directories for classes
* that have not been registered with the class map.
*
* @param bool $classMapAuthoritative
*/
public function setClassMapAuthoritative($classMapAuthoritative)
{
$this->classMapAuthoritative = $classMapAuthoritative;
}
/**
* Should class lookup fail if not found in the current class map?
*
* @return bool
*/
public function isClassMapAuthoritative()
{
return $this->classMapAuthoritative;
}
/** /**
* Registers this instance as an autoloader. * Registers this instance as an autoloader.
* *
@ -299,6 +322,9 @@ class ClassLoader
if (isset($this->classMap[$class])) { if (isset($this->classMap[$class])) {
return $this->classMap[$class]; return $this->classMap[$class];
} }
if ($this->classMapAuthoritative) {
return false;
}
$file = $this->findFileWithExtension($class, '.php'); $file = $this->findFileWithExtension($class, '.php');

View File

@ -20,7 +20,6 @@ return array(
'PicoFeed\\Client\\Client' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Client.php', 'PicoFeed\\Client\\Client' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Client.php',
'PicoFeed\\Client\\ClientException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/ClientException.php', 'PicoFeed\\Client\\ClientException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/ClientException.php',
'PicoFeed\\Client\\Curl' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Curl.php', 'PicoFeed\\Client\\Curl' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Curl.php',
'PicoFeed\\Client\\Grabber' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php',
'PicoFeed\\Client\\HttpHeaders' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/HttpHeaders.php', 'PicoFeed\\Client\\HttpHeaders' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/HttpHeaders.php',
'PicoFeed\\Client\\InvalidCertificateException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/InvalidCertificateException.php', 'PicoFeed\\Client\\InvalidCertificateException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/InvalidCertificateException.php',
'PicoFeed\\Client\\InvalidUrlException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/InvalidUrlException.php', 'PicoFeed\\Client\\InvalidUrlException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/InvalidUrlException.php',
@ -54,6 +53,11 @@ return array(
'PicoFeed\\Reader\\ReaderException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/ReaderException.php', 'PicoFeed\\Reader\\ReaderException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/ReaderException.php',
'PicoFeed\\Reader\\SubscriptionNotFoundException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/SubscriptionNotFoundException.php', 'PicoFeed\\Reader\\SubscriptionNotFoundException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/SubscriptionNotFoundException.php',
'PicoFeed\\Reader\\UnsupportedFeedFormatException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/UnsupportedFeedFormatException.php', 'PicoFeed\\Reader\\UnsupportedFeedFormatException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/UnsupportedFeedFormatException.php',
'PicoFeed\\Scraper\\CandidateParser' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Scraper/CandidateParser.php',
'PicoFeed\\Scraper\\ParserInterface' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Scraper/ParserInterface.php',
'PicoFeed\\Scraper\\RuleLoader' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Scraper/RuleLoader.php',
'PicoFeed\\Scraper\\RuleParser' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Scraper/RuleParser.php',
'PicoFeed\\Scraper\\Scraper' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Scraper/Scraper.php',
'PicoFeed\\Serialization\\Export' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Serialization/Export.php', 'PicoFeed\\Serialization\\Export' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Serialization/Export.php',
'PicoFeed\\Serialization\\Import' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Serialization/Import.php', 'PicoFeed\\Serialization\\Import' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Serialization/Import.php',
'PicoFeed\\Syndication\\Atom' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Syndication/Atom.php', 'PicoFeed\\Syndication\\Atom' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Syndication/Atom.php',

View File

@ -45,18 +45,18 @@
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/fguillot/picoFarad.git", "url": "https://github.com/fguillot/picoFarad.git",
"reference": "1bc48a4367adf359f3439c2e0ae20a7d299d8ccd" "reference": "a5817c49ca3037829ec1509d14724be5f29c35a0"
}, },
"dist": { "dist": {
"type": "zip", "type": "zip",
"url": "https://api.github.com/repos/fguillot/picoFarad/zipball/1bc48a4367adf359f3439c2e0ae20a7d299d8ccd", "url": "https://api.github.com/repos/fguillot/picoFarad/zipball/a5817c49ca3037829ec1509d14724be5f29c35a0",
"reference": "1bc48a4367adf359f3439c2e0ae20a7d299d8ccd", "reference": "a5817c49ca3037829ec1509d14724be5f29c35a0",
"shasum": "" "shasum": ""
}, },
"require": { "require": {
"php": ">=5.3.0" "php": ">=5.3.0"
}, },
"time": "2015-02-01 19:40:13", "time": "2015-04-14 01:53:02",
"type": "library", "type": "library",
"installation-source": "dist", "installation-source": "dist",
"autoload": { "autoload": {
@ -66,7 +66,7 @@
}, },
"notification-url": "https://packagist.org/downloads/", "notification-url": "https://packagist.org/downloads/",
"license": [ "license": [
"Unlicense" "MIT"
], ],
"authors": [ "authors": [
{ {
@ -84,18 +84,18 @@
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/fguillot/simpleValidator.git", "url": "https://github.com/fguillot/simpleValidator.git",
"reference": "41655dc7b9224395f5bb3b5623f6e428fe6d64e8" "reference": "2f30078bb6e688cf123c150d58fda322792a1532"
}, },
"dist": { "dist": {
"type": "zip", "type": "zip",
"url": "https://api.github.com/repos/fguillot/simpleValidator/zipball/41655dc7b9224395f5bb3b5623f6e428fe6d64e8", "url": "https://api.github.com/repos/fguillot/simpleValidator/zipball/2f30078bb6e688cf123c150d58fda322792a1532",
"reference": "41655dc7b9224395f5bb3b5623f6e428fe6d64e8", "reference": "2f30078bb6e688cf123c150d58fda322792a1532",
"shasum": "" "shasum": ""
}, },
"require": { "require": {
"php": ">=5.3.0" "php": ">=5.3.0"
}, },
"time": "2015-04-05 21:44:06", "time": "2015-04-14 02:03:43",
"type": "library", "type": "library",
"installation-source": "dist", "installation-source": "dist",
"autoload": { "autoload": {
@ -109,8 +109,7 @@
], ],
"authors": [ "authors": [
{ {
"name": "Frédéric Guillot", "name": "Frédéric Guillot"
"homepage": "http://fredericguillot.com"
} }
], ],
"description": "The most easy to use validator library for PHP :)", "description": "The most easy to use validator library for PHP :)",
@ -123,18 +122,18 @@
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/fguillot/JsonRPC.git", "url": "https://github.com/fguillot/JsonRPC.git",
"reference": "29d63a09ecd450d5e29fef74f687aab221055910" "reference": "1a397be7739ddabba87b07f0354655bd91087518"
}, },
"dist": { "dist": {
"type": "zip", "type": "zip",
"url": "https://api.github.com/repos/fguillot/JsonRPC/zipball/29d63a09ecd450d5e29fef74f687aab221055910", "url": "https://api.github.com/repos/fguillot/JsonRPC/zipball/1a397be7739ddabba87b07f0354655bd91087518",
"reference": "29d63a09ecd450d5e29fef74f687aab221055910", "reference": "1a397be7739ddabba87b07f0354655bd91087518",
"shasum": "" "shasum": ""
}, },
"require": { "require": {
"php": ">=5.3.0" "php": ">=5.3.0"
}, },
"time": "2015-04-05 21:49:38", "time": "2015-04-14 01:50:16",
"type": "library", "type": "library",
"installation-source": "dist", "installation-source": "dist",
"autoload": { "autoload": {
@ -144,7 +143,7 @@
}, },
"notification-url": "https://packagist.org/downloads/", "notification-url": "https://packagist.org/downloads/",
"license": [ "license": [
"Unlicense" "MIT"
], ],
"authors": [ "authors": [
{ {
@ -152,7 +151,7 @@
"homepage": "http://fredericguillot.com" "homepage": "http://fredericguillot.com"
} }
], ],
"description": "A simple Json-RPC client/server library that just works", "description": "Simple Json-RPC client/server library that just works",
"homepage": "https://github.com/fguillot/JsonRPC" "homepage": "https://github.com/fguillot/JsonRPC"
}, },
{ {
@ -162,12 +161,12 @@
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/fguillot/picoFeed.git", "url": "https://github.com/fguillot/picoFeed.git",
"reference": "273c344b35b468b6c8053f635332c3a404f8c7b9" "reference": "a6087e8264550891c1b8a6da77eca0cab9328709"
}, },
"dist": { "dist": {
"type": "zip", "type": "zip",
"url": "https://api.github.com/repos/fguillot/picoFeed/zipball/273c344b35b468b6c8053f635332c3a404f8c7b9", "url": "https://api.github.com/repos/fguillot/picoFeed/zipball/a6087e8264550891c1b8a6da77eca0cab9328709",
"reference": "273c344b35b468b6c8053f635332c3a404f8c7b9", "reference": "a6087e8264550891c1b8a6da77eca0cab9328709",
"shasum": "" "shasum": ""
}, },
"require": { "require": {
@ -181,7 +180,7 @@
"suggest": { "suggest": {
"ext-curl": "PicoFeed will use cURL if present" "ext-curl": "PicoFeed will use cURL if present"
}, },
"time": "2015-04-11 12:46:50", "time": "2015-04-27 22:22:06",
"bin": [ "bin": [
"picofeed" "picofeed"
], ],
@ -194,7 +193,7 @@
}, },
"notification-url": "https://packagist.org/downloads/", "notification-url": "https://packagist.org/downloads/",
"license": [ "license": [
"Unlicense" "MIT"
], ],
"authors": [ "authors": [
{ {

21
vendor/fguillot/json-rpc/LICENSE vendored Normal file
View File

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2015 Frederic Guillot
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

View File

@ -11,7 +11,7 @@ Features
- Authentication and IP based client restrictions - Authentication and IP based client restrictions
- Minimalist: there is only 2 files - Minimalist: there is only 2 files
- Fully unit tested - Fully unit tested
- License: Unlicense http://unlicense.org/ - License: MIT
Requirements Requirements
------------ ------------

View File

@ -1,9 +1,9 @@
{ {
"name": "fguillot/json-rpc", "name": "fguillot/json-rpc",
"description": "A simple Json-RPC client/server library that just works", "description": "Simple Json-RPC client/server library that just works",
"homepage": "https://github.com/fguillot/JsonRPC", "homepage": "https://github.com/fguillot/JsonRPC",
"type": "library", "type": "library",
"license": "Unlicense", "license": "MIT",
"authors": [ "authors": [
{ {
"name": "Frédéric Guillot", "name": "Frédéric Guillot",

21
vendor/fguillot/picofarad/LICENCE vendored Normal file
View File

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2015 Frederic Guillot
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

View File

@ -10,7 +10,7 @@ Features
- No dependency - No dependency
- Easy to use, fast and very lightweight - Easy to use, fast and very lightweight
- Only 4 files: Request, Response, Router and Session - Only 4 files: Request, Response, Router and Session
- License: Do what the fuck you want with that - License: MIT
Requirements Requirements
------------ ------------

View File

@ -3,7 +3,7 @@
"description": "Minimalist micro-framework", "description": "Minimalist micro-framework",
"homepage": "https://github.com/fguillot/picoFarad", "homepage": "https://github.com/fguillot/picoFarad",
"type": "library", "type": "library",
"license": "Unlicense", "license": "MIT",
"authors": [ "authors": [
{ {
"name": "Frédéric Guillot", "name": "Frédéric Guillot",

View File

@ -1,2 +1,3 @@
.DS_Store .DS_Store
vendor/ vendor/
*.py

View File

@ -1,12 +1,19 @@
language: php language: php
php: php:
- "5.6" - 7.0
- "5.5" - 5.6
- "5.4" - 5.5
- "5.3" - 5.4
- 5.3
matrix:
fast_finish: true
allow_failures:
- php: 7.0
before_script:
- composer dump-autoload
before_script: wget https://phar.phpunit.de/phpunit.phar
script: script:
- composer dump-autoload - phpunit
- php phpunit.phar

21
vendor/fguillot/picofeed/LICENSE vendored Normal file
View File

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2015 Frederic Guillot
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

View File

@ -24,7 +24,7 @@ Features
- Content grabber: download from the original website the full content - Content grabber: download from the original website the full content
- Enclosure detection - Enclosure detection
- RTL languages support - RTL languages support
- License: Unlicense <http://unlicense.org/> - License: MIT
Requirements Requirements
------------ ------------
@ -47,7 +47,6 @@ Authors
Real world usage Real world usage
---------------- ----------------
- [AnythingNew](http://anythingnew.co)
- [Miniflux](http://miniflux.net) - [Miniflux](http://miniflux.net)
- [Owncloud News](https://github.com/owncloud/news) - [Owncloud News](https://github.com/owncloud/news)

View File

@ -1,24 +0,0 @@
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org/>

View File

@ -3,7 +3,7 @@
"description": "Modern library to write or read feeds (RSS/Atom)", "description": "Modern library to write or read feeds (RSS/Atom)",
"homepage": "http://fguillot.github.io/picoFeed", "homepage": "http://fguillot.github.io/picoFeed",
"type": "library", "type": "library",
"license": "Unlicense", "license": "MIT",
"authors": [ "authors": [
{ {
"name": "Frédéric Guillot", "name": "Frédéric Guillot",

View File

@ -215,6 +215,27 @@ catch (PicoFeedException $e) {
} }
``` ```
Custom regex filters
--------------------
In case you want modify the content with a simple regex, you can create a rule file named after the domain of the feed's link attribute. For the feed pointing to **http://www.twogag.com/** the file is stored under **Rules/twogag.com.php**
For filtering, only the array with the key **filter** will be considered. The first level key is a preg_match regex that will match the sub url, e.g. to only match a feed whose link attribute points to **twogag.com/test**, the regex could look like **%/test.*%**. The second level array contains a list of search and replace strings, which will be passed to the preg\_replace function. The first string is the argument that should be matched, the second is the replacement.
To replace all occurences of links to smaller images for twogag, the following rule can be used:
```php
<?php
return array(
'filter' => array(
'%.*%' => array(
"%http://www.twogag.com/comics-rss/([^.]+)\\.jpg%" =>
"http://www.twogag.com/comics/$1.jpg"
)
)
);
```
Feed and item properties Feed and item properties
------------------------ ------------------------

View File

@ -15,23 +15,41 @@ How the content grabber works?
Standalone usage Standalone usage
---------------- ----------------
Fetch remote content:
```php ```php
<?php <?php
use PicoFeed\Client\Grabber; use PicoFeed\Config\Config;
use PicoFeed\Scraper\Scraper;
$grabber = new Grabber($item_url); $config = new Config;
$grabber->download();
$grabber->parse(); $grabber = new Scraper($config)
$grabber->setUrl($url);
$grabber->execute();
// Get raw HTML content // Get raw HTML content
echo $grabber->getRawContent(); echo $grabber->getRawContent();
// Get relevant content // Get relevant content
echo $grabber->getContent(); echo $grabber->getRelevantContent();
// Get filtered relevant content // Get filtered relevant content
echo $grabber->getFilteredContent(); echo $grabber->getFilteredContent();
// Return true if there is relevant content
var_dump($grabber->hasRelevantContent());
```
Parse HTML content:
```php
<?php
$grabber = new Scraper($config);
$grabber->setRawContent($html);
$grabber->execute();
``` ```
Fetch full item contents during feed parsing Fetch full item contents during feed parsing
@ -79,11 +97,11 @@ Configuration
### Enable content grabber for items ### Enable content grabber for items
- Method name: `enableContentGrabber()` - Method name: `enableContentGrabber()`
- Default value: false (content grabber is disabled by default) - Default value: false (also fetch content if no rule file exist)
- Argument value: none - Argument value: bool (true scrape only webpages which have a rule file)
```php ```php
$parser->enableContentGrabber(); $parser->enableContentGrabber(false);
``` ```
### Ignore item urls for the content grabber ### Ignore item urls for the content grabber
@ -106,30 +124,71 @@ Example with the BBC website, `www.bbc.co.uk.php`:
```php ```php
<?php <?php
return array( return array(
'test_url' => 'http://www.bbc.co.uk/news/world-middle-east-23911833', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@class="story-body"]', 'test_url' => 'http://www.bbc.co.uk/news/world-middle-east-23911833',
), 'body' => array(
'strip' => array( '//div[@class="story-body"]',
'//script', ),
'//form', 'strip' => array(
'//style', '//script',
'//*[@class="story-date"]', '//form',
'//*[@class="story-header"]', '//style',
'//*[@class="story-related"]', '//*[@class="story-date"]',
'//*[contains(@class, "byline")]', '//*[@class="story-header"]',
'//*[contains(@class, "story-feature")]', '//*[@class="story-related"]',
'//*[@id="video-carousel-container"]', '//*[contains(@class, "byline")]',
'//*[@id="also-related-links"]', '//*[contains(@class, "story-feature")]',
'//*[contains(@class, "share") or contains(@class, "hidden") or contains(@class, "hyper")]', '//*[@id="video-carousel-container"]',
'//*[@id="also-related-links"]',
'//*[contains(@class, "share") or contains(@class, "hidden") or contains(@class, "hyper")]',
)
)
) )
); );
``` ```
Each rule file can contain multiple rules, based so links to different website URLs can be handled differently. The first level key is a regex, which will be matched against the full path of the URL using **preg_match**, e.g. for **http://www.bbc.co.uk/news/world-middle-east-23911833?test=1** the URL that would be matched is **/news/world-middle-east-23911833?test=1**
Actually, only `body`, `strip` and `test_url` are supported. Each rule has the following keys:
* **body**: An array of xpath expressions which will be extracted from the page
* **strip**: An array of xpath expressions which will be removed from the matched content
* **test_url**: A test url to a matching page to test the grabber
Don't forget to send a pull request or a ticket to share your contribution with everybody, Don't forget to send a pull request or a ticket to share your contribution with everybody,
**A more complex example**:
Let's say you wanted to extract a div with the id **video** if the article points to an URL like **http://comix.com/videos/423**, **audio** if the article points to an URL like **http://comix.com/podcasts/5** and all other links to the page should instead take the div with the id **content**. The following rulefile would fit that requirement and would be stored in a file called **lib/PicoFeed/Rules/comix.com.php**:
```php
return array(
'grabber' => array(
'%^/videos.*%' => array(
'test_url' => 'http://comix.com/videos/423',
'body' => array(
'//div[@id="video"]',
),
'strip' => array()
),
'%^/podcasts.*%' => array(
'test_url' => 'http://comix.com/podcasts/5',
'body' => array(
'//div[@id="audio"]',
),
'strip' => array()
),
'%.*%' => array(
'test_url' => 'http://comix.com/blog/1',
'body' => array(
'//div[@id="content"]',
),
'strip' => array()
)
)
);
```
List of content grabber rules List of content grabber rules
----------------------------- -----------------------------

View File

@ -80,7 +80,7 @@ class Curl extends Client
{ {
$length = strlen($buffer); $length = strlen($buffer);
if ($buffer === "\r\n") { if ($buffer === "\r\n" || $buffer === "\n") {
$this->response_headers_count++; $this->response_headers_count++;
} }
else { else {
@ -162,6 +162,7 @@ class Curl extends Client
* Prepare curl proxy context * Prepare curl proxy context
* *
* @access private * @access private
* @param resource $ch
* @return resource $ch * @return resource $ch
*/ */
private function prepareProxyContext($ch) private function prepareProxyContext($ch)
@ -190,6 +191,7 @@ class Curl extends Client
* Prepare curl auth context * Prepare curl auth context
* *
* @access private * @access private
* @param resource $ch
* @return resource $ch * @return resource $ch
*/ */
private function prepareAuthContext($ch) private function prepareAuthContext($ch)
@ -205,6 +207,7 @@ class Curl extends Client
* Set write/header functions * Set write/header functions
* *
* @access private * @access private
* @param resource $ch
* @return resource $ch * @return resource $ch
*/ */
private function prepareDownloadMode($ch) private function prepareDownloadMode($ch)
@ -305,7 +308,7 @@ class Curl extends Client
{ {
$this->executeContext(); $this->executeContext();
list($status, $headers) = HttpHeaders::parse(explode("\r\n", $this->response_headers[$this->response_headers_count - 1])); list($status, $headers) = HttpHeaders::parse(explode("\n", $this->response_headers[$this->response_headers_count - 1]));
// When restricted with open_basedir // When restricted with open_basedir
if ($this->needToHandleRedirection($follow_location, $status)) { if ($this->needToHandleRedirection($follow_location, $status)) {

View File

@ -1,592 +0,0 @@
<?php
namespace PicoFeed\Client;
use DOMXPath;
use PicoFeed\Encoding\Encoding;
use PicoFeed\Logging\Logger;
use PicoFeed\Filter\Filter;
use PicoFeed\Parser\XmlParser;
/**
* Grabber class
*
* @author Frederic Guillot
* @package Client
*/
class Grabber
{
/**
* URL
*
* @access private
* @var string
*/
private $url = '';
/**
* Relevant content
*
* @access private
* @var string
*/
private $content = '';
/**
* HTML content
*
* @access private
* @var string
*/
private $html = '';
/**
* HTML content encoding
*
* @access private
* @var string
*/
private $encoding = '';
/**
* Flag to skip download and parsing
*
* @access private
* @var boolean
*/
private $skip_processing = false;
/**
* List of attributes to try to get the content, order is important, generic terms at the end
*
* @access private
* @var array
*/
private $candidatesAttributes = array(
'articleBody',
'articlebody',
'article-body',
'articleContent',
'articlecontent',
'article-content',
'articlePage',
'post-content',
'post_content',
'entry-content',
'entry-body',
'main-content',
'story_content',
'storycontent',
'entryBox',
'entrytext',
'comic',
'post',
'article',
'content',
'main',
);
/**
* List of attributes to strip
*
* @access private
* @var array
*/
private $stripAttributes = array(
'comment',
'share',
'links',
'toolbar',
'fb',
'footer',
'credit',
'bottom',
'nav',
'header',
'social',
'tag',
'metadata',
'entry-utility',
'related-posts',
'tweet',
'categories',
'post_title',
'by_line',
'byline',
'sponsors',
);
/**
* Tags to remove
*
* @access private
* @var array
*/
private $stripTags = array(
'nav',
'header',
'footer',
'aside',
'form',
);
/**
* Config object
*
* @access private
* @var \PicoFeed\Config\Config
*/
private $config;
/**
* Constructor
*
* @access public
* @param string $url Url
* @param string $html HTML content
* @param string $encoding Charset
*/
public function __construct($url, $html = '', $encoding = 'utf-8')
{
$this->url = $url;
$this->html = $html;
$this->encoding = $encoding;
$this->handleFiles();
$this->handleStreamingVideos();
}
/**
* Set config object
*
* @access public
* @param \PicoFeed\Config\Config $config Config instance
* @return Grabber
*/
public function setConfig($config)
{
$this->config = $config;
return $this;
}
/**
* Get URL to download.
*
* @access public
* @return string
*/
public function getUrl()
{
return $this->url;
}
/**
* Set URL to download and reset object to use for another grab.
*
* @access public
* @param string $url URL
* @return string
*/
public function setUrl($url)
{
$this->url = $url;
$this->html = "";
$this->content = "";
$this->encoding = "";
$this->handleFiles();
$this->handleStreamingVideos();
}
/**
* Get relevant content
*
* @access public
* @return string
*/
public function getContent()
{
return $this->content;
}
/**
* Get raw content (unfiltered)
*
* @access public
* @return string
*/
public function getRawContent()
{
return $this->html;
}
/**
* Get filtered relevant content
*
* @access public
* @return string
*/
public function getFilteredContent()
{
$filter = Filter::html($this->content, $this->url);
$filter->setConfig($this->config);
return $filter->execute();
}
/**
* Return the Youtube embed player and skip processing
*
* @access public
* @return string
*/
public function handleStreamingVideos()
{
if (preg_match("#(?<=v=|v\/|vi=|vi\/|youtu.be\/)[a-zA-Z0-9_-]{11}#", $this->url, $matches)) {
$this->content = '<iframe width="560" height="315" src="//www.youtube.com/embed/'.$matches[0].'" frameborder="0"></iframe>';
$this->skip_processing = true;
}
}
/**
* Skip processing for PDF documents
*
* @access public
* @return string
*/
public function handleFiles()
{
if (substr($this->url, -3) === 'pdf') {
$this->skip_processing = true;
Logger::setMessage(get_called_class().': PDF document => processing skipped');
}
}
/**
* Parse the HTML content
*
* @access public
* @return bool
*/
public function parse()
{
if ($this->skip_processing) {
return true;
}
if ($this->html) {
$html_encoding = XmlParser::getEncodingFromMetaTag($this->html);
// Encode everything in UTF-8
Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'" ; HTML Encoding "'.$html_encoding.'"');
$this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding);
$this->html = Filter::stripHeadTags($this->html);
Logger::setMessage(get_called_class().': Content length: '.strlen($this->html).' bytes');
$rules = $this->getRules();
if (! empty($rules)) {
Logger::setMessage(get_called_class().': Parse content with rules');
$this->parseContentWithRules($rules);
}
else {
Logger::setMessage(get_called_class().': Parse content with candidates');
$this->parseContentWithCandidates();
}
}
else {
Logger::setMessage(get_called_class().': No content fetched');
}
Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
Logger::setMessage(get_called_class().': Grabber done');
return $this->content !== '';
}
/**
* Download the HTML content
*
* @access public
* @return HTML content
*/
public function download()
{
if (! $this->skip_processing && $this->url != '') {
try {
$client = Client::getInstance();
if ($this->config !== null) {
$client->setConfig($this->config);
$client->setTimeout($this->config->getGrabberTimeout());
$client->setUserAgent($this->config->getGrabberUserAgent());
}
$client->execute($this->url);
$this->url = $client->getUrl();
$this->html = $client->getContent();
$this->encoding = $client->getEncoding();
}
catch (ClientException $e) {
Logger::setMessage(get_called_class().': '.$e->getMessage());
}
}
return $this->html;
}
/**
* Try to find a predefined rule
*
* @access public
* @return array
*/
public function getRules()
{
$hostname = parse_url($this->url, PHP_URL_HOST);
if ($hostname !== false) {
$files = $this->getRulesFileList($hostname);
foreach ($this->getRulesFolders() as $folder) {
$rule = $this->loadRuleFile($folder, $files);
if (! empty($rule)) {
return $rule;
}
}
}
return array();
}
/**
* Get the list of possible rules file names for a given hostname
*
* @access public
* @param string $hostname Hostname
* @return array
*/
public function getRulesFileList($hostname)
{
$files = array($hostname); // subdomain.domain.tld
$parts = explode('.', $hostname);
$len = count($parts);
if ($len > 2) {
$subdomain = array_shift($parts);
$files[] = implode('.', $parts); // domain.tld
$files[] = '.'.implode('.', $parts); // .domain.tld
$files[] = $subdomain; // subdomain
}
else if ($len === 2) {
$files[] = '.'.implode('.', $parts); // .domain.tld
$files[] = $parts[0]; // domain
}
return $files;
}
/**
* Load a rule file from the defined folder
*
* @access public
* @param string $folder Rule directory
* @param array $files List of possible file names
* @return array
*/
public function loadRuleFile($folder, array $files)
{
foreach ($files as $file) {
$filename = $folder.'/'.$file.'.php';
if (file_exists($filename)) {
Logger::setMessage(get_called_class().' Load rule: '.$file);
return include $filename;
}
}
return array();
}
/**
* Get the list of folders that contains rules
*
* @access public
* @return array
*/
public function getRulesFolders()
{
$folders = array(__DIR__.'/../Rules');
if ($this->config !== null && $this->config->getGrabberRulesFolder() !== null) {
$folders[] = $this->config->getGrabberRulesFolder();
}
return $folders;
}
/**
* Get the relevant content with predefined rules
*
* @access public
* @param array $rules Rules
*/
public function parseContentWithRules(array $rules)
{
// Logger::setMessage($this->html);
$dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$this->html);
$xpath = new DOMXPath($dom);
if (isset($rules['strip']) && is_array($rules['strip'])) {
foreach ($rules['strip'] as $pattern) {
$nodes = $xpath->query($pattern);
if ($nodes !== false && $nodes->length > 0) {
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
}
}
}
if (isset($rules['body']) && is_array($rules['body'])) {
foreach ($rules['body'] as $pattern) {
$nodes = $xpath->query($pattern);
if ($nodes !== false && $nodes->length > 0) {
foreach ($nodes as $node) {
$this->content .= $dom->saveXML($node);
}
}
}
}
}
/**
* Get the relevant content with the list of potential attributes
*
* @access public
*/
public function parseContentWithCandidates()
{
$dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$this->html);
$xpath = new DOMXPath($dom);
// Try to lookup in each tag
foreach ($this->candidatesAttributes as $candidate) {
Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"');
$nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
break;
}
}
// Try to fetch <article/>
if (strlen($this->content) < 200) {
$nodes = $xpath->query('//article');
if ($nodes !== false && $nodes->length > 0) {
$this->content = $dom->saveXML($nodes->item(0));
Logger::setMessage(get_called_class().': Find <article/> tag ('.strlen($this->content).' bytes)');
}
}
// Get everything
if (strlen($this->content) < 50) {
$nodes = $xpath->query('//body');
if ($nodes !== false && $nodes->length > 0) {
Logger::setMessage(get_called_class().' No enought content fetched, get //body');
$this->content = $dom->saveXML($nodes->item(0));
}
}
Logger::setMessage(get_called_class().': Strip garbage');
$this->stripGarbage();
}
/**
* Strip useless tags
*
* @access public
*/
public function stripGarbage()
{
$dom = XmlParser::getDomDocument($this->content);
if ($dom !== false) {
$xpath = new DOMXPath($dom);
foreach ($this->stripTags as $tag) {
$nodes = $xpath->query('//'.$tag);
if ($nodes !== false && $nodes->length > 0) {
Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"');
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
}
}
foreach ($this->stripAttributes as $attribute) {
$nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
if ($nodes !== false && $nodes->length > 0) {
Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"');
foreach ($nodes as $node) {
if ($this->shouldRemove($dom, $node)) {
$node->parentNode->removeChild($node);
}
}
}
}
$this->content = $dom->saveXML($dom->documentElement);
}
}
/**
* Return false if the node should not be removed
*
* @access public
* @param DomDocument $dom
* @param DomNode $node
* @return boolean
*/
public function shouldRemove($dom, $node)
{
$document_length = strlen($dom->textContent);
$node_length = strlen($node->textContent);
if ($document_length === 0) {
return true;
}
$ratio = $node_length * 100 / $document_length;
if ($ratio >= 90) {
Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%');
return false;
}
return true;
}
}

View File

@ -235,6 +235,7 @@ class Attribute
'filterProtocolUrlAttribute', 'filterProtocolUrlAttribute',
'rewriteImageProxyUrl', 'rewriteImageProxyUrl',
'secureIframeSrc', 'secureIframeSrc',
'removeYouTubeAutoplay'
); );
/** /**
@ -404,6 +405,25 @@ class Attribute
return true; return true;
} }
/**
* Removes YouTube autoplay from iframes
*
* @access public
* @param string $tag Tag name
* @param array $attribute Atttributes name
* @param string $value Attribute value
* @return boolean
*/
public function removeYouTubeAutoplay($tag, $attribute, &$value)
{
$regex = '%^(https://(?:www\.)?youtube.com/.*\?.*autoplay=)(1)(.*)%i';
if ($tag === 'iframe' && $attribute === 'src' && preg_match($regex, $value)) {
$value = preg_replace($regex, '${1}0$3', $value);
}
return true;
}
/** /**
* Rewrite image url to use with a proxy * Rewrite image url to use with a proxy
* *

View File

@ -2,7 +2,9 @@
namespace PicoFeed\Filter; namespace PicoFeed\Filter;
use PicoFeed\Config\Config;
use PicoFeed\Client\Url; use PicoFeed\Client\Url;
use PicoFeed\Scraper\RuleLoader;
use PicoFeed\Parser\XmlParser; use PicoFeed\Parser\XmlParser;
/** /**
@ -69,6 +71,14 @@ class Html
*/ */
public $attribute = ''; public $attribute = '';
/**
* The website to filter
*
* @access private
* @var string
*/
private $website;
/** /**
* Initialize the filter, all inputs data must be encoded in UTF-8 before * Initialize the filter, all inputs data must be encoded in UTF-8 before
* *
@ -81,6 +91,7 @@ class Html
$this->input = XmlParser::HtmlToXml($html); $this->input = XmlParser::HtmlToXml($html);
$this->output = ''; $this->output = '';
$this->tag = new Tag; $this->tag = new Tag;
$this->website = $website;
$this->attribute = new Attribute(new Url($website)); $this->attribute = new Attribute(new Url($website));
} }
@ -155,9 +166,45 @@ class Html
public function postFilter() public function postFilter()
{ {
$this->output = $this->tag->removeEmptyTags($this->output); $this->output = $this->tag->removeEmptyTags($this->output);
$this->output = $this->filterRules($this->output);
$this->output = $this->tag->removeMultipleBreakTags($this->output);
$this->output = trim($this->output); $this->output = trim($this->output);
} }
/**
* Called after XML parsing
* @param string $content the content that should be filtered
*
* @access public
*/
public function filterRules($content)
{
// the constructor should require a config, then this if can be removed
if ($this->config === null) {
$config = new Config;
} else {
$config = $this->config;
}
$loader = new RuleLoader($config);
$rules = $loader->getRules($this->website);
$url = new Url($this->website);
$sub_url = $url->getFullPath();
if (isset($rules['filter'])) {
foreach ($rules['filter'] as $pattern => $rule) {
if (preg_match($pattern, $sub_url)) {
foreach($rule as $search => $replace) {
$content = preg_replace($search, $replace, $content);
}
}
}
}
return $content;
}
/** /**
* Parse opening tag * Parse opening tag
* *

View File

@ -194,7 +194,7 @@ class Tag
* @param string $data Input data * @param string $data Input data
* @return string * @return string
*/ */
public function removeMultipleTags($data) public function removeMultipleBreakTags($data)
{ {
return preg_replace("/(<br\s*\/?>\s*)+/", "<br/>", $data); return preg_replace("/(<br\s*\/?>\s*)+/", "<br/>", $data);
} }

View File

@ -3,11 +3,11 @@
namespace PicoFeed\Parser; namespace PicoFeed\Parser;
use SimpleXMLElement; use SimpleXMLElement;
use PicoFeed\Client\Url;
use PicoFeed\Encoding\Encoding; use PicoFeed\Encoding\Encoding;
use PicoFeed\Filter\Filter; use PicoFeed\Filter\Filter;
use PicoFeed\Logging\Logger; use PicoFeed\Logging\Logger;
use PicoFeed\Client\Url; use PicoFeed\Scraper\Scraper;
use PicoFeed\Client\Grabber;
/** /**
* Base parser class * Base parser class
@ -81,6 +81,14 @@ abstract class Parser
*/ */
private $enable_grabber = false; private $enable_grabber = false;
/**
* Enable the content grabber on all pages
*
* @access private
* @var bool
*/
private $grabber_needs_rule_file = false;
/** /**
* Ignore those urls for the content scraper * Ignore those urls for the content scraper
* *
@ -237,11 +245,16 @@ abstract class Parser
{ {
if ($this->enable_grabber && ! in_array($item->getUrl(), $this->grabber_ignore_urls)) { if ($this->enable_grabber && ! in_array($item->getUrl(), $this->grabber_ignore_urls)) {
$grabber = new Grabber($item->getUrl()); $grabber = new Scraper($this->config);
$grabber->setConfig($this->config); $grabber->setUrl($item->getUrl());
$grabber->download();
if ($grabber->parse()) { if ($this->grabber_needs_rule_file) {
$grabber->disableCandidateParser();
}
$grabber->execute();
if ($grabber->hasRelevantContent()) {
$item->content = $grabber->getFilteredContent(); $item->content = $grabber->getFilteredContent();
} }
} }
@ -270,7 +283,6 @@ abstract class Parser
* Generate a unique id for an entry (hash all arguments) * Generate a unique id for an entry (hash all arguments)
* *
* @access public * @access public
* @param string $args Pieces of data to hash
* @return string * @return string
*/ */
public function generateId() public function generateId()
@ -383,11 +395,14 @@ abstract class Parser
* Enable the content grabber * Enable the content grabber
* *
* @access public * @access public
* @param bool $needs_rule_file true if only pages with rule files should be
* scraped
* @return \PicoFeed\Parser\Parser * @return \PicoFeed\Parser\Parser
*/ */
public function enableContentGrabber() public function enableContentGrabber($needs_rule_file = false)
{ {
$this->enable_grabber = true; $this->enable_grabber = true;
$this->grabber_needs_rule_file = $needs_rule_file;
} }
/** /**

View File

@ -1,10 +1,14 @@
<?php <?php
return array( return array(
'test_url' => 'http://combat.blog.lemonde.fr/2013/08/31/teddy-riner-le-rookie-devenu-rambo/#xtor=RSS-3208', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@class="entry-content"]', 'test_url' => 'http://combat.blog.lemonde.fr/2013/08/31/teddy-riner-le-rookie-devenu-rambo/#xtor=RSS-3208',
), 'body' => array(
'strip' => array( '//div[@class="entry-content"]',
'//*[contains(@class, "fb-like") or contains(@class, "social")]' ),
), 'strip' => array(
'//*[contains(@class, "fb-like") or contains(@class, "social")]'
),
)
)
); );

View File

@ -1,11 +1,15 @@
<?php <?php
return array( return array(
'title' => '//header/h1', 'grabber' => array(
'test_url' => 'http://bits.blogs.nytimes.com/2012/01/16/wikipedia-plans-to-go-dark-on-wednesday-to-protest-sopa/', '%.*%' => array(
'body' => array( 'title' => '//header/h1',
'//div[@class="postContent"]', 'test_url' => 'http://bits.blogs.nytimes.com/2012/01/16/wikipedia-plans-to-go-dark-on-wednesday-to-protest-sopa/',
), 'body' => array(
'strip' => array( '//div[@class="postContent"]',
'//*[@class="shareToolsBox"]', ),
), 'strip' => array(
'//*[@class="shareToolsBox"]',
),
)
)
); );

View File

@ -1,9 +1,13 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.igen.fr/ailleurs/2014/05/nvidia-va-delaisser-les-smartphones-grand-public-86031', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[contains(@class, "field-name-body")]' 'test_url' => 'http://www.igen.fr/ailleurs/2014/05/nvidia-va-delaisser-les-smartphones-grand-public-86031',
), 'body' => array(
'strip' => array( '//div[contains(@class, "field-name-body")]'
), ),
'strip' => array(
),
)
)
); );

View File

@ -1,8 +1,11 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.nytimes.com/2011/05/15/world/middleeast/15prince.html', 'grabber' => array(
'title' => '//h1[@class="articleHeadline"]', '%.*%' => array(
'body' => array( 'test_url' => 'http://www.nytimes.com/2011/05/15/world/middleeast/15prince.html',
'//div[@class="articleBody"]', 'body' => array(
), '//div[@class="articleBody"]',
),
)
)
); );

View File

@ -1,9 +1,12 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@class="KonaBody"]', 'test_url' => 'http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1',
), 'body' => array(
'strip' => array( '//div[@class="KonaBody"]',
),
'strip' => array()
)
) )
); );

View File

@ -1,16 +1,20 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.slate.com/articles/business/moneybox/2013/08/microsoft_ceo_steve_ballmer_retires_a_firsthand_account_of_the_company_s.html', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@class="sl-art-body"]', 'test_url' => 'http://www.slate.com/articles/business/moneybox/2013/08/microsoft_ceo_steve_ballmer_retires_a_firsthand_account_of_the_company_s.html',
), 'body' => array(
'strip' => array( '//div[@class="sl-art-body"]',
'//*[contains(@class, "social") or contains(@class, "comments") or contains(@class, "sl-article-floatin-tools") or contains(@class, "sl-art-pag")]', ),
'//*[@id="mys_slate_logged_in"]', 'strip' => array(
'//*[@id="sl_article_tools_myslate_bottom"]', '//*[contains(@class, "social") or contains(@class, "comments") or contains(@class, "sl-article-floatin-tools") or contains(@class, "sl-art-pag")]',
'//*[@id="mys_myslate"]', '//*[@id="mys_slate_logged_in"]',
'//*[@class="sl-viral-container"]', '//*[@id="sl_article_tools_myslate_bottom"]',
'//*[@class="sl-art-creds-cntr"]', '//*[@id="mys_myslate"]',
'//*[@class="sl-art-ad-midflex"]', '//*[@class="sl-viral-container"]',
'//*[@class="sl-art-creds-cntr"]',
'//*[@class="sl-art-ad-midflex"]',
)
)
) )
); );

View File

@ -1,10 +1,14 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.theguardian.com/sustainable-business/2015/feb/02/2015-hyper-transparency-global-business', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[contains(@class, "content__main-column--article")]', 'test_url' => 'http://www.theguardian.com/sustainable-business/2015/feb/02/2015-hyper-transparency-global-business',
), 'body' => array(
'strip' => array( '//div[contains(@class, "content__main-column--article")]',
'//div[contains(@class, "meta-container")]', ),
), 'strip' => array(
'//div[contains(@class, "meta-container")]',
),
)
)
); );

View File

@ -1,25 +1,29 @@
<?php <?php
return array( return array(
'test_url' => 'https://en.wikipedia.org/wiki/Grace_Hopper', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@id="bodyContent"]', 'test_url' => 'https://en.wikipedia.org/wiki/Grace_Hopper',
), 'body' => array(
'strip' => array( '//div[@id="bodyContent"]',
"//div[@id='toc']", ),
"//div[@id='catlinks']", 'strip' => array(
"//div[@id='jump-to-nav']", "//div[@id='toc']",
"//div[@class='thumbcaption']//div[@class='magnify']", "//div[@id='catlinks']",
"//table[@class='navbox']", "//div[@id='jump-to-nav']",
"//table[contains(@class, 'infobox')]", "//div[@class='thumbcaption']//div[@class='magnify']",
"//div[@class='dablink']", "//table[@class='navbox']",
"//div[@id='contentSub']", "//table[contains(@class, 'infobox')]",
"//div[@id='siteSub']", "//div[@class='dablink']",
"//table[@id='persondata']", "//div[@id='contentSub']",
"//table[contains(@class, 'metadata')]", "//div[@id='siteSub']",
"//*[contains(@class, 'noprint')]", "//table[@id='persondata']",
"//*[contains(@class, 'printfooter')]", "//table[contains(@class, 'metadata')]",
"//*[contains(@class, 'editsection')]", "//*[contains(@class, 'noprint')]",
"//*[contains(@class, 'error')]", "//*[contains(@class, 'printfooter')]",
"//span[@title='pronunciation:']", "//*[contains(@class, 'editsection')]",
), "//*[contains(@class, 'error')]",
"//span[@title='pronunciation:']",
),
)
)
); );

View File

@ -1,17 +1,21 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.wired.com/gamelife/2013/09/ouya-free-the-games/', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@class="entry"]', 'test_url' => 'http://www.wired.com/gamelife/2013/09/ouya-free-the-games/',
), 'body' => array(
'strip' => array( '//div[@class="entry"]',
'//*[@id="linker_widget"]', ),
'//*[contains(@class, "bio")]', 'strip' => array(
'//*[contains(@class, "entry-footer")]', '//*[@id="linker_widget"]',
'//*[contains(@class, "mobify_backtotop_link")]', '//*[contains(@class, "bio")]',
'//*[contains(@class, "gallery-navigation")]', '//*[contains(@class, "entry-footer")]',
'//*[contains(@class, "gallery-thumbnail")]', '//*[contains(@class, "mobify_backtotop_link")]',
'//img[contains(@src, "1x1")]', '//*[contains(@class, "gallery-navigation")]',
'//a[contains(@href, "creativecommons")]', '//*[contains(@class, "gallery-thumbnail")]',
), '//img[contains(@src, "1x1")]',
'//a[contains(@href, "creativecommons")]',
),
)
)
); );

View File

@ -1,11 +1,15 @@
<?php <?php
return array( return array(
'test_url' => 'http://online.wsj.com/article/SB10001424127887324108204579023143974408428.html', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@class="articlePage"]', 'test_url' => 'http://online.wsj.com/article/SB10001424127887324108204579023143974408428.html',
), 'body' => array(
'strip' => array( '//div[@class="articlePage"]',
'//*[@id="articleThumbnail_2"]', ),
'//*[@class="socialByline"]', 'strip' => array(
'//*[@id="articleThumbnail_2"]',
'//*[@class="socialByline"]',
)
)
) )
); );

View File

@ -1,14 +1,18 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.01net.com/editorial/624550/twitter-rachete-madbits-un-specialiste-francais-de-lanalyse-dimages/', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@class="article_ventre_box"]', 'test_url' => 'http://www.01net.com/editorial/624550/twitter-rachete-madbits-un-specialiste-francais-de-lanalyse-dimages/',
), 'body' => array(
'strip' => array( '//div[@class="article_ventre_box"]',
'//link', ),
'//*[contains(@class, "article_navigation")]', 'strip' => array(
'//h1', '//link',
'//*[contains(@class, "article_toolbarMain")]', '//*[contains(@class, "article_navigation")]',
'//*[contains(@class, "article_imagehaute_box")]' '//h1',
'//*[contains(@class, "article_toolbarMain")]',
'//*[contains(@class, "article_imagehaute_box")]'
)
)
) )
); );

View File

@ -1,10 +1,14 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.alainonline.net/news_details.php?lang=arabic&sid=18907', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@class="news_details"]' 'test_url' => 'http://www.alainonline.net/news_details.php?lang=arabic&sid=18907',
), 'body' => array(
'strip' => array( '//div[@class="news_details"]'
'//div[@class="news_details"]/div/div[last()]', ),
), 'strip' => array(
'//div[@class="news_details"]/div/div[last()]',
),
)
)
); );

View File

@ -1,20 +1,23 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.allgemeine-zeitung.de/lokales/polizei/mainz-gonsenheim-unbekannte-rauben-esso-tankstelle-in-kurt-schumacher-strasse-aus_14913147.htm', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[contains(@class, "article")][1]', 'test_url' => 'http://www.allgemeine-zeitung.de/lokales/polizei/mainz-gonsenheim-unbekannte-rauben-esso-tankstelle-in-kurt-schumacher-strasse-aus_14913147.htm',
), 'body' => array(
'strip' => array( '//div[contains(@class, "article")][1]',
'//read/h1', ),
'//*[@id="t-map"]', 'strip' => array(
'//*[contains(@class, "modules")]', '//read/h1',
'//*[contains(@class, "adsense")]', '//*[@id="t-map"]',
'//*[contains(@class, "linkbox")]', '//*[contains(@class, "modules")]',
'//*[contains(@class, "info")]', '//*[contains(@class, "adsense")]',
'//*[@class="skip"]', '//*[contains(@class, "linkbox")]',
'//*[@class="funcs"]', '//*[contains(@class, "info")]',
'//span[@class="nd address"]', '//*[@class="skip"]',
'//a[contains(@href, "abo-und-services")]' '//*[@class="funcs"]',
'//span[@class="nd address"]',
'//a[contains(@href, "abo-und-services")]'
)
)
) )
); );

View File

@ -0,0 +1,10 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.areadvd.de/news/daily-deals-angebote-bei-lautsprecher-teufel-3/',
'body' => array('//div[contains(@class,"entry")]'),
'strip' => array(),
)
)
);

View File

@ -0,0 +1,10 @@
<?php
return array(
'grabber' => array(
'%/index.php.*comic=.*%' => array(
'test_url' => 'http://www.awkwardzombie.com/index.php?comic=041315',
'body' => array('//*[@id="comic"]/img'),
'strip' => array(),
)
)
);

View File

@ -1,9 +1,13 @@
<?php <?php
return array( return array(
'test_url' => 'http://blog.fefe.de/?ts=ad706a73', 'grabber' => array(
'body' => array( '%.*%' => array(
'/html/body/ul' 'test_url' => 'http://blog.fefe.de/?ts=ad706a73',
), 'body' => array(
'strip' => array( '/html/body/ul'
), ),
'strip' => array(
),
)
)
); );

View File

@ -1,9 +1,13 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.bunicomic.com/comic/buni-623/', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@class="comic-table"]', 'test_url' => 'http://www.bunicomic.com/comic/buni-623/',
), 'body' => array(
'strip' => array( '//div[@class="comic-table"]',
), ),
'strip' => array(
),
)
)
); );

View File

@ -0,0 +1,12 @@
<?php
return array(
'grabber' => array(
'%/cad/.+%' => array(
'test_url' => 'http://www.cad-comic.com/cad/20150417',
'body' => array(
'//*[@id="content"]/img'
),
'strip' => array(),
)
)
);

View File

@ -0,0 +1,10 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://chaoslife.findchaos.com/pets-in-the-wild',
'body' => array('//div[@id="comic"]'),
'strip' => array(),
)
)
);

View File

@ -0,0 +1,10 @@
<?php
return array(
'grabber' => array(
'%/comic.*%' => array(
'test_url' => 'http://cliquerefresh.com/comic/078-stating-the-obvious/',
'body' => array('//div[@class="comicImg"]/img | //div[@class="comicImg"]/a/img'),
'strip' => array(),
)
)
);

View File

@ -1,9 +1,13 @@
<?php <?php
return array( return array(
'test_url' => 'http://consomac.fr/news-2430-l-iphone-6-toujours-un-secret-bien-garde.html', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[contains(@id, "newscontent")]', 'test_url' => 'http://consomac.fr/news-2430-l-iphone-6-toujours-un-secret-bien-garde.html',
), 'body' => array(
'strip' => array( '//div[contains(@id, "newscontent")]',
), ),
'strip' => array(
),
)
)
); );

26
vendor/fguillot/picofeed/lib/PicoFeed/Rules/dailyjs.com.php vendored Executable file → Normal file
View File

@ -1,15 +1,19 @@
<?php <?php
return array( return array(
'test_url' => 'http://dailyjs.com/2014/08/07/p5js/', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@id="post"]', 'test_url' => 'http://dailyjs.com/2014/08/07/p5js/',
), 'body' => array(
'strip' => array( '//div[@id="post"]',
'//h2[@class="post"]', ),
'//div[@class="meta"]', 'strip' => array(
'//*[contains(@class, "addthis_toolbox")]', '//h2[@class="post"]',
'//*[contains(@class, "addthis_default_style")]', '//div[@class="meta"]',
'//*[@class="navigation small"]', '//*[contains(@class, "addthis_toolbox")]',
'//*[@id="related"]', '//*[contains(@class, "addthis_default_style")]',
'//*[@class="navigation small"]',
'//*[@id="related"]',
)
)
) )
); );

View File

@ -1,10 +1,14 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.degroupnews.com/medias/vodsvod/amazon-concurrence-la-chromecast-de-google-avec-fire-tv-stick', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@class="contenu"]', 'test_url' => 'http://www.degroupnews.com/medias/vodsvod/amazon-concurrence-la-chromecast-de-google-avec-fire-tv-stick',
), 'body' => array(
'strip' => array( '//div[@class="contenu"]',
'//div[contains(@class, "a2a")]' ),
), 'strip' => array(
'//div[contains(@class, "a2a")]'
),
)
)
); );

View File

@ -1,10 +1,14 @@
<?php <?php
return array( return array(
'test_url' => 'http://derstandard.at/2000010267354/The-Witcher-3-Hohe-Hardware-Anforderungen-fuer-PC-Spieler?ref=rss', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@class="copytext"]', 'test_url' => 'http://derstandard.at/2000010267354/The-Witcher-3-Hohe-Hardware-Anforderungen-fuer-PC-Spieler?ref=rss',
'//ul[@id="media-list"]', 'body' => array(
), '//div[@class="copytext"]',
'strip' => array( '//ul[@id="media-list"]',
), ),
'strip' => array(
),
)
)
); );

View File

@ -1,9 +1,13 @@
<?php <?php
return array( return array(
'test_url' => 'http://distrowatch.com/?newsid=08355', 'grabber' => array(
'body' => array( '%.*%' => array(
'//td[@class="NewsText"][1]', 'test_url' => 'http://distrowatch.com/?newsid=08355',
), 'body' => array(
'strip' => array( '//td[@class="NewsText"][1]',
),
'strip' => array(
)
)
) )
); );

View File

@ -1,11 +1,15 @@
<?php <?php
return array( return array(
'test_url' => 'http://dozodomo.com/bento/2014/03/04/lart-des-maki-de-takayo-kiyota/', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@class="joke"]', 'test_url' => 'http://dozodomo.com/bento/2014/03/04/lart-des-maki-de-takayo-kiyota/',
'//div[@class="story-cover"]', 'body' => array(
'//div[@class="story-content"]', '//div[@class="joke"]',
), '//div[@class="story-cover"]',
'strip' => array( '//div[@class="story-content"]',
),
'strip' => array(
)
)
) )
); );

View File

@ -0,0 +1,10 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.engadget.com/2015/04/20/dark-matter-discovery/?ncid=rss_truncated',
'body' => array('//div[@class="article-content"]/p[not(@class="read-more")] | //div[@class="article-content"]/div[@style="text-align: center;"]'),
'strip' => array(),
)
)
);

View File

@ -0,0 +1,42 @@
<?php
return array(
'grabber' => array(
'%/articles/view/comicsandcosplay/comics/critical-miss.*%' => array(
'body' => array('//*[@class="body"]/span/img | //div[@class="folder_nav_links"]/following::p'),
'test_url' => 'http://www.escapistmagazine.com/articles/view/comicsandcosplay/comics/critical-miss/13776-Critical-Miss-on-Framerates?utm_source=rss&amp;utm_medium=rss&amp;utm_campaign=articles',
'strip' => array()
),
'%/articles/view/comicsandcosplay/comics/namegame.*%' => array(
'body' => array('//*[@class="body"]/span/p/img[@height != "120"]'),
'test_url' => 'http://www.escapistmagazine.com/articles/view/comicsandcosplay/comics/namegame/9759-Leaving-the-Nest?utm_source=rss&amp;utm_medium=rss&amp;utm_campaign=articles',
'strip' => array()
),
'%/articles/view/comicsandcosplay/comics/stolen-pixels.*%' => array(
'body' => array('//*[@class="body"]/span/p[2]/img'),
'test_url' => 'http://www.escapistmagazine.com/articles/view/comicsandcosplay/comics/stolen-pixels/8866-Stolen-Pixels-258-Where-the-Boys-Are?utm_source=rss&amp;utm_medium=rss&amp;utm_campaign=articles',
'strip' => array()
),
'%/articles/view/comicsandcosplay/comics/bumhugparade.*%' => array(
'body' => array('//*[@class="body"]/span/p[2]/img'),
'test_url' => 'http://www.escapistmagazine.com/articles/view/comicsandcosplay/comics/bumhugparade/8262-Bumhug-Parade-13?utm_source=rss&amp;utm_medium=rss&amp;utm_campaign=articles',
'strip' => array()
),
'%/articles/view/comicsandcosplay.*/comics/escapistradiotheater%' => array(
'body' => array('//*[@class="body"]/span/p[2]/img'),
'test_url' => 'http://www.escapistmagazine.com/articles/view/comicsandcosplay/comics/escapistradiotheater/8265-The-Escapist-Radio-Theater-13?utm_source=rss&amp;utm_medium=rss&amp;utm_campaign=articles',
'strip' => array()
),
'%/articles/view/comicsandcosplay/comics/paused.*%' => array(
'body' => array('//*[@class="body"]/span/p[2]/img | //*[@class="body"]/span/div/img'),
'test_url' => 'http://www.escapistmagazine.com/articles/view/comicsandcosplay/comics/paused/8263-Paused-16?utm_source=rss&amp;utm_medium=rss&amp;utm_campaign=articles',
'strip' => array()
),
'%/articles/view/comicsandcosplay/comics/fraughtwithperil.*%' => array(
'body' => array('//*[@class="body"]'),
'test_url' => 'http://www.escapistmagazine.com/articles/view/comicsandcosplay/comics/fraughtwithperil/12166-The-Escapist-Presents-Escapist-Comics-Critical-Miss-B-lyeh-Fhlop?utm_source=rss&amp;utm_medium=rss&amp;utm_campaign=articles',
'strip' => array()
)
)
);

View File

@ -1,9 +1,13 @@
<?php <?php
return array( return array(
'test_url' => 'http://explosm.net/comics/3803/', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@id="comic-container"]', 'test_url' => 'http://explosm.net/comics/3803/',
), 'body' => array(
'strip' => array( '//div[@id="comic-container"]',
), ),
'strip' => array(
),
)
)
); );

View File

@ -1,9 +1,13 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.fastcodesign.com/3026548/exposure/peek-inside-the-worlds-forbidden-subway-tunnels', 'grabber' => array(
'body' => array( '%.*%' => array(
'//article[contains(@class, "body prose")]', 'test_url' => 'http://www.fastcodesign.com/3026548/exposure/peek-inside-the-worlds-forbidden-subway-tunnels',
), 'body' => array(
'strip' => array( '//article[contains(@class, "body prose")]',
),
'strip' => array(
)
)
) )
); );

View File

@ -1,9 +1,13 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.fastcoexist.com/3026114/take-a-seat-on-this-gates-funded-future-toilet-that-will-change-how-we-think-about-poop', 'grabber' => array(
'body' => array( '%.*%' => array(
'//article[contains(@class, "body prose")]', 'test_url' => 'http://www.fastcoexist.com/3026114/take-a-seat-on-this-gates-funded-future-toilet-that-will-change-how-we-think-about-poop',
), 'body' => array(
'strip' => array( '//article[contains(@class, "body prose")]',
),
'strip' => array(
)
)
) )
); );

View File

@ -1,9 +1,13 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.fastcompany.com/3026712/fast-feed/elon-musk-an-apple-tesla-merger-is-very-unlikely', 'grabber' => array(
'body' => array( '%.*%' => array(
'//article[contains(@class, "body prose")]', 'test_url' => 'http://www.fastcompany.com/3026712/fast-feed/elon-musk-an-apple-tesla-merger-is-very-unlikely',
), 'body' => array(
'strip' => array( '//article[contains(@class, "body prose")]',
),
'strip' => array(
)
)
) )
); );

View File

@ -1,9 +1,13 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.ffworld.com/?rub=news&page=voir&id=2709', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@class="news_body"]', 'test_url' => 'http://www.ffworld.com/?rub=news&page=voir&id=2709',
), 'body' => array(
'strip' => array( '//div[@class="news_body"]',
),
'strip' => array(
)
)
) )
); );

View File

@ -0,0 +1,10 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'body' => array('//*[@id="comic"] | //*[@class="post-image"]'),
'strip' => array(),
'test_url' => 'http://www.fowllanguagecomics.com/comic/working-out/'
)
)
);

View File

@ -1,10 +1,14 @@
<?php <?php
return array( return array(
'test_url' => 'https://github.com/audreyr/favicon-cheat-sheet', 'grabber' => array(
'body' => array( '%.*%' => array(
'//article[contains(@class, "entry-content")]', 'test_url' => 'https://github.com/audreyr/favicon-cheat-sheet',
), 'body' => array(
'strip' => array( '//article[contains(@class, "entry-content")]',
'//h1' ),
'strip' => array(
'//h1'
)
)
) )
); );

View File

@ -1,8 +1,12 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.golem.de/news/breko-telekom-verzoegert-gezielt-den-vectoring-ausbau-1311-102974.html', 'grabber' => array(
'body' => array( '%.*%' => array(
'//header[@class="cluster-header"]', 'test_url' => 'http://www.golem.de/news/breko-telekom-verzoegert-gezielt-den-vectoring-ausbau-1311-102974.html',
'//div[@class="formatted"]' 'body' => array(
'//header[@class="cluster-header"]',
'//div[@class="formatted"]'
)
)
) )
); );

View File

@ -1,7 +1,11 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.heise.de/security/meldung/BND-300-Millionen-Euro-fuer-Fruehwarnsystem-gegen-Cyber-Attacken-2192237.html', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@class="meldung_wrapper"]' 'test_url' => 'http://www.heise.de/security/meldung/BND-300-Millionen-Euro-fuer-Fruehwarnsystem-gegen-Cyber-Attacken-2192237.html',
'body' => array(
'//div[@class="meldung_wrapper"]'
)
)
) )
); );

View File

@ -1,9 +1,13 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.huffingtonpost.com/2014/02/20/centscere-social-media-syracuse_n_4823848.html', 'grabber' => array(
'body' => array( '%.*%' => array(
'//article[@class="content")]', 'test_url' => 'http://www.huffingtonpost.com/2014/02/20/centscere-social-media-syracuse_n_4823848.html',
), 'body' => array(
'strip' => array( '//article[@class="content")]',
),
'strip' => array(
)
)
) )
); );

View File

@ -1,8 +1,12 @@
<?php <?php
return array( return array(
'test_url' => 'http://ing.dk/artikel/smart-husisolering-og-styring-skal-mindske-japans-energikrise-164517', 'grabber' => array(
'body' => array( '%.*%' => array(
'//section[contains(@class, "teaser")]', 'test_url' => 'http://ing.dk/artikel/smart-husisolering-og-styring-skal-mindske-japans-energikrise-164517',
'//section[contains(@class, "body")]', 'body' => array(
'//section[contains(@class, "teaser")]',
'//section[contains(@class, "body")]',
)
)
) )
); );

View File

@ -1,7 +1,11 @@
<?php <?php
return array( return array(
'test_url' => 'http://www./2014/05/20/le-playstation-now-arrive-en-beta-fermee-aux-etats-unis/', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@class="post-content"]', 'test_url' => 'http://www./2014/05/20/le-playstation-now-arrive-en-beta-fermee-aux-etats-unis/',
'body' => array(
'//div[@class="post-content"]',
)
)
) )
); );

View File

@ -1,9 +1,13 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.kanpai.fr/japon/comment-donner-lheure-en-japonais.html', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@class="single-left"]', 'test_url' => 'http://www.kanpai.fr/japon/comment-donner-lheure-en-japonais.html',
), 'body' => array(
'strip' => array( '//div[@class="single-left"]',
),
'strip' => array(
)
)
) )
); );

View File

@ -1,8 +1,12 @@
<?php <?php
return array( return array(
'test_url' => 'http://karriere.jobfinder.dk/artikel/dansk-professor-skal-lede-smart-grid-forskning-20-millioner-dollars-763', 'grabber' => array(
'body' => array( '%.*%' => array(
'//section[contains(@class, "teaser")]', 'test_url' => 'http://karriere.jobfinder.dk/artikel/dansk-professor-skal-lede-smart-grid-forskning-20-millioner-dollars-763',
'//section[contains(@class, "body")]', 'body' => array(
'//section[contains(@class, "teaser")]',
'//section[contains(@class, "body")]',
)
)
) )
); );

View File

@ -1,13 +1,17 @@
<?php <?php
return array( return array(
'test_url' => 'http://lejapon.fr/guide-voyage-japon/5223/tokyo-sous-la-neige.htm', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@class="entry"]' 'test_url' => 'http://lejapon.fr/guide-voyage-japon/5223/tokyo-sous-la-neige.htm',
), 'body' => array(
'strip' => array( '//div[@class="entry"]'
'//*[contains(@class, "addthis_toolbox")]', ),
'//*[contains(@class, "addthis_default_style")]', 'strip' => array(
'//*[@class="navigation small"]', '//*[contains(@class, "addthis_toolbox")]',
'//*[@id="related"]', '//*[contains(@class, "addthis_default_style")]',
'//*[@class="navigation small"]',
'//*[@id="related"]',
)
)
) )
); );

View File

@ -1,9 +1,13 @@
<?php <?php
return array( return array(
'test_url' => 'http://lesjoiesducode.fr/post/75576211207/quand-lappli-ne-fonctionne-plus-sans-aucune-raison', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@class="blog-post-content"]', 'test_url' => 'http://lesjoiesducode.fr/post/75576211207/quand-lappli-ne-fonctionne-plus-sans-aucune-raison',
), 'body' => array(
'strip' => array( '//div[@class="blog-post-content"]',
),
'strip' => array(
)
)
) )
); );

View File

@ -0,0 +1,13 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.lfg.co/page/871/?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+LookingForGroup+%28Looking+For+Group%29&utm_content=FeedBurner',
'body' => array(
'//*[@id="comic"]/img | //*[@class="content"]'
),
'strip' => array(),
)
)
);

View File

@ -1,14 +1,18 @@
<?php <?php
return array( return array(
'test_url' => 'http://lifehacker.com/bring-water-bottle-caps-into-concerts-to-protect-your-d-1269334973', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[contains(@class, "row")/img', 'test_url' => 'http://lifehacker.com/bring-water-bottle-caps-into-concerts-to-protect-your-d-1269334973',
'//div[contains(@class, "content-column")]', 'body' => array(
), '//div[contains(@class, "row")/img',
'strip' => array( '//div[contains(@class, "content-column")]',
'//*[contains(@class, "meta")]', ),
'//span[contains(@class, "icon")]', 'strip' => array(
'//h1', '//*[contains(@class, "meta")]',
'//aside', '//span[contains(@class, "icon")]',
'//h1',
'//aside',
)
)
) )
); );

View File

@ -1,9 +1,13 @@
<?php <?php
return array( return array(
'test_url' => 'http://lists.freebsd.org/pipermail/freebsd-announce/2013-September/001504.html', 'grabber' => array(
'body' => array( '%.*%' => array(
'//pre', 'test_url' => 'http://lists.freebsd.org/pipermail/freebsd-announce/2013-September/001504.html',
), 'body' => array(
'strip' => array( '//pre',
),
'strip' => array(
)
)
) )
); );

View File

@ -0,0 +1,10 @@
<?php
return array(
'grabber' => array(
'%/comic.*%' => array(
'test_url' => 'http://www.loadingartist.com/comic/lifted-spirits/',
'body' => array('//div[@class="comic"]'),
'strip' => array(),
)
)
);

View File

@ -0,0 +1,10 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://loldwell.com/?comic=food-math-101',
'body' => array('//*[@id="comic"]'),
'strip' => array(),
)
)
);

View File

@ -1,9 +1,13 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.macg.co//logiciels/2014/05/feedly-sameliore-un-petit-peu-sur-mac-82205', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[contains(@class, "field-name-body")]' 'test_url' => 'http://www.macg.co//logiciels/2014/05/feedly-sameliore-un-petit-peu-sur-mac-82205',
), 'body' => array(
'strip' => array( '//div[contains(@class, "field-name-body")]'
), ),
'strip' => array(
),
)
)
); );

View File

@ -1,9 +1,13 @@
<?php <?php
return array( return array(
'test_url' => 'http://marc.info/?l=openbsd-misc&m=141987113202061&w=2', 'grabber' => array(
'body' => array( '%.*%' => array(
'//pre', 'test_url' => 'http://marc.info/?l=openbsd-misc&m=141987113202061&w=2',
), 'body' => array(
'strip' => array( '//pre',
), ),
'strip' => array(
),
)
)
); );

View File

@ -0,0 +1,10 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://maximumble.thebookofbiff.com/2015/04/20/1084-change/',
'body' => array('//div[@id="comic"]/div/a/img'),
'strip' => array(),
)
)
);

View File

@ -1,9 +1,13 @@
<?php <?php
return array( return array(
'test_url' => 'https://medium.com/lessons-learned/917b8b63ae3e', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[contains(@class, "post-field body")]', 'test_url' => 'https://medium.com/lessons-learned/917b8b63ae3e',
), 'body' => array(
'strip' => array( '//div[contains(@class, "post-field body")]',
),
'strip' => array(
)
)
) )
); );

View File

@ -0,0 +1,10 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.metronieuws.nl/sport/2015/04/broer-fellaini-zorgde-bijna-voor-paniek-bij-mourinho',
'body' => array('//div[contains(@class,"article-top")]/div[contains(@class,"image-component")] | //div[@class="article-full-width"]/div[1]'),
'strip' => array(),
)
)
);

View File

@ -0,0 +1,10 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://mokepon.smackjeeves.com/comics/2120096/chapter-9-page-68/',
'body' => array('//*[@id="comic_area_inner"]/img | //*[@id="comic_area_inner"]/a/img'),
'strip' => array(),
)
)
);

View File

@ -1,9 +1,13 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.monwindowsphone.com/tout-savoir-sur-le-centre-d-action-de-windows-phone-8-1-t40574.html', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@class="blog-post-body"]' 'test_url' => 'http://www.monwindowsphone.com/tout-savoir-sur-le-centre-d-action-de-windows-phone-8-1-t40574.html',
), 'body' => array(
'strip' => array( '//div[@class="blog-post-body"]'
), ),
'strip' => array(
),
)
)
); );

View File

@ -0,0 +1,12 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.neustadt-ticker.de/36480/aktuell/nachrichten/buergerbuero-neustadt-ab-heute-wieder-geoeffnet',
'body' => array('//div[contains(@class,"article")]/div[@class="PostContent" and *[not(contains(@class, "navigation"))]]'),
'strip' => array(
'//*[@id="wp_rp_first"]'
),
)
)
);

View File

@ -0,0 +1,10 @@
<?php
return array(
'grabber' => array(
'%/archives.*%' => array(
'test_url' => 'http://niceteethcomic.com/archives/page119/',
'body' => array('//*[@class="comicpane"]/a/img'),
'strip' => array(),
)
)
);

View File

@ -0,0 +1,8 @@
<?php
return array(
'filter' => array(
'%.*%' => array(
'%.*static.nichtlustig.de/comics/full/(\\d+).*%s' => '<img src="http://static.nichtlustig.de/comics/full/$1.jpg" />'
)
)
);

View File

@ -1,16 +1,20 @@
<?php <?php
return array( return array(
'test_url' => 'https://www.openrightsgroup.org/blog/2014/3-days-to-go-till-orgcon2014', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[contains(@class, "content")]/div', 'test_url' => 'https://www.openrightsgroup.org/blog/2014/3-days-to-go-till-orgcon2014',
), 'body' => array(
'strip' => array( '//div[contains(@class, "content")]/div',
'//h2[1]', ),
'//div[@class="info"]', 'strip' => array(
'//div[@class="tags"]', '//h2[1]',
'//div[@class="comments"]', '//div[@class="info"]',
'//div[@class="breadcrumbs"]', '//div[@class="tags"]',
'//h1[@class="pageTitle"]', '//div[@class="comments"]',
'//p[@class="bookmarkThis"]', '//div[@class="breadcrumbs"]',
), '//h1[@class="pageTitle"]',
'//p[@class="bookmarkThis"]',
),
)
)
); );

View File

@ -1,9 +1,13 @@
<?php <?php
return array( return array(
'test_url' => 'http://pastebin.com/ed1pP9Ak', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@class="text"]', 'test_url' => 'http://pastebin.com/ed1pP9Ak',
), 'body' => array(
'strip' => array( '//div[@class="text"]',
),
'strip' => array(
)
)
) )
); );

View File

@ -0,0 +1,21 @@
<?php
return array(
'grabber' => array(
'%/news/.*%' => array(
'test_url' => 'http://penny-arcade.com/news/post/2015/04/15/101-part-two',
'body' => array(
'//*[@class="postBody"]/*',
),
'strip' => array(
)
),
'%/comic/.*%' => array(
'test_url' => 'http://penny-arcade.com/comic/2015/04/15',
'body' => array(
'//*[@id="comicFrame"]/a/img',
),
'strip' => array(
)
)
)
);

View File

@ -1,7 +1,11 @@
<?php <?php
return array( return array(
'test_url' => 'https://plus.google.com/+LarryPage/posts/Lh8SKC6sED1', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@role="article"]/div[contains(@class, "eE")]', 'test_url' => 'https://plus.google.com/+LarryPage/posts/Lh8SKC6sED1',
), 'body' => array(
'//div[@role="article"]/div[contains(@class, "eE")]',
),
)
)
); );

20
vendor/fguillot/picofeed/lib/PicoFeed/Rules/putaindecode.fr.php vendored Executable file → Normal file
View File

@ -1,12 +1,16 @@
<?php <?php
return array( return array(
'test_url' => 'http://putaindecode.fr/posts/js/etat-lieux-js-modulaire-front/', 'grabber' => array(
'body' => array( '%.*%' => array(
'//*[@class="putainde-Post-md"]', 'test_url' => 'http://putaindecode.fr/posts/js/etat-lieux-js-modulaire-front/',
), 'body' => array(
'strip' => array( '//*[@class="putainde-Post-md"]',
'//*[contains(@class, "inlineimg")]', ),
'//*[contains(@class, "comment-respond")]', 'strip' => array(
'//header' '//*[contains(@class, "inlineimg")]',
'//*[contains(@class, "comment-respond")]',
'//header'
)
)
) )
); );

View File

@ -1,9 +1,13 @@
<?php <?php
return array( return array(
'test_url' => 'http://rue89.feedsportal.com/c/33822/f/608948/s/30999fa0/sc/24/l/0L0Srue890N0C20A130C0A80C30A0Cfaisait0Eboris0Eboillon0Eex0Esarko0Eboy0E350A0E0A0A0A0Eeuros0Egare0Enord0E245315/story01.htm', 'grabber' => array(
'body' => array( '%.*%' => array(
'//*[@id="article"]/div[contains(@class, "content")]', 'test_url' => 'http://rue89.feedsportal.com/c/33822/f/608948/s/30999fa0/sc/24/l/0L0Srue890N0C20A130C0A80C30A0Cfaisait0Eboris0Eboillon0Eex0Esarko0Eboy0E350A0E0A0A0A0Eeuros0Egare0Enord0E245315/story01.htm',
), 'body' => array(
'strip' => array( '//*[@id="article"]/div[contains(@class, "content")]',
),
'strip' => array(
)
)
) )
); );

View File

@ -0,0 +1,12 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://satwcomic.com/day-at-the-beach',
'body' => array(
'//div[@class="container"]/center/a/img'
),
'strip' => array(),
)
)
);

View File

@ -0,0 +1,12 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'https://www.scrumalliance.org/community/articles/2015/march/an-introduction-to-agile-project-intake?feed=articles',
'body' => array(
'//div[@class="article_content"]',
),
'strip' => array()
)
)
);

View File

@ -1,9 +1,13 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.sitepoint.com/creating-hello-world-app-swift/', 'grabber' => array(
'body' => array( '%.*%' => array(
'//section[@class="article_body"]', 'test_url' => 'http://www.sitepoint.com/creating-hello-world-app-swift/',
), 'body' => array(
'strip' => array( '//section[@class="article_body"]',
), ),
'strip' => array(
),
)
)
); );

View File

@ -0,0 +1,11 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://science.slashdot.org/story/15/04/20/0528253/pull-top-can-tabs-at-50-reach-historic-archaeological-status',
'body' => array(
'//article/div[@class="body"] | //article[@class="layout-article"]/div[@class="elips"]'),
'strip' => array(),
)
)
);

View File

@ -1,15 +1,19 @@
<?php <?php
return array( return array(
'test_url' => 'http://smallhousebliss.com/2013/08/29/house-g-by-lode-architecture/', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[@class="post-content"]', 'test_url' => 'http://smallhousebliss.com/2013/08/29/house-g-by-lode-architecture/',
), 'body' => array(
'strip' => array( '//div[@class="post-content"]',
'//*[contains(@class, "gallery")]', ),
'//*[contains(@class, "share")]', 'strip' => array(
'//*[contains(@class, "wpcnt")]', '//*[contains(@class, "gallery")]',
'//*[contains(@class, "meta")]', '//*[contains(@class, "share")]',
'//*[contains(@class, "postitle")]', '//*[contains(@class, "wpcnt")]',
'//*[@id="nav-below"]', '//*[contains(@class, "meta")]',
'//*[contains(@class, "postitle")]',
'//*[@id="nav-below"]',
)
)
) )
); );

View File

@ -0,0 +1,10 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://smarthomewelt.de/apple-tv-amazon-echo-smart-home/',
'body' => array('//div[@class="entry-inner"]/p | //div[@class="entry-inner"]/div[contains(@class,"wp-caption")]'),
'strip' => array(),
)
)
);

View File

@ -0,0 +1,10 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.smashingmagazine.com/2015/04/17/using-sketch-for-responsive-web-design-case-study/',
'body' => array('//article[contains(@class,"post")]/p'),
'strip' => array(),
)
)
);

View File

@ -1,7 +1,11 @@
<?php <?php
return array( return array(
'test_url' => 'http://www.spiegel.de/politik/ausland/afrika-angola-geht-gegen-islam-vor-und-schliesst-moscheen-a-935788.html', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[contains(@class, "article-section")]' 'test_url' => 'http://www.spiegel.de/politik/ausland/afrika-angola-geht-gegen-islam-vor-und-schliesst-moscheen-a-935788.html',
'body' => array(
'//div[contains(@class, "article-section")]'
)
)
) )
); );

View File

@ -0,0 +1,10 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://sz.de/1.2443161',
'body' => array('//article[@id="sitecontent"]/section[@class="topenrichment"]//img | //article[@id="sitecontent"]/section[@class="body"]/section[@class="authors"]/preceding-sibling::*[not(contains(@class, "ad"))]'),
'strip' => array(),
)
)
);

View File

@ -1,11 +1,15 @@
<?php <?php
return array( return array(
'test_url' => 'http://techcrunch.com/2013/08/31/indias-visa-maze/', 'grabber' => array(
'body' => array( '%.*%' => array(
'//div[contains(@class, "media-container")]', 'test_url' => 'http://techcrunch.com/2013/08/31/indias-visa-maze/',
'//div[@class="body-copy"]', 'body' => array(
), '//div[contains(@class, "media-container")]',
'strip' => array( '//div[@class="body-copy"]',
'//*[contains(@class, "module-crunchbase")]' ),
'strip' => array(
'//*[contains(@class, "module-crunchbase")]'
)
)
) )
); );

Some files were not shown because too many files have changed in this diff Show More