Update PicoFeed to have ContentGenerator
This commit is contained in:
parent
b8a9c91e79
commit
165acb0342
@ -15,7 +15,7 @@
|
||||
"fguillot/simple-validator": "v1.0.0",
|
||||
"fguillot/json-rpc": "v1.0.2",
|
||||
"fguillot/picodb": "v1.0.2",
|
||||
"fguillot/picofeed": "v0.1.19"
|
||||
"fguillot/picofeed": "v0.1.20"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpunit/phpunit": "4.8.3",
|
||||
|
9
vendor/composer/autoload_classmap.php
vendored
9
vendor/composer/autoload_classmap.php
vendored
@ -25,6 +25,7 @@ return array(
|
||||
'PicoDb\\SQLException' => $vendorDir . '/fguillot/picodb/lib/PicoDb/SQLException.php',
|
||||
'PicoDb\\Schema' => $vendorDir . '/fguillot/picodb/lib/PicoDb/Schema.php',
|
||||
'PicoDb\\Table' => $vendorDir . '/fguillot/picodb/lib/PicoDb/Table.php',
|
||||
'PicoFeed\\Base' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Base.php',
|
||||
'PicoFeed\\Client\\Client' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Client.php',
|
||||
'PicoFeed\\Client\\ClientException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/ClientException.php',
|
||||
'PicoFeed\\Client\\Curl' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Client/Curl.php',
|
||||
@ -42,6 +43,9 @@ return array(
|
||||
'PicoFeed\\Filter\\Filter' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php',
|
||||
'PicoFeed\\Filter\\Html' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Filter/Html.php',
|
||||
'PicoFeed\\Filter\\Tag' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Filter/Tag.php',
|
||||
'PicoFeed\\Generator\\ContentGeneratorInterface' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Generator/ContentGeneratorInterface.php',
|
||||
'PicoFeed\\Generator\\FileContentGenerator' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Generator/FileContentGenerator.php',
|
||||
'PicoFeed\\Generator\\YoutubeContentGenerator' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Generator/YoutubeContentGenerator.php',
|
||||
'PicoFeed\\Logging\\Logger' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Logging/Logger.php',
|
||||
'PicoFeed\\Parser\\Atom' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/Atom.php',
|
||||
'PicoFeed\\Parser\\DateParser' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/DateParser.php',
|
||||
@ -57,6 +61,11 @@ return array(
|
||||
'PicoFeed\\Parser\\XmlEntityException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/XmlEntityException.php',
|
||||
'PicoFeed\\Parser\\XmlParser' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php',
|
||||
'PicoFeed\\PicoFeedException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/PicoFeedException.php',
|
||||
'PicoFeed\\Processor\\ContentFilterProcessor' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ContentFilterProcessor.php',
|
||||
'PicoFeed\\Processor\\ContentGeneratorProcessor' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ContentGeneratorProcessor.php',
|
||||
'PicoFeed\\Processor\\ItemPostProcessor' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ItemPostProcessor.php',
|
||||
'PicoFeed\\Processor\\ItemProcessorInterface' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ItemProcessorInterface.php',
|
||||
'PicoFeed\\Processor\\ScraperProcessor' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Processor/ScraperProcessor.php',
|
||||
'PicoFeed\\Reader\\Favicon' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/Favicon.php',
|
||||
'PicoFeed\\Reader\\Reader' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/Reader.php',
|
||||
'PicoFeed\\Reader\\ReaderException' => $vendorDir . '/fguillot/picofeed/lib/PicoFeed/Reader/ReaderException.php',
|
||||
|
12
vendor/composer/installed.json
vendored
12
vendor/composer/installed.json
vendored
@ -163,17 +163,17 @@
|
||||
},
|
||||
{
|
||||
"name": "fguillot/picofeed",
|
||||
"version": "v0.1.19",
|
||||
"version_normalized": "0.1.19.0",
|
||||
"version": "v0.1.20",
|
||||
"version_normalized": "0.1.20.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/fguillot/picoFeed.git",
|
||||
"reference": "c270ef4474a2460d857f99c84612025c5f9975f2"
|
||||
"reference": "d6bbdd248fa4a3eef7831ffaae0491a2ea58f897"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/fguillot/picoFeed/zipball/c270ef4474a2460d857f99c84612025c5f9975f2",
|
||||
"reference": "c270ef4474a2460d857f99c84612025c5f9975f2",
|
||||
"url": "https://api.github.com/repos/fguillot/picoFeed/zipball/d6bbdd248fa4a3eef7831ffaae0491a2ea58f897",
|
||||
"reference": "d6bbdd248fa4a3eef7831ffaae0491a2ea58f897",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
@ -188,7 +188,7 @@
|
||||
"suggest": {
|
||||
"ext-curl": "PicoFeed will use cURL if present"
|
||||
},
|
||||
"time": "2016-02-11 19:52:02",
|
||||
"time": "2016-03-24 12:09:56",
|
||||
"bin": [
|
||||
"picofeed"
|
||||
],
|
||||
|
34
vendor/fguillot/picofeed/lib/PicoFeed/Base.php
vendored
Normal file
34
vendor/fguillot/picofeed/lib/PicoFeed/Base.php
vendored
Normal file
@ -0,0 +1,34 @@
|
||||
<?php
|
||||
|
||||
namespace PicoFeed;
|
||||
|
||||
use PicoFeed\Config\Config;
|
||||
use PicoFeed\Logging\Logger;
|
||||
|
||||
/**
|
||||
* Base class
|
||||
*
|
||||
* @package PicoFeed
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
abstract class Base
|
||||
{
|
||||
/**
|
||||
* Config class instance
|
||||
*
|
||||
* @access protected
|
||||
* @var \PicoFeed\Config\Config
|
||||
*/
|
||||
protected $config;
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param \PicoFeed\Config\Config $config Config class instance
|
||||
*/
|
||||
public function __construct(Config $config = null)
|
||||
{
|
||||
$this->config = $config ?: new Config();
|
||||
Logger::setTimezone($this->config->getTimezone());
|
||||
}
|
||||
}
|
@ -2,24 +2,17 @@
|
||||
|
||||
namespace PicoFeed\Filter;
|
||||
|
||||
use DOMXpath;
|
||||
use DOMXPath;
|
||||
use PicoFeed\Base;
|
||||
use PicoFeed\Parser\XmlParser;
|
||||
use PicoFeed\Config\Config;
|
||||
|
||||
/**
|
||||
* Tag Filter class.
|
||||
*
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
class Tag
|
||||
class Tag extends Base
|
||||
{
|
||||
/**
|
||||
* Config object.
|
||||
*
|
||||
* @var \PicoFeed\Config\Config
|
||||
*/
|
||||
private $config;
|
||||
|
||||
/**
|
||||
* Tags blacklist (Xpath expressions).
|
||||
*
|
||||
@ -76,11 +69,6 @@ class Tag
|
||||
'q',
|
||||
);
|
||||
|
||||
public function __construct(Config $config)
|
||||
{
|
||||
$this->config = $config;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the tag is allowed and is not a pixel tracker.
|
||||
*
|
||||
|
23
vendor/fguillot/picofeed/lib/PicoFeed/Generator/ContentGeneratorInterface.php
vendored
Normal file
23
vendor/fguillot/picofeed/lib/PicoFeed/Generator/ContentGeneratorInterface.php
vendored
Normal file
@ -0,0 +1,23 @@
|
||||
<?php
|
||||
|
||||
namespace PicoFeed\Generator;
|
||||
|
||||
use PicoFeed\Parser\Item;
|
||||
|
||||
/**
|
||||
* Content Generator Interface
|
||||
*
|
||||
* @package PicoFeed\Generator
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
interface ContentGeneratorInterface
|
||||
{
|
||||
/**
|
||||
* Execute Content Generator
|
||||
*
|
||||
* @access public
|
||||
* @param Item $item
|
||||
* @return boolean
|
||||
*/
|
||||
public function execute(Item $item);
|
||||
}
|
36
vendor/fguillot/picofeed/lib/PicoFeed/Generator/FileContentGenerator.php
vendored
Normal file
36
vendor/fguillot/picofeed/lib/PicoFeed/Generator/FileContentGenerator.php
vendored
Normal file
@ -0,0 +1,36 @@
|
||||
<?php
|
||||
|
||||
namespace PicoFeed\Generator;
|
||||
|
||||
use PicoFeed\Base;
|
||||
use PicoFeed\Parser\Item;
|
||||
|
||||
/**
|
||||
* File Content Generator
|
||||
*
|
||||
* @package PicoFeed\Generator
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
class FileContentGenerator extends Base implements ContentGeneratorInterface
|
||||
{
|
||||
private $extensions = array('pdf');
|
||||
|
||||
/**
|
||||
* Execute Content Generator
|
||||
*
|
||||
* @access public
|
||||
* @param Item $item
|
||||
* @return boolean
|
||||
*/
|
||||
public function execute(Item $item)
|
||||
{
|
||||
foreach ($this->extensions as $extension) {
|
||||
if (substr($item->getUrl(), - strlen($extension)) === $extension) {
|
||||
$item->setContent('<a href="'.$item->getUrl().'" target="_blank">'.$item->getUrl().'</a>');
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
67
vendor/fguillot/picofeed/lib/PicoFeed/Generator/YoutubeContentGenerator.php
vendored
Normal file
67
vendor/fguillot/picofeed/lib/PicoFeed/Generator/YoutubeContentGenerator.php
vendored
Normal file
@ -0,0 +1,67 @@
|
||||
<?php
|
||||
|
||||
namespace PicoFeed\Generator;
|
||||
|
||||
use PicoFeed\Base;
|
||||
use PicoFeed\Parser\Item;
|
||||
|
||||
/**
|
||||
* Youtube Content Generator
|
||||
*
|
||||
* @package PicoFeed\Generator
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
class YoutubeContentGenerator extends Base implements ContentGeneratorInterface
|
||||
{
|
||||
/**
|
||||
* Execute Content Generator
|
||||
*
|
||||
* @access public
|
||||
* @param Item $item
|
||||
* @return boolean
|
||||
*/
|
||||
public function execute(Item $item)
|
||||
{
|
||||
if ($item->hasNamespace('yt')) {
|
||||
return $this->generateHtmlFromXml($item);
|
||||
}
|
||||
|
||||
return $this->generateHtmlFromUrl($item);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate HTML
|
||||
*
|
||||
* @access public
|
||||
* @param Item $item
|
||||
* @return boolean
|
||||
*/
|
||||
private function generateHtmlFromXml(Item $item)
|
||||
{
|
||||
$videoId = $item->getTag('yt:videoId');
|
||||
|
||||
if (! empty($videoId)) {
|
||||
$item->setContent('<iframe width="560" height="315" src="//www.youtube.com/embed/'.$videoId[0].'" frameborder="0"></iframe>');
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate HTML from item URL
|
||||
*
|
||||
* @access public
|
||||
* @param Item $item
|
||||
* @return bool
|
||||
*/
|
||||
public function generateHtmlFromUrl(Item $item)
|
||||
{
|
||||
if (preg_match('/youtube\.com\/watch\?v=(.*)/', $item->getUrl(), $matches)) {
|
||||
$item->setContent('<iframe width="560" height="315" src="//www.youtube.com/embed/'.$matches[1].'" frameborder="0"></iframe>');
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
@ -150,7 +150,7 @@ class Atom extends Parser
|
||||
$updated = XmlParser::getXPathResult($xml, 'atom:updated', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($xml, 'updated');
|
||||
|
||||
$feed->date = $this->date->getDateTime((string) current($updated));
|
||||
$feed->date = $this->getDateParser()->getDateTime((string) current($updated));
|
||||
}
|
||||
|
||||
/**
|
||||
@ -168,8 +168,8 @@ class Atom extends Parser
|
||||
$updated = XmlParser::getXPathResult($entry, 'atom:updated', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($entry, 'updated');
|
||||
|
||||
$published = !empty($published) ? $this->date->getDateTime((string) current($published)) : null;
|
||||
$updated = !empty($updated) ? $this->date->getDateTime((string) current($updated)) : null;
|
||||
$published = !empty($published) ? $this->getDateParser()->getDateTime((string) current($published)) : null;
|
||||
$updated = !empty($updated) ? $this->getDateParser()->getDateTime((string) current($updated)) : null;
|
||||
|
||||
if ($published === null && $updated === null) {
|
||||
$item->date = $feed->getDate(); // We use the feed date if there is no date for the item
|
||||
|
@ -4,20 +4,22 @@ namespace PicoFeed\Parser;
|
||||
|
||||
use DateTime;
|
||||
use DateTimeZone;
|
||||
use PicoFeed\Base;
|
||||
|
||||
/**
|
||||
* Date Parser.
|
||||
*
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
class DateParser
|
||||
class DateParser extends Base
|
||||
{
|
||||
/**
|
||||
* Timezone used to parse feed dates.
|
||||
*
|
||||
* @access private
|
||||
* @var string
|
||||
*/
|
||||
public $timezone = 'UTC';
|
||||
private $timezone = 'UTC';
|
||||
|
||||
/**
|
||||
* Supported formats [ 'format' => length ].
|
||||
@ -88,7 +90,7 @@ class DateParser
|
||||
*/
|
||||
public function getValidDate($format, $value)
|
||||
{
|
||||
$date = DateTime::createFromFormat($format, $value, new DateTimeZone($this->timezone));
|
||||
$date = DateTime::createFromFormat($format, $value, $this->getTimeZone());
|
||||
|
||||
if ($date !== false) {
|
||||
$errors = DateTime::getLastErrors();
|
||||
@ -108,6 +110,17 @@ class DateParser
|
||||
*/
|
||||
public function getCurrentDateTime()
|
||||
{
|
||||
return new DateTime('now', new DateTimeZone($this->timezone));
|
||||
return new DateTime('now', $this->getTimeZone());
|
||||
}
|
||||
|
||||
/**
|
||||
* Get DateTimeZone instance
|
||||
*
|
||||
* @access public
|
||||
* @return DateTimeZone
|
||||
*/
|
||||
public function getTimeZone()
|
||||
{
|
||||
return new DateTimeZone($this->config->getTimezone() ?: $this->timezone);
|
||||
}
|
||||
}
|
||||
|
@ -102,6 +102,18 @@ class Item
|
||||
*/
|
||||
public $namespaces = array();
|
||||
|
||||
/**
|
||||
* Check if a XML namespace exists
|
||||
*
|
||||
* @access public
|
||||
* @param string $namespace
|
||||
* @return bool
|
||||
*/
|
||||
public function hasNamespace($namespace)
|
||||
{
|
||||
return array_key_exists($namespace, $this->namespaces);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get specific XML tag or attribute value.
|
||||
*
|
||||
@ -112,12 +124,10 @@ class Item
|
||||
*/
|
||||
public function getTag($tag, $attribute = '')
|
||||
{
|
||||
// convert to xPath attribute query
|
||||
if ($attribute !== '') {
|
||||
$attribute = '/@'.$attribute;
|
||||
}
|
||||
|
||||
// construct query
|
||||
$query = './/'.$tag.$attribute;
|
||||
$elements = XmlParser::getXPathResult($this->xml, $query, $this->namespaces);
|
||||
|
||||
@ -155,13 +165,29 @@ class Item
|
||||
}
|
||||
|
||||
/**
|
||||
* Get url.
|
||||
* Get URL
|
||||
*
|
||||
* @access public
|
||||
* @return string
|
||||
*/
|
||||
public function getUrl()
|
||||
{
|
||||
return $this->url;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set URL
|
||||
*
|
||||
* @access public
|
||||
* @param string $url
|
||||
* @return Item
|
||||
*/
|
||||
public function setUrl($url)
|
||||
{
|
||||
$this->url = $url;
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get id.
|
||||
*/
|
||||
@ -186,6 +212,19 @@ class Item
|
||||
return $this->content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set content
|
||||
*
|
||||
* @access public
|
||||
* @param string $value
|
||||
* @return Item
|
||||
*/
|
||||
public function setContent($value)
|
||||
{
|
||||
$this->content = $value;
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get enclosure url.
|
||||
*/
|
||||
|
@ -2,12 +2,15 @@
|
||||
|
||||
namespace PicoFeed\Parser;
|
||||
|
||||
use PicoFeed\Processor\ContentFilterProcessor;
|
||||
use PicoFeed\Processor\ContentGeneratorProcessor;
|
||||
use PicoFeed\Processor\ItemPostProcessor;
|
||||
use PicoFeed\Processor\ScraperProcessor;
|
||||
use SimpleXMLElement;
|
||||
use PicoFeed\Client\Url;
|
||||
use PicoFeed\Encoding\Encoding;
|
||||
use PicoFeed\Filter\Filter;
|
||||
use PicoFeed\Logging\Logger;
|
||||
use PicoFeed\Scraper\Scraper;
|
||||
|
||||
/**
|
||||
* Base parser class.
|
||||
@ -28,7 +31,7 @@ abstract class Parser
|
||||
*
|
||||
* @var \PicoFeed\Parser\DateParser
|
||||
*/
|
||||
protected $date;
|
||||
private $dateParser;
|
||||
|
||||
/**
|
||||
* Hash algorithm used to generate item id, any value supported by PHP, see hash_algos().
|
||||
@ -66,32 +69,12 @@ abstract class Parser
|
||||
protected $used_namespaces = array();
|
||||
|
||||
/**
|
||||
* Enable the content filtering.
|
||||
* Item Post Processor instance
|
||||
*
|
||||
* @var bool
|
||||
* @access private
|
||||
* @var ItemPostProcessor
|
||||
*/
|
||||
private $enable_filter = true;
|
||||
|
||||
/**
|
||||
* Enable the content grabber.
|
||||
*
|
||||
* @var bool
|
||||
*/
|
||||
private $enable_grabber = false;
|
||||
|
||||
/**
|
||||
* Enable the content grabber on all pages.
|
||||
*
|
||||
* @var bool
|
||||
*/
|
||||
private $grabber_needs_rule_file = false;
|
||||
|
||||
/**
|
||||
* Ignore those urls for the content scraper.
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
private $grabber_ignore_urls = array();
|
||||
private $itemPostProcessor;
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
@ -102,7 +85,6 @@ abstract class Parser
|
||||
*/
|
||||
public function __construct($content, $http_encoding = '', $fallback_url = '')
|
||||
{
|
||||
$this->date = new DateParser();
|
||||
$this->fallback_url = $fallback_url;
|
||||
$xml_encoding = XmlParser::getEncodingFromXmlTag($content);
|
||||
|
||||
@ -112,6 +94,10 @@ abstract class Parser
|
||||
// Encode everything in UTF-8
|
||||
Logger::setMessage(get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"');
|
||||
$this->content = Encoding::convert($this->content, $xml_encoding ?: $http_encoding);
|
||||
|
||||
$this->itemPostProcessor = new ItemPostProcessor($this->config);
|
||||
$this->itemPostProcessor->register(new ContentGeneratorProcessor($this->config));
|
||||
$this->itemPostProcessor->register(new ContentFilterProcessor($this->config));
|
||||
}
|
||||
|
||||
/**
|
||||
@ -173,15 +159,11 @@ abstract class Parser
|
||||
|
||||
// Id generation can use the item url/title/content (order is important)
|
||||
$this->findItemId($entry, $item, $feed);
|
||||
|
||||
$this->findItemDate($entry, $item, $feed);
|
||||
$this->findItemEnclosure($entry, $item, $feed);
|
||||
$this->findItemLanguage($entry, $item, $feed);
|
||||
|
||||
// Order is important (avoid double filtering)
|
||||
$this->filterItemContent($feed, $item);
|
||||
$this->scrapWebsite($item);
|
||||
|
||||
$this->itemPostProcessor->execute($feed, $item);
|
||||
$feed->items[] = $item;
|
||||
}
|
||||
|
||||
@ -230,43 +212,29 @@ abstract class Parser
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch item content with the content grabber.
|
||||
* Get Item Post Processor instance
|
||||
*
|
||||
* @param Item $item Item object
|
||||
* @access public
|
||||
* @return ItemPostProcessor
|
||||
*/
|
||||
public function scrapWebsite(Item $item)
|
||||
public function getItemPostProcessor()
|
||||
{
|
||||
if ($this->enable_grabber && !in_array($item->getUrl(), $this->grabber_ignore_urls)) {
|
||||
$grabber = new Scraper($this->config);
|
||||
$grabber->setUrl($item->getUrl());
|
||||
|
||||
if ($this->grabber_needs_rule_file) {
|
||||
$grabber->disableCandidateParser();
|
||||
}
|
||||
|
||||
$grabber->execute();
|
||||
|
||||
if ($grabber->hasRelevantContent()) {
|
||||
$item->content = $grabber->getFilteredContent();
|
||||
}
|
||||
}
|
||||
return $this->itemPostProcessor;
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter HTML for entry content.
|
||||
* Get DateParser instance
|
||||
*
|
||||
* @param Feed $feed Feed object
|
||||
* @param Item $item Item object
|
||||
* @access public
|
||||
* @return DateParser
|
||||
*/
|
||||
public function filterItemContent(Feed $feed, Item $item)
|
||||
public function getDateParser()
|
||||
{
|
||||
if ($this->isFilteringEnabled()) {
|
||||
$filter = Filter::html($item->getContent(), $feed->getSiteUrl());
|
||||
$filter->setConfig($this->config);
|
||||
$item->content = $filter->execute();
|
||||
} else {
|
||||
Logger::setMessage(get_called_class().': Content filtering disabled');
|
||||
if ($this->dateParser === null) {
|
||||
return new DateParser($this->config);
|
||||
}
|
||||
|
||||
return $this->dateParser;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -316,31 +284,11 @@ abstract class Parser
|
||||
* Set Hash algorithm used for id generation.
|
||||
*
|
||||
* @param string $algo Algorithm name
|
||||
*
|
||||
* @return \PicoFeed\Parser\Parser
|
||||
*/
|
||||
public function setHashAlgo($algo)
|
||||
{
|
||||
$this->hash_algo = $algo ?: $this->hash_algo;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a different timezone.
|
||||
*
|
||||
* @see http://php.net/manual/en/timezones.php
|
||||
*
|
||||
* @param string $timezone Timezone
|
||||
*
|
||||
* @return \PicoFeed\Parser\Parser
|
||||
*/
|
||||
public function setTimezone($timezone)
|
||||
{
|
||||
if ($timezone) {
|
||||
$this->date->timezone = $timezone;
|
||||
}
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
@ -354,7 +302,6 @@ abstract class Parser
|
||||
public function setConfig($config)
|
||||
{
|
||||
$this->config = $config;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
@ -365,21 +312,8 @@ abstract class Parser
|
||||
*/
|
||||
public function disableContentFiltering()
|
||||
{
|
||||
$this->enable_filter = false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if the content filtering is enabled.
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function isFilteringEnabled()
|
||||
{
|
||||
if ($this->config === null) {
|
||||
return $this->enable_filter;
|
||||
}
|
||||
|
||||
return $this->config->getContentFiltering($this->enable_filter);
|
||||
$this->itemPostProcessor->unregister('PicoFeed\Processor\ContentFilterProcessor');
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -392,8 +326,14 @@ abstract class Parser
|
||||
*/
|
||||
public function enableContentGrabber($needs_rule_file = false)
|
||||
{
|
||||
$this->enable_grabber = true;
|
||||
$this->grabber_needs_rule_file = $needs_rule_file;
|
||||
$processor = new ScraperProcessor($this->config);
|
||||
|
||||
if ($needs_rule_file) {
|
||||
$processor->getScraper()->disableCandidateParser();
|
||||
}
|
||||
|
||||
$this->itemPostProcessor->register($processor);
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -405,7 +345,8 @@ abstract class Parser
|
||||
*/
|
||||
public function setGrabberIgnoreUrls(array $urls)
|
||||
{
|
||||
$this->grabber_ignore_urls = $urls;
|
||||
$this->itemPostProcessor->getProcessor('PicoFeed\Processor\ScraperProcessor')->ignoreUrls($urls);
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -149,7 +149,7 @@ class Rss10 extends Parser
|
||||
$date = XmlParser::getXPathResult($xml, 'rss:channel/dc:date', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($xml, 'channel/dc:date', $this->namespaces);
|
||||
|
||||
$feed->date = $this->date->getDateTime((string) current($date));
|
||||
$feed->date = $this->getDateParser()->getDateTime((string) current($date));
|
||||
}
|
||||
|
||||
/**
|
||||
@ -163,7 +163,7 @@ class Rss10 extends Parser
|
||||
{
|
||||
$date = XmlParser::getXPathResult($entry, 'dc:date', $this->namespaces);
|
||||
|
||||
$item->date = empty($date) ? $feed->getDate() : $this->date->getDateTime((string) current($date));
|
||||
$item->date = empty($date) ? $feed->getDate() : $this->getDateParser()->getDateTime((string) current($date));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -139,11 +139,11 @@ class Rss20 extends Parser
|
||||
$publish_date = XmlParser::getXPathResult($xml, 'channel/pubDate');
|
||||
$update_date = XmlParser::getXPathResult($xml, 'channel/lastBuildDate');
|
||||
|
||||
$published = !empty($publish_date) ? $this->date->getDateTime((string) current($publish_date)) : null;
|
||||
$updated = !empty($update_date) ? $this->date->getDateTime((string) current($update_date)) : null;
|
||||
$published = !empty($publish_date) ? $this->getDateParser()->getDateTime((string) current($publish_date)) : null;
|
||||
$updated = !empty($update_date) ? $this->getDateParser()->getDateTime((string) current($update_date)) : null;
|
||||
|
||||
if ($published === null && $updated === null) {
|
||||
$feed->date = $this->date->getCurrentDateTime(); // We use the current date if there is no date for the feed
|
||||
$feed->date = $this->getDateParser()->getCurrentDateTime(); // We use the current date if there is no date for the feed
|
||||
} elseif ($published !== null && $updated !== null) {
|
||||
$feed->date = max($published, $updated); // We use the most recent date between published and updated
|
||||
} else {
|
||||
@ -162,7 +162,7 @@ class Rss20 extends Parser
|
||||
{
|
||||
$date = XmlParser::getXPathResult($entry, 'pubDate');
|
||||
|
||||
$item->date = empty($date) ? $feed->getDate() : $this->date->getDateTime((string) current($date));
|
||||
$item->date = empty($date) ? $feed->getDate() : $this->getDateParser()->getDateTime((string) current($date));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -38,7 +38,7 @@ class XmlParser
|
||||
*
|
||||
* @param string $input XML content
|
||||
*
|
||||
* @return \DOMNDocument
|
||||
* @return \DOMDocument
|
||||
*/
|
||||
public static function getDomDocument($input)
|
||||
{
|
||||
|
37
vendor/fguillot/picofeed/lib/PicoFeed/Processor/ContentFilterProcessor.php
vendored
Normal file
37
vendor/fguillot/picofeed/lib/PicoFeed/Processor/ContentFilterProcessor.php
vendored
Normal file
@ -0,0 +1,37 @@
|
||||
<?php
|
||||
|
||||
namespace PicoFeed\Processor;
|
||||
|
||||
use PicoFeed\Base;
|
||||
use PicoFeed\Filter\Filter;
|
||||
use PicoFeed\Logging\Logger;
|
||||
use PicoFeed\Parser\Feed;
|
||||
use PicoFeed\Parser\Item;
|
||||
|
||||
/**
|
||||
* Item Content Filter
|
||||
*
|
||||
* @package PicoFeed\Processor
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
class ContentFilterProcessor extends Base implements ItemProcessorInterface
|
||||
{
|
||||
/**
|
||||
* Execute Item Processor
|
||||
*
|
||||
* @access public
|
||||
* @param Feed $feed
|
||||
* @param Item $item
|
||||
* @return bool
|
||||
*/
|
||||
public function execute(Feed $feed, Item $item)
|
||||
{
|
||||
if ($this->config->getContentFiltering(true)) {
|
||||
$filter = Filter::html($item->getContent(), $feed->getSiteUrl());
|
||||
$filter->setConfig($this->config);
|
||||
$item->setContent($filter->execute());
|
||||
} else {
|
||||
Logger::setMessage(get_called_class().': Content filtering disabled');
|
||||
}
|
||||
}
|
||||
}
|
49
vendor/fguillot/picofeed/lib/PicoFeed/Processor/ContentGeneratorProcessor.php
vendored
Normal file
49
vendor/fguillot/picofeed/lib/PicoFeed/Processor/ContentGeneratorProcessor.php
vendored
Normal file
@ -0,0 +1,49 @@
|
||||
<?php
|
||||
|
||||
namespace PicoFeed\Processor;
|
||||
|
||||
use PicoFeed\Base;
|
||||
use PicoFeed\Parser\Feed;
|
||||
use PicoFeed\Parser\Item;
|
||||
|
||||
/**
|
||||
* Item Content Generator
|
||||
*
|
||||
* @package PicoFeed\Processor
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
class ContentGeneratorProcessor extends Base implements ItemProcessorInterface
|
||||
{
|
||||
/**
|
||||
* List of generators
|
||||
*
|
||||
* @access protected
|
||||
* @var array
|
||||
*/
|
||||
protected $generators = array(
|
||||
'youtube',
|
||||
'file',
|
||||
);
|
||||
|
||||
/**
|
||||
* Execute Item Processor
|
||||
*
|
||||
* @access public
|
||||
* @param Feed $feed
|
||||
* @param Item $item
|
||||
* @return bool
|
||||
*/
|
||||
public function execute(Feed $feed, Item $item)
|
||||
{
|
||||
foreach ($this->generators as $generator) {
|
||||
$className = '\PicoFeed\Generator\\'.ucfirst($generator).'ContentGenerator';
|
||||
$object = new $className($this->config);
|
||||
|
||||
if ($object->execute($item)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
84
vendor/fguillot/picofeed/lib/PicoFeed/Processor/ItemPostProcessor.php
vendored
Normal file
84
vendor/fguillot/picofeed/lib/PicoFeed/Processor/ItemPostProcessor.php
vendored
Normal file
@ -0,0 +1,84 @@
|
||||
<?php
|
||||
|
||||
namespace PicoFeed\Processor;
|
||||
|
||||
use PicoFeed\Base;
|
||||
use PicoFeed\Parser\Feed;
|
||||
use PicoFeed\Parser\Item;
|
||||
|
||||
/**
|
||||
* Item Post Processor
|
||||
*
|
||||
* @package PicoFeed\Processor
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
class ItemPostProcessor extends Base
|
||||
{
|
||||
/**
|
||||
* List of processors
|
||||
*
|
||||
* @access private
|
||||
* @var array
|
||||
*/
|
||||
private $processors = array();
|
||||
|
||||
/**
|
||||
* Execute all processors
|
||||
*
|
||||
* @access public
|
||||
* @param Feed $feed
|
||||
* @param Item $item
|
||||
* @return bool
|
||||
*/
|
||||
public function execute(Feed $feed, Item $item)
|
||||
{
|
||||
foreach ($this->processors as $processor) {
|
||||
if ($processor->execute($feed, $item)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Register a new Item post-processor
|
||||
*
|
||||
* @access public
|
||||
* @param ItemProcessorInterface $processor
|
||||
* @return ItemPostProcessor
|
||||
*/
|
||||
public function register(ItemProcessorInterface $processor)
|
||||
{
|
||||
$this->processors[get_class($processor)] = $processor;
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove Processor instance
|
||||
*
|
||||
* @access public
|
||||
* @param string $class
|
||||
* @return ItemPostProcessor
|
||||
*/
|
||||
public function unregister($class)
|
||||
{
|
||||
if (isset($this->processors[$class])) {
|
||||
unset($this->processors[$class]);
|
||||
}
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get Processor instance
|
||||
*
|
||||
* @access public
|
||||
* @param string $class
|
||||
* @return ItemProcessorInterface|null
|
||||
*/
|
||||
public function getProcessor($class)
|
||||
{
|
||||
return isset($this->processors[$class]) ? $this->processors[$class] : null;
|
||||
}
|
||||
}
|
25
vendor/fguillot/picofeed/lib/PicoFeed/Processor/ItemProcessorInterface.php
vendored
Normal file
25
vendor/fguillot/picofeed/lib/PicoFeed/Processor/ItemProcessorInterface.php
vendored
Normal file
@ -0,0 +1,25 @@
|
||||
<?php
|
||||
|
||||
namespace PicoFeed\Processor;
|
||||
|
||||
use PicoFeed\Parser\Feed;
|
||||
use PicoFeed\Parser\Item;
|
||||
|
||||
/**
|
||||
* Item Processor Interface
|
||||
*
|
||||
* @package PicoFeed\Processor
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
interface ItemProcessorInterface
|
||||
{
|
||||
/**
|
||||
* Execute Item Processor
|
||||
*
|
||||
* @access public
|
||||
* @param Feed $feed
|
||||
* @param Item $item
|
||||
* @return bool
|
||||
*/
|
||||
public function execute(Feed $feed, Item $item);
|
||||
}
|
71
vendor/fguillot/picofeed/lib/PicoFeed/Processor/ScraperProcessor.php
vendored
Normal file
71
vendor/fguillot/picofeed/lib/PicoFeed/Processor/ScraperProcessor.php
vendored
Normal file
@ -0,0 +1,71 @@
|
||||
<?php
|
||||
|
||||
namespace PicoFeed\Processor;
|
||||
|
||||
use PicoFeed\Base;
|
||||
use PicoFeed\Parser\Feed;
|
||||
use PicoFeed\Parser\Item;
|
||||
use PicoFeed\Scraper\Scraper;
|
||||
|
||||
/**
|
||||
* Scraper Processor
|
||||
*
|
||||
* @package PicoFeed\Processor
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
class ScraperProcessor extends Base implements ItemProcessorInterface
|
||||
{
|
||||
private $ignoredUrls = array();
|
||||
private $scraper;
|
||||
|
||||
/**
|
||||
* Execute Item Processor
|
||||
*
|
||||
* @access public
|
||||
* @param Feed $feed
|
||||
* @param Item $item
|
||||
* @return bool
|
||||
*/
|
||||
public function execute(Feed $feed, Item $item)
|
||||
{
|
||||
if (!in_array($item->getUrl(), $this->ignoredUrls)) {
|
||||
$scraper = $this->getScraper();
|
||||
$scraper->setUrl($item->getUrl());
|
||||
$scraper->execute();
|
||||
|
||||
if ($scraper->hasRelevantContent()) {
|
||||
$item->setContent($scraper->getFilteredContent());
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ignore list of URLs
|
||||
*
|
||||
* @access public
|
||||
* @param array $urls
|
||||
* @return $this
|
||||
*/
|
||||
public function ignoreUrls(array $urls)
|
||||
{
|
||||
$this->ignoredUrls = $urls;
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns Scraper instance
|
||||
*
|
||||
* @access public
|
||||
* @return Scraper
|
||||
*/
|
||||
public function getScraper()
|
||||
{
|
||||
if ($this->scraper === null) {
|
||||
$this->scraper = new Scraper($this->config);
|
||||
}
|
||||
|
||||
return $this->scraper;
|
||||
}
|
||||
}
|
@ -2,11 +2,11 @@
|
||||
|
||||
namespace PicoFeed\Reader;
|
||||
|
||||
use DOMXpath;
|
||||
use DOMXPath;
|
||||
use PicoFeed\Base;
|
||||
use PicoFeed\Client\Client;
|
||||
use PicoFeed\Client\ClientException;
|
||||
use PicoFeed\Client\Url;
|
||||
use PicoFeed\Config\Config;
|
||||
use PicoFeed\Logging\Logger;
|
||||
use PicoFeed\Parser\XmlParser;
|
||||
|
||||
@ -17,7 +17,7 @@ use PicoFeed\Parser\XmlParser;
|
||||
*
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
class Favicon
|
||||
class Favicon extends Base
|
||||
{
|
||||
/**
|
||||
* Valid types for favicon (supported by browsers).
|
||||
@ -33,13 +33,6 @@ class Favicon
|
||||
'image/svg+xml'
|
||||
);
|
||||
|
||||
/**
|
||||
* Config class instance.
|
||||
*
|
||||
* @var \PicoFeed\Config\Config
|
||||
*/
|
||||
private $config;
|
||||
|
||||
/**
|
||||
* Icon binary content.
|
||||
*
|
||||
@ -54,16 +47,6 @@ class Favicon
|
||||
*/
|
||||
private $content_type = '';
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param \PicoFeed\Config\Config $config Config class instance
|
||||
*/
|
||||
public function __construct(Config $config = null)
|
||||
{
|
||||
$this->config = $config ?: new Config();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the icon file content (available only after the download).
|
||||
*
|
||||
|
@ -3,7 +3,7 @@
|
||||
namespace PicoFeed\Reader;
|
||||
|
||||
use DOMXPath;
|
||||
use PicoFeed\Config\Config;
|
||||
use PicoFeed\Base;
|
||||
use PicoFeed\Client\Client;
|
||||
use PicoFeed\Client\Url;
|
||||
use PicoFeed\Logging\Logger;
|
||||
@ -14,7 +14,7 @@ use PicoFeed\Parser\XmlParser;
|
||||
*
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
class Reader
|
||||
class Reader extends Base
|
||||
{
|
||||
/**
|
||||
* Feed formats for detection.
|
||||
@ -29,24 +29,6 @@ class Reader
|
||||
'Rss10' => '//rdf',
|
||||
);
|
||||
|
||||
/**
|
||||
* Config class instance.
|
||||
*
|
||||
* @var \PicoFeed\Config\Config
|
||||
*/
|
||||
private $config;
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param \PicoFeed\Config\Config $config Config class instance
|
||||
*/
|
||||
public function __construct(Config $config = null)
|
||||
{
|
||||
$this->config = $config ?: new Config();
|
||||
Logger::setTimezone($this->config->getTimezone());
|
||||
}
|
||||
|
||||
/**
|
||||
* Download a feed (no discovery).
|
||||
*
|
||||
@ -163,7 +145,6 @@ class Reader
|
||||
|
||||
$parser = new $className($content, $encoding, $url);
|
||||
$parser->setHashAlgo($this->config->getParserHashAlgo());
|
||||
$parser->setTimezone($this->config->getTimezone());
|
||||
$parser->setConfig($this->config);
|
||||
|
||||
return $parser;
|
||||
|
11
vendor/fguillot/picofeed/lib/PicoFeed/Rules/.over-blog.com.php
vendored
Normal file
11
vendor/fguillot/picofeed/lib/PicoFeed/Rules/.over-blog.com.php
vendored
Normal file
@ -0,0 +1,11 @@
|
||||
<?php
|
||||
return array(
|
||||
'grabber' => array(
|
||||
'%.*%' => array(
|
||||
'test_url' => 'http://eliascarpe.over-blog.com/2015/12/re-upload-projets-d-avenir.html',
|
||||
'body' => array(
|
||||
'//div[contains(concat(" ", normalize-space(@class), " "), " ob-section ")]',
|
||||
),
|
||||
)
|
||||
)
|
||||
);
|
13
vendor/fguillot/picofeed/lib/PicoFeed/Rules/encyclopedie.naheulbeuk.com.php
vendored
Normal file
13
vendor/fguillot/picofeed/lib/PicoFeed/Rules/encyclopedie.naheulbeuk.com.php
vendored
Normal file
@ -0,0 +1,13 @@
|
||||
<?php
|
||||
return array(
|
||||
'grabber' => array(
|
||||
'%.*%' => array(
|
||||
'test_url' => 'http://encyclopedie.naheulbeuk.com/article.php3?id_article=352',
|
||||
'body' => array(
|
||||
'//td//h1[@class="titre-texte"]',
|
||||
'//td//div[@class="surtitre"]',
|
||||
'//td//div[@class="texte"]',
|
||||
),
|
||||
)
|
||||
),
|
||||
);
|
@ -1,12 +0,0 @@
|
||||
<?php
|
||||
|
||||
return array(
|
||||
'grabber' => array(
|
||||
'%/joyoftech/.*%' => array(
|
||||
'body' => array(
|
||||
'//img[@width="640"]',
|
||||
),
|
||||
'test_url' => 'http://www.geekculture.com/joyoftech/joyarchives/2235.html',
|
||||
),
|
||||
),
|
||||
);
|
@ -1,18 +0,0 @@
|
||||
<?php
|
||||
return array(
|
||||
'grabber' => array(
|
||||
'%.*%' => array(
|
||||
'test_url' => 'http://news.sciencemag.org/biology/2015/09/genetic-engineering-turns-common-plant-cancer-fighter',
|
||||
'body' => array(
|
||||
'//div[@class="content"]',
|
||||
),
|
||||
'strip' => array(
|
||||
'//h1[@class="snews-article__headline"]',
|
||||
'//div[contains(@class,"easy_social_box")]',
|
||||
'//div[@class="author-teaser"]',
|
||||
'//div[@class="article-byline"]',
|
||||
),
|
||||
),
|
||||
)
|
||||
);
|
||||
|
20
vendor/fguillot/picofeed/lib/PicoFeed/Rules/rugbyrama.fr.php
vendored
Normal file
20
vendor/fguillot/picofeed/lib/PicoFeed/Rules/rugbyrama.fr.php
vendored
Normal file
@ -0,0 +1,20 @@
|
||||
<?php
|
||||
return array(
|
||||
'grabber' => array(
|
||||
'%.*%' => array(
|
||||
'test_url' => 'http://www.rugbyrama.fr/rugby/top-14/2015-2016/top-14-hayman-coupe-du-monde-finale-2012-lutte.-voici-levan-chilachava-toulon_sto5283863/story.shtml',
|
||||
'body' => array(
|
||||
'//div[@class="story-simple-content"]',
|
||||
),
|
||||
'strip' => array(
|
||||
'//script',
|
||||
'//form',
|
||||
'//style',
|
||||
'//*[@class="share-buttons"]',
|
||||
'//*[@class="show-mobile-block"]',
|
||||
'//*[@class="hide-desktop"]',
|
||||
'//*[@id="tracking_img"]',
|
||||
)
|
||||
)
|
||||
)
|
||||
);
|
@ -4,8 +4,7 @@ return array(
|
||||
'%.*%' => array(
|
||||
'test_url' => 'http://www.franceculture.fr/emission-culture-eco-la-finance-aime-toujours-la-france-2016-01-08',
|
||||
'body' => array(
|
||||
'//div[@class="heading"]/*/*/div[contains(@class,"player-inline")]',
|
||||
'//article/div[@class="text-zone"]',
|
||||
'//div[@class="text-zone"]',
|
||||
),
|
||||
'strip' => array(
|
||||
'//ul[@class="tags"]',
|
||||
|
11
vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.monsieur-le-chien.fr.php
vendored
Normal file
11
vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.monsieur-le-chien.fr.php
vendored
Normal file
@ -0,0 +1,11 @@
|
||||
<?php
|
||||
return array(
|
||||
'grabber' => array(
|
||||
'%.*%' => array(
|
||||
'test_url' => 'http://www.monsieur-le-chien.fr/index.php?planche=672',
|
||||
'body' => array(
|
||||
'//img[starts-with(@src, "i/planches/")]',
|
||||
),
|
||||
)
|
||||
)
|
||||
);
|
16
vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.sciencemag.org.php
vendored
Normal file
16
vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.sciencemag.org.php
vendored
Normal file
@ -0,0 +1,16 @@
|
||||
<?php
|
||||
return array(
|
||||
'grabber' => array(
|
||||
'%.*%' => array(
|
||||
'test_url' => 'http://www.sciencemag.org/news/2016/01/could-bright-foamy-wak$
|
||||
'body' => array(
|
||||
'//div[@class="row--hero"]',
|
||||
'//article[contains(@class,"primary")]',
|
||||
),
|
||||
'strip' => array(
|
||||
'//header[@class="article__header"]',
|
||||
'//footer[@class="article__foot"]',
|
||||
),
|
||||
),
|
||||
)
|
||||
);
|
@ -2,8 +2,8 @@
|
||||
|
||||
namespace PicoFeed\Scraper;
|
||||
|
||||
use PicoFeed\Base;
|
||||
use PicoFeed\Logging\Logger;
|
||||
use PicoFeed\Config\Config;
|
||||
|
||||
/**
|
||||
* RuleLoader class.
|
||||
@ -11,25 +11,8 @@ use PicoFeed\Config\Config;
|
||||
* @author Frederic Guillot
|
||||
* @author Bernhard Posselt
|
||||
*/
|
||||
class RuleLoader
|
||||
class RuleLoader extends Base
|
||||
{
|
||||
/**
|
||||
* Config object.
|
||||
*
|
||||
* @var \PicoFeed\Config\Config
|
||||
*/
|
||||
private $config;
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param \PicoFeed\Config\Config $config Config class instance
|
||||
*/
|
||||
public function __construct(Config $config)
|
||||
{
|
||||
$this->config = $config;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the rules for an URL.
|
||||
*
|
||||
@ -111,12 +94,14 @@ class RuleLoader
|
||||
*/
|
||||
public function getRulesFolders()
|
||||
{
|
||||
$folders = array(__DIR__.'/../Rules');
|
||||
$folders = array();
|
||||
|
||||
if ($this->config !== null && $this->config->getGrabberRulesFolder() !== null) {
|
||||
$folders[] = $this->config->getGrabberRulesFolder();
|
||||
}
|
||||
|
||||
$folders[] = __DIR__ . '/../Rules';
|
||||
|
||||
return $folders;
|
||||
}
|
||||
}
|
||||
|
@ -2,10 +2,10 @@
|
||||
|
||||
namespace PicoFeed\Scraper;
|
||||
|
||||
use PicoFeed\Base;
|
||||
use PicoFeed\Client\Client;
|
||||
use PicoFeed\Client\ClientException;
|
||||
use PicoFeed\Client\Url;
|
||||
use PicoFeed\Config\Config;
|
||||
use PicoFeed\Encoding\Encoding;
|
||||
use PicoFeed\Filter\Filter;
|
||||
use PicoFeed\Logging\Logger;
|
||||
@ -16,7 +16,7 @@ use PicoFeed\Parser\XmlParser;
|
||||
*
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
class Scraper
|
||||
class Scraper extends Base
|
||||
{
|
||||
/**
|
||||
* URL.
|
||||
@ -53,24 +53,6 @@ class Scraper
|
||||
*/
|
||||
private $enableCandidateParser = true;
|
||||
|
||||
/**
|
||||
* Config object.
|
||||
*
|
||||
* @var \PicoFeed\Config\Config
|
||||
*/
|
||||
private $config;
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param \PicoFeed\Config\Config $config Config class instance
|
||||
*/
|
||||
public function __construct(Config $config)
|
||||
{
|
||||
$this->config = $config;
|
||||
Logger::setTimezone($this->config->getTimezone());
|
||||
}
|
||||
|
||||
/**
|
||||
* Disable candidates parsing.
|
||||
*
|
||||
@ -79,7 +61,6 @@ class Scraper
|
||||
public function disableCandidateParser()
|
||||
{
|
||||
$this->enableCandidateParser = false;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
@ -227,9 +208,11 @@ class Scraper
|
||||
*/
|
||||
public function execute()
|
||||
{
|
||||
$this->download();
|
||||
$this->content = '';
|
||||
$this->html = '';
|
||||
$this->encoding = '';
|
||||
|
||||
if (!$this->skipProcessing()) {
|
||||
$this->download();
|
||||
$this->prepareHtml();
|
||||
|
||||
$parser = $this->getParser();
|
||||
@ -239,34 +222,6 @@ class Scraper
|
||||
Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the parsing must be skipped.
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function skipProcessing()
|
||||
{
|
||||
$handlers = array(
|
||||
'detectStreamingVideos',
|
||||
'detectPdfFiles',
|
||||
);
|
||||
|
||||
foreach ($handlers as $handler) {
|
||||
if ($this->$handler()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (empty($this->html)) {
|
||||
Logger::setMessage(get_called_class().': Raw HTML is empty');
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the parser.
|
||||
@ -287,17 +242,14 @@ class Scraper
|
||||
|
||||
if (preg_match($pattern, $sub_url)) {
|
||||
Logger::setMessage(get_called_class().': Matched url '.$sub_url);
|
||||
|
||||
return new RuleParser($this->html, $rule);
|
||||
}
|
||||
}
|
||||
} elseif ($this->enableCandidateParser) {
|
||||
Logger::setMessage(get_called_class().': Parse content with candidates');
|
||||
|
||||
return new CandidateParser($this->html);
|
||||
}
|
||||
|
||||
return;
|
||||
return new CandidateParser($this->html);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -312,30 +264,4 @@ class Scraper
|
||||
|
||||
Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'" ; HTML Encoding "'.$html_encoding.'"');
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the Youtube embed player and skip processing.
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function detectStreamingVideos()
|
||||
{
|
||||
if (preg_match("#(?<=v=|v\/|vi=|vi\/|youtu.be\/)[a-zA-Z0-9_-]{11}#", $this->url, $matches)) {
|
||||
$this->content = '<iframe width="560" height="315" src="//www.youtube.com/embed/'.$matches[0].'" frameborder="0"></iframe>';
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip processing for PDF documents.
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function detectPdfFiles()
|
||||
{
|
||||
return substr($this->url, -3) === 'pdf';
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user