Bug fix feed parser and add a grabber rule for distrowatch.com

This commit is contained in:
Frédéric Guillot 2014-03-29 19:48:29 -04:00
parent 7a3e8512b6
commit 61c660d81b
3 changed files with 359 additions and 42 deletions

View File

@ -2,16 +2,69 @@
namespace PicoFeed; namespace PicoFeed;
/**
* Filter class
*
* @author Frederic Guillot
* @package parser
*/
class Filter class Filter
{ {
/**
* Filtered XML data
*
* @access private
* @var string
*/
private $data = ''; private $data = '';
/**
* Site URL (used to build absolute URL)
*
* @access private
* @var string
*/
private $url = ''; private $url = '';
/**
* Unfiltered XML data
*
* @access private
* @var string
*/
private $input = ''; private $input = '';
/**
* List of empty tags
*
* @access private
* @var array
*/
private $empty_tags = array(); private $empty_tags = array();
/**
* Flag to remove the content of a tag
*
* @access private
* @var boolean
*/
private $strip_content = false; private $strip_content = false;
/**
* Flag to remember if the current payload is a source code <pre/>
*
* @access private
* @var boolean
*/
private $is_code = false; private $is_code = false;
// Allow only these tags and attributes /**
* Tags and attribute whitelist
*
* @static
* @access public
* @var array
*/
public static $whitelist_tags = array( public static $whitelist_tags = array(
'audio' => array('controls', 'src'), 'audio' => array('controls', 'src'),
'video' => array('poster', 'controls', 'height', 'width', 'src'), 'video' => array('poster', 'controls', 'height', 'width', 'src'),
@ -53,13 +106,25 @@ class Filter
'q' => array('cite') 'q' => array('cite')
); );
// Strip content of these tags /**
* Tags blacklist, strip the content of those tags
*
* @static
* @access public
* @var array
*/
public static $blacklist_tags = array( public static $blacklist_tags = array(
'script' 'script'
); );
// Allowed URI scheme /**
// For a complete list go to http://en.wikipedia.org/wiki/URI_scheme * Scheme whitelist
* For a complete list go to http://en.wikipedia.org/wiki/URI_scheme
*
* @static
* @access public
* @var array
*/
public static $scheme_whitelist = array( public static $scheme_whitelist = array(
'//', '//',
'data:image/png;base64,', 'data:image/png;base64,',
@ -96,14 +161,26 @@ class Filter
'tel:', 'tel:',
); );
// Attributes used for external resources /**
* Attributes used for external resources
*
* @static
* @access public
* @var array
*/
public static $media_attributes = array( public static $media_attributes = array(
'src', 'src',
'href', 'href',
'poster', 'poster',
); );
// Blacklisted resources /**
* Blacklisted resources
*
* @static
* @access public
* @var array
*/
public static $media_blacklist = array( public static $media_blacklist = array(
'feeds.feedburner.com', 'feeds.feedburner.com',
'share.feedsportal.com', 'share.feedsportal.com',
@ -129,7 +206,13 @@ class Filter
'www.gstatic.com/images/icons/gplus-64.png', 'www.gstatic.com/images/icons/gplus-64.png',
); );
// Mandatory attributes for specified tags /**
* Mandatory attributes for specified tags
*
* @static
* @access public
* @var array
*/
public static $required_attributes = array( public static $required_attributes = array(
'a' => array('href'), 'a' => array('href'),
'img' => array('src'), 'img' => array('src'),
@ -138,19 +221,37 @@ class Filter
'source' => array('src'), 'source' => array('src'),
); );
// Add attributes to specified tags /**
* Add attributes to specified tags
*
* @static
* @access public
* @var array
*/
public static $add_attributes = array( public static $add_attributes = array(
'a' => 'rel="noreferrer" target="_blank"' 'a' => 'rel="noreferrer" target="_blank"'
); );
// Attributes that must be integer /**
* Attributes that must be integer
*
* @static
* @access public
* @var array
*/
public static $integer_attributes = array( public static $integer_attributes = array(
'width', 'width',
'height', 'height',
'frameborder', 'frameborder',
); );
// Iframe source whitelist, everything else is ignored /**
* Iframe source whitelist, everything else is ignored
*
* @static
* @access public
* @var array
*/
public static $iframe_whitelist = array( public static $iframe_whitelist = array(
'//www.youtube.com', '//www.youtube.com',
'http://www.youtube.com', 'http://www.youtube.com',
@ -161,8 +262,13 @@ class Filter
'https://www.dailymotion.com', 'https://www.dailymotion.com',
); );
/**
// All inputs data must be encoded in UTF-8 before * Initialize the filter, all inputs data must be encoded in UTF-8 before
*
* @access public
* @param string $data XML content
* @param string $site_url Site URL (used to build absolute URL)
*/
public function __construct($data, $site_url) public function __construct($data, $site_url)
{ {
$this->url = $site_url; $this->url = $site_url;
@ -175,7 +281,12 @@ class Filter
$this->input = $dom->saveXML($dom->getElementsByTagName('body')->item(0)); $this->input = $dom->saveXML($dom->getElementsByTagName('body')->item(0));
} }
/**
* Run tags/attributes filtering
*
* @access public
* @return string
*/
public function execute() public function execute()
{ {
$parser = xml_parser_create(); $parser = xml_parser_create();
@ -192,7 +303,14 @@ class Filter
return $this->data; return $this->data;
} }
/**
* Parse opening tag
*
* @access public
* @param resource $parser XML parser
* @param string $name Tag name
* @param array $attributes Tag attributes
*/
public function startTag($parser, $name, $attributes) public function startTag($parser, $name, $attributes)
{ {
$empty_tag = false; $empty_tag = false;
@ -288,7 +406,13 @@ class Filter
$this->empty_tags[] = $empty_tag; $this->empty_tags[] = $empty_tag;
} }
/**
* Parse closing tag
*
* @access public
* @param resource $parser XML parser
* @param string $name Tag name
*/
public function endTag($parser, $name) public function endTag($parser, $name)
{ {
if (! array_pop($this->empty_tags) && $this->isAllowedTag($name)) { if (! array_pop($this->empty_tags) && $this->isAllowedTag($name)) {
@ -298,7 +422,13 @@ class Filter
if ($this->is_code && $name === 'pre') $this->is_code = false; if ($this->is_code && $name === 'pre') $this->is_code = false;
} }
/**
* Parse tag content
*
* @access public
* @param resource $parser XML parser
* @param string $content Tag content
*/
public function dataTag($parser, $content) public function dataTag($parser, $content)
{ {
$content = str_replace("\xc2\xa0", ' ', $content); // Replace &nbsp; with normal space $content = str_replace("\xc2\xa0", ' ', $content); // Replace &nbsp; with normal space
@ -314,13 +444,26 @@ class Filter
} }
} }
/**
* Escape HTML content
*
* @static
* @access public
* @return string
*/
public static function escape($content) public static function escape($content)
{ {
return htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false); return htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false);
} }
/**
* Get the absolute url for a relative link
*
* @access public
* @param string $path Relative path
* @param string $url Site base url
* @return string
*/
public static function getAbsoluteUrl($path, $url) public static function getAbsoluteUrl($path, $url)
{ {
$components = parse_url($url); $components = parse_url($url);
@ -365,32 +508,63 @@ class Filter
} }
} }
/**
* Check if an url is relative
*
* @access public
* @param string $value Attribute value
* @return boolean
*/
public static function isRelativePath($value) public static function isRelativePath($value)
{ {
if (strpos($value, 'data:') === 0) return false; if (strpos($value, 'data:') === 0) return false;
return strpos($value, '://') === false && strpos($value, '//') !== 0; return strpos($value, '://') === false && strpos($value, '//') !== 0;
} }
/**
* Check if a tag is on the whitelist
*
* @access public
* @param string $name Tag name
* @return boolean
*/
public function isAllowedTag($name) public function isAllowedTag($name)
{ {
return isset(self::$whitelist_tags[$name]); return isset(self::$whitelist_tags[$name]);
} }
/**
* Check if an attribute is allowed for a given tag
*
* @access public
* @param string $tag Tag name
* @param array $attribute Attribute name
* @return boolean
*/
public function isAllowedAttribute($tag, $attribute) public function isAllowedAttribute($tag, $attribute)
{ {
return in_array($attribute, self::$whitelist_tags[$tag]); return in_array($attribute, self::$whitelist_tags[$tag]);
} }
/**
* Check if an attribute name is an external resource
*
* @access public
* @param string $data Attribute name
* @return boolean
*/
public function isResource($attribute) public function isResource($attribute)
{ {
return in_array($attribute, self::$media_attributes); return in_array($attribute, self::$media_attributes);
} }
/**
* Check if an iframe url is allowed
*
* @access public
* @param string $value Attribute value
* @return boolean
*/
public function isAllowedIframeResource($value) public function isAllowedIframeResource($value)
{ {
foreach (self::$iframe_whitelist as $url) { foreach (self::$iframe_whitelist as $url) {
@ -403,7 +577,13 @@ class Filter
return false; return false;
} }
/**
* Detect if the protocol is allowed or not
*
* @access public
* @param string $value Attribute value
* @return boolean
*/
public function isAllowedProtocol($value) public function isAllowedProtocol($value)
{ {
foreach (self::$scheme_whitelist as $protocol) { foreach (self::$scheme_whitelist as $protocol) {
@ -416,7 +596,13 @@ class Filter
return false; return false;
} }
/**
* Detect if an url is blacklisted
*
* @access public
* @param string $resouce Attribute value (URL)
* @return boolean
*/
public function isBlacklistedMedia($resource) public function isBlacklistedMedia($resource)
{ {
foreach (self::$media_blacklist as $name) { foreach (self::$media_blacklist as $name) {
@ -429,7 +615,14 @@ class Filter
return false; return false;
} }
/**
* Detect if an image tag is a pixel tracker
*
* @access public
* @param string $tag Tag name
* @param array $attributes Tag attributes
* @return boolean
*/
public function isPixelTracker($tag, array $attributes) public function isPixelTracker($tag, array $attributes)
{ {
return $tag === 'img' && return $tag === 'img' &&
@ -437,7 +630,14 @@ class Filter
$attributes['height'] == 1 && $attributes['width'] == 1; $attributes['height'] == 1 && $attributes['width'] == 1;
} }
/**
* Check if an attribute value is integer
*
* @access public
* @param string $attribute Attribute name
* @param string $value Attribute value
* @return boolean
*/
public function validateAttributeValue($attribute, $value) public function validateAttributeValue($attribute, $value)
{ {
if (in_array($attribute, self::$integer_attributes)) { if (in_array($attribute, self::$integer_attributes)) {
@ -447,29 +647,53 @@ class Filter
return true; return true;
} }
/**
* Replace <br/><br/> by only one
*
* @access public
* @param string $data Input data
* @return string
*/
public function removeMultipleTags($data) public function removeMultipleTags($data)
{ {
// Replace <br/><br/> by only one
return preg_replace("/(<br\s*\/?>\s*)+/", "<br/>", $data); return preg_replace("/(<br\s*\/?>\s*)+/", "<br/>", $data);
} }
/**
* Remove empty tags
*
* @access public
* @param string $data Input data
* @return string
*/
public function removeEmptyTags($data) public function removeEmptyTags($data)
{ {
return preg_replace('/<([^<\/>]*)>([\s]*?|(?R))<\/\1>/imsU', '', $data); return preg_replace('/<([^<\/>]*)>([\s]*?|(?R))<\/\1>/imsU', '', $data);
} }
/**
* Remove HTML tags
*
* @access public
* @param string $data Input data
* @return string
*/
public function removeHTMLTags($data) public function removeHTMLTags($data)
{ {
return preg_replace('~<(?:!DOCTYPE|/?(?:html|head|body))[^>]*>\s*~i', '', $data); return preg_replace('~<(?:!DOCTYPE|/?(?:html|head|body))[^>]*>\s*~i', '', $data);
} }
/**
* Remove the XML tag from a document
*
* @static
* @access public
* @param string $data Input data
* @return string
*/
public static function stripXmlTag($data) public static function stripXmlTag($data)
{ {
if (strpos($data, '<?xml ') !== false) { if (strpos($data, '<?xml') !== false) {
$data = ltrim(substr($data, strpos($data, '?>') + 2)); $data = ltrim(substr($data, strpos($data, '?>') + 2));
} }
@ -486,13 +710,27 @@ class Filter
return $data; return $data;
} }
/**
* Strip meta tags from the HTML content
*
* @static
* @access public
* @param string $data Input data
* @return string
*/
public static function stripMetaTags($data) public static function stripMetaTags($data)
{ {
return preg_replace('/<meta\s.*?\/>/is', '', $data); return preg_replace('/<meta\s.*?\/>/is', '', $data);
} }
/**
* Get the encoding from a xml tag
*
* @static
* @access public
* @param string $data Input data
* @return string
*/
public static function getEncodingFromXmlTag($data) public static function getEncodingFromXmlTag($data)
{ {
$encoding = ''; $encoding = '';

View File

@ -7,13 +7,46 @@ require_once __DIR__.'/Parser.php';
require_once __DIR__.'/Client.php'; require_once __DIR__.'/Client.php';
require_once __DIR__.'/Filter.php'; require_once __DIR__.'/Filter.php';
/**
* Reader class
*
* @author Frederic Guillot
* @package parser
*/
class Reader class Reader
{ {
/**
* Feed or site URL
*
* @access private
* @var string
*/
private $url = ''; private $url = '';
/**
* Feed content
*
* @access private
* @var string
*/
private $content = ''; private $content = '';
/**
* HTTP encoding
*
* @access private
* @var string
*/
private $encoding = ''; private $encoding = '';
/**
* Constructor
*
* @access public
* @param string $content Feed content
* @param string $encoding Feed encoding
* @return Reader
*/
public function __construct($content = '', $encoding = '') public function __construct($content = '', $encoding = '')
{ {
$this->content = $content; $this->content = $content;
@ -21,7 +54,17 @@ class Reader
return $this; return $this;
} }
/**
* Download a feed
*
* @access public
* @param string $url Feed content
* @param string $last_modified Last modified HTTP header
* @param string $etag Etag HTTP header
* @param string $timeout Client connection timeout
* @param string $user_agent HTTP user-agent
* @return Client
*/
public function download($url, $last_modified = '', $etag = '', $timeout = 5, $user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)') public function download($url, $last_modified = '', $etag = '', $timeout = 5, $user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)')
{ {
if (strpos($url, 'http') !== 0) { if (strpos($url, 'http') !== 0) {
@ -44,19 +87,35 @@ class Reader
return $client; return $client;
} }
/**
* Get the download content
*
* @access public
* @return string
*/
public function getContent() public function getContent()
{ {
return $this->content; return $this->content;
} }
/**
* Get finale URL
*
* @access public
* @return string
*/
public function getUrl() public function getUrl()
{ {
return $this->url; return $this->url;
} }
/**
* Get the first XML tag
*
* @access public
* @param string $data Feed content
* @return string
*/
public function getFirstTag($data) public function getFirstTag($data)
{ {
// Strip HTML comments (max of 5,000 characters long to prevent crashing) // Strip HTML comments (max of 5,000 characters long to prevent crashing)
@ -79,7 +138,13 @@ class Reader
return substr($data, $open_tag, $close_tag); return substr($data, $open_tag, $close_tag);
} }
/**
* Discover feed format and return a parser instance
*
* @access public
* @param boolean $discover Enable feed autodiscovery in HTML document
* @return mixed False on failure or Parser instance
*/
public function getParser($discover = false) public function getParser($discover = false)
{ {
$first_tag = $this->getFirstTag($this->content); $first_tag = $this->getFirstTag($this->content);
@ -140,7 +205,12 @@ class Reader
return false; return false;
} }
/**
* Discover feed url inside a HTML document and download the feed
*
* @access public
* @return boolean
*/
public function discover() public function discover()
{ {
if (! $this->content) { if (! $this->content) {

View File

@ -0,0 +1,9 @@
<?php
return array(
'test_url' => 'http://distrowatch.com/?newsid=08355',
'body' => array(
'//td[@class="NewsText"][1]',
),
'strip' => array(
)
);