2014-05-20 14:20:27 -04:00
|
|
|
<?php
|
|
|
|
|
2014-12-23 21:28:26 -05:00
|
|
|
namespace PicoFeed\Parser;
|
2014-05-20 14:20:27 -04:00
|
|
|
|
2016-07-28 21:14:51 -04:00
|
|
|
use DOMDocument;
|
|
|
|
use SimpleXMLElement;
|
|
|
|
use ZendXml\Exception\RuntimeException;
|
2015-12-15 19:26:15 -05:00
|
|
|
use ZendXml\Security;
|
2014-05-20 14:20:27 -04:00
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* XML parser class.
|
2014-05-20 14:20:27 -04:00
|
|
|
*
|
|
|
|
* Checks for XML eXternal Entity (XXE) and XML Entity Expansion (XEE) attacks on XML documents
|
|
|
|
*
|
2016-07-28 21:14:51 -04:00
|
|
|
* @package PicoFeed\Parser
|
2014-05-20 14:20:27 -04:00
|
|
|
* @author Frederic Guillot
|
|
|
|
*/
|
|
|
|
class XmlParser
|
|
|
|
{
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Get a SimpleXmlElement instance or return false.
|
2014-05-20 14:20:27 -04:00
|
|
|
*
|
|
|
|
* @static
|
2015-10-19 22:49:30 -04:00
|
|
|
* @param string $input XML content
|
2014-05-20 14:20:27 -04:00
|
|
|
* @return mixed
|
|
|
|
*/
|
|
|
|
public static function getSimpleXml($input)
|
|
|
|
{
|
2015-12-15 19:26:15 -05:00
|
|
|
return self::scan($input);
|
2014-05-20 14:20:27 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Get a DomDocument instance or return false.
|
2014-05-20 14:20:27 -04:00
|
|
|
*
|
|
|
|
* @static
|
2015-10-19 22:49:30 -04:00
|
|
|
* @param string $input XML content
|
2016-07-28 21:14:51 -04:00
|
|
|
* @return DOMDocument
|
2014-05-20 14:20:27 -04:00
|
|
|
*/
|
2014-12-23 21:28:26 -05:00
|
|
|
public static function getDomDocument($input)
|
2014-05-20 14:20:27 -04:00
|
|
|
{
|
2015-03-01 19:56:11 +01:00
|
|
|
if (empty($input)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-12-15 19:26:15 -05:00
|
|
|
$dom = self::scan($input, new DOMDocument());
|
2014-05-20 14:20:27 -04:00
|
|
|
|
2014-12-23 21:28:26 -05:00
|
|
|
// The document is empty, there is probably some parsing errors
|
|
|
|
if ($dom && $dom->childNodes->length === 0) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return $dom;
|
|
|
|
}
|
2014-05-20 14:20:27 -04:00
|
|
|
|
2015-12-15 19:26:15 -05:00
|
|
|
/**
|
2016-07-28 21:14:51 -04:00
|
|
|
* Small wrapper around ZendXml to turn their exceptions into PicoFeed exceptions
|
2016-04-17 19:34:54 -04:00
|
|
|
*
|
2016-07-28 21:14:51 -04:00
|
|
|
* @static
|
|
|
|
* @access private
|
|
|
|
* @param string $input
|
|
|
|
* @param DOMDocument $dom
|
|
|
|
* @throws XmlEntityException
|
|
|
|
* @return SimpleXMLElement|DomDocument|boolean
|
2015-12-15 19:26:15 -05:00
|
|
|
*/
|
2016-03-24 17:49:50 -04:00
|
|
|
private static function scan($input, $dom = null)
|
2015-12-15 19:26:15 -05:00
|
|
|
{
|
|
|
|
try {
|
|
|
|
return Security::scan($input, $dom);
|
2016-07-28 21:14:51 -04:00
|
|
|
} catch(RuntimeException $e) {
|
2015-12-15 19:26:15 -05:00
|
|
|
throw new XmlEntityException($e->getMessage());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-12-23 21:28:26 -05:00
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Load HTML document by using a DomDocument instance or return false on failure.
|
2014-12-23 21:28:26 -05:00
|
|
|
*
|
|
|
|
* @static
|
2016-07-28 21:14:51 -04:00
|
|
|
* @access public
|
|
|
|
* @param string $input XML content
|
|
|
|
* @return DOMDocument
|
2014-12-23 21:28:26 -05:00
|
|
|
*/
|
|
|
|
public static function getHtmlDocument($input)
|
|
|
|
{
|
2015-12-15 19:26:15 -05:00
|
|
|
$dom = new DomDocument();
|
|
|
|
|
2014-12-26 10:56:50 -05:00
|
|
|
if (empty($input)) {
|
2015-12-15 19:26:15 -05:00
|
|
|
return $dom;
|
2014-12-26 10:56:50 -05:00
|
|
|
}
|
|
|
|
|
2015-12-15 19:26:15 -05:00
|
|
|
libxml_use_internal_errors(true);
|
2015-10-19 22:49:30 -04:00
|
|
|
|
2015-12-15 19:26:15 -05:00
|
|
|
if (version_compare(PHP_VERSION, '5.4.0', '>=')) {
|
|
|
|
$dom->loadHTML($input, LIBXML_NONET);
|
2015-10-19 22:49:30 -04:00
|
|
|
} else {
|
2015-12-15 19:26:15 -05:00
|
|
|
$dom->loadHTML($input);
|
2014-05-20 14:20:27 -04:00
|
|
|
}
|
|
|
|
|
2015-12-15 19:26:15 -05:00
|
|
|
return $dom;
|
2014-05-20 14:20:27 -04:00
|
|
|
}
|
|
|
|
|
2014-10-19 14:42:31 -04:00
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Convert a HTML document to XML.
|
2014-10-19 14:42:31 -04:00
|
|
|
*
|
|
|
|
* @static
|
2016-07-28 21:14:51 -04:00
|
|
|
* @access public
|
|
|
|
* @param string $html HTML document
|
2014-10-19 14:42:31 -04:00
|
|
|
* @return string
|
|
|
|
*/
|
2015-10-19 22:49:30 -04:00
|
|
|
public static function htmlToXml($html)
|
2014-10-19 14:42:31 -04:00
|
|
|
{
|
|
|
|
$dom = self::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html);
|
|
|
|
return $dom->saveXML($dom->getElementsByTagName('body')->item(0));
|
|
|
|
}
|
|
|
|
|
2014-05-20 14:20:27 -04:00
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Get XML parser errors.
|
2014-05-20 14:20:27 -04:00
|
|
|
*
|
|
|
|
* @static
|
2016-07-28 21:14:51 -04:00
|
|
|
* @access public
|
2014-05-20 14:20:27 -04:00
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
public static function getErrors()
|
|
|
|
{
|
|
|
|
$errors = array();
|
|
|
|
|
2015-10-19 22:49:30 -04:00
|
|
|
foreach (libxml_get_errors() as $error) {
|
2014-05-20 14:20:27 -04:00
|
|
|
$errors[] = sprintf('XML error: %s (Line: %d - Column: %d - Code: %d)',
|
|
|
|
$error->message,
|
|
|
|
$error->line,
|
|
|
|
$error->column,
|
|
|
|
$error->code
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
return implode(', ', $errors);
|
|
|
|
}
|
2014-05-25 08:47:03 -04:00
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Get the encoding from a xml tag.
|
2014-05-25 08:47:03 -04:00
|
|
|
*
|
|
|
|
* @static
|
2016-07-28 21:14:51 -04:00
|
|
|
* @access public
|
|
|
|
* @param string $data Input data
|
2014-05-25 08:47:03 -04:00
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
public static function getEncodingFromXmlTag($data)
|
|
|
|
{
|
|
|
|
$encoding = '';
|
|
|
|
|
|
|
|
if (strpos($data, '<?xml') !== false) {
|
|
|
|
$data = substr($data, 0, strrpos($data, '?>'));
|
|
|
|
$data = str_replace("'", '"', $data);
|
|
|
|
|
|
|
|
$p1 = strpos($data, 'encoding=');
|
|
|
|
$p2 = strpos($data, '"', $p1 + 10);
|
|
|
|
|
2015-01-27 20:13:16 -05:00
|
|
|
if ($p1 !== false && $p2 !== false) {
|
|
|
|
$encoding = substr($data, $p1 + 10, $p2 - $p1 - 10);
|
|
|
|
$encoding = strtolower($encoding);
|
|
|
|
}
|
2014-05-25 08:47:03 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
return $encoding;
|
|
|
|
}
|
2014-10-19 14:42:31 -04:00
|
|
|
|
2015-03-01 19:56:11 +01:00
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Get the charset from a meta tag.
|
2015-03-01 19:56:11 +01:00
|
|
|
*
|
|
|
|
* @static
|
2016-07-28 21:14:51 -04:00
|
|
|
* @access public
|
|
|
|
* @param string $data Input data
|
2015-03-01 19:56:11 +01:00
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
public static function getEncodingFromMetaTag($data)
|
|
|
|
{
|
|
|
|
$encoding = '';
|
|
|
|
|
2015-03-25 19:59:41 -04:00
|
|
|
if (preg_match('/<meta.*?charset\s*=\s*["\']?\s*([^"\'\s\/>;]+)/i', $data, $match) === 1) {
|
|
|
|
$encoding = strtolower($match[1]);
|
2015-03-01 19:56:11 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return $encoding;
|
|
|
|
}
|
|
|
|
|
2014-10-19 14:42:31 -04:00
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Rewrite XPath query to use namespace-uri and local-name derived from prefix.
|
|
|
|
*
|
2016-07-28 21:14:51 -04:00
|
|
|
* @static
|
|
|
|
* @access public
|
2015-10-19 22:49:30 -04:00
|
|
|
* @param string $query XPath query
|
|
|
|
* @param array $ns Prefix to namespace URI mapping
|
2015-07-19 11:19:26 -04:00
|
|
|
* @return string
|
2014-10-19 14:42:31 -04:00
|
|
|
*/
|
2015-10-19 22:49:30 -04:00
|
|
|
public static function replaceXPathPrefixWithNamespaceURI($query, array $ns)
|
|
|
|
{
|
|
|
|
return preg_replace_callback('/([A-Z0-9]+):([A-Z0-9]+)/iu', function ($matches) use ($ns) {
|
2015-07-19 11:19:26 -04:00
|
|
|
// don't try to map the special prefix XML
|
|
|
|
if (strtolower($matches[1]) === 'xml') {
|
|
|
|
return $matches[0];
|
|
|
|
}
|
2014-10-19 14:42:31 -04:00
|
|
|
|
2015-07-19 11:19:26 -04:00
|
|
|
return '*[namespace-uri()="'.$ns[$matches[1]].'" and local-name()="'.$matches[2].'"]';
|
|
|
|
},
|
|
|
|
$query);
|
2014-10-19 14:42:31 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-10-19 22:49:30 -04:00
|
|
|
* Get the result elements of a XPath query.
|
|
|
|
*
|
2016-07-28 21:14:51 -04:00
|
|
|
* @static
|
|
|
|
* @access public
|
|
|
|
* @param SimpleXMLElement $xml XML element
|
|
|
|
* @param string $query XPath query
|
|
|
|
* @param array $ns Prefix to namespace URI mapping
|
|
|
|
* @return SimpleXMLElement[]
|
2014-10-19 14:42:31 -04:00
|
|
|
*/
|
2015-07-19 11:19:26 -04:00
|
|
|
public static function getXPathResult(SimpleXMLElement $xml, $query, array $ns = array())
|
2014-10-19 14:42:31 -04:00
|
|
|
{
|
2015-10-19 22:49:30 -04:00
|
|
|
if (!empty($ns)) {
|
2015-07-19 11:19:26 -04:00
|
|
|
$query = static::replaceXPathPrefixWithNamespaceURI($query, $ns);
|
2014-10-19 14:42:31 -04:00
|
|
|
}
|
|
|
|
|
2015-07-19 11:19:26 -04:00
|
|
|
return $xml->xpath($query);
|
2014-10-19 14:42:31 -04:00
|
|
|
}
|
2016-04-17 19:34:54 -04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Get the first Xpath result or SimpleXMLElement value
|
|
|
|
*
|
|
|
|
* @static
|
|
|
|
* @access public
|
|
|
|
* @param mixed $value
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
public static function getValue($value)
|
|
|
|
{
|
|
|
|
$result = '';
|
|
|
|
|
|
|
|
if (is_array($value) && count($value) > 0) {
|
|
|
|
$result = (string) $value[0];
|
|
|
|
} elseif (is_a($value, 'SimpleXMLElement')) {
|
|
|
|
return $result = (string) $value;
|
|
|
|
}
|
|
|
|
|
|
|
|
return trim($result);
|
|
|
|
}
|
2014-05-20 14:20:27 -04:00
|
|
|
}
|