Update PicoFeed to version 0.1.27

This commit is contained in:
Frederic Guillot 2016-12-26 17:32:18 -05:00
parent 533bba270b
commit 1a85a76c5c
31 changed files with 462 additions and 88 deletions

View File

@ -15,7 +15,7 @@
"fguillot/simple-validator": "v1.0.0",
"fguillot/json-rpc": "v1.2.3",
"fguillot/picodb": "v1.0.14 ",
"fguillot/picofeed": "v0.1.25",
"fguillot/picofeed": "v0.1.27",
"pda/pheanstalk": "v3.1.0",
"ircmaxell/password-compat": "^1.0.4"
},

View File

@ -222,56 +222,6 @@
"password"
]
},
{
"name": "fguillot/picofeed",
"version": "v0.1.25",
"version_normalized": "0.1.25.0",
"source": {
"type": "git",
"url": "https://github.com/fguillot/picoFeed.git",
"reference": "2bf5bc40361e788eda6b1bd5d444630986721e69"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/fguillot/picoFeed/zipball/2bf5bc40361e788eda6b1bd5d444630986721e69",
"reference": "2bf5bc40361e788eda6b1bd5d444630986721e69",
"shasum": ""
},
"require": {
"ext-dom": "*",
"ext-iconv": "*",
"ext-libxml": "*",
"ext-simplexml": "*",
"ext-xml": "*",
"php": ">=5.3.0",
"zendframework/zendxml": "^1.0"
},
"suggest": {
"ext-curl": "PicoFeed will use cURL if present"
},
"time": "2016-08-30 01:33:18",
"bin": [
"picofeed"
],
"type": "library",
"installation-source": "dist",
"autoload": {
"psr-0": {
"PicoFeed": "lib/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Frédéric Guillot"
}
],
"description": "Modern library to handle RSS/Atom feeds",
"homepage": "https://github.com/fguillot/picoFeed"
},
{
"name": "fguillot/json-rpc",
"version": "v1.2.3",
@ -312,5 +262,60 @@
],
"description": "Simple Json-RPC client/server library that just works",
"homepage": "https://github.com/fguillot/JsonRPC"
},
{
"name": "fguillot/picofeed",
"version": "v0.1.27",
"version_normalized": "0.1.27.0",
"source": {
"type": "git",
"url": "https://github.com/fguillot/picoFeed.git",
"reference": "41924841d3cd0480364ca9bcb90abe095d744457"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/fguillot/picoFeed/zipball/41924841d3cd0480364ca9bcb90abe095d744457",
"reference": "41924841d3cd0480364ca9bcb90abe095d744457",
"shasum": ""
},
"require": {
"ext-dom": "*",
"ext-iconv": "*",
"ext-libxml": "*",
"ext-simplexml": "*",
"ext-xml": "*",
"php": ">=5.3.0",
"zendframework/zendxml": "^1.0"
},
"require-dev": {
"phpdocumentor/reflection-docblock": "2.0.4",
"phpunit/phpunit": "4.8.26",
"symfony/yaml": "2.8.7"
},
"suggest": {
"ext-curl": "PicoFeed will use cURL if present"
},
"time": "2016-12-26 22:25:33",
"bin": [
"picofeed"
],
"type": "library",
"installation-source": "dist",
"autoload": {
"psr-0": {
"PicoFeed": "lib/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Frédéric Guillot"
}
],
"description": "Modern library to handle RSS/Atom feeds",
"homepage": "https://github.com/fguillot/picoFeed"
}
]

View File

@ -11,6 +11,8 @@ use PicoFeed\Logging\Logger;
*/
class Curl extends Client
{
protected $nbRedirects = 0;
/**
* HTTP response body.
*
@ -136,6 +138,7 @@ class Curl extends Client
if ($this->etag) {
$headers[] = 'If-None-Match: '.$this->etag;
$headers[] = 'A-IM: feed';
}
if ($this->last_modified) {
@ -199,6 +202,9 @@ class Curl extends Client
*/
private function prepareDownloadMode($ch)
{
$this->body = '';
$this->response_headers = array();
$this->response_headers_count = 0;
$write_function = 'readBody';
$header_function = 'readHeaders';
@ -304,12 +310,11 @@ class Curl extends Client
* Handle HTTP redirects
*
* @param string $location Redirected URL
*
* @return array
* @throws MaxRedirectException
*/
private function handleRedirection($location)
{
$nb_redirects = 0;
$result = array();
$this->url = Url::resolve($location, $this->url);
$this->body = '';
@ -318,9 +323,9 @@ class Curl extends Client
$this->response_headers_count = 0;
while (true) {
++$nb_redirects;
$this->nbRedirects++;
if ($nb_redirects >= $this->max_redirects) {
if ($this->nbRedirects >= $this->max_redirects) {
throw new MaxRedirectException('Maximum number of redirections reached');
}

View File

@ -31,6 +31,7 @@ class Stream extends Client
if ($this->etag) {
$headers[] = 'If-None-Match: '.$this->etag;
$headers[] = 'A-IM: feed';
}
if ($this->last_modified) {
@ -104,6 +105,9 @@ class Stream extends Client
* Do the HTTP request.
*
* @return array HTTP response ['body' => ..., 'status' => ..., 'headers' => ...]
* @throws InvalidUrlException
* @throws MaxSizeException
* @throws TimeoutException
*/
public function doRequest()
{

View File

@ -51,6 +51,7 @@ class Attribute
'td' => array(),
'tbody' => array(),
'thead' => array(),
'h1' => array(),
'h2' => array(),
'h3' => array(),
'h4' => array(),

View File

@ -42,6 +42,7 @@ class Tag extends Base
'td',
'tbody',
'thead',
'h1',
'h2',
'h3',
'h4',
@ -67,6 +68,8 @@ class Tag extends Base
'abbr',
'iframe',
'q',
'sup',
'sub',
);
/**

View File

@ -13,7 +13,7 @@ class Feed
/**
* Feed items.
*
* @var array
* @var Item[]
*/
public $items = array();

View File

@ -222,18 +222,20 @@ abstract class Parser implements ParserInterface
public function findItemDate(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$this->findItemPublishedDate($entry, $item, $feed);
$published = $item->getPublishedDate();
$this->findItemUpdatedDate($entry, $item, $feed);
$updated = $item->getUpdatedDate();
if ($published === null && $updated === null) {
$item->setDate($feed->getDate()); // We use the feed date if there is no date for the item
} elseif ($published !== null && $updated !== null) {
$item->setDate(max($published, $updated)); // We use the most recent date between published and updated
} else {
$item->setDate($updated ?: $published);
if ($item->getPublishedDate() === null) {
// Use the updated date if available, otherwise use the feed date
$item->setPublishedDate($item->getUpdatedDate() ?: $feed->getDate());
}
if ($item->getUpdatedDate() === null) {
// Use the published date as fallback
$item->setUpdatedDate($item->getPublishedDate());
}
// Use the most recent of published and updated dates
$item->setDate(max($item->getPublishedDate(), $item->getUpdatedDate()));
}
/**

View File

@ -0,0 +1,31 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://bigpicture.ru/?p=556658',
'body' => array(
'//div[@class="article container"]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//h1',
'//*[@class="wp-smiley"]',
'//div[@class="ipmd"]',
'//div[@class="tags"]',
'//div[@class="social-button"]',
'//div[@class="bottom-share"]',
'//div[@class="raccoonbox"]',
'//div[@class="yndadvert"]',
'//div[@class="we-recommend"]',
'//div[@class="relap-bigpicture_ru-wrapper"]',
'//div[@id="mmail"]',
'//div[@id="mobile-ads-cut"]',
'//div[@id="liquidstorm-alt-html"]',
'//div[contains(@class, "post-tags")]',
'//*[contains(text(),"Смотрите также")]',
),
),
),
);

View File

@ -0,0 +1,22 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://e-w-e.ru/16-prekrasnyx-izobretenij-zhenshhin/',
'body' => array(
'//div[contains(@class, "post_text")]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//*[@class="views_post"]',
'//*[@class="adman_mobile"]',
'//*[@class="adman_desctop"]',
'//*[contains(@rel, "nofollow")]',
'//*[contains(@class, "wp-smiley")]',
'//*[contains(text(),"Источник:")]',
),
),
),
);

View File

@ -0,0 +1,27 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.factroom.ru/life/20-facts-about-oil',
'body' => array(
'//div[@class="post"]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//h1',
'//div[@id="yandex_ad2"]',
'//*[@class="jp-relatedposts"]',
'//div[contains(@class, "likely-desktop")]',
'//div[contains(@class, "likely-mobile")]',
'//p[last()]',
'//div[contains(@class, "facebook")]',
'//div[contains(@class, "desktop-underpost-direct")]',
'//div[contains(@class, "source-box")]',
'//div[contains(@class, "under-likely-desktop")]',
'//div[contains(@class, "mobile-down-post")]',
),
),
),
);

View File

@ -0,0 +1,19 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://fototelegraf.ru/?p=348232',
'body' => array(
'//div[@class="post-content"]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//div[@class="imageButtonsBlock"]',
'//div[@class="adOnPostBtwImg"]',
'//div[contains(@class, "post-tags")]',
),
),
),
);

View File

@ -6,7 +6,15 @@ return array(
'test_url' => 'http://www.golem.de/news/breko-telekom-verzoegert-gezielt-den-vectoring-ausbau-1311-102974.html',
'body' => array(
'//header[@class="cluster-header"]',
'//header[@class="paged-cluster-header"]',
'//div[@class="formatted"]',
),
'next_page' => array(
'//a[@id="atoc_next"]'
),
'strip' => array(
'//header[@class="cluster-header"]/a',
'//div[@id="iqadtile4"]',
),
),
),

View File

@ -0,0 +1,19 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://gorabbit.ru/article/10-oshchushcheniy-za-rulem-kogda-tolko-poluchil-voditelskie-prava',
'body' => array(
'//div[@class="detail_text"]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//div[@class="socials"]',
'//div[@id="cr_1"]',
'//div[@class="related_items"]',
),
),
),
);

View File

@ -0,0 +1,23 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'https://hotshowlife.com/top-10-chempionov-produktov-po-szhiganiyu-kalorij/',
'body' => array(
'//div[@class="entry-content"]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//div[@class="ads2"]',
'//div[@class="mistape_caption"]',
'//div[contains(@class, "et_social_media_hidden")]',
'//div[contains(@class, "et_social_inline_bottom")]',
'//div[contains(@class, "avatar")]',
'//ul[contains(@class, "entry-tags")]',
'//div[contains(@class, "entry-meta")]',
),
),
),
);

View File

@ -0,0 +1,19 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://justcoolidea.ru/idealnyj-sad-samodelnye-proekty-dlya-berezhlivogo-domovladeltsa/',
'body' => array(
'//section[@class="entry-content"]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//*[contains(@class, "essb_links")]',
'//*[contains(@rel, "nofollow")]',
'//*[contains(@class, "ads")]',
),
),
),
);

View File

@ -0,0 +1,23 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => array(
'http://www.legorafi.fr/2016/12/16/gorafi-magazine-bravo-vous-avez-bientot-presque-survecu-a-2016/',
'http://www.legorafi.fr/2016/12/15/manuel-valls-promet-quune-fois-elu-il-debarrassera-la-france-de-manuel-valls/',
),
'body' => array(
'//section[@id="banner_magazine"]',
'//figure[@class="main_picture"]',
'//div[@class="content"]',
),
'strip' => array(
'//figcaption',
'//div[@class="sharebox"]',
'//div[@class="tags"]',
'//section[@class="taboola_article"]',
),
),
),
);

View File

@ -0,0 +1,22 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://lifehacker.ru/2016/03/03/polymail/',
'body' => array(
'//div[@class="post-content"]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//*[@class="wp-thumbnail-caption"]',
'//*[contains(@class, "social-likes")]',
'//*[@class="jp-relatedposts"]',
'//*[contains(@class, "wpappbox")]',
'//*[contains(@class, "icon__image")]',
'//div[@id="hypercomments_widget"]',
),
),
),
);

View File

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.monandroid.com/blog/tutoriel-avance-activer-le-stockage-fusionne-sur-android-6-marshamallow-t12.html',
'body' => array(
'//div[@class="blog-post-body"]',
),
'strip' => array(
),
),
),
);

View File

@ -3,7 +3,7 @@
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.monwindowsphone.com/tout-savoir-sur-le-centre-d-action-de-windows-phone-8-1-t40574.html',
'test_url' => 'http://www.monwindows.com/tout-savoir-sur-le-centre-d-action-de-windows-phone-8-1-t40574.html',
'body' => array(
'//div[@class="blog-post-body"]',
),

View File

@ -0,0 +1,21 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.moya-planeta.ru/travel/view/chto_yaponcu_horosho_russkomu_ne_ponyat_20432/',
'body' => array(
'//div[@class="full_object"]',
),
'strip' => array(
'//div[@class="full_object_panel object_panel"]',
'//div[@class="full_object_panel_geo object_panel"]',
'//div[@class="full_object_title"]',
'//div[@class="full_object_social_likes"]',
'//div[@class="full_object_planeta_likes"]',
'//div[@class="full_object_go2comments"]',
'//div[@id="yandex_ad_R-163191-3"]',
'//div[@class="full_object_shop_article_recommend"]',
),
),
),
);

View File

@ -0,0 +1,11 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.nat-geo.ru/fact/868093-knidos-antichnyy-naukograd/',
'body' => array(
'//div[@class="article-inner-text"]',
),
),
),
);

View File

@ -0,0 +1,24 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.publy.ru/post/19988',
'body' => array(
'//div[@class="singlepost"]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//*[@class="featured"]',
'//*[@class="toc_white no_bullets"]',
'//*[@class="toc_title"]',
'//*[@class="pba"]',
'//*[@class="comments"]',
'//*[contains(@class, "g-single")]',
'//*[@class="ts-fab-wrapper"]',
'//*[contains(@class, "wp_rp_wrap")]',
),
),
),
);

View File

@ -1,9 +1,15 @@
<?php
return array(
'filter' => array(
'grabber' => array(
'%.*%' => array(
'%(<img.+)(\.png"/>)%' => '$1$2$1after$2',
'test_url' => 'http://www.smbc-comics.com/comic/the-troll-toll',
'body' => array(
'//div[@id="cc-comicbody"]',
'//div[@id="aftercomic"]',
),
'strip' => array(
),
),
),
);

View File

@ -0,0 +1,21 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://takprosto.cc/kokteyl-dlya-pohudeniya-v-domashnih-usloviyah/',
'body' => array(
'//div[contains(@class, "entry-contentt")]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//*[@class="views_post"]',
'//*[contains(@class, "mailchimp-box")]',
'//*[contains(@class, "essb_links")]',
'//*[contains(@rel, "nofollow")]',
'//*[contains(@class, "ads")]',
),
),
),
);

View File

@ -2,20 +2,16 @@
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.thelocal.se/20151018/swedish-moderates-tighten-focus-on-begging-ban',
'test_url' => 'www.thelocal.se/20161219/this-swede-can-memorize-hundreds-of-numbers-in-only-five-minutes',
'body' => array(
'//article',
'//div[@id="article-photo"]',
'//div[@id="article-description"]',
'//div[@id="article-body"]',
),
'strip' => array(
'//p[@id="mobile-signature"]',
'//article/div[4]',
'//article/ul[1]',
'//div[@class="clr"]',
'//p[@class="small"]',
'//p[@style="font-weight: bold; font-size: 14px;"]',
'//div[@class="author"]',
'//div[@class="ad_container"]',
'//div[@id="article-info-middle"]',
)
)
)
);

View File

@ -243,6 +243,16 @@ class CandidateParser implements ParserInterface
}
}
/**
* Find link for next page of the article.
*
* @return string
*/
public function findNextLink()
{
return null;
}
/**
* Return false if the node should not be removed.
*

View File

@ -10,4 +10,11 @@ interface ParserInterface
* @return string
*/
public function execute();
/**
* Find link for next page of the article.
*
* @return string
*/
public function findNextLink();
}

View File

@ -65,7 +65,6 @@ class RuleParser implements ParserInterface
public function findContent()
{
$content = '';
if (isset($this->rules['body']) && is_array($this->rules['body'])) {
foreach ($this->rules['body'] as $pattern) {
$nodes = $this->xpath->query($pattern);
@ -80,4 +79,24 @@ class RuleParser implements ParserInterface
return $content;
}
/**
* Fetch next link based on Xpath rules.
*
* @return string
*/
public function findNextLink()
{
if (isset($this->rules['next_page']) && is_array($this->rules['next_page'])) {
foreach ($this->rules['next_page'] as $pattern) {
$nodes = $this->xpath->query($pattern);
if ($nodes !== false && $nodes->length > 0) {
foreach ($nodes as $node) {
return $node->getAttribute('href');
}
}
}
}
return null;
}
}

View File

@ -206,19 +206,31 @@ class Scraper extends Base
/**
* Execute the scraper.
*/
public function execute()
public function execute($pageContent = '', $recursionDepth = 0)
{
$this->content = '';
$this->html = '';
$this->encoding = '';
$this->content = '';
$this->download();
$this->prepareHtml();
$parser = $this->getParser();
if ($parser !== null) {
$this->content = $parser->execute();
$maxRecursions = $this->config->getMaxRecursions();
if(!isset($maxRecursions)){
$maxRecursions = 25;
}
$pageContent .= $parser->execute();
// check if there is a link to next page and recursively get content (max 25 pages)
if((($nextLink = $parser->findNextLink()) !== null) && $recursionDepth < $maxRecursions){
$nextLink = Url::resolve($nextLink,$this->url);
$this->setUrl($nextLink);
$this->execute($pageContent,$recursionDepth+1);
}
else{
$this->content = $pageContent;
}
Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
}
}

View File

@ -36,7 +36,7 @@ class Rss20Helper
* @param DOMElement $element
* @param string $tag
* @param string $value
* @return AtomHelper
* @return $this
*/
public function buildNode(DOMElement $element, $tag, $value)
{
@ -52,7 +52,7 @@ class Rss20Helper
* @access public
* @param DOMElement $element
* @param string $title
* @return AtomHelper
* @return $this
*/
public function buildTitle(DOMElement $element, $title)
{
@ -66,7 +66,7 @@ class Rss20Helper
* @param DOMElement $element
* @param DateTime $date
* @param string $type
* @return AtomHelper
* @return $this
*/
public function buildDate(DOMElement $element, DateTime $date, $type = 'pubDate')
{
@ -79,7 +79,7 @@ class Rss20Helper
* @access public
* @param DOMElement $element
* @param string $url
* @return AtomHelper
* @return $this
*/
public function buildLink(DOMElement $element, $url)
{
@ -94,7 +94,7 @@ class Rss20Helper
* @param string $tag
* @param string $authorName
* @param string $authorEmail
* @return AtomHelper
* @return $this
*/
public function buildAuthor(DOMElement $element, $tag, $authorName, $authorEmail)
{