Skip to content
Snippets Groups Projects
Commit df1afad1 authored by twistor's avatar twistor Committed by MegaChriz
Browse files

Issue #1281496 by twistor, MegaChriz, natew: fixed prepend base url for...

Issue #1281496 by twistor, MegaChriz, natew: fixed prepend base url for relative links in entries in atom feeds.
parent 315b8fdc
No related branches found
No related tags found
No related merge requests found
......@@ -86,11 +86,7 @@ function _parser_common_syndication_atom10_parse($feed_XML) {
"georss" => "http://www.georss.org/georss",
);
$base = $feed_XML->xpath("@base");
$base = (string) array_shift($base);
if (!valid_url($base, TRUE)) {
$base = FALSE;
}
$base = _parser_common_syndication_atom10_parse_base_url($feed_XML);
// Detect the title
$parsed_source['title'] = isset($feed_XML->title) ? _parser_common_syndication_title("{$feed_XML->title}") : "";
......@@ -98,20 +94,13 @@ function _parser_common_syndication_atom10_parse($feed_XML) {
$parsed_source['description'] = isset($feed_XML->subtitle) ? "{$feed_XML->subtitle}" : "";
$parsed_source['link'] = _parser_common_syndication_link($feed_XML->link);
if (valid_url($parsed_source['link']) && !valid_url($parsed_source['link'], TRUE) && !empty($base)) {
if ($base && !valid_url($parsed_source['link'], TRUE) && valid_url($parsed_source['link'])) {
$parsed_source['link'] = $base . $parsed_source['link'];
}
$parsed_source['items'] = array();
foreach ($feed_XML->entry as $news) {
$original_url = NULL;
$guid = !empty($news->id) ? "{$news->id}" : NULL;
if (valid_url($guid, TRUE)) {
$original_url = $guid;
}
$georss = (array)$news->children($ns["georss"]);
$geoname = '';
if (isset($georss['featureName'])) {
......@@ -164,13 +153,6 @@ function _parser_common_syndication_atom10_parse($feed_XML) {
$body .= "{$news->summary}";
}
if (!empty($news->content['src'])) {
// some src elements in some valid atom feeds contained no urls at all
if (valid_url("{$news->content['src']}", TRUE)) {
$original_url = "{$news->content['src']}";
}
}
$original_author = '';
if (!empty($news->source->author->name)) {
$original_author = "{$news->source->author->name}";
......@@ -182,8 +164,6 @@ function _parser_common_syndication_atom10_parse($feed_XML) {
$original_author = "{$feed_XML->author->name}";
}
$original_url = _parser_common_syndication_link($news->link);
$item = array();
$item['title'] = _parser_common_syndication_title($title, $body);
$item['description'] = $body;
......@@ -201,17 +181,32 @@ function _parser_common_syndication_atom10_parse($feed_XML) {
$item['timestamp'] = _parser_common_syndication_parse_date("{$news->updated}");
}
$item['url'] = trim($original_url);
if (valid_url($item['url']) && !valid_url($item['url'], TRUE) && !empty($base)) {
$item['url'] = $base . $item['url'];
$item['guid'] = (string) $news->id;
$item['url'] = _parser_common_syndication_link($news->link);
if (!$item['url'] && !empty($news->content['src']) && valid_url($news->content['src'], TRUE)) {
$item['url'] = (string) $news->content['src'];
}
// Fall back on URL if GUID is empty.
if (!empty($guid)) {
$item['guid'] = $guid;
if (!strlen($item['url']) && $item['guid'] && valid_url($item['guid'], TRUE)) {
$item['url'] = $item['guid'];
}
else {
if (!valid_url($item['url'], TRUE) && valid_url($item['url'])) {
if ($item_base = _parser_common_syndication_atom10_parse_base_url($news)) {
$item['url'] = $item_base . $item['url'];
}
elseif ($base) {
$item['url'] = $base . $item['url'];
}
}
// Fall back on URL if GUID is empty.
if (!strlen($item['guid'])) {
$item['guid'] = $item['url'];
}
$item['geolocations'] = array();
if ($lat && $lon) {
$item['geolocations'] = array(
......@@ -226,9 +221,61 @@ function _parser_common_syndication_atom10_parse($feed_XML) {
$item['domains'] = isset($additional_taxonomies['ATOM Domains']) ? $additional_taxonomies['ATOM Domains'] : array();
$parsed_source['items'][] = $item;
}
return $parsed_source;
}
/**
* Finds the base URL of an Atom document.
*
* @param SimpleXMLElement $xml
* The XML document.
*
* @return string|false
* Returns the base URL or false on failure.
*/
function _parser_common_syndication_atom10_parse_base_url(SimpleXMLElement $xml) {
$base = $xml->attributes('xml', TRUE)->base;
if (!$base) {
$base = $xml['base'];
}
if ($base && valid_url($base, TRUE)) {
return rtrim($base, '/') . '/';
}
// Try to build a base from the self link.
foreach ($xml->xpath('*[local-name() = "link" and @rel="self" and @href]') as $self) {
if (valid_url($self['href'], TRUE)) {
return _parser_common_syndication_string_url_path((string) $self['href']);
}
}
// Try to build a base from the alternate link.
foreach ($xml->xpath('*[local-name() = "link" and @rel="alternate" and @href]') as $alternate) {
if (valid_url($alternate['href'], TRUE)) {
return _parser_common_syndication_string_url_path((string) $alternate['href']);
}
}
return FALSE;
}
/**
* Removes the path parts of an absolute URL.
*
* @param string $url
* The absolute URL.
*
* @return string
* The absolute URL with the path stripped.
*/
function _parser_common_syndication_string_url_path($url) {
$pos = strpos($url, '/', strpos($url, '//') + 2);
return $pos ? substr($url, 0, $pos + 1) : $url . '/';
}
/**
* Parse RDF Site Summary (RSS) 1.0 feeds in RDF/XML format.
*
......@@ -586,7 +633,7 @@ function _parser_common_syndication_link($links) {
}
}
}
return $to_link;
return trim($to_link);
}
/**
......
......@@ -33,6 +33,7 @@ class CommonSyndicationParserTestCase extends DrupalWebTestCase {
$this->_testRSS2();
$this->_testAtomGeoRSS();
$this->_testAtomGeoRSSWithoutAuthor();
$this->_testAtomEntriesWithoutBaseUrl();
}
/**
......@@ -91,6 +92,32 @@ class CommonSyndicationParserTestCase extends DrupalWebTestCase {
$feed = common_syndication_parser_parse($string);
}
/**
* Tests if the base url is prepended for entries without base url.
*
* For example, the url in the following entry should be parsed as
* 'http://www.example.com/node/123' and not as 'node/123'.
* @code
* <entry>
* <link href="node/123"/>
* </entry>
* @endcode
*/
protected function _testAtomEntriesWithoutBaseUrl() {
$string = $this->readFeed('entries-without-base-url.atom');
$feed = common_syndication_parser_parse($string);
// Assert that all items got the base url assigned.
$expected = array(
'http://www.example.com/node/1281496#comment-11669575',
'http://www.example.com/node/1281496#comment-10080648',
'http://www.example.com/node/1281496#comment-10062564',
);
foreach ($feed['items'] as $key => $item) {
$this->assertEqual($expected[$key], $item['url']);
}
}
/**
* Helper to read a feed.
*/
......
<?xml version="1.0" encoding="utf-8" ?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
<title>Feeds issue #1281496</title>
<link rel="self" type="application/atom+xml" href="http://www.example.com/feeds/entries-without-base-url.atom"/>
<link rel="alternate" type="text/html" href="http://www.example.com/node/1281496"/>
<updated>2016-10-29T20:35:56Z</updated>
<author>
<name>dcotruta</name>
</author>
<id>http://www.example.com/node/1281496</id>
<entry>
<title>Re-spin the patch</title>
<link rel="alternate" type="text/html" href="node/1281496#comment-11669575" />
<id>comment-11669575</id>
<updated>2016-09-28T17:08:00Z</updated>
<summary>Re-spin the patch for feeds 7.x-2.0-beta2.</summary>
<author>
<name>natew</name>
</author>
</entry>
<entry>
<title>Thanks twistor, I just tried the latest patch</title>
<link rel="alternate" type="text/html" href="node/1281496#comment-10080648" />
<id>comment-10080648</id>
<updated>2015-07-02T19:33:00Z</updated>
<summary>Thanks twistor, I just tried the latest patch and this works for me. The feed items get imported and the proper url is set.</summary>
<author>
<name>natew</name>
</author>
</entry>
<entry>
<title>Probably missed a string cast somewhere.</title>
<link rel="alternate" type="text/html" href="node/1281496#comment-10062564" />
<id>comment-10062564</id>
<updated>2015-06-26T19:52:00Z</updated>
<summary>Probably missed a string cast somewhere.</summary>
<author>
<name>twistor</name>
</author>
</entry>
</feed>
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment