diff --git a/libraries/common_syndication_parser.inc b/libraries/common_syndication_parser.inc index 47b7850004f9761ccd98252babc92639b4337795..d2296eb34c44c08f3416c63ee689290aab79894d 100644 --- a/libraries/common_syndication_parser.inc +++ b/libraries/common_syndication_parser.inc @@ -86,11 +86,7 @@ function _parser_common_syndication_atom10_parse($feed_XML) { "georss" => "http://www.georss.org/georss", ); - $base = $feed_XML->xpath("@base"); - $base = (string) array_shift($base); - if (!valid_url($base, TRUE)) { - $base = FALSE; - } + $base = _parser_common_syndication_atom10_parse_base_url($feed_XML); // Detect the title $parsed_source['title'] = isset($feed_XML->title) ? _parser_common_syndication_title("{$feed_XML->title}") : ""; @@ -98,20 +94,13 @@ function _parser_common_syndication_atom10_parse($feed_XML) { $parsed_source['description'] = isset($feed_XML->subtitle) ? "{$feed_XML->subtitle}" : ""; $parsed_source['link'] = _parser_common_syndication_link($feed_XML->link); - if (valid_url($parsed_source['link']) && !valid_url($parsed_source['link'], TRUE) && !empty($base)) { + if ($base && !valid_url($parsed_source['link'], TRUE) && valid_url($parsed_source['link'])) { $parsed_source['link'] = $base . $parsed_source['link']; } $parsed_source['items'] = array(); foreach ($feed_XML->entry as $news) { - - $original_url = NULL; - $guid = !empty($news->id) ? "{$news->id}" : NULL; - if (valid_url($guid, TRUE)) { - $original_url = $guid; - } - $georss = (array)$news->children($ns["georss"]); $geoname = ''; if (isset($georss['featureName'])) { @@ -164,13 +153,6 @@ function _parser_common_syndication_atom10_parse($feed_XML) { $body .= "{$news->summary}"; } - if (!empty($news->content['src'])) { - // some src elements in some valid atom feeds contained no urls at all - if (valid_url("{$news->content['src']}", TRUE)) { - $original_url = "{$news->content['src']}"; - } - } - $original_author = ''; if (!empty($news->source->author->name)) { $original_author = "{$news->source->author->name}"; @@ -182,8 +164,6 @@ function _parser_common_syndication_atom10_parse($feed_XML) { $original_author = "{$feed_XML->author->name}"; } - $original_url = _parser_common_syndication_link($news->link); - $item = array(); $item['title'] = _parser_common_syndication_title($title, $body); $item['description'] = $body; @@ -201,17 +181,32 @@ function _parser_common_syndication_atom10_parse($feed_XML) { $item['timestamp'] = _parser_common_syndication_parse_date("{$news->updated}"); } - $item['url'] = trim($original_url); - if (valid_url($item['url']) && !valid_url($item['url'], TRUE) && !empty($base)) { - $item['url'] = $base . $item['url']; + $item['guid'] = (string) $news->id; + + $item['url'] = _parser_common_syndication_link($news->link); + + if (!$item['url'] && !empty($news->content['src']) && valid_url($news->content['src'], TRUE)) { + $item['url'] = (string) $news->content['src']; } - // Fall back on URL if GUID is empty. - if (!empty($guid)) { - $item['guid'] = $guid; + + if (!strlen($item['url']) && $item['guid'] && valid_url($item['guid'], TRUE)) { + $item['url'] = $item['guid']; } - else { + + if (!valid_url($item['url'], TRUE) && valid_url($item['url'])) { + if ($item_base = _parser_common_syndication_atom10_parse_base_url($news)) { + $item['url'] = $item_base . $item['url']; + } + elseif ($base) { + $item['url'] = $base . $item['url']; + } + } + + // Fall back on URL if GUID is empty. + if (!strlen($item['guid'])) { $item['guid'] = $item['url']; } + $item['geolocations'] = array(); if ($lat && $lon) { $item['geolocations'] = array( @@ -226,9 +221,61 @@ function _parser_common_syndication_atom10_parse($feed_XML) { $item['domains'] = isset($additional_taxonomies['ATOM Domains']) ? $additional_taxonomies['ATOM Domains'] : array(); $parsed_source['items'][] = $item; } + return $parsed_source; } +/** + * Finds the base URL of an Atom document. + * + * @param SimpleXMLElement $xml + * The XML document. + * + * @return string|false + * Returns the base URL or false on failure. + */ +function _parser_common_syndication_atom10_parse_base_url(SimpleXMLElement $xml) { + $base = $xml->attributes('xml', TRUE)->base; + if (!$base) { + $base = $xml['base']; + } + + if ($base && valid_url($base, TRUE)) { + return rtrim($base, '/') . '/'; + } + + // Try to build a base from the self link. + foreach ($xml->xpath('*[local-name() = "link" and @rel="self" and @href]') as $self) { + if (valid_url($self['href'], TRUE)) { + return _parser_common_syndication_string_url_path((string) $self['href']); + } + } + + // Try to build a base from the alternate link. + foreach ($xml->xpath('*[local-name() = "link" and @rel="alternate" and @href]') as $alternate) { + if (valid_url($alternate['href'], TRUE)) { + return _parser_common_syndication_string_url_path((string) $alternate['href']); + } + } + + return FALSE; +} + +/** + * Removes the path parts of an absolute URL. + * + * @param string $url + * The absolute URL. + * + * @return string + * The absolute URL with the path stripped. + */ +function _parser_common_syndication_string_url_path($url) { + $pos = strpos($url, '/', strpos($url, '//') + 2); + + return $pos ? substr($url, 0, $pos + 1) : $url . '/'; +} + /** * Parse RDF Site Summary (RSS) 1.0 feeds in RDF/XML format. * @@ -586,7 +633,7 @@ function _parser_common_syndication_link($links) { } } } - return $to_link; + return trim($to_link); } /** diff --git a/tests/common_syndication_parser.test b/tests/common_syndication_parser.test index d8eabb4cad5237f518a6004aed76e36a50975b0e..d68ca4b522f97a9f5f89e69b57778cb398051a81 100644 --- a/tests/common_syndication_parser.test +++ b/tests/common_syndication_parser.test @@ -33,6 +33,7 @@ class CommonSyndicationParserTestCase extends DrupalWebTestCase { $this->_testRSS2(); $this->_testAtomGeoRSS(); $this->_testAtomGeoRSSWithoutAuthor(); + $this->_testAtomEntriesWithoutBaseUrl(); } /** @@ -91,6 +92,32 @@ class CommonSyndicationParserTestCase extends DrupalWebTestCase { $feed = common_syndication_parser_parse($string); } + /** + * Tests if the base url is prepended for entries without base url. + * + * For example, the url in the following entry should be parsed as + * 'http://www.example.com/node/123' and not as 'node/123'. + * @code + * <entry> + * <link href="node/123"/> + * </entry> + * @endcode + */ + protected function _testAtomEntriesWithoutBaseUrl() { + $string = $this->readFeed('entries-without-base-url.atom'); + $feed = common_syndication_parser_parse($string); + + // Assert that all items got the base url assigned. + $expected = array( + 'http://www.example.com/node/1281496#comment-11669575', + 'http://www.example.com/node/1281496#comment-10080648', + 'http://www.example.com/node/1281496#comment-10062564', + ); + foreach ($feed['items'] as $key => $item) { + $this->assertEqual($expected[$key], $item['url']); + } + } + /** * Helper to read a feed. */ diff --git a/tests/feeds/entries-without-base-url.atom b/tests/feeds/entries-without-base-url.atom new file mode 100644 index 0000000000000000000000000000000000000000..043c78b6a8dc15e27dee1f7d484fe9d36eb9bf1d --- /dev/null +++ b/tests/feeds/entries-without-base-url.atom @@ -0,0 +1,45 @@ +<?xml version="1.0" encoding="utf-8" ?> +<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"> + <title>Feeds issue #1281496</title> + <link rel="self" type="application/atom+xml" href="http://www.example.com/feeds/entries-without-base-url.atom"/> + <link rel="alternate" type="text/html" href="http://www.example.com/node/1281496"/> + <updated>2016-10-29T20:35:56Z</updated> + <author> + <name>dcotruta</name> + </author> + <id>http://www.example.com/node/1281496</id> + + <entry> + <title>Re-spin the patch</title> + <link rel="alternate" type="text/html" href="node/1281496#comment-11669575" /> + <id>comment-11669575</id> + <updated>2016-09-28T17:08:00Z</updated> + <summary>Re-spin the patch for feeds 7.x-2.0-beta2.</summary> + <author> + <name>natew</name> + </author> + </entry> + + <entry> + <title>Thanks twistor, I just tried the latest patch</title> + <link rel="alternate" type="text/html" href="node/1281496#comment-10080648" /> + <id>comment-10080648</id> + <updated>2015-07-02T19:33:00Z</updated> + <summary>Thanks twistor, I just tried the latest patch and this works for me. The feed items get imported and the proper url is set.</summary> + <author> + <name>natew</name> + </author> + </entry> + + <entry> + <title>Probably missed a string cast somewhere.</title> + <link rel="alternate" type="text/html" href="node/1281496#comment-10062564" /> + <id>comment-10062564</id> + <updated>2015-06-26T19:52:00Z</updated> + <summary>Probably missed a string cast somewhere.</summary> + <author> + <name>twistor</name> + </author> + </entry> + +</feed>