Issue #1281496 by twistor, MegaChriz, natew: fixed prepend base url for...

Issue #1281496 by twistor, MegaChriz, natew: fixed prepend base url for relative links in entries in atom feeds.

Issue #1281496 by twistor, MegaChriz, natew: fixed prepend base url for...
df1afad1 · twistor · MegaChriz · 315b8fdc · df1afad1 · df1afad1
Commit df1afad1 authored 8 years ago by twistor Committed by MegaChriz 8 years ago
--- a/libraries/common_syndication_parser.inc
+++ b/libraries/common_syndication_parser.inc
@@ -86,11 +86,7 @@ function _parser_common_syndication_atom10_parse($feed_XML) {
    "georss" => "http://www.georss.org/georss",
  );

-  $base = $feed_XML->xpath("@base");
-  $base = (string) array_shift($base);
-  if (!valid_url($base, TRUE)) {
-    $base = FALSE;
-  }
+  $base = _parser_common_syndication_atom10_parse_base_url($feed_XML);

  // Detect the title
  $parsed_source['title'] = isset($feed_XML->title) ? _parser_common_syndication_title("{$feed_XML->title}") : "";
@@ -98,20 +94,13 @@ function _parser_common_syndication_atom10_parse($feed_XML) {
  $parsed_source['description'] = isset($feed_XML->subtitle) ? "{$feed_XML->subtitle}" : "";

  $parsed_source['link'] = _parser_common_syndication_link($feed_XML->link);
-  if (valid_url($parsed_source['link']) && !valid_url($parsed_source['link'], TRUE) && !empty($base)) {
+  if ($base && !valid_url($parsed_source['link'], TRUE) && valid_url($parsed_source['link'])) {
    $parsed_source['link'] = $base . $parsed_source['link'];
  }

  $parsed_source['items'] = array();

  foreach ($feed_XML->entry as $news) {
-
-    $original_url = NULL;
-    $guid = !empty($news->id) ? "{$news->id}" : NULL;
-    if (valid_url($guid, TRUE)) {
-      $original_url = $guid;
-    }
-
    $georss = (array)$news->children($ns["georss"]);
    $geoname = '';
    if (isset($georss['featureName'])) {
@@ -164,13 +153,6 @@ function _parser_common_syndication_atom10_parse($feed_XML) {
      $body .= "{$news->summary}";
    }

-    if (!empty($news->content['src'])) {
-      // some src elements in some valid atom feeds contained no urls at all
-      if (valid_url("{$news->content['src']}", TRUE)) {
-        $original_url = "{$news->content['src']}";
-      }
-    }
-
    $original_author = '';
    if (!empty($news->source->author->name)) {
      $original_author = "{$news->source->author->name}";
@@ -182,8 +164,6 @@ function _parser_common_syndication_atom10_parse($feed_XML) {
      $original_author = "{$feed_XML->author->name}";
    }

-    $original_url = _parser_common_syndication_link($news->link);
-
    $item = array();
    $item['title'] = _parser_common_syndication_title($title, $body);
    $item['description'] = $body;
@@ -201,17 +181,32 @@ function _parser_common_syndication_atom10_parse($feed_XML) {
      $item['timestamp'] = _parser_common_syndication_parse_date("{$news->updated}");
    }

-    $item['url'] = trim($original_url);
-    if (valid_url($item['url']) && !valid_url($item['url'], TRUE) && !empty($base)) {
-      $item['url'] = $base . $item['url'];
+    $item['guid'] = (string) $news->id;
+
+    $item['url'] = _parser_common_syndication_link($news->link);
+
+    if (!$item['url'] && !empty($news->content['src']) && valid_url($news->content['src'], TRUE)) {
+      $item['url'] = (string) $news->content['src'];
    }
-    // Fall back on URL if GUID is empty.
-    if (!empty($guid)) {
-      $item['guid'] = $guid;
+
+    if (!strlen($item['url']) && $item['guid'] && valid_url($item['guid'], TRUE)) {
+      $item['url'] = $item['guid'];
    }
-    else {
+
+    if (!valid_url($item['url'], TRUE) && valid_url($item['url'])) {
+      if ($item_base = _parser_common_syndication_atom10_parse_base_url($news)) {
+        $item['url'] = $item_base . $item['url'];
+      }
+      elseif ($base) {
+        $item['url'] = $base . $item['url'];
+      }
+    }
+
+    // Fall back on URL if GUID is empty.
+    if (!strlen($item['guid'])) {
      $item['guid'] = $item['url'];
    }
+
    $item['geolocations'] = array();
    if ($lat && $lon) {
      $item['geolocations'] = array(
@@ -226,9 +221,61 @@ function _parser_common_syndication_atom10_parse($feed_XML) {
    $item['domains'] = isset($additional_taxonomies['ATOM Domains']) ? $additional_taxonomies['ATOM Domains'] : array();
    $parsed_source['items'][] = $item;
  }
+
  return $parsed_source;
 }

+/**
+ * Finds the base URL of an Atom document.
+ *
+ * @param SimpleXMLElement $xml
+ *   The XML document.
+ *
+ * @return string|false
+ *   Returns the base URL or false on failure.
+ */
+function _parser_common_syndication_atom10_parse_base_url(SimpleXMLElement $xml) {
+  $base = $xml->attributes('xml', TRUE)->base;
+  if (!$base) {
+    $base = $xml['base'];
+  }
+
+  if ($base && valid_url($base, TRUE)) {
+    return rtrim($base, '/') . '/';
+  }
+
+  // Try to build a base from the self link.
+  foreach ($xml->xpath('*[local-name() = "link" and @rel="self" and @href]') as $self) {
+    if (valid_url($self['href'], TRUE)) {
+      return _parser_common_syndication_string_url_path((string) $self['href']);
+    }
+  }
+
+  // Try to build a base from the alternate link.
+  foreach ($xml->xpath('*[local-name() = "link" and @rel="alternate" and @href]') as $alternate) {
+    if (valid_url($alternate['href'], TRUE)) {
+      return _parser_common_syndication_string_url_path((string) $alternate['href']);
+    }
+  }
+
+  return FALSE;
+}
+
+/**
+ * Removes the path parts of an absolute URL.
+ *
+ * @param string $url
+ *   The absolute URL.
+ *
+ * @return string
+ *   The absolute URL with the path stripped.
+ */
+function _parser_common_syndication_string_url_path($url) {
+  $pos = strpos($url, '/', strpos($url, '//') + 2);
+
+  return $pos ? substr($url, 0, $pos + 1) : $url . '/';
+}
+
 /**
 * Parse RDF Site Summary (RSS) 1.0 feeds in RDF/XML format.
 *
@@ -586,7 +633,7 @@ function _parser_common_syndication_link($links) {
      }
    }
  }
-  return $to_link;
+  return trim($to_link);
 }

 /**

--- a/tests/common_syndication_parser.test
+++ b/tests/common_syndication_parser.test
@@ -33,6 +33,7 @@ class CommonSyndicationParserTestCase extends DrupalWebTestCase {
    $this->_testRSS2();
    $this->_testAtomGeoRSS();
    $this->_testAtomGeoRSSWithoutAuthor();
+    $this->_testAtomEntriesWithoutBaseUrl();
  }

  /**
@@ -91,6 +92,32 @@ class CommonSyndicationParserTestCase extends DrupalWebTestCase {
    $feed = common_syndication_parser_parse($string);
  }

+  /**
+   * Tests if the base url is prepended for entries without base url.
+   *
+   * For example, the url in the following entry should be parsed as
+   * 'http://www.example.com/node/123' and not as 'node/123'.
+   * @code
+   * <entry>
+   *   <link href="node/123"/>
+   * </entry>
+   * @endcode
+   */
+  protected function _testAtomEntriesWithoutBaseUrl() {
+    $string = $this->readFeed('entries-without-base-url.atom');
+    $feed = common_syndication_parser_parse($string);
+
+    // Assert that all items got the base url assigned.
+    $expected = array(
+      'http://www.example.com/node/1281496#comment-11669575',
+      'http://www.example.com/node/1281496#comment-10080648',
+      'http://www.example.com/node/1281496#comment-10062564',
+    );
+    foreach ($feed['items'] as $key => $item) {
+      $this->assertEqual($expected[$key], $item['url']);
+    }
+  }
+
  /**
   * Helper to read a feed.
   */

--- a/tests/feeds/entries-without-base-url.atom
+++ b/tests/feeds/entries-without-base-url.atom
+<?xml version="1.0" encoding="utf-8" ?>
+<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
+  <title>Feeds issue #1281496</title>
+  <link rel="self" type="application/atom+xml" href="http://www.example.com/feeds/entries-without-base-url.atom"/>
+  <link rel="alternate" type="text/html" href="http://www.example.com/node/1281496"/>
+  <updated>2016-10-29T20:35:56Z</updated>
+  <author>
+    <name>dcotruta</name>
+  </author>
+  <id>http://www.example.com/node/1281496</id>
+
+  <entry>
+    <title>Re-spin the patch</title>
+    <link rel="alternate" type="text/html" href="node/1281496#comment-11669575" />
+    <id>comment-11669575</id>
+    <updated>2016-09-28T17:08:00Z</updated>
+    <summary>Re-spin the patch for feeds 7.x-2.0-beta2.</summary>
+    <author>
+      <name>natew</name>
+    </author>
+  </entry>
+
+  <entry>
+    <title>Thanks twistor, I just tried the latest patch</title>
+    <link rel="alternate" type="text/html" href="node/1281496#comment-10080648" />
+    <id>comment-10080648</id>
+    <updated>2015-07-02T19:33:00Z</updated>
+    <summary>Thanks twistor, I just tried the latest patch and this works for me. The feed items get imported and the proper url is set.</summary>
+    <author>
+      <name>natew</name>
+    </author>
+  </entry>
+
+  <entry>
+    <title>Probably missed a string cast somewhere.</title>
+    <link rel="alternate" type="text/html" href="node/1281496#comment-10062564" />
+    <id>comment-10062564</id>
+    <updated>2015-06-26T19:52:00Z</updated>
+    <summary>Probably missed a string cast somewhere.</summary>
+    <author>
+      <name>twistor</name>
+    </author>
+  </entry>
+
+</feed>