From 42b9c759631f4c6ab9ebe5f7a7db959ac518d6b3 Mon Sep 17 00:00:00 2001
From: Will White <will_white@32237.no-reply.drupal.org>
Date: Tue, 30 Mar 2010 18:54:29 +0000
Subject: [PATCH] feature request #705872 by Scott Reynolds: HTTPFetcher
 autodiscovery.

---
 CHANGELOG.txt                |   1 +
 libraries/http_request.inc   | 230 ++++++++++++++++++++++++++---------
 plugins/FeedsHTTPFetcher.inc |  19 +++
 3 files changed, 191 insertions(+), 59 deletions(-)

diff --git a/CHANGELOG.txt b/CHANGELOG.txt
index 01928ae5..4551462c 100644
--- a/CHANGELOG.txt
+++ b/CHANGELOG.txt
@@ -33,6 +33,7 @@ Feeds 6.x 1.0 xxxxx xx, 2010-xx-xx
   same importer id.
 - #740962 Fix FileFetcher Attached to Feed Node, Upload Field Not Saving File
   Path.
+- #705872 Scott Reynolds: Added HTTPFetcher autodiscovery
 
 Feeds 6.x 1.0 Alpha 12, 2010-02-23
 ----------------------------------
diff --git a/libraries/http_request.inc b/libraries/http_request.inc
index b26e71d8..4ec7c9a8 100644
--- a/libraries/http_request.inc
+++ b/libraries/http_request.inc
@@ -10,16 +10,22 @@
  */
 
 /**
- * Download RSS or Atom feeds from a given URL. If document in given URL is an
- * HTML document, function attempts to discover RSS or Atom feeds and downloads
- * them.
- *
- * @todo Debug
- * @todo Cache detected rss feeds in url.
- * @todo Use exceptions, not string or false return values.
+ * PCRE for finding the link tags in html.
+ */
+define('HTTP_REQUEST_PCRE_LINK_TAG', '/<link((?:[\x09\x0A\x0B\x0C\x0D\x20]+[^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3D\x3E]*(?:[\x09\x0A\x0B\x0C\x0D\x20]*=[\x09\x0A\x0B\x0C\x0D\x20]*(?:"(?:[^"]*)"|\'(?:[^\']*)\'|(?:[^\x09\x0A\x0B\x0C\x0D\x20\x22\x27\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x3E]*)?))?)*)[\x09\x0A\x0B\x0C\x0D\x20]*(>(.*)<\/link>|(\/)?>)/si');
+
+/**
+ * PCRE for matching all the attributes in a tag.
+ */
+define('HTTP_REQUEST_PCRE_TAG_ATTRIBUTES', '/[\x09\x0A\x0B\x0C\x0D\x20]+([^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3D\x3E]*)(?:[\x09\x0A\x0B\x0C\x0D\x20]*=[\x09\x0A\x0B\x0C\x0D\x20]*(?:"([^"]*)"|\'([^\']*)\'|([^\x09\x0A\x0B\x0C\x0D\x20\x22\x27\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x3E]*)?))?/');
+
+/**
+ * Discover RSS or atom feeds at the given URL. If document in given URL is an
+ * HTML document, function attempts to discover RSS or Atom feeds.
  *
  * @return
- *  string - the downloaded data, FALSE - if the URL is not reachable
+ *  string - the discovered feed, FALSE - if the URL is not reachable or there
+ *    no feeds.
  */
 function http_request_get_common_syndication($url, $settings = NULL) {
   if (valid_url($url, TRUE)) {
@@ -36,62 +42,26 @@ function http_request_get_common_syndication($url, $settings = NULL) {
   $download = http_request_get($url, $username, $password, $accept_invalid_cert);
 
   // Cannot get the feed, return.
-  if ($download->data == FALSE) {
+  // http_request_get() always returns 200 even if its 304.
+  if ($download->code != 200) {
     return FALSE;
   }
 
-  // Do the autodiscovery at this level, pass back the real data.
-  // Maybe it's HTML. If it's not HTML, not worth to take a look into the
-  // downloaded string.
+  // Drop the data into a seperate variable so all manipulations of the html
+  // will not effect the actual object that exists in the static cache.
+  // @see http_request_get.
   $downloaded_string = $download->data;
-  if (strpos(strtolower($downloaded_string), "<html") === FALSE) {
-    return $download;
+  // If this happens to be a feed then just return the url.
+  if (http_request_is_feed($download->headers['Content-Type'], $downloaded_string)) {
+    return $url;
   }
-  else {
-    // Ugly hack to be able to retrieve the xml:base property, no way to access
-    // xml:lang inside <feed>
-    $downloaded_string = preg_replace('/xml:base *=/', 'base=', $downloaded_string);
-    // Filter out strange tags.
-    $downloaded_string_filtered = preg_replace(array('@<script[^>]*?.*?</script>@si', '@<object[^>]*?.*?</object>@si', '@<embed[^>]*?.*?</embed>@si', '@<applet[^>]*?.*?</applet>@si', '@<noframes[^>]*?.*?</noframes>@si', '@<noscript[^>]*?.*?</noscript>@si', '@<noembed[^>]*?.*?</noembed>@si'), '', $downloaded_string);
-    $downloaded_string = $downloaded_string_filtered ? $downloaded_string_filtered : $downloaded_string;
-
-    $allowed_mime = array("text/xml", "application/rss+xml", "application/atom+xml", "application/rdf+xml", "application/xml");
-
-    $matches = array();
-    // Get all the links tag
-    preg_match_all('/<link\s+(.*?)\s*\/?>/si', $downloaded_string, $matches);
-    $links = $matches[1];
-    $rss_link = FALSE;
-    foreach ($links as $link) {
-      $mime = array();
-      // Get the type attribute and check if the mime type is allowed.
-      preg_match_all('/type\s*=\s*("|\')([A-Za-z\/+]*)("|\')/si', $link, $mime);
-      if (in_array(array_pop($mime[2]), $allowed_mime)) {
-        $href = array();
-        // Get the href attribute.
-        preg_match_all('/href\s*=\s*("|\')([=#\?_:.0-9A-Za-z\/+]*)("|\')/si', $link, $href);
-        $rss_link = array_pop($href[2]);
-        if (is_string($rss_link) && strlen($rss_link) > 0 && $rss_link != $url) {
-          // Handle base url related stuff.
-          $parsed_url = parse_url($rss_link);
-          if (!isset($parsed_url['host'])) {
-            // It's relative so make it absolute.
-            $base_tag = array();
-            preg_match_all('/<base href\s*=\s*("|\')([_:.0-9A-Za-z\/+]*)("|\')/si', $link, $base_tag);
-            $base_url = array_pop($base_tag[2]);
-            if (is_string($base_url) && strlen($base_url) > 0) {
-              // Get from the HTML base tag.
-              $rss_link = $base_url . $rss_link;
-            }
-            else {
-              // Guess from the original URL.
-              $original_url = parse_url($url);
-              $rss_link = $original_url['scheme'] .'://'. $original_url['host'] . (isset($original_url['port']) ? ':' : '') . $original_url['port'] . $parsed_url['path'] .'?'. $parsed_url['query']  .'#'. $parsed_url['fragment'];
-            }
-          }
-          return http_request_get_common_syndication($rss_link, $settings);
-        }
-      }
+
+  $discovered_feeds = http_request_find_feeds($downloaded_string);
+  foreach ($discovered_feeds as $feed_url) {
+    $absolute = http_request_create_absolute_url($feed_url, $url);
+    if (!empty($absolute)) {
+      // @TODO: something more intelligent?
+      return $absolute;
     }
   }
 }
@@ -249,3 +219,145 @@ function http_request_use_curl() {
 function http_request_clear_cache($url) {
   cache_clear_all('feeds_http_download_'. md5($url), 'cache');
 }
+
+/**
+ * Returns if the provided $content_type is a feed.
+ *
+ * @param string $content_type
+ *  The Content-Type header.
+ *
+ * @param string $data
+ *  The actual data from the http request.
+ *
+ * @return boolean
+ *  Returns TRUE if this is a parsable feed.
+ */
+function http_request_is_feed($content_type, $data) {
+  $pos = strpos($content_type, ';');
+  if ($pos !== FALSE) {
+    $content_type = substr($content_type, 0, $pos);
+  }
+  $content_type = strtolower($content_type);
+  if (strpos($content_type, 'xml') !== FALSE) {
+    return TRUE;
+  }
+
+  // @TODO: Sometimes the content-type can be text/html but still be a valid
+  // feed.
+  return FALSE;
+}
+
+/**
+ * Finds potential feed tags in the HTML document.
+ *
+ * @param string $html
+ *  The html string to search.
+ *
+ * @return array()
+ *  An array of href to feeds.
+ */
+function http_request_find_feeds($html) {
+  $matches = array();
+  preg_match_all(HTTP_REQUEST_PCRE_LINK_TAG, $html, $matches);
+  $links = $matches[1];
+  $candidates = array();
+  $valid_links = array();
+
+  // Build up all the links information.
+  foreach ($links as $link_tag) {
+    $attributes = array();
+    $candidate = array();
+
+    preg_match_all(HTTP_REQUEST_PCRE_TAG_ATTRIBUTES, $link_tag, $attributes, PREG_SET_ORDER);
+    foreach ($attributes as $attribute) {
+      // Find the key value pairs, attribute[1] is key and attribute[2] is the
+      // value.
+      if(!empty($attribute[1]) && !empty($attribute[2])) {
+        $candidate[drupal_strtolower($attribute[1])] = drupal_strtolower(decode_entities($attribute[2]));
+      }
+    }
+
+    // Examine candidate to see if it s a feed.
+    // @TODO: could/should use http_request_is_feed ??
+    if (isset($candidate['rel']) && $candidate['rel'] == 'alternate') {
+      if (isset($candidate['href']) && isset($candidate['type']) && strpos($candidate['type'], 'xml') !== FALSE) {
+        // All tests pass, its a valid candidate.
+        $valid_links[] = $candidate['href'];
+      }
+    }
+  }
+
+  return $valid_links;
+}
+
+/**
+ * Create an absolute url.
+ *
+ * @param string $url
+ *  The href to transform.
+ *
+ * @param $base_url
+ *  The url to be used as the base for a relative $url.
+ *
+ * @return string
+ *  an absolute url
+ */
+function http_request_create_absolute_url($url, $base_url) {
+  $url = trim($url);
+  if (valid_url($url, TRUE)) {
+    // Valid absolute url already.
+    return $url;
+  }
+
+  // Turn relative url into absolute.
+  if (valid_url($url, FALSE)) {
+    // Produces variables $scheme, $host, $user, $pass, $path, $query and $fragment.
+    $parsed_url = parse_url($base_url);
+
+    $path = dirname($parsed_url['path']);
+
+    // Adding to the existing path.
+    if ($url{0} == '/') {
+      $cparts = array_filter(explode("/", $url));
+    }
+    else {
+      // Backtracking from the existing path.
+      $cparts = array_merge(array_filter(explode("/", $path)), array_filter(explode("/", $url)));
+      foreach($cparts as $i => $part) {
+        if($part == '.') {
+          $cparts[$i] = null;
+        }
+        if($part == '..') {
+          $cparts[$i - 1] = null;
+          $cparts[$i] = null;
+        }
+      }
+      $cparts = array_filter($cparts);
+    }
+    $path = implode("/", $cparts);
+
+    // Build the prefix to the path.
+    $absolute_url = '';
+    if (isset($parsed_url['scheme'])) {
+      $absolute_url = $parsed_url['scheme'] . '://';
+    }
+
+    if (isset($parsed_url['user'])) {
+      $absolute_url .= $parsed_url['user'];
+      if (isset($pass)) {
+        $absolute_url .= ':' . $parsed_url['pass'];
+      }
+      $absolute_url .= '@';
+    }
+    if (isset($parsed_url['host'])) {
+      $absolute_url .= $parsed_url['host'] . '/';
+    }
+
+    $absolute_url .= $path;
+
+    if (valid_url($absolute_url, TRUE)) {
+      return $absolute_url;
+    }
+  }
+  return FALSE;
+}
diff --git a/plugins/FeedsHTTPFetcher.inc b/plugins/FeedsHTTPFetcher.inc
index a638d32f..d55d2b81 100644
--- a/plugins/FeedsHTTPFetcher.inc
+++ b/plugins/FeedsHTTPFetcher.inc
@@ -94,6 +94,7 @@ class FeedsHTTPFetcher extends FeedsFetcher {
    */
   public function configDefaults() {
     return array(
+      'auto_detect_feeds' => FALSE,
       'use_pubsubhubbub' => FALSE,
       'designated_hub' => '',
     );
@@ -104,6 +105,12 @@ class FeedsHTTPFetcher extends FeedsFetcher {
    */
   public function configForm(&$form_state) {
     $form = array();
+    $form['auto_detect_feeds'] = array(
+      '#type' => 'checkbox',
+      '#title' => t('Auto detect feeds'),
+      '#description' => t('If the supplied URL does not point to a feed but an HTML document, attempt to extract a feed URL from the document.'),
+      '#default_value' => $this->config['auto_detect_feeds'],
+    );
     $form['use_pubsubhubbub'] = array(
       '#type' => 'checkbox',
       '#title' => t('Use PubSubHubbub'),
@@ -135,6 +142,18 @@ class FeedsHTTPFetcher extends FeedsFetcher {
     return $form;
   }
 
+  /**
+   * Override parent::sourceFormValidate().
+   */
+  public function sourceFormValidate(&$values) {
+    if ($this->config['auto_detect_feeds']) {
+      feeds_include_library('http_request.inc', 'http_request');
+      if ($url = http_request_get_common_syndication($values['source'])) {
+        $values['source'] = $url;
+      }
+    }
+  }
+
   /**
    * Override sourceSave() - subscribe to hub.
    */
-- 
GitLab