From 42b9c759631f4c6ab9ebe5f7a7db959ac518d6b3 Mon Sep 17 00:00:00 2001 From: Will White <will_white@32237.no-reply.drupal.org> Date: Tue, 30 Mar 2010 18:54:29 +0000 Subject: [PATCH] feature request #705872 by Scott Reynolds: HTTPFetcher autodiscovery. --- CHANGELOG.txt | 1 + libraries/http_request.inc | 230 ++++++++++++++++++++++++++--------- plugins/FeedsHTTPFetcher.inc | 19 +++ 3 files changed, 191 insertions(+), 59 deletions(-) diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 01928ae5..4551462c 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -33,6 +33,7 @@ Feeds 6.x 1.0 xxxxx xx, 2010-xx-xx same importer id. - #740962 Fix FileFetcher Attached to Feed Node, Upload Field Not Saving File Path. +- #705872 Scott Reynolds: Added HTTPFetcher autodiscovery Feeds 6.x 1.0 Alpha 12, 2010-02-23 ---------------------------------- diff --git a/libraries/http_request.inc b/libraries/http_request.inc index b26e71d8..4ec7c9a8 100644 --- a/libraries/http_request.inc +++ b/libraries/http_request.inc @@ -10,16 +10,22 @@ */ /** - * Download RSS or Atom feeds from a given URL. If document in given URL is an - * HTML document, function attempts to discover RSS or Atom feeds and downloads - * them. - * - * @todo Debug - * @todo Cache detected rss feeds in url. - * @todo Use exceptions, not string or false return values. + * PCRE for finding the link tags in html. + */ +define('HTTP_REQUEST_PCRE_LINK_TAG', '/<link((?:[\x09\x0A\x0B\x0C\x0D\x20]+[^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3D\x3E]*(?:[\x09\x0A\x0B\x0C\x0D\x20]*=[\x09\x0A\x0B\x0C\x0D\x20]*(?:"(?:[^"]*)"|\'(?:[^\']*)\'|(?:[^\x09\x0A\x0B\x0C\x0D\x20\x22\x27\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x3E]*)?))?)*)[\x09\x0A\x0B\x0C\x0D\x20]*(>(.*)<\/link>|(\/)?>)/si'); + +/** + * PCRE for matching all the attributes in a tag. + */ +define('HTTP_REQUEST_PCRE_TAG_ATTRIBUTES', '/[\x09\x0A\x0B\x0C\x0D\x20]+([^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3D\x3E]*)(?:[\x09\x0A\x0B\x0C\x0D\x20]*=[\x09\x0A\x0B\x0C\x0D\x20]*(?:"([^"]*)"|\'([^\']*)\'|([^\x09\x0A\x0B\x0C\x0D\x20\x22\x27\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x3E]*)?))?/'); + +/** + * Discover RSS or atom feeds at the given URL. If document in given URL is an + * HTML document, function attempts to discover RSS or Atom feeds. * * @return - * string - the downloaded data, FALSE - if the URL is not reachable + * string - the discovered feed, FALSE - if the URL is not reachable or there + * no feeds. */ function http_request_get_common_syndication($url, $settings = NULL) { if (valid_url($url, TRUE)) { @@ -36,62 +42,26 @@ function http_request_get_common_syndication($url, $settings = NULL) { $download = http_request_get($url, $username, $password, $accept_invalid_cert); // Cannot get the feed, return. - if ($download->data == FALSE) { + // http_request_get() always returns 200 even if its 304. + if ($download->code != 200) { return FALSE; } - // Do the autodiscovery at this level, pass back the real data. - // Maybe it's HTML. If it's not HTML, not worth to take a look into the - // downloaded string. + // Drop the data into a seperate variable so all manipulations of the html + // will not effect the actual object that exists in the static cache. + // @see http_request_get. $downloaded_string = $download->data; - if (strpos(strtolower($downloaded_string), "<html") === FALSE) { - return $download; + // If this happens to be a feed then just return the url. + if (http_request_is_feed($download->headers['Content-Type'], $downloaded_string)) { + return $url; } - else { - // Ugly hack to be able to retrieve the xml:base property, no way to access - // xml:lang inside <feed> - $downloaded_string = preg_replace('/xml:base *=/', 'base=', $downloaded_string); - // Filter out strange tags. - $downloaded_string_filtered = preg_replace(array('@<script[^>]*?.*?</script>@si', '@<object[^>]*?.*?</object>@si', '@<embed[^>]*?.*?</embed>@si', '@<applet[^>]*?.*?</applet>@si', '@<noframes[^>]*?.*?</noframes>@si', '@<noscript[^>]*?.*?</noscript>@si', '@<noembed[^>]*?.*?</noembed>@si'), '', $downloaded_string); - $downloaded_string = $downloaded_string_filtered ? $downloaded_string_filtered : $downloaded_string; - - $allowed_mime = array("text/xml", "application/rss+xml", "application/atom+xml", "application/rdf+xml", "application/xml"); - - $matches = array(); - // Get all the links tag - preg_match_all('/<link\s+(.*?)\s*\/?>/si', $downloaded_string, $matches); - $links = $matches[1]; - $rss_link = FALSE; - foreach ($links as $link) { - $mime = array(); - // Get the type attribute and check if the mime type is allowed. - preg_match_all('/type\s*=\s*("|\')([A-Za-z\/+]*)("|\')/si', $link, $mime); - if (in_array(array_pop($mime[2]), $allowed_mime)) { - $href = array(); - // Get the href attribute. - preg_match_all('/href\s*=\s*("|\')([=#\?_:.0-9A-Za-z\/+]*)("|\')/si', $link, $href); - $rss_link = array_pop($href[2]); - if (is_string($rss_link) && strlen($rss_link) > 0 && $rss_link != $url) { - // Handle base url related stuff. - $parsed_url = parse_url($rss_link); - if (!isset($parsed_url['host'])) { - // It's relative so make it absolute. - $base_tag = array(); - preg_match_all('/<base href\s*=\s*("|\')([_:.0-9A-Za-z\/+]*)("|\')/si', $link, $base_tag); - $base_url = array_pop($base_tag[2]); - if (is_string($base_url) && strlen($base_url) > 0) { - // Get from the HTML base tag. - $rss_link = $base_url . $rss_link; - } - else { - // Guess from the original URL. - $original_url = parse_url($url); - $rss_link = $original_url['scheme'] .'://'. $original_url['host'] . (isset($original_url['port']) ? ':' : '') . $original_url['port'] . $parsed_url['path'] .'?'. $parsed_url['query'] .'#'. $parsed_url['fragment']; - } - } - return http_request_get_common_syndication($rss_link, $settings); - } - } + + $discovered_feeds = http_request_find_feeds($downloaded_string); + foreach ($discovered_feeds as $feed_url) { + $absolute = http_request_create_absolute_url($feed_url, $url); + if (!empty($absolute)) { + // @TODO: something more intelligent? + return $absolute; } } } @@ -249,3 +219,145 @@ function http_request_use_curl() { function http_request_clear_cache($url) { cache_clear_all('feeds_http_download_'. md5($url), 'cache'); } + +/** + * Returns if the provided $content_type is a feed. + * + * @param string $content_type + * The Content-Type header. + * + * @param string $data + * The actual data from the http request. + * + * @return boolean + * Returns TRUE if this is a parsable feed. + */ +function http_request_is_feed($content_type, $data) { + $pos = strpos($content_type, ';'); + if ($pos !== FALSE) { + $content_type = substr($content_type, 0, $pos); + } + $content_type = strtolower($content_type); + if (strpos($content_type, 'xml') !== FALSE) { + return TRUE; + } + + // @TODO: Sometimes the content-type can be text/html but still be a valid + // feed. + return FALSE; +} + +/** + * Finds potential feed tags in the HTML document. + * + * @param string $html + * The html string to search. + * + * @return array() + * An array of href to feeds. + */ +function http_request_find_feeds($html) { + $matches = array(); + preg_match_all(HTTP_REQUEST_PCRE_LINK_TAG, $html, $matches); + $links = $matches[1]; + $candidates = array(); + $valid_links = array(); + + // Build up all the links information. + foreach ($links as $link_tag) { + $attributes = array(); + $candidate = array(); + + preg_match_all(HTTP_REQUEST_PCRE_TAG_ATTRIBUTES, $link_tag, $attributes, PREG_SET_ORDER); + foreach ($attributes as $attribute) { + // Find the key value pairs, attribute[1] is key and attribute[2] is the + // value. + if(!empty($attribute[1]) && !empty($attribute[2])) { + $candidate[drupal_strtolower($attribute[1])] = drupal_strtolower(decode_entities($attribute[2])); + } + } + + // Examine candidate to see if it s a feed. + // @TODO: could/should use http_request_is_feed ?? + if (isset($candidate['rel']) && $candidate['rel'] == 'alternate') { + if (isset($candidate['href']) && isset($candidate['type']) && strpos($candidate['type'], 'xml') !== FALSE) { + // All tests pass, its a valid candidate. + $valid_links[] = $candidate['href']; + } + } + } + + return $valid_links; +} + +/** + * Create an absolute url. + * + * @param string $url + * The href to transform. + * + * @param $base_url + * The url to be used as the base for a relative $url. + * + * @return string + * an absolute url + */ +function http_request_create_absolute_url($url, $base_url) { + $url = trim($url); + if (valid_url($url, TRUE)) { + // Valid absolute url already. + return $url; + } + + // Turn relative url into absolute. + if (valid_url($url, FALSE)) { + // Produces variables $scheme, $host, $user, $pass, $path, $query and $fragment. + $parsed_url = parse_url($base_url); + + $path = dirname($parsed_url['path']); + + // Adding to the existing path. + if ($url{0} == '/') { + $cparts = array_filter(explode("/", $url)); + } + else { + // Backtracking from the existing path. + $cparts = array_merge(array_filter(explode("/", $path)), array_filter(explode("/", $url))); + foreach($cparts as $i => $part) { + if($part == '.') { + $cparts[$i] = null; + } + if($part == '..') { + $cparts[$i - 1] = null; + $cparts[$i] = null; + } + } + $cparts = array_filter($cparts); + } + $path = implode("/", $cparts); + + // Build the prefix to the path. + $absolute_url = ''; + if (isset($parsed_url['scheme'])) { + $absolute_url = $parsed_url['scheme'] . '://'; + } + + if (isset($parsed_url['user'])) { + $absolute_url .= $parsed_url['user']; + if (isset($pass)) { + $absolute_url .= ':' . $parsed_url['pass']; + } + $absolute_url .= '@'; + } + if (isset($parsed_url['host'])) { + $absolute_url .= $parsed_url['host'] . '/'; + } + + $absolute_url .= $path; + + if (valid_url($absolute_url, TRUE)) { + return $absolute_url; + } + } + return FALSE; +} diff --git a/plugins/FeedsHTTPFetcher.inc b/plugins/FeedsHTTPFetcher.inc index a638d32f..d55d2b81 100644 --- a/plugins/FeedsHTTPFetcher.inc +++ b/plugins/FeedsHTTPFetcher.inc @@ -94,6 +94,7 @@ class FeedsHTTPFetcher extends FeedsFetcher { */ public function configDefaults() { return array( + 'auto_detect_feeds' => FALSE, 'use_pubsubhubbub' => FALSE, 'designated_hub' => '', ); @@ -104,6 +105,12 @@ class FeedsHTTPFetcher extends FeedsFetcher { */ public function configForm(&$form_state) { $form = array(); + $form['auto_detect_feeds'] = array( + '#type' => 'checkbox', + '#title' => t('Auto detect feeds'), + '#description' => t('If the supplied URL does not point to a feed but an HTML document, attempt to extract a feed URL from the document.'), + '#default_value' => $this->config['auto_detect_feeds'], + ); $form['use_pubsubhubbub'] = array( '#type' => 'checkbox', '#title' => t('Use PubSubHubbub'), @@ -135,6 +142,18 @@ class FeedsHTTPFetcher extends FeedsFetcher { return $form; } + /** + * Override parent::sourceFormValidate(). + */ + public function sourceFormValidate(&$values) { + if ($this->config['auto_detect_feeds']) { + feeds_include_library('http_request.inc', 'http_request'); + if ($url = http_request_get_common_syndication($values['source'])) { + $values['source'] = $url; + } + } + } + /** * Override sourceSave() - subscribe to hub. */ -- GitLab