<?php // $Id$ /** * @file * Download via HTTP. * * Support caching, HTTP Basic Authentication, detection of RSS/Atom feeds, * redirects. */ /** * Download RSS or Atom feeds from a given URL. If document in given URL is an * HTML document, function attempts to discover RSS or Atom feeds and downloads * them. * * @todo Debug * @todo Cache detected rss feeds in url. * @todo Use exceptions, not string or false return values. * * @return * string - the downloaded data, FALSE - if the URL is not reachable */ function http_request_get_common_syndication($url, $settings = NULL) { if (valid_url($url, TRUE)) { // Handle password protected feeds. $url_parts = parse_url($url); $password = $username = NULL; if (!empty($url_parts['user'])) { $password = $url_parts['pass']; $username = $url_parts['user']; } } $accept_invalid_cert = isset($settings['accept_invalid_cert']) ? $settings['accept_invalid_cert'] : FALSE; $download = http_request_get($url, $username, $password, $accept_invalid_cert); // Cannot get the feed, return. if ($download->data == FALSE) { return FALSE; } // Do the autodiscovery at this level, pass back the real data. // Maybe it's HTML. If it's not HTML, not worth to take a look into the // downloaded string. $downloaded_string = $download->data; if (strpos(strtolower($downloaded_string), "<html") === FALSE) { return $download; } else { // Ugly hack to be able to retrieve the xml:base property, no way to access // xml:lang inside <feed> $downloaded_string = preg_replace('/xml:base *=/', 'base=', $downloaded_string); // Filter out strange tags. $downloaded_string_filtered = preg_replace(array('@<script[^>]*?.*?</script>@si', '@<object[^>]*?.*?</object>@si', '@<embed[^>]*?.*?</embed>@si', '@<applet[^>]*?.*?</applet>@si', '@<noframes[^>]*?.*?</noframes>@si', '@<noscript[^>]*?.*?</noscript>@si', '@<noembed[^>]*?.*?</noembed>@si'), '', $downloaded_string); $downloaded_string = $downloaded_string_filtered ? $downloaded_string_filtered : $downloaded_string; $allowed_mime = array("text/xml", "application/rss+xml", "application/atom+xml", "application/rdf+xml", "application/xml"); $matches = array(); // Get all the links tag preg_match_all('/<link\s+(.*?)\s*\/?>/si', $downloaded_string, $matches); $links = $matches[1]; $rss_link = FALSE; foreach ($links as $link) { $mime = array(); // Get the type attribute and check if the mime type is allowed. preg_match_all('/type\s*=\s*("|\')([A-Za-z\/+]*)("|\')/si', $link, $mime); if (in_array(array_pop($mime[2]), $allowed_mime)) { $href = array(); // Get the href attribute. preg_match_all('/href\s*=\s*("|\')([=#\?_:.0-9A-Za-z\/+]*)("|\')/si', $link, $href); $rss_link = array_pop($href[2]); if (is_string($rss_link) && strlen($rss_link) > 0 && $rss_link != $url) { // Handle base url related stuff. $parsed_url = parse_url($rss_link); if (!isset($parsed_url['host'])) { // It's relative so make it absolute. $base_tag = array(); preg_match_all('/<base href\s*=\s*("|\')([_:.0-9A-Za-z\/+]*)("|\')/si', $link, $base_tag); $base_url = array_pop($base_tag[2]); if (is_string($base_url) && strlen($base_url) > 0) { // Get from the HTML base tag. $rss_link = $base_url . $rss_link; } else { // Guess from the original URL. $original_url = parse_url($url); $rss_link = $original_url['scheme'] .'://'. $original_url['host'] . (isset($original_url['port']) ? ':' : '') . $original_url['port'] . $parsed_url['path'] .'?'. $parsed_url['query'] .'#'. $parsed_url['fragment']; } } return http_request_get_common_syndication($rss_link, $settings); } } } } } /** * Get the content from the given URL. * * @param $url * A valid URL (not only web URLs). * @param $username * If the URL use authentication, here you can supply the username for this. * @param $password * If the URL use authentication, here you can supply the password for this. * @return * A stdClass object that describes the data downloaded from $url. The object's * data property contains the actual document at the URL. */ function http_request_get($url, $username = NULL, $password = NULL, $accept_invalid_cert = FALSE) { // Intra-pagedownload cache, avoid to download the same content twice within one page download (it's possible, compatible and parse calls). static $download_cache = array(); if (isset($download_cache[$url])) { return $download_cache[$url]; } $has_etag = FALSE; $curl = http_request_use_curl(); // Only download and parse data if really needs refresh. // Based on "Last-Modified" and "If-Modified-Since". $headers = array(); if ($cache = cache_get('feeds_http_download_'. md5($url))) { $last_result = $cache->data; $last_headers = $last_result->headers; $has_etag = TRUE; if (!empty($last_headers['ETag'])) { if ($curl) { $headers[] = 'If-None-Match: '. $last_headers['ETag']; } else { $headers['If-None-Match'] = $last_headers['ETag']; } } if (!empty($last_headers['Last-Modified'])) { if ($curl) { $headers[] = 'If-Modified-Since: '. $last_headers['Last-Modified']; } else { $headers['If-Modified-Since'] = $last_headers['Last-Modified']; } } if (!empty($username) && !$curl) { $headers['Authorization'] = 'Basic '. base64_encode("$username:$password"); } } if ($curl) { $headers[] = 'User-Agent: Drupal (+http://drupal.org/)'; $result = new stdClass(); // Only download via cURL if we can validate the scheme to be either http or // https. // Validate in PHP, CURLOPT_PROTOCOLS is only supported with cURL 7.19.4 $uri = parse_url($url); if ($uri['scheme'] != 'http' && $uri['scheme'] != 'https') { $result->error = 'invalid schema '. $uri['scheme']; $result->code = -1003; // This corresponds to drupal_http_request() } else { $download = curl_init($url); curl_setopt($download, CURLOPT_FOLLOWLOCATION, TRUE); if (!empty($username)) { curl_setopt($download, CURLOPT_USERPWD, "{$username}:{$password}"); } curl_setopt($download, CURLOPT_HTTPHEADER, $headers); curl_setopt($download, CURLOPT_HEADER, TRUE); curl_setopt($download, CURLOPT_RETURNTRANSFER, TRUE); curl_setopt($download, CURLOPT_ENCODING, ''); if ($accept_invalid_cert) { curl_setopt($download, CURLOPT_SSL_VERIFYPEER, 0); } $header = ''; $data = curl_exec($download); $header_size = curl_getinfo($download, CURLINFO_HEADER_SIZE); $header = substr($data, 0, $header_size - 1); $result->data = substr($data, $header_size); $header_lines = preg_split("/\r\n|\n|\r/", $header); $result->headers = array(); array_shift($header_lines); // skip HTTP response status while ($line = trim(array_shift($header_lines))) { list($header, $value) = explode(':', $line, 2); if (isset($result->headers[$header]) && $header == 'Set-Cookie') { // RFC 2109: the Set-Cookie response header comprises the token Set- // Cookie:, followed by a comma-separated list of one or more cookies. $result->headers[$header] .= ','. trim($value); } else { $result->headers[$header] = trim($value); } } $result->code = curl_getinfo($download, CURLINFO_HTTP_CODE); curl_close($download); } } else { $result = drupal_http_request($url, $headers); } $result->code = isset($result->code) ? $result->code : 200; // In case of 304 Not Modified try to return cached data. if ($result->code == 304) { if (isset($last_result)) { $last_result->from_cache = TRUE; return $last_result; } else { // It's a tragedy, this file must exist and contain good data. // In this case, clear cache and repeat. cache_clear_all('feeds_http_download_'. md5($url), 'cache'); return http_request_get($url, $username, $password); } } if (!isset($result->headers) || !isset($result->headers['ETag']) || !isset($result->headers['Last-Modified'])) { $result->headers = isset($result->headers) ? $result->headers : array(); $result->headers['ETag'] = isset($result->headers['ETag']) ? $result->headers['ETag'] : ''; $result->headers['Last-Modified'] = isset($result->headers['Last-Modified']) ? $result->headers['Last-Modified'] : ''; } // Set caches. cache_set('feeds_http_download_'. md5($url), $result); $download_cache[$url] = $result; return $result; } /** * Decides if it's possible to use cURL or not. * * @return * TRUE if curl is available, FALSE otherwise. */ function http_request_use_curl() { $basedir = ini_get("open_basedir"); return function_exists('curl_init') && !ini_get('safe_mode') && empty($basedir); } /** * Clear cache for a specific URL. */ function http_request_clear_cache($url) { cache_clear_all('feeds_http_download_'. md5($url), 'cache'); }