Commit 04d95934 authored by Konstantin Komelin's avatar Konstantin Komelin
Browse files

Issue #8407187 by konstantin.komelin: Removed simplehtmldom library from the...

Issue #8407187 by konstantin.komelin: Removed simplehtmldom library from the module repository and added support of external library.
parent 27104c92
simplehtmldom API
==========================================================================================================
The module is a gateway between simplehtmldom PHP library and Drupal.
It provides powerful API for HTML parsing. It works fine with broken markup.
After installing this module you will be able to use simplehtmldom functions right in your code.
See more details about library usage here http://simplehtmldom.sourceforge.net/
Dependencies:
---------------
The module requires simplehtmldom library 1.11+ version.
Please copy latest version of simplehtmldom library from http://sourceforge.net/projects/simplehtmldom/
to you libraries folder, for example sites/all/libraries/simplehtmldom/simple_html_dom.php
Tested with simplehtmldom 1.5.
<?php
// Include the only file of the library.
// There are 5-6 funcs there and their names are not prefixed. But these names are quite unprobable to cause collisions...
/**
* @file
* Include the only file of the library.
*
* There are 5-6 funcs there and their names are not prefixed.
* But these names are quite unprobable to cause collisions...
*/
if (!function_exists('str_get_html')) {
require_once('simplehtmldom/simple_html_dom.php');
$simplehtmldom_library_path = _simplhtmldom_get_library_path();
if (!empty($simplehtmldom_library_path)) {
require_once $simplehtmldom_library_path;
}
}
else {
drupal_set_message('The simplehtmldom API module found that you define its functions before it does itself! Somewhere in yor drupal files there must be a copy of the simplehtmldom PHP library that you do not need if you have this drupal module... Please remove the old library and live happily. The simplehtmldom module will include the necessary php files from the lib as soon as you remove the old ones.', 'error');
drupal_set_message(t('The simplehtmldom API module found that you define its functions before it does itself! Somewhere in your drupal files there must be a copy of the simplehtmldom PHP library that you do not need if you have this drupal module... Please remove the old library and live happily. The simplehtmldom module will include the necessary php files from the lib as soon as you remove the old ones.'), 'error');
}
/**
* Returns path of simplhtmldom library.
*
* @return bool|string
* Path to library.
*/
function _simplhtmldom_get_library_path() {
// Implement simple cache.
$library_path = &drupal_static(__FUNCTION__);
if (!empty($library_path)) {
return $library_path;
}
$file = 'simple_html_dom.php';
$library = 'simplehtmldom';
// Support libraries module.
if (module_exists('libraries') && file_exists(libraries_get_path($library) . "/$file")) {
return libraries_get_path($library) . "/$file";
}
else {
$paths = array(
'sites/all/libraries/' . $library,
drupal_get_path('module', 'simplehtmldom') . '/' . $library,
drupal_get_path('module', 'simplehtmldom') . "/libraries",
'profiles/' . variable_get('install_profile', 'default') . '/libraries/' . $library,
);
foreach ($paths as $library_path) {
$path = $library_path . "/$file";
if (file_exists($path)) {
return $path;
}
}
}
return FALSE;
}
/**
* Implements hook_libraries_info().
*/
function simplehtmldom_libraries_info() {
$libraries['simplehtmldom'] = array(
'name' => 'PHP Simple HTML DOM Parser',
'vendor url' => 'http://simplehtmldom.sourceforge.net',
'download url' => 'http://sourceforge.net/projects/simplehtmldom/files/',
'version arguments' => array(
'file' => 'simple_html_dom.php',
'pattern' => '/version ([0-9\.]+)/',
'lines' => 40,
),
'files' => array(
'php' => array('simple_html_dom.php'),
),
);
return $libraries;
}
<?php
// example of how to use advanced selector features
include('../simple_html_dom.php');
// -----------------------------------------------------------------------------
// descendant selector
$str = <<<HTML
<div>
<div>
<div class="foo bar">ok</div>
</div>
</div>
HTML;
$html = str_get_html($str);
echo $html->find('div div div', 0)->innertext . '<br>'; // result: "ok"
// -----------------------------------------------------------------------------
// nested selector
$str = <<<HTML
<ul id="ul1">
<li>item:<span>1</span></li>
<li>item:<span>2</span></li>
</ul>
<ul id="ul2">
<li>item:<span>3</span></li>
<li>item:<span>4</span></li>
</ul>
HTML;
$html = str_get_html($str);
foreach($html->find('ul') as $ul) {
foreach($ul->find('li') as $li)
echo $li->innertext . '<br>';
}
// -----------------------------------------------------------------------------
// parsing checkbox
$str = <<<HTML
<form name="form1" method="post" action="">
<input type="checkbox" name="checkbox1" value="checkbox1" checked>item1<br>
<input type="checkbox" name="checkbox2" value="checkbox2">item2<br>
<input type="checkbox" name="checkbox3" value="checkbox3" checked>item3<br>
</form>
HTML;
$html = str_get_html($str);
foreach($html->find('input[type=checkbox]') as $checkbox) {
if ($checkbox->checked)
echo $checkbox->name . ' is checked<br>';
else
echo $checkbox->name . ' is not checked<br>';
}
?>
\ No newline at end of file
<?php
// example of how to use basic selector to retrieve HTML contents
include('../simple_html_dom.php');
// get DOM from URL or file
$html = file_get_html('http://www.google.com/');
// find all link
foreach($html->find('a') as $e)
echo $e->href . '<br>';
// find all image
foreach($html->find('img') as $e)
echo $e->src . '<br>';
// find all image with full tag
foreach($html->find('img') as $e)
echo $e->outertext . '<br>';
// find all div tags with id=gbar
foreach($html->find('div#gbar') as $e)
echo $e->innertext . '<br>';
// find all span tags with class=gb1
foreach($html->find('span.gb1') as $e)
echo $e->outertext . '<br>';
// find all td tags with attribite align=center
foreach($html->find('td[align=center]') as $e)
echo $e->innertext . '<br>';
// extract text from table
echo $html->find('td[align="center"]', 1)->plaintext.'<br><hr>';
// extract text from HTML
echo $html->plaintext;
?>
\ No newline at end of file
<?php
include_once('../simple_html_dom.php');
// 1. Write a function with parameter "$element"
function my_callback($element) {
if ($element->tag=='input')
$element->outertext = 'input';
if ($element->tag=='img')
$element->outertext = 'img';
if ($element->tag=='a')
$element->outertext = 'a';
}
// 2. create HTML Dom
$html = file_get_html('http://www.google.com/');
// 3. Register the callback function with it's function name
$html->set_callback('my_callback');
// 4. Callback function will be invoked while dumping
echo $html;
?>
\ No newline at end of file
<?php
include_once('../simple_html_dom.php');
echo file_get_html('http://www.google.com/')->plaintext;
?>
\ No newline at end of file
<?php
// example of how to modify HTML contents
include('../simple_html_dom.php');
// get DOM from URL or file
$html = file_get_html('http://www.google.com/');
// remove all image
foreach($html->find('img') as $e)
$e->outertext = '';
// replace all input
foreach($html->find('input') as $e)
$e->outertext = '[INPUT]';
// dump contents
echo $html;
?>
\ No newline at end of file
<?php
include_once('../../simple_html_dom.php');
function scraping_digg() {
// create HTML DOM
$html = file_get_html('http://digg.com/');
// get news block
foreach($html->find('div.news-summary') as $article) {
// get title
$item['title'] = trim($article->find('h3', 0)->plaintext);
// get details
$item['details'] = trim($article->find('p', 0)->plaintext);
// get intro
$item['diggs'] = trim($article->find('li a strong', 0)->plaintext);
$ret[] = $item;
}
// clean up memory
$html->clear();
unset($html);
return $ret;
}
// -----------------------------------------------------------------------------
// test it!
// "http://digg.com" will check user_agent header...
ini_set('user_agent', 'My-Application/2.5');
$ret = scraping_digg();
foreach($ret as $v) {
echo $v['title'].'<br>';
echo '<ul>';
echo '<li>'.$v['details'].'</li>';
echo '<li>Diggs: '.$v['diggs'].'</li>';
echo '</ul>';
}
?>
\ No newline at end of file
<?php
include_once('../../simple_html_dom.php');
function scraping_IMDB($url) {
// create HTML DOM
$html = file_get_html($url);
// get title
$ret['Title'] = $html->find('title', 0)->innertext;
// get rating
$ret['Rating'] = $html->find('div[class="general rating"] b', 0)->innertext;
// get overview
foreach($html->find('div[class="info"]') as $div) {
// skip user comments
if($div->find('h5', 0)->innertext=='User Comments:')
return $ret;
$key = '';
$val = '';
foreach($div->find('*') as $node) {
if ($node->tag=='h5')
$key = $node->plaintext;
if ($node->tag=='a' && $node->plaintext!='more')
$val .= trim(str_replace("\n", '', $node->plaintext));
if ($node->tag=='text')
$val .= trim(str_replace("\n", '', $node->plaintext));
}
$ret[$key] = $val;
}
// clean up memory
$html->clear();
unset($html);
return $ret;
}
// -----------------------------------------------------------------------------
// test it!
$ret = scraping_IMDB('http://imdb.com/title/tt0335266/');
foreach($ret as $k=>$v)
echo '<strong>'.$k.' </strong>'.$v.'<br>';
?>
\ No newline at end of file
<?php
include_once('../../simple_html_dom.php');
function scraping_slashdot() {
// create HTML DOM
$html = file_get_html('http://slashdot.org/');
// get article block
foreach($html->find('div[id^=firehose-]') as $article) {
// get title
$item['title'] = trim($article->find('a.datitle', 0)->plaintext);
// get body
$item['body'] = trim($article->find('div.body', 0)->plaintext);
$ret[] = $item;
}
// clean up memory
$html->clear();
unset($html);
return $ret;
}
// -----------------------------------------------------------------------------
// test it!
$ret = scraping_slashdot();
foreach($ret as $v) {
echo $v['title'].'<br>';
echo '<ul>';
echo '<li>'.$v['body'].'</li>';
echo '</ul>';
}
?>
\ No newline at end of file
<?php
include_once('../simple_html_dom.php');
// -----------------------------------------------------------------------------
// remove HTML comments
function html_no_comment($url) {
// create HTML DOM
$html = file_get_html($url);
// remove all comment elements
foreach($html->find('comment') as $e)
$e->outertext = '';
$ret = $html->save();
// clean up memory
$html->clear();
unset($html);
return $ret;
}
// -----------------------------------------------------------------------------
// search elements that contains an specific text
function find_contains($html, $selector, $keyword, $index=-1) {
$ret = array();
foreach ($html->find($selector) as $e) {
if (strpos($e->innertext, $keyword)!==false)
$ret[] = $e;
}
if ($index<0) return $ret;
return (isset($ret[$index])) ? $ret[$index] : null;
}
?>
\ No newline at end of file
/*$Rev: 46 $*/
body {
margin: 0;
padding: 0;
font-family: verdana,arial,helvetica,sans-serif;
font-size: 11px;
color: #4F5155;
}
#content {
margin: 0 20px 0 20px;
line-height: 16px;
padding: 0;
}
h1 {
font-size: 18px;
margin: 0;
padding: 0 0 2px 0;
background-color: #D0D0D0;
text-align: center;
}
h2 {
background-color: #727EA3;
border-right: 1px solid #D0D0D0;
border-bottom: 1px solid #D0D0D0;
color: #FFFFFF;
font-size: 14px;
font-weight: bold;
margin: 14px 0 4px 0;
padding: 1px 10px 1px 10px;
}
ul {
margin-top: 0;
margin-bottom: 0;
line-height:1.5em;
list-style-image:url(bullet.gif);
list-style-type:square;
}
.top {
font-size: 11px;
float: right;
}
.code {
font-size: 11px;
font-family: Monaco, Verdana, Sans-serif;
line-height: 14px;
background-color: #f6f6f6;
border-bottom: 1px solid #D0D0D0;
border-top: 1px solid #A0A0A0;
border-left: 1px solid #A0A0A0;
border-right: 1px solid #D0D0D0;
color: #002166;
display: block;
margin: 2px 0 2px 0;
padding: 2px 10px 2px 10px;
}
.code A:link {color: #002166; text-decoration: none; font-weight: bold;}
.code A:visited {color: #002166; text-decoration: none; font-weight: bold;}
.code A:active {color: #002166; text-decoration: none; font-weight: bold;}
.code A:hover {color: #0000ff; text-decoration: underline; font-weight: bold;}
.code .keyword {
color: #007700;
}
.code .comment {
font-size: 10px;
color: #888;
}
.code .var {
color: #770000;
}
th {
font-family: Lucida Grande, Verdana, Geneva, Sans-serif;
color: #000000;
background-color: #CFD4E6;
margin: 2px 2px 2px 2px;
padding: 2px 2px 2px 2px;
font-size: 13px;
font-weight: normal;
font-style: normal;
}
td {
background-color: #dddddd;
}
.description {
font-family: Lucida Grande, Verdana, Geneva, Sans-serif;
font-size: 11px;
color: #333;
text-ident: 30px;
font-style: normal;
}
.returns {
font-family: Monaco, Verdana, Sans-serif;
font-size: 10px;
color: #888;
float: left;
text-align: right;
margin: 0 4px 0 0;
width: 48px;
}
\ No newline at end of file
/* Caution! Ensure accessibility in print and other media types... */
@media projection, screen { /* Use class for showing/hiding tab content, so that visibility can be better controlled in different media types... */
.ui-tabs-hide {
display: none;
}
}
/* Hide useless elements in print layouts... */
@media print {
.ui-tabs-nav {
display: none;
}
}
/* Skin */
.ui-tabs-nav, .ui-tabs-panel {
font-family: "Trebuchet MS", Trebuchet, Verdana, Helvetica, Arial, sans-serif;
font-size: 12px;
}
.ui-tabs-nav {
list-style: none;
margin: 0;
padding: 0 0 0 4px;
}
.ui-tabs-nav:after { /* clearing without presentational markup, IE gets extra treatment */
display: block;
clear: both;
content: " ";
}
.ui-tabs-nav li {
float: left;
margin: 0 0 0 1px;
min-width: 84px; /* be nice to Opera */
}
.ui-tabs-nav a, .ui-tabs-nav a span {
display: block;
padding: 0 10px;
background: url(../img/tab.png) no-repeat;
}
.ui-tabs-nav a {
margin: 1px 0 0; /* position: relative makes opacity fail for disabled tab in IE */
padding-left: 0;
color: #27537a;
font-weight: bold;
line-height: 1.2;
text-align: center;
text-decoration: none;
white-space: nowrap; /* required in IE 6 */
outline: 0; /* prevent dotted border in Firefox */
}
.ui-tabs-nav .ui-tabs-selected a {
position: relative;
top: 1px;
z-index: 2;
margin-top: 0;
color: #000;
}
.ui-tabs-nav a span {
width: 64px; /* IE 6 treats width as min-width */
min-width: 64px;
height: 18px; /* IE 6 treats height as min-height */
min-height: 18px;
padding-top: 6px;
padding-right: 0;
}
*>.ui-tabs-nav a span { /* hide from IE 6 */
width: auto;
height: auto;
}
.ui-tabs-nav .ui-tabs-selected a span {