Ich sollte mit Ajax eine kleine Live-Suche basteln.. Ich habe dafür jQuery genutzt (hat mich Till drauf aufmerksam gemacht) und muss ehrlich sagen jQuery ist total geil. Perfekt für solche Sachen. Ich habe mir spasseshalber zusätzlich dazu ein kleines Script gebastelt welches alle Wörter einer Website ausliest, so dass ich vielleicht eine Art Suchfunktion für Internetseiten umsetzen könnte. Die Wörter werden zwar (noch) nicht vernünftig nach Relevanz genutzt, dennoch ist die Suchfunktion brauchbar wie ich denke und relativ simpel.
Hier mal der Codeschnippsel. Man kann sicherlich noch einiges verbessern.
header('Content-type: text/html; charset=utf-8');
/**
* will fetch a website
* this function will fetch a website and return it
* so that its possible to do further processing on
* it.
*
* @param string $url url, should start with http://
* @return string html output
*/
function getPage($url)
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_FRESH_CONNECT, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$output = curl_exec($ch);
curl_close($ch);
return $output;
}
/**
* will process the html content of a page
* this function will strip some stuff and make sure
* that you have all words seperated by whitespaces
* so that you can easily explode it into a array
* later.
*
* @param string $content html page content
* @return string $line stripped html content in one line
*/
function stripContent($content)
{
// We add a space before < and after > because:
// <p>foo>/p<>p<bar>/p<
// would be foobar after strip_tags.
$content = str_replace("< ", " <", $content);
$content = str_replace(">", "> ", $content);
// we strip all html tags
$content = strip_tags($content);
// let's decode htmlentities:
$content = html_entity_decode($content, ENT_QUOTES, "UTF-8");
// let's make everything lowercase
$content = mb_strtolower($content, "UTF-8");
// we replace some things with spaces, like:
// one,two,three = one two three
// hey!How are you = hey How are you
$toreplace = array("\n", "\r", "\t", ",", ".", "!", "?", "\x0B", "\0");
$line = str_replace($toreplace, " ", $content);
// First some spaces were added, now let's make sure
// that we have only one space per maximum.
$line = preg_replace("/\s+/m", " ", $line);
// let's make sure that only a-z, äöüß and spaces
// are around - Everything else is not useful as keys
// for searching.
$line = preg_replace("/[^a-züäöß\s]/isU", '', $line);
return $line;
}
function getWords($line)
{
$temp = explode(" ", $line);
foreach($temp as $t)
{
// per word "processing"
//$t = html_entity_decode($t, ENT_QUOTES, "UTF-8");
//$t = preg_replace("/[^A-Za-züäöÖÄÜß]/isU", '', $t);
if(is_int($t))
continue;
if(is_numeric($t))
continue;
if(strlen($t) < 4)
continue;
// word seems to be okay, let's create the array:
if(isset($array[$t]))
{
$array[$t]++;
} else {
$array[$t] = 1;
}
}
return $array;
}
var_dump(getWords(stripContent(getPage($_GET['url']))));
