// Software engineers look away now unless you wish to be horrified by the amount // of coupling and cohesion in these functions. I'm warning you... it's not pretty class smd_FuzzyFind { // The last 3 parameters are for optimization only, to avoid the // surprisingly slow strlen() and substr() calls: // - $start_index = from which character of $text to start the search // - $max_len = maximum character to search (starting from $start_index) // - $text_strlen = // The return value is an array of matches: // Array( [] => , ... ) // Note: is generally NOT an exact edit distance but rather a // lower bound. This is unfortunate but the routine would be slower if // the exact error was calculate along with the matches. // The function is based on the non-deterministic automaton simulation // algorithm (without bit parallelism optimizations). function search_short($patt, $k, $text, $start_index=0, $max_len=-1, $text_strlen=-1) { if ( $text_strlen < 0 ) $text_strlen = strlen( $text ); if ( $max_len < 0 ) $max_len = $text_strlen; $start_index = max( 0, $start_index ); $n = min( $max_len, $text_strlen-$start_index ); $m = strlen( $patt ); $end_index = $start_index + $n; // If $text is shorter than $patt, use the built-in // levenshtein() instead: if ($n < $m) { $lev = levenshtein(substr($text, $start_index, $n), $patt); if ( $lev <= $k ) return Array( $start_index+$n-1 => $lev ); else return Array(); } $s = Array(); for ($i=0; $i<$m; $i++) { $c = $patt{$i}; if ( isset($s[$c])) $s[$c] = min($i, $s[$c]); else $s[$c] = $i; } if ( $end_index < $start_index ) return Array(); $matches = Array(); $da = $db = range(0, $m-$k+1); $mk = $m-$k; for ($t=$start_index; $t<$end_index; $t++) { $c = $text{$t}; $in_patt = isset($s[$c]); if ($t&1) { $d=&$da; $e=&$db; } else { $d=&$db; $e=&$da; } for ($i=1; $i<=$mk; $i++) { $g = min( $k+1, $e[$i]+1, $e[$i+1]+1 ); // TODO: optimize this with a look-up-table? if ( $in_patt ) for ($j=$e[$i-1]; ($j<$g && $j<=$mk); $j++) if ( $patt{$i+$j-1} == $c ) $g = $j; $d[$i] = $g; } if ( $d[$mk] <= $k ) { $err = $d[$mk]; $i = min( $t-$err+$k+1, $start_index+$n-1); if ( !isset($matches[$i]) || $err < $matches[$i]) $matches[$i] = $err; } } unset( $da, $db ); return $matches; } function test_short_search() { $test_text = "Olipa kerran jussi bj&xling ja kolme\n iloista ". "jussi bforling:ia mutta ei yhtaan jussi bjorling-nimista laulajaa."; $test_patt = "jussi bjorling"; assert( $this->search_short($test_patt, 4, $test_text) == Array(27=>2, 60=>1, 94=>0)); assert( $this->search_short($test_patt, 2, $test_text) == Array(27=>2, 60=>1, 94=>0)); assert( $this->search_short($test_patt, 1, $test_text) == Array(60=>1, 94=>0)); assert( $this->search_short($test_patt, 0, $test_text) == Array(94=>0)); assert( $this->search_short("bjorling", 2, $test_text, 19, 7) == Array()); assert( $this->search_short("bjorling", 2, $test_text, 19, 8) == Array(26=>2)); assert( $this->search_short("bjorling", 2, $test_text, 20, 8) == Array()); } var $patt, $patt_len, $max_err; var $parts, $n_parts, $unique_parts, $max_part_len; var $transf_patt; var $too_short_err; function smd_FuzzyFind( $pattern, $max_error ) { $this->patt = $pattern; $this->patt_len = strlen($this->patt); $this->max_err = $max_error; // Calculate pattern partition size $intpartlen = floor($this->patt_len/($this->max_err+2)); if ($intpartlen < 1) { $this->too_short_err = True; return; } else $this->too_short_err = False; // Partition the pattern for pruning $this->parts = Array(); for ($i=0; $i<$this->patt_len; $i+=$intpartlen) { if ( $i + $intpartlen*2 > $this->patt_len ) { $this->parts[] = substr( $this->patt, $i ); break; } else $this->parts[] = substr( $this->patt, $i, $intpartlen ); } $this->n_parts = count($this->parts); // The intpartlen test above should have covered this: assert( $this->n_parts >= $this->max_err+1 ); // Find maximum part length foreach( $this->parts as $p ) $this->max_part_len = max( $this->max_part_len, strlen($p)); // Make a new part array with duplicate strings removed $this->unique_parts = array_unique($this->parts); // Transform the pattern into a low resolution pruning string // by replacing parts with single characters $this->transf_patt = ""; reset( $this->parts ); while (list(,$p) = each($this->parts)) $this->transf_patt .= chr(array_search($p, $this->unique_parts)+ord("A")); // Self diagnostics $this->test_short_search(); } function search( $text ) { // Find all occurences of unique parts in the // full text. The result is an array: // Array( => , .. ) $part_map = Array(); reset( $this->unique_parts ); while (list($pi, $part_str) = each($this->unique_parts)) { $pos = strpos($text, $part_str); while ( $pos !== False ) { $part_map[$pos] = $pi; $pos = strpos($text, $part_str, $pos+1); } } ksort( $part_map ); // Sort by string index // The following code does several things simultaneously: // 1) Divide the indices into groups using gaps // larger than $this->max_err as boundaries. // 2) Translate the groups into strings so that // part# 0 = 'A', part# 1 = 'B' etc. to make // a low resolution approximate search possible later // 3) Save the string indices in the full string // that correspond to characters in the translated string. // 4) Discard groups (=part sequences) that are too // short to contain the approximate pattern. // The format of resulting array: // Array( // Array( "", // Array( => , ... ) ), // ... ) $transf = Array(); $transf_text = ""; $transf_pos = Array(); $last_end = 0; $group_len = 0; reset( $part_map ); while (list($i,$p) = each($part_map)) { if ( $i-$last_end > $this->max_part_len+$this->max_err ) { if ( $group_len >= ($this->n_parts-$this->max_err)) $transf[] = Array( $transf_text, $transf_pos ); $transf_text = ""; $transf_pos = Array(); $group_len = 0; } $transf_text .= chr($p + ord("A")); $transf_pos[] = $i; $group_len++; $last_end = $i + strlen($this->parts[$p]); } if ( strlen( $transf_text ) >= ($this->n_parts-$this->max_err)) $transf[] = Array( $transf_text, $transf_pos ); unset( $transf_text, $transf_pos ); if ( current($transf) === False ) return Array(); // Filter the remaining groups ("approximate anagrams" // of the pattern) and leave only the ones that have enough // parts in correct order. You can think of this last step of the // algorithm as a *low resolution* approximate string search. // The result is an array of candidate text spans to be scanned: // Array( Array(, ), ... ) $part_positions = Array(); while (list(,list($str, $pos_map)) = each($transf)) { // print "|$transf_patt| - |$str|\n"; $lores_matches = $this->search_short( $this->transf_patt, $this->max_err, $str ); while (list($tr_end, ) = each($lores_matches)) { $tr_start = max(0, $tr_end - $this->n_parts); if ( $tr_end >= $tr_start ) { $median_pos = $pos_map[ (int)(($tr_start+$tr_end)/2) ]; $start = $median_pos - ($this->patt_len/2+1) - $this->max_err - $this->max_part_len; $end = $median_pos + ($this->patt_len/2+1) + $this->max_err + $this->max_part_len; // print "#" . strtr(substr( $text, $start, $end-$start ), "\n\r", "$$") . "#\n"; // print_r( $this->search_short( &$this->patt, $this->max_err, &$text, $start, $end-$start )); $part_positions[] = Array($start, $end); } } unset( $lores_matches ); } unset( $transf ); if ( current($part_positions) === False ) return Array(); // Scan the final candidates and put the matches in a new array: $matches = Array(); $text_len = strlen($text); while (list(, list($start, $end)) = each($part_positions)) { $m = $this->search_short( $this->patt, $this->max_err, $text, $start, $end-$start, $text_len ); while (list($i, $cost) = each($m)) $matches[$i] = $cost; } unset($part_positions); return $matches; } } class smd_MLP { var $smd_strings; var $smd_owner; var $smd_prefix; var $smd_lang; var $smd_event; function smd_MLP($plug, $prefx, $strarray, $lng='en-gb', $ev='public') { $this->smd_owner = $plug; $this->smd_prefix = $prefx; $this->smd_strings = $strarray; $this->smd_lang = $lng; $this->smd_event = $ev; register_callback(array(&$this, 'smd_Callback'), 'l10n.enumerate_strings'); } function smd_Callback($event='l10n.enumerate_strings', $step='', $pre=0) { $r = array( 'owner' => $this->smd_owner, 'prefix' => $this->smd_prefix, 'lang' => $this->smd_lang, 'event' => $this->smd_event, 'strings' => $this->smd_strings, ); return $r; } // Generic lookup // $what = key to look up // $args = any arguments the key is expecting for replacement function gTxt($what, $args = array()) { global $textarray; // Prepare the prefixed key for use $key = $this->smd_prefix . '-' . $what; $key = strtolower($key); // Grab from the global textarray (possibly edited by MLP) if we can if(isset($textarray[$key])) { $str = $textarray[$key]; } else { // The string isn't in the localised textarray so fallback to using // the (non prefixed) string array in the plugin $key = strtolower($what); $str = (isset($this->smd_strings[$key])) ? $this->smd_strings[$key] : $what; } // Perform substitutions if(!empty($args)) { $str = strtr($str, $args); } return $str; } } function smd_addQSVar($url, $key, $value) { $url = smd_removeQSVar($url, $key); if (strpos($url, '?') === false) { return ($url . '?' . $key . '=' . $value); } else { return ($url . '&' . $key . '=' . $value); } } function smd_removeQSVar($url, $key) { $url = preg_replace('/(.*)(\?|&)' . $key . '=[^&]+?(&)(.*)/i', '$1$2$4', $url . '&'); $url = substr($url, 0, -1); return ($url); } // DEPRECATED: for backwards compatibility only function smd_getSubCats($parent,$cattype) { return getTree($parent,$cattype); //getTree() or getTreePath()?? } // DEPRECATED: for backwards compatibility only function smd_getAtts($str, $allowed, $idprefix="", $splitat="/(,|,\s)+/", $pregopt = PREG_SPLIT_NO_EMPTY) { return smd_getOpts($str, $allowed, $idprefix, true, $splitat, $pregopt); } function smd_getOpts($str, $allowed, $idprefix="", $allowRange=true, $splitat="/(,|,\s)+/", $pregopt = PREG_SPLIT_NO_EMPTY) { global $pretext, $thisarticle; $out = array(); $notout = array(); $matches = smd_split($str, $allowRange, $splitat, $pregopt); // An array that tells the loop what to do with each of the valid strings // arg1: type (1=exact match, 2=anywhere within string, 3=custom field) // arg2: prefix, if any // arg3: variable to substitute // arg4: extra check, if applicable // arg5: store in the in/exclude list (1=include; 2=exclude) $opt = array( "?c" => array(2, "", $pretext['c'], "1", 1), "!c" => array(2, "", $pretext['c'], "1", 2), "?s" => array(2, "", $pretext['s'], "1", 1), "!s" => array(2, "", $pretext['s'], "1", 2), "?q" => array(2, "", $pretext['q'], "1", 1), "!q" => array(2, "", $pretext['q'], "1", 2), "?t" => array(2, "", $thisarticle['url_title'], '$thisarticle!=NULL', 1), "!t" => array(2, "", $thisarticle['url_title'], '$thisarticle!=NULL', 2), "?id" => array(1, $idprefix, $pretext['id'], '$thisarticle!=NULL', 1), "!id" => array(1, $idprefix, $pretext['id'], '$thisarticle!=NULL', 2), "?" => array(3, "", "", '$thisarticle!=NULL', 1), "!" => array(3, "", "", '$thisarticle!=NULL', 2), ); for ($idx = 0; $idx < count($matches); $idx++) { $matched = false; $thismatch = $matches[$idx]; foreach ($opt as $var => $args) { $opvar = ($args[4] == 1) ? "out" : "notout"; switch ($args[0]) { case 1: if (($thismatch === $var) && in_array($var,$allowed)) { $matched = true; if ($args[2] != "" && $args[3]) { $rep = str_replace($var, $args[1].$args[2], $thismatch); if (!in_array($rep, ${$opvar})) { ${$opvar}[] = $rep; } } } break; case 2: $pat = '/\\'.$var.'$|\\'.$var.'[^A-Za-z0-9]/'; if ((is_int(smd_pregPos($pat, $thismatch, $fnd))) && in_array($var,$allowed)) { $matched = true; if ($args[2] != "" && $args[3]) { $rep = str_replace($var, $args[1].$args[2], $thismatch); if (!in_array($rep, ${$opvar})) { ${$opvar}[] = $rep; } } } break; case 3: $len = strlen($var); if ((substr($thismatch,0,$len) === $var) && in_array($var."field",$allowed)) { $matched = true; // Use the given field name; which may be a comma-separated sublist. // Split off the field name from the question mark $fieldname = substr($thismatch,$len); if (($args[3]) && (isset($thisarticle[strtolower($fieldname)]))) { $fieldContents = $thisarticle[strtolower($fieldname)]; } else { $fieldContents = $fieldname; } if (!empty($fieldContents)) { $subout = smd_split(strip_tags($fieldContents), $allowRange, $splitat, $pregopt); foreach ($subout as $subname) { if (!in_array($subname, ${$opvar})) { ${$opvar}[] = $subname; } } } } break; } if ($matched) { break; } } if (!$matched) { // Assign the variable verbatim if (!in_array($thismatch, $out)) { $out[] = $thismatch; } } } return array($out,$notout); } // DEPRECATED: for backwards compatibility only function smd_splitRange($str, $splitat="/(,|,\s)+/", $pregopt = PREG_SPLIT_NO_EMPTY) { return smd_split($str, true, $splitat, $pregopt); } function smd_split($str, $allowRange=true, $splitat="/(,|,\s)+/", $pregopt = PREG_SPLIT_NO_EMPTY) { $retarr = array(); if ((substr($splitat,0,1) == "/") && (substr($splitat, strlen($splitat)-1, 1) == "/")) { $pat = $splitat; } else { $pat = '/['.$splitat.']+/'; } $elems = preg_split($pat, $str, -1, $pregopt); foreach ($elems as $item) { $item = trim($item); $negate = false; // Does the item start with a negation character if (substr($item,0,1) === "!") { $negate = true; $item = substr($item,1); } // Is the item an integer list range if ($allowRange && preg_match('/^(\d+)\-(\d+)$/', $item)) { list($lo, $hi) = explode("-", $item, 2); $rng = range($lo, $hi); // Reapply the negation if necessary for($idx = 0; $idx < count($rng); $idx++) { $rng[$idx] = (($negate) ? "!" : "") . $rng[$idx]; } $retarr = array_merge($retarr, $rng); } else { $retarr[] = (($negate) ? "!" : "") . $item; } } return $retarr; } // Stolen from php.net: strpos page comments... function smd_pregPos($sPattern, $sSubject, &$FoundString, $iOffset = 0) { $FoundString = null; if (preg_match($sPattern, $sSubject, $aMatches, PREG_OFFSET_CAPTURE, $iOffset) > 0) { $FoundString = $aMatches[0][0]; return $aMatches[0][1]; } else { return false; } } //... and array_combine... if (!function_exists("array_combine")) { function array_combine($arr1,$arr2) { $out = array(); foreach($arr1 as $key1 => $value1) { $out[$value1] = $arr2[$key1]; } return $out; } } //... and htmlspecialchars_decode if (!function_exists("htmlspecialchars_decode")) { function htmlspecialchars_decode($string, $quote_style = ENT_COMPAT) { return strtr($string, array_flip(get_html_translation_table(HTML_SPECIALCHARS, $quote_style))); } } function smd_getWord($haystack,$searchterm,$offset=0) { $numwords = str_word_count($searchterm); $haystack = strrev(substr($haystack,0,$offset+1)); $spacePos = 1; for ($idx = 0; $idx < $numwords; $idx++) { $spacePos = (strpos($haystack, " ", $spacePos))+1; } return trim(strrev(substr($haystack, 0, $spacePos))); }