package String::EditDistance; use strict; use vars qw($VERSION); $VERSION = '0.01'; sub new($$$;@) { my ($class, $str1, $str2) = (shift, shift, shift); my %costs = ( 'insert'=>1, 'mutate'=>1, @_ ); $class = ref ($class) || $class; my ( $dist, $comm ); if ( exists $costs{'bound'} ) { ( $dist, $comm )= _edit_dist_bound($str1, $str2, $costs{'insert'}, $costs{'mutate'}, $costs{'bound'}); } else { ( $dist, $comm )= _edit_dist($str1, $str2, $costs{'insert'}, $costs{'mutate'}); } bless { "dist" => $dist, "comm" => $comm, }, $class; } sub distance($) { return $_[0]->{'dist'}; } sub commonality($) { return $_[0]->{'comm'}; } sub _edit_dist($$$$) { my ( $str1, $str2, $insdel, $mutate ) = @_; my ($i, $j); my ($left, $right); # ORDER STRINGS: length str1 > length str2 my $len1 = length $str1; my $len2 = length $str2; ($str2, $str1, $len2, $len1) = ($str1, $str2, $len1, $len2) if $len1 < $len2; # EXTRACT COMMON PREFIX for ($left=0; $left<$len2; $left++) { last if substr($str1,$left,1) ne substr($str2,$left,1); } # CREATE EDIT MATRIX my @dist = ( [], [] ); $#{$dist[0]} = $len2-$left; $#{$dist[1]} = $len2-$left; for ($j=0; $j<=$len2-$left; $j++) { $dist[0][$j] = $j*$insdel; } # LOOP VARIABLES my ($nw, $ne, $sw, $se, $mutcost, $modi, $modip1, $chari, $alt); # FOR EACH LETTER $chari IN $str1.... for ($i=0; $i<$len1-$left; $i++) { $chari = substr($str1,$i+$left,1); # INITIALIZE LOCAL CONTEXT $modi = $i & 1; $modip1 = $modi ? 0 : 1; $nw = $dist[$modi][0]; $ne = $dist[$modip1][0] = ($i+1) * $insdel; # FOR EACH LETTER IN $str2... for ($j=0; $j<$len2-$left; $j++) { # INITIALIZE REST OF LOCAL CONTEXT $sw = $dist[$modi][$j+1]; # FIND MINIMUM OF INSERTION/DELETION AND MUTATION COSTS $se = $sw + $insdel; $alt = $ne + $insdel; $se = $alt if $alt < $se; if ($nw < $se) { $nw += ($chari ne substr($str2,$j+$left,1)) ? $mutate : 0; $se = $nw if $nw < $se; } # CACHE RESULT AND MOVE DOWN $dist[$modip1][$j+1] = $se; $ne = $se; $nw = $sw; } } # RETURN EDIT DISTANCE, COMMONALITY my $edist = $dist[$i&1][$len2-$left]; return ( $edist, 1 - $edist / $len1); } sub _edit_dist_bound($$$$$) { my ( $str1, $str2, $insdel, $mutate, $bound ) = @_; my ($i, $j); # ORDER STRINGS: length str1 > length str2 my $len1 = length $str1; my $len2 = length $str2; ($str2, $str1, $len2, $len1) = ($str1, $str2, $len1, $len2) if $len1 < $len2; # COMPUTE BOUND IN ABSOLUTE EDIT OPS $bound = $len1 * (1-$bound) if $bound<1; # CREATE EDIT MATRIX my @dist = ( [], [] ); $#{$dist[0]} = $len2; $#{$dist[1]} = $len2; for ($j=0; $j<=$len2; $j++) { $dist[0][$j] = $j*$insdel; } # LOOP VARIABLES my ($nw, $ne, $sw, $se, $mutcost, $modi, $modip1, $chari, $alt, $min); # FOR EACH LETTER $chari IN $str1.... for ($i=0; $i<$len1; $i++) { $chari = substr($str1,$i,1); # INITIALIZE LOCAL CONTEXT $modi = $i & 1; $modip1 = $modi ? 0 : 1; $nw = $dist[$modi][0]; $ne = $dist[$modip1][0] = ($i+1) * $insdel; $min = $ne; # FOR EACH LETTER IN $str2... for ($j=0; $j<$len2; $j++) { # INITIALIZE REST OF LOCAL CONTEXT $sw = $dist[$modi][$j+1]; # FIND MINIMUM OF INSERTION/DELETION AND MUTATION COSTS $se = $sw + $insdel; $alt = $ne + $insdel; $se = $alt if $alt < $se; if ($nw < $se) { $nw += ($chari ne substr($str2,$j,1)) ? $mutate : 0; $se = $nw if $nw < $se; } # CACHE RESULT AND MOVE DOWN $dist[$modip1][$j+1] = $se; $ne = $se; $nw = $sw; $min = $se if $min > $se; } # SHORT-CIRCUIT IF BOUND EXCEEDED if ($min > $bound) { return ( $min, 1 - $min/$len1 ); } } # RETURN EDIT DISTANCE, COMMONALITY my $edist = $dist[$i&1][$len2]; return ( $edist, 1 - $edist / $len1); } 1; __END__