# # lexical database and tagger. *pHEAR* # sub loadModelObject { local('$handle $d @r'); $handle = openf("models/ $+ $1"); while $d (readObject($handle)) { push(@r, $d); } closef($handle); return iff(size(@r) == 1, @r[0], @r); } # @(@(word, tag, score), ...) sub isDifferent { local('$word $tag $score $option $_score $_tag %counts'); foreach $option ($1) { ($word, $tag, $score) = $option; %counts[$tag] += 1; # iff($score > 0.0, 1, 0); if (%counts[$tag] > 1) { return $null; } } return 1; } # tagAll(pre1, pre2, prevword, @(word1, word2, ...)) sub tagAll { local('$option $result $score @r'); # warn(@_); foreach $option ($4) { $result = tagSingle($1, $2, $3, $option); $score = scoreTagSequence($1, $2, $result); # $score += scoreTagAssociation($option, $result); # $score /= 2; push(@r, @($option, $result, $score)); } return @r; } sub getBest { local('$x'); $x = iff(size($1) > 1, reduce({ return iff($1[2] > $2[2], $1, $2); }, $1), $1[-1]); # warn("Best for $1 is: $x"); return $x; } sub initTaggerModels { global('$endings $lexdb $trigrams $trigramsr $bywords'); $endings = loadModelObject("endings.bin"); $lexdb = loadModelObject("lexicon.bin"); ($trigrams, $trigramsr) = loadModelObject("trigrams.bin"); # $bywords = loadModelObject("bywords.bin"); } sub scoreTags { local('$pre2 $pre1 $tag $base'); ($pre2, $pre1, $tag) = @_; if ($pre2 in $trigrams && $pre1 in $trigrams[$pre2]) { $base = $trigrams[$pre2][$pre1]; return iff ($tag in $base, $base[$tag], 0.0); } else { return 0.0; } } sub endsWith { return iff(strlen($1) >= strlen($2) && right($1, strlen($2)) eq $2); } sub beginsWith { return iff(strlen($1) >= strlen($2) && left($1, strlen($2)) eq $2); } sub past { # a regex to check if a word is a past participle or not. return '\w+ed|awoken|borne|beaten|become|begun|bent|bet|bitten|bled|blown|broken|bred|brought|built|burnt|burst|bought|caught|chosen|come|cost|cut|dealt|done|drawn|dreamt|drunk|driven|eaten|made|meant|met|paid|put|quit|read|ridden|rung|risen|run|said|seen|sought|sold|sent|set|shaken|shone|shot|shown|shut|sung|sunk|sat|slept|smelt|spoken|spent|spilt|spoilt|spread|stood|stolen|stuck|stung|stunk|struck|sworn|swum|taken|taught|torn|told|thought|thrown|understood|woken|worn|wept|won|written'; } sub fixSingleTag { local('$word $tag $pre1 $tag1'); ($word, $tag, $pre1, $tag1) = @_; # rule 1 : DT, {VBD | VBP} --> DT, NN if ($tag1 eq "DT" && ($tag eq "VBD" || $tag eq "VBP" || $tag eq "VB")) { return "NN"; } # rule 2: convert a noun to a number (CD) if "." appears in the word else if ([$tag startsWith: "N"] && -isnumber $word) { return "CD"; } # rule 3: convert a noun to a past participle if else if ([$tag startsWith: "N"] && [$word endsWith: "ed"]) { return "VBN"; } # rule 4: convert any type to adverb if it ends in "ly" else if ([$word endsWith: "ly"]) { return "RB"; } # rule 5: convert a common noun (NN or NNS) to a adjective if it ends with "al" else if ([$tag startsWith: "NN"] && [$word endsWith: "al"]) { return "JJ"; } # rule 6: convert a noun to a verb if the preceeding work is "would" else if ([$tag startsWith: "NN"] && (lc($pre1) eq "would")) # || $tag1 eq "TO")) { return "VB"; } # rule 7: if a word has been categorized as a common noun and it ends with "s", # then set its type to plural common noun (NNS) else if ($tag eq "NN" && [$word endsWith: "s"]) { return "NNS"; } # rule 8: convert a common noun to a present prticiple verb (i.e., a gerand) else if ([$tag startsWith: "NN"] && [$word endsWith: "ing"]) { return "VBG"; } # rule 9: give punctuation its own tag. else if ($word isin "-,()[];:/--") { return ","; } return $tag; } sub findWordTag { local('$base $wordtags $rtag $key $value $temp $result'); ($base, $wordtags) = @_; $rtag = -1.0; $result = $null; foreach $key => $value ($wordtags) { $temp = iff($key in $base, $base[$key], 0.00000000000000001) * $value; if ($temp > $rtag && $key ne "") { $rtag = $temp; $result = "$key"; } } if ($result is $null) { warn("Null: " . @_); return 'NN'; } return $result; } sub taggerWithLexProb { local('@results $word $pre2 $pre1 $count $wordl'); foreach $count => $word ($1) { $wordl = lc($word); local('%base'); %base = ohash(); setMissPolicy(%base, { return 1.0; }); if ($wordl !in $lexdb) { if (-isupper charAt($word, 0) && $pre1 eq "") { push(@results, @($word, "NN")); } else if (strlen($word) >= 3 && right($wordl, 3) in $endings) { push(@results, @($word, findWordTag(%base, $endings[right($wordl, 3)]) )); } else { push(@results, @($word, "NN")); } # @results[-1][1] = fixSingleTag($word, @results[-1][1], iff($pre1 eq "", "", @results[-2][0]), $pre1); } else if (size($lexdb[$wordl]) >= 1) { push(@results, @($word, findWordTag(%base, $lexdb[$wordl]))); } else { push(@results, @($word, "NN")); # @results[-1][1] = fixSingleTag($word, @results[-1][1], iff($pre1 eq "", "", @results[-2][0]), $pre1); } } return @results; } sub scoreWordTagFit { warn("scoreWordTagFit: " . @_); warn("scoreTagSequence( $+ $1 $+ , $2 $+ , $3 $+ ) = [".scoreTagSequence($1, $2, $3)."] * scoreTagAssociation( $+ $3 $+ , $4 $+ ) = [".scoreTagAssociation($3, $4)."]"); return scoreTagSequence($1, $2, $3) * scoreTagAssociation($3, $4); } sub scoreTagSequence { local('$pre2 $pre1 $tag $base'); ($pre2, $pre1, $tag) = @_; if ($pre2 in $trigrams && $pre1 in $trigrams[$pre2]) { $base = $trigrams[$pre2][$pre1]; if ($tag in $base) { return $base[$tag]; } } return 0.0; } sub scoreTagAssociation { local('$word $tag $wordl $end'); ($word, $tag) = @_; $wordl = lc($word); if ($wordl !in $lexdb) { $end = iff(strlen($wordl) > 3, right($wordl, 3), $wordl); if ($end in $endings && $tag in $endings[$end]) { return $endings[$end][$tag]; } } else if ($tag in $lexdb[$wordl]) { return $lexdb[$wordl][$tag]; } #warn("Assoc: $word -> $tag has nothing!"); return 0.0; } # pre2, pre1, last, word sub tagSingle { local('$pre2 $pre1 $word %base $last $wordl $result'); ($pre2, $pre1, $last, $word) = @_; $wordl = lc($word); if ($pre2 !in $trigrams || $pre1 !in $trigrams[$pre2]) { %base = ohash(); setMissPolicy(%base, { return 1.0; }); } else { %base = $trigrams[$pre2][$pre1]; } if ($wordl !in $lexdb) { if (-isupper charAt($word, 0) && $pre1 eq "") { $result = 'NN'; } else if (strlen($word) >= 3 && right($wordl, 3) in $endings) { $result = findWordTag(%base, $endings[right($wordl, 3)]); } else { $result = 'NN'; } $result = fixSingleTag($word, $result, $last, $pre1); } else if (size($lexdb[$wordl]) >= 1) { $result = findWordTag(%base, $lexdb[$wordl]); } else { $result = fixSingleTag($word, 'NN', $last, $pre1); } return $result; } # use trigrams to predict appropriate current word and brill strategy for unknown words sub taggerWithTrigrams { local('@results $word $pre2 $pre1 $count $wordl'); ($pre2, $pre1) = ""; foreach $count => $word ($1) { if (strlen($word) == 0) { warn("Broken: ' $+ $word $+ ' @ $count of $1"); } push(@results, @($word, tagSingle($pre2, $pre1, iff($pre1 eq "", "", @results[-1][0]), $word))); assert @results[-1][1] ne "" : "Eh?!? " . sublist(@_, 1) . " $word and " . @results; $pre2 = $pre1; $pre1 = @results[-1][1]; } return @results; } sub taggerToString { return join(" ", map({ return join("/", $1); }, $1)); }