// Copyright 2006-2007 Deutsches Forschungszentrum fuer Kuenstliche Intelligenz // or its licensors, as applicable. // // You may not use this file except under the terms of the accompanying license. // // Licensed under the Apache License, Version 2.0 (the "License"); you // may not use this file except in compliance with the License. You may // obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Project: // File: // Purpose: // Responsible: tmb // Reviewer: // Primary Repository: // Web Sites: www.iupr.org, www.dfki.de #include #include #include #include #ifdef GOOGLE_INTERNAL #include "nlp/fst/lib/fst-decl.h" #include "nlp/fst/lib/fst-inl.h" #include "nlp/fst/lib/fstlib-inl.h" #else #include "fst/lib/fst.h" #include "fst/lib/fstlib.h" #endif #undef CHECK #include "colib.h" #include "fstbuilder.h" #include "fstutil.h" #include "fstmodels.h" #define EPSILON 0 #define STAR kSigmaLabel #define REST kRhoLabel using namespace fst; using namespace colib; namespace ocropus { // General purpose minimization function; this uses lazy versions // and then finally performs the minimization. static StdVectorFst *fst_minimize(autodel &composition,bool rmeps,bool det,bool min) { autodel > epsfree; if(rmeps) epsfree = new RmEpsilonFst(*composition); else epsfree = composition.move(); autodel > determinization; if(det) determinization = new DeterminizeFst(*epsfree); else determinization = epsfree.move(); autodel result(new StdVectorFst(*determinization)); Minimize(result.ptr()); return result.move(); } // Create an fst that ignores the given set of symbols on its input. StdVectorFst *fst_ignoring(intarray &a,int maxsymbol,int minsymbol) { CHECK_ARG(minsymbol>0 && minsymbol0 && maxsymbol<10000000); autodel fst; fst = new StdVectorFst(); int start = fst->AddState(); fst->SetStart(start); fst->SetFinal(start,0.0); narray used(maxsymbol); fill(used,0); for(int i=0;iAddArc(start,StdArc(a[i],EPSILON,0.0,start)); } for(int i=minsymbol;iAddArc(start,StdArc(i,i,0.0,start)); } Verify(*fst); return fst.move(); } // Create an fst that keeps only the given set of symbols on its // input and deletes the rest. StdVectorFst *fst_keeping(intarray &a,int maxsymbol,int minsymbol) { CHECK_ARG(minsymbol>0 && minsymbol0 && maxsymbol<10000000); autodel fst; fst = new StdVectorFst(); int start = fst->AddState(); fst->SetStart(start); fst->SetFinal(start,0.0); narray used(maxsymbol); fill(used,0); for(int i=0;iAddArc(start,StdArc(a[i],a[i],0.0,start)); } for(int i=minsymbol;iAddArc(start,StdArc(i,EPSILON,0.0,start)); } Verify(*fst); return fst.move(); } // Create an fst that permits insertions and deletions of // arbitrary symbols with the given costs. StdVectorFst *fst_insdel(float ins,float del,int maxsymbol,int minsymbol) { CHECK_ARG(minsymbol>0 && minsymbol0 && maxsymbol<10000000); autodel fst; fst = new StdVectorFst(); int start = fst->AddState(); fst->SetStart(start); fst->SetFinal(start,0.0); for(int i=minsymbol;iAddArc(start,StdArc(i,i,0.0,start)); fst->AddArc(start,StdArc(EPSILON,i,ins,start)); fst->AddArc(start,StdArc(i,EPSILON,del,start)); } Verify(*fst); return fst.move(); } // Create an fst that permits limited insertions and deletions. StdVectorFst *fst_limited_edit_distance(int maxins,float ins,int maxdel,float del,int maxsymbol,int minsymbol) { CHECK_ARG(minsymbol>0 && minsymbol0 && maxsymbol<10000000); autodel fst; fst = new StdVectorFst(); int start = fst->AddState(); fst->SetStart(start); fst->SetFinal(start,0.0); for(int r=minsymbol;rAddArc(start,StdArc(r,r,0.0,start)); int last = start; for(int i=0;iAddState(); // add the transition from the last state to the current; // if the last state was the start state, make it a non-eps // transition for(int r=minsymbol;rAddArc(last,StdArc(i==0?r:EPSILON,r,ins,current)); // add the eps:a transition that returns to the start state for(int r=minsymbol;rAddArc(current,StdArc(EPSILON,r,ins,start)); last = current; current = fst->AddState(); } last = start; for(int i=0;iAddState(); // add the transition from the last state to the current; // if the last state was the start state, make it a non-eps // transition for(int r=minsymbol;rAddArc(last,StdArc(r,i==0?r:EPSILON,del,current)); // add the eps:a transition that returns to the start state for(int r=minsymbol;rAddArc(current,StdArc(r,EPSILON,del,start)); last = current; current = fst->AddState(); } Verify(*fst); return fst.move(); } // Create an fst that corresponds to edit distance. StdVectorFst *fst_edit_distance(float subst,float ins,float del,int maxsymbol,int minsymbol) { CHECK_ARG(minsymbol>0 && minsymbol0 && maxsymbol<10000000); autodel fst; fst = new StdVectorFst(); int start = fst->AddState(); fst->SetStart(start); fst->SetFinal(start,0.0); for(int i=minsymbol;iAddArc(start,StdArc(i,j,cost,start)); } fst->AddArc(start,StdArc(EPSILON,i,ins,start)); fst->AddArc(start,StdArc(i,EPSILON,del,start)); } Verify(*fst); return fst.move(); } // Create an fst that permits strings in the given size range. StdVectorFst *fst_size_range(int minsize,int maxsize,int maxsymbol,int minsymbol) { autodel fst; fst = new StdVectorFst(); int start = fst->AddState(); fst->SetStart(start); int current = start; for(int i=0;i=minsize) fst->SetFinal(current,0.0); int next = fst->AddState(); for(int j=minsymbol;jAddArc(current,StdArc(j,j,0.0,next)); } current = next; } fst->SetFinal(current,0.0); return fst.move(); } // Create a statistical unigram model. struct UnigramModelImpl : UnigramModel { int start; autodel fst; UnigramModelImpl() { clear(); } void clear() { fst = new StdVectorFst(); start = fst->AddState(); fst->SetStart(start); fst->SetFinal(start,0.0); Verify(*fst); } void addSymbol(int input,int output,float cost) { check_valid_symbol(input); check_valid_symbol(output); fst->AddArc(start,StdArc(input,output,cost,start)); } StdVectorFst *take() { Verify(*fst); return fst.move(); } }; UnigramModel *make_UnigramModel() { return new UnigramModelImpl(); } // Create an fst corresponding to a dictionary. struct DictionaryModelImpl : DictionaryModel { int start; // FIXME change this to use the ICharLattice/IGenericFst interface autodel fst; DictionaryModelImpl() { clear(); } void clear() { fst = new StdVectorFst(); start = fst->AddState(); fst->SetStart(start); } void addWord(intarray &w,float cost) { int n = w.length(); CHECK_ARG(n>0); int current = start; for(int i=0;iAddState(); fst->AddArc(current,StdArc(w[i],w[i],arc_cost,next)); current = next; } fst->SetFinal(current,0.0); } void addWordSymbol(intarray &w,int output,float cost) { int n = w.length(); CHECK_ARG(n>0); int current = start; for(int i=0;iAddState(); int output_symbol = (i==n-1)?output:EPSILON; fst->AddArc(current,StdArc(w[i],output_symbol,arc_cost,next)); current = next; } fst->SetFinal(current,0.0); } void addWordTranscription(intarray &input,intarray &output,float cost) { int n = input.length(); CHECK_ARG(n>0); int current = start; for(int i=0;iAddState(); int input_symbol = (iAddArc(current,StdArc(input_symbol,output_symbol,arc_cost,next)); current = next; } fst->SetFinal(current,0.0); } // convenience methods for UTF8 input void addWord(const char *w,float cost=0.0) { intarray a; utf8_decode(a,w); addWord(a,cost); } void addWordSymbol(const char *w,int output,float cost=0.0) { intarray a; utf8_decode(a,w); for(int i=0;i fst; intarray keys; floatarray costs; intarray states; bytearray done; // ngrams are in reading order, with the last element conditioned on the previous ones // use 0 for beginning and end of string void addNgram(intarray &ngram,float cost) { intarray key; reverse(key,ngram); rowpush(keys,key); costs.push(cost); } void addNgram(const char *ngram,float cost) { intarray a; utf8_decode(a,ngram); for(int i=0;iAddState(); fst->SetStart(start); states.resize(costs.length()); for(int i=0;iAddState(); int number_of_start_states = 0; int number_of_final_states = 0; for(int i=0;iSetFinal(states(i),0.0); number_of_final_states++; } remove_left(prefix,1); if(prefix(0)==0) { // start states have no inputs other than 0 fst->AddArc(start,StdArc(EPSILON,EPSILON,0.0,states(i))); // no further arcs going into this state number_of_start_states++; continue; } intarray matching; rowprefixselect(matching,keys,prefix); for(int j=0;jAddArc(states(k),StdArc(keys(k,0),keys(k,0),costs(k),states(i))); } } // make sure we actually got start and final states; this doesn't guarantee // connectivity, but it guards against the most common errors ASSERTWARN(number_of_start_states>0); ASSERTWARN(number_of_final_states>0); } StdVectorFst *take() { construct(); Verify(*fst); fst_minimize(fst,true,true,true); return fst.move(); } ~NgramModelImpl() {} }; NgramModel *make_NgramModel() { return new NgramModelImpl(); } }