Edinburgh Speech Tools 2.4-release
EST_WFST.h
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1997 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Author : Alan W Black */
34/* Date : November 1997 */
35/*-----------------------------------------------------------------------*/
36/* */
37/* Weighted Finite State Transducers */
38/* */
39/*=======================================================================*/
40#ifndef __EST_WFST_H__
41#define __EST_WFST_H__
42
43#include "EST_simplestats.h"
44#include "EST_rw_status.h"
45#include "EST_Option.h"
46#include "EST_TList.h"
47#include "EST_TVector.h"
48#include "EST_THash.h"
49#include "siod.h"
50#define wfst_error_msg(WMESS) (cerr << WMESS << endl,siod_error())
51
52#define WFST_ERROR_STATE -1
53
54class EST_WFST_State;
55class EST_WFST;
56
57/** an internal class for \Ref{EST_WFST} for representing transitions
58 in an WFST
59 */
61 private:
62 float p_weight;
63 int p_state;
64 int p_in_symbol;
65 int p_out_symbol;
66 public:
69 { p_weight=t.p_weight; p_state=t.p_state;
70 p_in_symbol = t.p_in_symbol; p_out_symbol=t.p_out_symbol; }
71 EST_WFST_Transition(float w, int s, int i, int o)
72 { p_weight=w; p_state=s; p_in_symbol=i; p_out_symbol=o;}
74 float weight() const { return p_weight; }
75 int state() const { return p_state; }
76 int in_symbol() const { return p_in_symbol; }
77 int out_symbol() const { return p_out_symbol; }
78 void set_weight(float f) { p_weight = f; }
79 void set_state(int s) { p_state = s; }
80
81};
83
84enum wfst_state_type {wfst_final, wfst_nonfinal, wfst_error, wfst_licence};
85/** I'd like to use the enums but I need to binary read/write them **/
86/** I don't believe that's portable so we need to have ints for these **/
87#define WFST_FINAL 0
88#define WFST_NONFINAL 1
89#define WFST_ERROR 2
90#define WFST_LICENCE 3
91
92
93/** an internal class for \Ref{EST_WFST} used to represent a
94 state in a WFST
95*/
97 private:
98 int p_name;
99 enum wfst_state_type p_type;
100 int p_tag; // for marking in traversing
101 public:
102 wfst_translist transitions;
103
104 EST_WFST_State(int name);
105 EST_WFST_State(const EST_WFST_State &state);
107
108 EST_WFST_Transition *add_transition(float w,
109 int end,
110 int in,
111 int out);
112 int name() const { return p_name; }
113 int num_transitions() const { return transitions.length(); }
114 enum wfst_state_type type() const { return p_type; }
115 void set_type(wfst_state_type t) { p_type = t; }
116 void set_tag(int v) { p_tag = v;}
117 int tag() const { return p_tag;}
118};
120
122enum wfst_mstate_type {wfst_ms_set, wfst_ms_list};
123
124/** an internal class to \Ref{EST_WFST} used in holding multi-states
125 when determinizing and find the intersections of other WFSTs.
126 */
128 private:
129 int p_name;
130 float p_weight;
131 enum wfst_mstate_type p_type;
132 public:
134 { p_name = -1; p_weight = 0.0; p_type = wfst_ms_set; }
135 EST_WFST_MultiState(enum wfst_mstate_type ty) : EST_IList()
136 { p_name = -1; p_weight = 0.0; p_type = ty; }
137 int name() const { return p_name; }
138 void set_name(int i) { p_name = i; }
139 float weight() const { return p_weight; }
140 void set_weight(float w) { p_weight = w; }
141 void set_type(enum wfst_mstate_type s) { p_type = s; }
142 enum wfst_mstate_type type() const { return p_type; }
143 void add(int i);
144};
145
146int multistate_index(EST_WFST_MultiStateIndex &i,EST_WFST_MultiState *ms);
147
148/** a call representing a weighted finite-state transducer
149 */
150class EST_WFST {
151 private:
152 EST_Discrete p_in_symbols;
153 EST_Discrete p_out_symbols;
154 int p_start_state;
155 int current_tag;
156 int p_num_states;
157 int p_cumulate;
158 wfst_state_vector p_states;
159
160 int operator_and(LISP l);
161 int operator_or(LISP l);
162 int operator_star(LISP l);
163 int operator_plus(LISP l);
164 int operator_optional(LISP l);
165 int operator_not(LISP l);
166 int terminal(LISP l);
167 EST_WFST_State *copy_and_map_states(const EST_IVector &state_map,
168 const EST_WFST_State *s,
169 const EST_WFST &b) const;
170 void extend_alphabets(const EST_WFST &b);
171 int deterministiconstartstates(const EST_WFST &a, const EST_WFST &b) const;
172 EST_read_status load_transitions_from_lisp(int s, LISP trans);
173 void more_states(int new_max);
174
175 int can_reach_final(int state);
176 static int traverse_tag;
177 public:
178 /**@name Constructor and initialisation functions */
179 //@{
180 /// ?
181 EST_WFST();
182 /// ?
183 EST_WFST(const EST_WFST &wfst) { p_num_states = 0; copy(wfst); }
184 ~EST_WFST();
185 //@}
186
187 /**@name Reseting functions */
188 //@{
189 /// Clear with (estimation of number of states required)
190 void init(int init_num_states=10);
191 /// clear an initialise with given input and out alphabets
192 void init(LISP in, LISP out);
193 /// Copy from existing wfst
194 void copy(const EST_WFST &wfst);
195 /// clear removing existing states if any
196 void clear();
197 //@}
198
199 /**@name General utility functions */
200 //@{
201 int num_states() const { return p_num_states; }
202 int start_state() const { return p_start_state; }
203 /// Map input symbol to input alphabet index
204 int in_symbol(const EST_String &s) const
205 { return p_in_symbols.name(s); }
206 /// Map input alphabet index to input symbol
207 const EST_String &in_symbol(int i) const
208 { return p_in_symbols.name(i); }
209 /// Map output symbol to output alphabet index
210 int out_symbol(const EST_String &s) const
211 { return p_out_symbols.name(s); }
212 /// Map output alphabet index to output symbol
213 const EST_String &out_symbol(int i) const
214 { return p_out_symbols.name(i); }
215 /// LISP for on epsilon symbols
216 LISP epsilon_label() const { return rintern("__epsilon__"); }
217 /// Internal index for input epsilon
218 int in_epsilon() const { return p_in_symbols.name("__epsilon__"); }
219 /// Internal index for output epsilon
220 int out_epsilon() const { return p_out_symbols.name("__epsilon__"); }
221 /// Return internal state information
222 const EST_WFST_State *state(int i) const { return p_states(i); }
223 /// Return internal state information (non-const)
224 EST_WFST_State *state_non_const(int i) { return p_states(i); }
225 /// True if state {\tt i} is final
226 int final(int i) const
227 { return ((i != WFST_ERROR_STATE) && (state(i)->type() == wfst_final));}
228 /// Accessing the input alphabet
229 const EST_Discrete &in_symbols() const { return p_in_symbols; }
230 /// Accessing the output alphabet
231 const EST_Discrete &out_symbols() const { return p_out_symbols; }
232
233 //@}
234
235 /**@name file i/o */
236 //@{
237 /// ?
238 EST_write_status save(const EST_String &filename,
239 const EST_String type = "ascii");
240 EST_write_status save_binary(FILE *fd);
241 /// ?
242 EST_read_status load(const EST_String &filename);
243
244 EST_read_status load_binary(FILE *fd,
245 EST_Option &hinfo,
246 int num_states,
247 int swap);
248 //@}
249
250 /**@name transduction functions */
251 //@{
252 /// Find (first) new state given in and out symbols
253 int transition(int state,int in, int out) const;
254 int transition(int state,int in, int out, float &prob) const;
255 /// Find (first) transition given in and out symbols
256 EST_WFST_Transition *find_transition(int state,int in, int out) const;
257 /// Find (first) new state given in and out strings
258 int transition(int state,const EST_String &in,const EST_String &out) const;
259 /// Find (first) new state given in/out string
260 int transition(int state,const EST_String &inout) const;
261 /// Transduce in to out from state
262 int transduce(int state,int in,int &out) const;
263 /// Transduce in to out (strings) from state
264 int transduce(int state,const EST_String &in,EST_String &out) const;
265 /// Transduce in to list of transitions
266 void transduce(int state,int in,wfst_translist &out) const;
267 /// Find all possible transitions for given state/input/output
268 void transition_all(int state,int in, int out,
269 EST_WFST_MultiState *ms) const;
270
271 //@}
272
273 /**@name Cumulation functions for adding collective probabilities
274 for transitions from data */
275 //@{
276 /// Cumulation condition
277 int cumulate() const {return p_cumulate;}
278 /// Clear and start cumulation
279 void start_cumulate();
280 /// Stop cumulation and calculate probabilities on transitions
281 void stop_cumulate();
282 //@}
283
284 /**@name WFST construction functions from external representations **/
285 //@{
286 /// Add a new state, returns new name
287 int add_state(enum wfst_state_type state_type);
288 /// Given a multi-state return type (final, ok, error)
289 enum wfst_state_type ms_type(EST_WFST_MultiState *ms) const;
290
291 /// Basic regex constructor
292 void build_wfst(int start, int end,LISP regex);
293 /// Basic conjunction constructor
294 void build_and_transition(int start, int end, LISP conjunctions);
295 /// Basic disjunction constructor
296 void build_or_transition(int start, int end, LISP disjunctions);
297
298 // from standard REGEX
299 void build_from_regex(LISP inalpha, LISP outalpha, LISP regex);
300 // Kay/Kaplan/Koskenniemi rule compile
301 void kkrule_compile(LISP inalpha, LISP outalpha, LISP fp,
302 LISP rule, LISP sets);
303 // Build from regular (or pseudo-CF) grammar
304 void build_from_rg(LISP inalpha, LISP outalpha,
305 LISP distinguished, LISP rewrites,
306 LISP sets, LISP terms,
307 int max_depth);
308 // Build simple tree lexicon
309 void build_tree_lex(LISP inalpha, LISP outalpha,
310 LISP wlist);
311 //@}
312
313 /**@name Basic WFST operators */
314 //@{
315 /// Build determinized form of a
316 void determinize(const EST_WFST &a);
317 /// Build minimized form of a
318 void minimize(const EST_WFST &a);
319 /// Build complement of a
320 void complement(const EST_WFST &a);
321 /** Build intersection of all WFSTs in given list. The new WFST
322 recognizes the only the strings that are recognized by all WFSTs
323 in the given list */
325 /** Build intersection of WFSTs a and b The new WFST
326 recognizes the only the strings that are recognized by both a
327 and b list */
328 void intersection(const EST_WFST &a, const EST_WFST &b);
329 /** Build union of all WFSTs in given list. The new WFST
330 recognizes the only the strings that are recognized by at least
331 one WFSTs in the given list */
333 /** Build union of WFSTs a and b. The new WFST
334 recognizes the only the strings that are recognized by either
335 a or b */
336 void uunion(const EST_WFST &a, const EST_WFST &b);
337 /** Build new WFST by composition of a and b. Outputs of a are
338 fed to b, given a new WFSTs which has a's input language and b's
339 output set. a's output and b's input alphabets must be the same */
340 void compose(const EST_WFST &a,const EST_WFST &b);
341 /** Build WFST that accepts only strings in a that aren't also accepted
342 by strings in b. */
343 void difference(const EST_WFST &a,const EST_WFST &b);
344 /** Build WFST that accepts a language that consists of any string in
345 a followed by any string in b **/
346 void concat(const EST_WFST &a,const EST_WFST &b);
347 //@}
348
349 /**@name construction support functions */
350 //@{
351 /// True if WFST is deterministic
352 int deterministic() const;
353 /// Transduce a multi-state given n and out
356 int in, int out) const;
357 /// Extend multi-state with epsilon reachable states
359 /// Remove error states from the WFST.
360 void remove_error_states(const EST_WFST &a);
361
362 EST_String summary() const;
363 /// ?
364 EST_WFST & operator = (const EST_WFST &a) { copy(a); return *this; }
365};
367
368// The basic operations on WFST
369void minimize(EST_WFST &a,EST_WFST &b);
370void determinize(EST_WFST &a,EST_WFST &b);
371void concat(EST_WFST &a,EST_WFST &b,EST_WFST &c);
372void compose(EST_WFST &a,EST_WFST &b,EST_WFST &c);
373void intersect(wfst_list &wl,EST_WFST &wfst);
374void complement(EST_WFST &a,EST_WFST &b);
375void difference(EST_WFST &a,EST_WFST &b,EST_WFST &c);
376void concatenate(EST_WFST &a,EST_WFST &b,EST_WFST &c);
377
378// Compile a set of KK rules
379void kkcompile(LISP ruleset, EST_WFST &all_wfst);
380// Compile a set of LTS rules
381void ltscompile(LISP lts_rules, EST_WFST &all_wfst);
382// Compile a regular grammar
383void rgcompile(LISP rg, EST_WFST &all_wfst);
384// Compile a tree lexicon
385void tlcompile(LISP rg, EST_WFST &all_wfst);
386
387// Transduction and recognition functions
388int transduce(const EST_WFST &wfst,const EST_StrList &in,EST_StrList &out);
389int transduce(const EST_WFST &wfst,const EST_IList &in,EST_IList &out);
390int recognize(const EST_WFST &wfst,const EST_StrList &in,int quiet);
391int recognize(const EST_WFST &wfst,const EST_IList &in,
392 const EST_IList &out,int quite);
393int recognize_for_perplexity(const EST_WFST &wfst,
394 const EST_StrList &in,
395 int quiet,
396 float &count,
397 float &sumlogp);
398int recognize_for_perplexity(const EST_WFST &wfst,
399 const EST_IList &in,
400 const EST_IList &out,
401 int quiet,
402 float &count,
403 float &sumlogp);
404
405VAL_REGISTER_CLASS_DCLS(wfst,EST_WFST)
406
407#endif
const EST_String & name(const int n) const
The name given the index.
void difference(const EST_WFST &a, const EST_WFST &b)
Definition: wfst_ops.cc:898
void start_cumulate()
Clear and start cumulation.
Definition: EST_WFST.cc:670
int add_state(enum wfst_state_type state_type)
Add a new state, returns new name.
Definition: EST_WFST.cc:652
const EST_String & out_symbol(int i) const
Map output alphabet index to output symbol.
Definition: EST_WFST.h:213
EST_WFST_State * state_non_const(int i)
Return internal state information (non-const)
Definition: EST_WFST.h:224
EST_WFST & operator=(const EST_WFST &a)
?
Definition: EST_WFST.h:364
const EST_String & in_symbol(int i) const
Map input alphabet index to input symbol.
Definition: EST_WFST.h:207
void uunion(EST_TList< EST_WFST > &wl)
void add_epsilon_reachable(EST_WFST_MultiState *ms) const
Extend multi-state with epsilon reachable states.
Definition: wfst_ops.cc:319
EST_WFST()
?
Definition: EST_WFST.cc:126
void complement(const EST_WFST &a)
Build complement of a.
Definition: wfst_ops.cc:646
EST_WFST_Transition * find_transition(int state, int in, int out) const
Find (first) transition given in and out symbols.
Definition: EST_WFST.cc:267
void init(int init_num_states=10)
Clear with (estimation of number of states required)
Definition: EST_WFST.cc:145
void clear()
clear removing existing states if any
Definition: EST_WFST.cc:115
void transition_all(int state, int in, int out, EST_WFST_MultiState *ms) const
Find all possible transitions for given state/input/output.
Definition: wfst_ops.cc:294
void compose(const EST_WFST &a, const EST_WFST &b)
Definition: wfst_ops.cc:812
EST_WFST_MultiState * apply_multistate(const EST_WFST &wfst, EST_WFST_MultiState *ms, int in, int out) const
Transduce a multi-state given n and out.
Definition: wfst_ops.cc:250
int out_epsilon() const
Internal index for output epsilon.
Definition: EST_WFST.h:220
int in_symbol(const EST_String &s) const
Map input symbol to input alphabet index.
Definition: EST_WFST.h:204
void remove_error_states(const EST_WFST &a)
Remove error states from the WFST.
Definition: wfst_ops.cc:917
EST_WFST(const EST_WFST &wfst)
?
Definition: EST_WFST.h:183
void build_wfst(int start, int end, LISP regex)
Basic regex constructor.
Definition: wfst_regex.cc:140
int in_epsilon() const
Internal index for input epsilon.
Definition: EST_WFST.h:218
int deterministic() const
True if WFST is deterministic.
Definition: wfst_ops.cc:975
void copy(const EST_WFST &wfst)
Copy from existing wfst.
Definition: EST_WFST.cc:132
EST_write_status save(const EST_String &filename, const EST_String type="ascii")
?
Definition: EST_WFST.cc:349
int cumulate() const
Cumulation condition.
Definition: EST_WFST.h:277
const EST_WFST_State * state(int i) const
Return internal state information.
Definition: EST_WFST.h:222
int out_symbol(const EST_String &s) const
Map output symbol to output alphabet index.
Definition: EST_WFST.h:210
LISP epsilon_label() const
LISP for on epsilon symbols.
Definition: EST_WFST.h:216
const EST_Discrete & out_symbols() const
Accessing the output alphabet.
Definition: EST_WFST.h:231
void stop_cumulate()
Stop cumulation and calculate probabilities on transitions.
Definition: EST_WFST.cc:685
void build_or_transition(int start, int end, LISP disjunctions)
Basic disjunction constructor.
Definition: wfst_regex.cc:44
void minimize(const EST_WFST &a)
Build minimized form of a.
Definition: wfst_ops.cc:484
int transduce(int state, int in, int &out) const
Transduce in to out from state.
Definition: EST_WFST.cc:219
enum wfst_state_type ms_type(EST_WFST_MultiState *ms) const
Given a multi-state return type (final, ok, error)
Definition: wfst_ops.cc:271
void concat(const EST_WFST &a, const EST_WFST &b)
Definition: wfst_ops.cc:776
void intersection(EST_TList< EST_WFST > &wl)
Definition: wfst_ops.cc:356
void determinize(const EST_WFST &a)
Build determinized form of a.
Definition: wfst_ops.cc:164
EST_read_status load(const EST_String &filename)
?
Definition: EST_WFST.cc:508
int transition(int state, int in, int out) const
Find (first) new state given in and out symbols.
Definition: EST_WFST.cc:260
void build_and_transition(int start, int end, LISP conjunctions)
Basic conjunction constructor.
Definition: wfst_regex.cc:62
const EST_Discrete & in_symbols() const
Accessing the input alphabet.
Definition: EST_WFST.h:229