Edinburgh Speech Tools 2.4-release
EST_Ngrammar.h
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1996 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Author : Simon King & Alan W Black */
34/* Date : February 1997 */
35/*-----------------------------------------------------------------------*/
36/* */
37/* A general class for ngrams (bi-gram, tri-gram etc) */
38/* */
39/*=======================================================================*/
40#ifndef __EST_NGRAMMAR_H__
41#define __EST_NGRAMMAR_H__
42
43#include <cstdarg>
44#include <cstdlib>
45
46using namespace std;
47
48#include "EST_String.h"
49#include "EST_Val.h"
50#include "EST_rw_status.h"
51#include "EST_types.h"
52#include "EST_FMatrix.h"
53#include "EST_TList.h"
54#include "EST_StringTrie.h"
55#include "EST_simplestats.h"
56#include "EST_PST.h"
57#include "EST_string_aux.h"
58#include "EST_math.h"
59
60// HTK style
61#define SENTENCE_START_MARKER "!ENTER"
62#define SENTENCE_END_MARKER "!EXIT"
63#define OOV_MARKER "!OOV"
64
65#define EST_NGRAMBIN_MAGIC 1315402337
66
67// for compressed save/load
68#define GZIP_FILENAME_EXTENSION "gz"
69#define COMPRESS_FILENAME_EXTENSION "Z"
70
71// Ultimate floor
72#define TINY_FREQ 1.0e-10
73
74// ngram state - represents the N-1 word history and contains
75// the pdf of the next word
76
78
79private:
80
81protected:
83 int p_id; // a 'name'
84
85public:
87
88 p_pdf()
89
90 {
91 init();
92 };
93 EST_NgrammarState(int id,EST_Discrete *d){clear();init(id,d);};
95 {clear();init(id,pdf);};
99
100 EST_IVector path; // how we got here
101
102 // initialise
103 void clear();
104 void init();
105 void init(int id, EST_Discrete *d);
106 void init(int id, const EST_DiscreteProbDistribution &pdf);
107
108 // build
109 void cumulate(const int index, const double count=1)
110 {p_pdf.cumulate(index,count);};
111 void cumulate(const EST_String &word, const double count=1)
112 {p_pdf.cumulate(word,count);};
113
114 // access
115 int id() const {return p_id; };
116 const EST_DiscreteProbDistribution &pdf_const() const {return p_pdf; };
117 EST_DiscreteProbDistribution &pdf() {return p_pdf; };
118 double probability(const EST_String &w) const
119 {return p_pdf.probability(w);}
120 double probability(int w) const {return p_pdf.probability(w);}
121 double frequency(const EST_String &w) const
122 {return p_pdf.frequency(w);}
123 double frequency(int w) const {return p_pdf.frequency(w);}
124 const EST_String &most_probable(double *prob = NULL) const
125 {return p_pdf.most_probable(prob);}
126
127friend ostream& operator<<(ostream& s, const EST_NgrammarState &a);
128
129};
130
132
133private:
134
135protected:
136 int p_level; // = 0 for root node
137 double backoff_weight;
139 EST_StringTrie children;
140
141 EST_BackoffNgrammarState* add_child(const EST_Discrete *d,
142 const EST_StrVector &words);
143 EST_BackoffNgrammarState* add_child(const EST_Discrete *d,
144 const EST_IVector &words);
145public:
147 { init(); };
148 EST_BackoffNgrammarState(const EST_Discrete *d,int level)
149 {clear();init(d,level);};
151 {clear();init(pdf,level);};
155
156 // initialise
157 void clear();
158 void init();
159 void init(const EST_Discrete *d, int level);
160 void init(const EST_DiscreteProbDistribution &pdf, int level);
161
162 // build
163 bool accumulate(const EST_StrVector &words,
164 const double count=1);
165 bool accumulate(const EST_IVector &words,
166 const double count=1);
167 // access
168 const EST_DiscreteProbDistribution &pdf_const() const {return p_pdf; };
169 EST_DiscreteProbDistribution &pdf() {return p_pdf; };
170 double probability(const EST_String &w) const
171 {return p_pdf.probability(w);}
172 double frequency(const EST_String &w) const
173 {return p_pdf.frequency(w);}
174 const EST_String &most_probable(double *prob = NULL) const
175 {return p_pdf.most_probable(prob);}
176
177 const int level() const {return p_level;}
178
179 EST_BackoffNgrammarState* get_child(const EST_String &word) const
180 {
181 return (EST_BackoffNgrammarState*)children.lookup(word);
182 }
183 EST_BackoffNgrammarState* get_child(const int word) const
184 {
185 return (EST_BackoffNgrammarState*)children.lookup(p_pdf.get_discrete()->name(word));
186 }
187
188 void remove_child(EST_BackoffNgrammarState *child,
189 const EST_String &name);
190
191 // recursive delete of contents and children
192 void zap();
193
194 const EST_BackoffNgrammarState *const get_state(const EST_StrVector &words) const;
195
196 bool ngram_exists(const EST_StrVector &words,
197 const double threshold) const;
198 const double get_backoff_weight() const {return backoff_weight; }
199 const double get_backoff_weight(const EST_StrVector &words) const;
200 bool set_backoff_weight(const EST_StrVector &words, const double w);
201 void frequency_of_frequencies(EST_DVector &ff);
202
203 void print_freqs(ostream &os,const int order,EST_String followers="");
204
205friend ostream& operator<<(ostream& s, const EST_BackoffNgrammarState &a);
206
207};
208
210
211public:
212
213 // 3 representations : sparse, dense and backed off. User specifies which.
214 enum representation_t {sparse, dense, backoff};
215
216 // now only keep frequencies (or log frequencies)
217 // probabilities (or log probabilities) can be done
218 // on the fly quickly enough
219 enum entry_t {frequencies, log_frequencies};
220
221protected:
222
223 // each instance of an EST_Ngrammar is a grammar of fixed order
224 // e.g. a bigram (order = 2)
225 int p_order;
226 int p_num_samples;
227
228 double p_number_of_sentences; // which were used to build this grammar
229
230
231 EST_String p_sentence_start_marker;
232 EST_String p_sentence_end_marker;
233
234 // only one representation in use at a time
235 representation_t p_representation;
236 entry_t p_entry_type;
237
238 // sparse representation is a tree structure
239 // holding only those N-grams which were seen
240 EST_PredictionSuffixTree sparse_representation;
241 bool init_sparse_representation();
242
243 // dense representation is just an array of all states
244 bool init_dense_representation();
245
246 // backoff representation is also a tree structure
247 // but the root state pdf is the most recent word in the
248 // ngram and going down the tree is going back in time....
249 // here is the root node :
250 EST_BackoffNgrammarState *backoff_representation;
251
252 double backoff_threshold;
253
254 // need a non-zero unigram floor to enable backing off
255 double backoff_unigram_floor_freq;
256
257 // instead of simple discounting, we have a (possibly) different
258 // discount per order and per frequency
259 // e.g. backoff_discount[2](4) contains the discount to be
260 // applied to a trigram frequency of 4
261 // backoff_discount[0] is unused (we don't discount unigrams)
262 EST_DVector *backoff_discount;
263 const double get_backoff_discount(const int order, const double freq) const;
264
265 bool init_backoff_representation();
266 void prune_backoff_representation(EST_BackoffNgrammarState *start_state=NULL); // remove any zero frequency branches
267 void backoff_restore_unigram_states();
268 int p_num_states; // == p_vocab_size ^ (p_ord-1) if fully dense
269 EST_NgrammarState *p_states; // state id is index into this array
270 int find_dense_state_index(const EST_IVector &words, int index=0) const;
271
272 // and the reverse
273 const EST_StrVector &make_ngram_from_index(const int i) const;
274
275 // vocabulary
276 EST_Discrete *vocab;
277 EST_Discrete *pred_vocab; // may be different from state vocab
278 bool init_vocab(const EST_StrList &wordlist);
279 bool init_vocab(const EST_StrList &word_list,
280 const EST_StrList &pred_list);
281
282 // make sure vocab matches a given wordlist
283 bool check_vocab(const EST_StrList &wordlist);
284
285 EST_DiscreteProbDistribution vocab_pdf; // overall pdf
286
287 const EST_String &lastword(const EST_StrVector &words) const
288 { return words(p_order-1); }
289 const int lastword(const EST_IVector &words) const
290 { return words(p_order-1); }
291 // are we allowing out-of-vocabulary words, or is the vocabulary closed?
292 bool allow_oov;
293
294 bool sparse_to_dense();
295 bool dense_to_sparse();
296
297 // these aren't sorted yet ...
298 void take_logs();
299 void take_exps();
300 void freqs_to_probs(); // just calls normalise
301
302 bool build_sparse(const EST_String &filename,
303 const EST_String &prev,
304 const EST_String &prev_prev,
305 const EST_String &last);
306 // for dense and backoff
307 bool build_ngram(const EST_String &filename,
308 const EST_String &prev,
309 const EST_String &prev_prev,
310 const EST_String &last,
311 const EST_String &input_format);
312
313 // go through all matching ngrams ( *(ngram[i])="" matches anything )
314 void iterate(EST_StrVector &words,
315 void (*function)(EST_Ngrammar *n,
316 EST_StrVector &words,
317 void *params),
318 void *params);
319
320 // same, but with a constant Ngrammar
321 void const_iterate(EST_StrVector &words,
322 void (*function)(const EST_Ngrammar *const n,
323 EST_StrVector &words,
324 void *params),
325 void *params) const;
326
327 bool p_init(int o, representation_t r);
328
329 // new filename returned of we had to copy stdin to a
330 // temporary file - must delete it later !
331 bool oov_preprocess(const EST_String &filename,
332 EST_String &new_filename,
333 const EST_String &what);
334
335
336 const EST_NgrammarState &find_state_const(const EST_StrVector &words)const;
337 EST_NgrammarState &find_state(const EST_StrVector &words);
338 const EST_NgrammarState &find_state_const(const EST_IVector &words) const;
339 EST_NgrammarState &find_state(const EST_IVector &words);
340
341 // special versions for backoff grammars
342 const EST_DiscreteProbDistribution &backoff_prob_dist(const EST_StrVector &words) const;
343 const double backoff_reverse_probability_sub(const EST_StrVector &words,
344 const EST_BackoffNgrammarState *root) const;
345 const double backoff_probability(const EST_StrVector &words,
346 const bool trace=false) const;
347 const double backoff_reverse_probability(const EST_StrVector &words) const;
348 const EST_String & backoff_most_probable(const EST_StrVector &words,
349 double *prob = NULL) const;
350
351 // backoff representation isn't a nice array of states
352 // so use this to visit every node in the tree
353 // and apply the function to that node
354 void backoff_traverse(EST_BackoffNgrammarState *start_state,
355 void (*function)(EST_BackoffNgrammarState *s,
356 void *params),
357 void *params);
358
359 // visit every node at a given level
360 void backoff_traverse(EST_BackoffNgrammarState *start_state,
361 void (*function)(EST_BackoffNgrammarState *s,
362 void *params),
363 void *params, const int level);
364public:
365
366 EST_Ngrammar() {default_values();}
367
368 EST_Ngrammar(int o, representation_t r,
369 const EST_StrList &wordlist)
370 {
371 default_values(); init(o,r,wordlist);
372 }
373
374 // When state trans vocab differs from prediction vocab
375 EST_Ngrammar(int o, representation_t r,
376 const EST_StrList &wordlist,
377 const EST_StrList &predlist)
378 {
379 default_values(); init(o,r,wordlist,predlist);
380 }
381
382 EST_Ngrammar(int o, representation_t r, EST_Discrete &v)
383 {
384 default_values(); init(o,r,v);
385 }
387
388 void default_values();
389 void clear();
390 bool init(int o, representation_t r,
391 const EST_StrList &wordlist);
392 bool init(int o, representation_t r,
393 const EST_StrList &wordlist,
394 const EST_StrList &predlist);
395 bool init(int o, representation_t r, EST_Discrete &v);
396 bool init(int o, representation_t r,
398
399 // access
400 int num_states(void) const { return p_num_states;}
401 double samples(void) const { return p_num_samples;}
402 int order() const { return p_order; }
403 int get_vocab_length() const { return vocab?vocab->length():0; }
404 EST_String get_vocab_word(int i) const;
405 int get_vocab_word(const EST_String &s) const;
406 int get_pred_vocab_length() const { return pred_vocab->length(); }
407 EST_String get_pred_vocab_word(int i) const { return pred_vocab->name(i); }
408 int get_pred_vocab_word(const EST_String &s) const
409 { return pred_vocab->name(s); }
410 int closed_vocab() const {return !allow_oov; }
411 entry_t entry_type() const {return p_entry_type;}
412 representation_t representation() const
413 { return p_representation;}
414
415 // build
416 bool build(const EST_StrList &filenames,
417 const EST_String &prev = SENTENCE_START_MARKER,
418 const EST_String &prev_prev = SENTENCE_END_MARKER,
419 const EST_String &last = SENTENCE_END_MARKER,
420 const EST_String &input_format = "",
421 const EST_String &oov_mode = "",
422 const int mincount=1,
423 const int maxcount=10);
424
425 // Accumulate ngrams
426 void accumulate(const EST_StrVector &words,
427 const double count=1);
428 //const int index=0);
429 void accumulate(const EST_IVector &words,
430 const double count=1);
431 //const int index=0);
432
433 // hack - fix enter/exit probs s.t. P(...,!ENTER)=P(!EXIT,...)=0, for all x
434 void make_htk_compatible();
435
436 // I/O functions
437 EST_read_status load(const EST_String &filename);
438 EST_read_status load(const EST_String &filename, const EST_StrList &wordlist);
439 EST_write_status save(const EST_String &filename,
440 const EST_String type="cstr_ascii",
441 const bool trace=false,
442 double floor=0.0);
443
444 int wordlist_index(const EST_String &word, const bool report=true) const;
445 const EST_String &wordlist_index(int i) const;
446 int predlist_index(const EST_String &word) const;
447 const EST_String &predlist_index(int i) const;
448
449 // set
450 bool set_entry_type(entry_t new_type);
451 bool set_representation(representation_t new_representation);
452
453 // probability distributions
454 // -------------------------
455 // flag 'force' forces computation of probs on-the-fly if necessary
456 double probability(const EST_StrVector &words, bool force=false,
457 const bool trace=false) const;
458 double frequency(const EST_StrVector &words, bool force=false,
459 const bool trace=false) const;
460
461 const EST_String &predict(const EST_StrVector &words,
462 double *prob,int *state) const;
463 const EST_String &predict(const EST_StrVector &words) const
464 {double p; int state; return predict(words,&p,&state); }
465 const EST_String &predict(const EST_StrVector &words,double *prob) const
466 {int state; return predict(words,prob,&state); }
467
468 const EST_String &predict(const EST_IVector &words,double *prob,int *state) const;
469 const EST_String &predict(const EST_IVector &words) const
470 {double p; int state; return predict(words,&p,&state); }
471 const EST_String &predict(const EST_IVector &words,double *prob) const
472 {int state; return predict(words,prob,&state); }
473
474 int find_state_id(const EST_StrVector &words) const;
475 int find_state_id(const EST_IVector &words) const;
476 int find_next_state_id(int state, int word) const;
477 // fast versions for common N
478 //const double probability(const EST_String w1);
479 //const double probability(const EST_String w1,const EST_String w2);
480 //const double probability(const EST_String w1,const EST_String w2,
481 //const EST_String w2);
482
483 // reverse - probability of words[0..order-2] given word[order-1]
484 double reverse_probability(const EST_StrVector &words,
485 bool force=false) const;
486 double reverse_probability(const EST_IVector &words,
487 bool force=false) const;
488
489 // predict, where words has 'order' elements and the last one is "" or NULL
490 const EST_DiscreteProbDistribution &prob_dist(const EST_StrVector &words) const;
491 const EST_DiscreteProbDistribution &prob_dist(const EST_IVector &words) const;
492 const EST_DiscreteProbDistribution &prob_dist(int state) const;
493
494// bool stats(const EST_String filename,
495// double &raw_entropy, double &count,
496// double &entropy, double &perplexity,
497// const EST_String &prev = SENTENCE_START_MARKER,
498// const EST_String &prev_prev = SENTENCE_END_MARKER,
499// const EST_String &last = SENTENCE_END_MARKER,
500// const EST_String &input_format = "") const;
501
502 void fill_window_start(EST_IVector &window,
503 const EST_String &prev,
504 const EST_String &prev_prev) const;
505
506 void fill_window_start(EST_StrVector &window,
507 const EST_String &prev,
508 const EST_String &prev_prev) const;
509
510 // why anybody would want to do this ....
511 //EST_Ngrammar &operator =(const EST_Ngrammar &a);
512
513 bool ngram_exists(const EST_StrVector &words) const;
514 bool ngram_exists(const EST_StrVector &words, const double threshold) const;
515 const double get_backoff_weight(const EST_StrVector &words) const;
516 bool set_backoff_weight(const EST_StrVector &words, const double w);
517
518 void print_freqs(ostream &os,double floor=0.0);
519
520 // i/o functions
521 // -------------
522 friend ostream& operator<<(ostream& s, EST_Ngrammar &n);
523 friend EST_read_status load_ngram_htk_ascii(const EST_String filename,
524 EST_Ngrammar &n);
525 friend EST_read_status load_ngram_htk_binary(const EST_String filename,
526 EST_Ngrammar &n);
527 friend EST_read_status load_ngram_arpa(const EST_String filename,
528 EST_Ngrammar &n,
529 const EST_StrList &vocab);
530 friend EST_read_status load_ngram_cstr_ascii(const EST_String filename,
531 EST_Ngrammar &n);
532 friend EST_read_status load_ngram_cstr_bin(const EST_String filename,
533 EST_Ngrammar &n);
534
535 friend EST_write_status save_ngram_htk_ascii_sub(const EST_String &word,
536 ostream *ost,
537 EST_Ngrammar &n,
538 double floor);
539 friend EST_write_status save_ngram_htk_ascii(const EST_String filename,
540 EST_Ngrammar &n,
541 double floor);
542
543 //friend EST_write_status save_ngram_htk_binary(const EST_String filename,
544 // EST_Ngrammar &n);
545 friend EST_write_status save_ngram_cstr_ascii(const EST_String filename,
546 EST_Ngrammar &n,
547 const bool trace,
548 double floor);
549 friend EST_write_status save_ngram_cstr_bin(const EST_String filename,
550 EST_Ngrammar &n,
551 const bool trace,
552 double floor);
553 friend EST_write_status save_ngram_arpa(const EST_String filename,
554 EST_Ngrammar &n);
555 friend EST_write_status save_ngram_arpa_sub(ostream *ost,
556 EST_Ngrammar &n,
557 const EST_StrVector &words);
558 friend EST_write_status save_ngram_wfst(const EST_String filename,
559 EST_Ngrammar &n);
560
561 // Auxiliary functions
562
563 // smoothing
564friend void frequency_of_frequencies(EST_DVector &ff, EST_Ngrammar &n,int this_order);
565friend void map_frequencies(EST_Ngrammar &n, const EST_DVector &map, const int this_order);
566friend bool Good_Turing_smooth(EST_Ngrammar &n, int maxcount, int mincount);
567friend void Good_Turing_discount(EST_Ngrammar &ngrammar, const int maxcount,
568 const double default_discount);
569
570friend void fs_build_backoff_ngrams(EST_Ngrammar *backoff_ngrams,
571 EST_Ngrammar &ngram);
572friend int fs_backoff_smooth(EST_Ngrammar *backoff_ngrams,
573 EST_Ngrammar &ngram, int smooth_thresh);
574
575 // frequencies below mincount get backed off
576 // frequencies above maxcount are not smoothed(discounted)
577 bool compute_backoff_weights(const int mincount=1,
578 const int maxcount=10);
579
580
581 bool merge(EST_Ngrammar &n,float weight);
582
583friend class EST_BackoffNgrammar;
584
585};
586
587void Ngram_freqsmooth(EST_Ngrammar &ngram,
588 int smooth_thresh1,
589 int smooth_thresh2);
590
591// utils
592void slide(EST_IVector &i, const int l);
593void slide(EST_StrVector &i, const int l);
594
595bool test_stats(EST_Ngrammar &ngram,
596 const EST_String &filename,
597 double &raw_entropy,
598 double &count,
599 double &entropy,
600 double &perplexity,
601 const EST_String &input_format,
602 const EST_String &prev = SENTENCE_START_MARKER,
603 const EST_String &prev_prev = SENTENCE_END_MARKER,
604 const EST_String &last = SENTENCE_END_MARKER);
605
606VAL_REGISTER_CLASS_DCLS(ngrammar,EST_Ngrammar)
607
608#endif // __EST_NGRAMMAR_H__
const EST_Discrete *const get_discrete() const
Returns discrete vocabulary of distribution.
const EST_String & most_probable(double *prob=NULL) const
Return the most probable member of the distribution.
void cumulate(const EST_String &s, double count=1)
Add this observation, may specify number of occurrences.
const EST_String & name(const int n) const
The name given the index.
const int length(void) const
The number of members in the discrete.
void * lookup(const EST_String &key) const
Find contents index by {\tt key}, 0 if there is not contents.
INLINE int length() const
number of items in vector.
Definition: EST_TVector.h:252