Edinburgh Speech Tools 2.4-release
wagon_test_main.cc
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1996,1997 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Author : Alan W Black */
34/* Date : October 1997 */
35/*-----------------------------------------------------------------------*/
36/* A program for testing a CART tree against data, also may be used to */
37/* predict values using a tree and data */
38/* */
39/*=======================================================================*/
40#include <cstdlib>
41#include <iostream>
42#include <fstream>
43#include <cstring>
44#include "EST_Wagon.h"
45#include "EST_cutils.h"
46#include "EST_multistats.h"
47#include "EST_Token.h"
48#include "EST_cmd_line.h"
49
50static int wagon_test_main(int argc, char **argv);
51static LISP find_feature_value(const char *feature,
52 LISP vector, LISP description);
53static LISP wagon_vector_predict(LISP tree, LISP vector, LISP description);
54static LISP get_data_vector(EST_TokenStream &data, LISP description);
55static void simple_predict(EST_TokenStream &data, FILE *output,
56 LISP tree, LISP description, int all_info);
57static void test_tree_class(EST_TokenStream &data, FILE *output,
58 LISP tree, LISP description);
59static void test_tree_float(EST_TokenStream &data, FILE *output,
60 LISP tree, LISP description);
61
62/** @name <command>wagon_test</command> <emphasis>Test CART models</emphasis>
63 @id wagon-test-manual
64 * @toc
65 */
66
67//@{
68
69
70/**@name Synopsis
71 */
72//@{
73
74//@synopsis
75
76/**
77Wagon_test is used to test CART models on feature data.
78
79A detailed description of the CART model can be found in the
80<link linkend="cart-overview">CART model overview</link> section.
81*/
82
83int main(int argc, char **argv)
84{
85
86 wagon_test_main(argc,argv);
87
88 exit(0);
89 return 0;
90}
91
92static int wagon_test_main(int argc, char **argv)
93{
94 // Top level function sets up data and creates a tree
95 EST_Option al;
96 EST_StrList files;
97 LISP description,tree=NIL;;
98 EST_TokenStream data;
99 FILE *wgn_output;
100
101 parse_command_line
102 (argc, argv,
103 EST_String("<options>\n")+
104 "Summary: program to test CART models on data\n"+
105 "-desc <ifile> Field description file\n"+
106 "-data <ifile> Datafile, one vector per line\n"+
107 "-tree <ifile> File containing CART tree\n"+
108 "-track <ifile>\n"+
109 " track for vertex indices\n"+
110 "-predict Predict for each vector returning full vector\n"+
111 "-predict_val Predict for each vector returning just value\n"+
112 "-predictee <string>\n"+
113 " name of field to predict (default is first field)\n"+
114 "-heap <int> {210000}\n"+
115 " Set size of Lisp heap, should not normally need\n"+
116 " to be changed from its default\n"+
117 "-o <ofile> File to save output in\n",
118 files, al);
119
120 siod_init(al.ival("-heap"));
121
122 if (al.present("-desc"))
123 {
124 gc_protect(&description);
125 description = car(vload(al.val("-desc"),1));
126 }
127 else
128 {
129 cerr << argv[0] << ": no description file specified" << endl;
130 exit(-1);
131 }
132
133 if (al.present("-tree"))
134 {
135 gc_protect(&tree);
136 tree = car(vload(al.val("-tree"),1));
137 if (tree == NIL)
138 {
139 cerr << argv[0] << ": no tree found in \"" << al.val("-tree")
140 << "\"" << endl;
141 exit(-1);
142 }
143 }
144 else
145 {
146 cerr << argv[0] << ": no tree file specified" << endl;
147 exit(-1);
148 }
149
150 if (al.present("-data"))
151 {
152 if (data.open(al.val("-data")) != 0)
153 {
154 cerr << argv[0] << ": can't open data file \"" <<
155 al.val("-data") << "\" for input." << endl;
156 exit(-1);
157 }
158 }
159 else
160 {
161 cerr << argv[0] << ": no data file specified" << endl;
162 exit(-1);
163 }
164
165 if (al.present("-track"))
166 {
167 wgn_VertexTrack.load(al.val("-track"));
168 }
169
170 if (al.present("-o"))
171 {
172 if ((wgn_output = fopen(al.val("-o"),"w")) == NULL)
173 {
174 cerr << argv[0] << ": can't open output file \"" <<
175 al.val("-o") << "\"" << endl;
176 }
177 }
178 else
179 wgn_output = stdout;
180
181 if (al.present("-predictee"))
182 {
183 LISP l;
184 int i;
185 wgn_predictee_name = al.val("-predictee");
186 for (l=description,i=0; l != NIL; l=cdr(l),i++)
187 if (streq(wgn_predictee_name,get_c_string(car(car(l)))))
188 {
189 wgn_predictee = i;
190 break;
191 }
192 if (l==NIL)
193 {
194 cerr << argv[0] << ": predictee \"" << wgn_predictee <<
195 "\" not in description\n";
196 }
197 }
198 const char *predict_type =
199 get_c_string(car(cdr(siod_nth(wgn_predictee,description))));
200
201 if (al.present("-predict"))
202 simple_predict(data,wgn_output,tree,description,FALSE);
203 else if (al.present("-predict_val"))
204 simple_predict(data,wgn_output,tree,description,TRUE);
205 else if (streq(predict_type,"float") ||
206 streq(predict_type,"int"))
207 test_tree_float(data,wgn_output,tree,description);
208#if 0
209 else if (streq(predict_type,"vector"))
210 test_tree_vector(data,wgn_output,tree,description);
211#endif
212 else
213 test_tree_class(data,wgn_output,tree,description);
214
215 if (wgn_output != stdout)
216 fclose(wgn_output);
217 data.close();
218 return 0;
219}
220
221static LISP get_data_vector(EST_TokenStream &data, LISP description)
222{
223 // read in one vector. Should be terminated with an newline
224 LISP v=NIL,d;
225
226 if (data.eof())
227 return NIL;
228
229 for (d=description; d != NIL; d=cdr(d))
230 {
231 EST_Token t = data.get();
232
233 if ((d != description) && (t.whitespace().contains("\n")))
234 {
235 cerr << "wagon_test: unexpected newline within vector " <<
236 t.row() << " wrong number of features" << endl;
237 siod_error();
238 }
239 if (streq(get_c_string(car(cdr(car(d)))),"float") ||
240 streq(get_c_string(car(cdr(car(d)))),"int"))
241 v = cons(flocons(atof(t.string())),v);
242 else if ((streq(get_c_string(car(cdr(car(d)))),"_other_")) &&
243 (siod_member_str(t.string(),cdr(car(d))) == NIL))
244 v = cons(strintern("_other_"),v);
245 else
246 v = cons(strintern(t.string()),v);
247 }
248
249 return reverse(v);
250}
251
252static void simple_predict(EST_TokenStream &data, FILE *output,
253 LISP tree, LISP description, int all_info)
254{
255 LISP vector,predict;
256 EST_String val;
257
258 for (vector=get_data_vector(data,description);
259 vector != NIL; vector=get_data_vector(data,description))
260 {
261 predict = wagon_vector_predict(tree,vector,description);
262 if (all_info)
263 val = siod_sprint(car(reverse(predict)));
264 else
265 val = siod_sprint(predict);
266 fprintf(output,"%s\n",(const char *)val);
267 }
268}
269
270static void test_tree_float(EST_TokenStream &data, FILE *output,
271 LISP tree, LISP description)
272{
273 // Test tree against data to get summary of results FLOAT
274 float predict_val,real_val;
275 EST_SuffStats x,y,xx,yy,xy,se,e;
276 double cor,error;
277 LISP vector,predict;
278
279 for (vector=get_data_vector(data,description);
280 vector != NIL; vector=get_data_vector(data,description))
281 {
282 predict = wagon_vector_predict(tree,vector,description);
283 predict_val = get_c_float(car(reverse(predict)));
284 real_val = get_c_float(siod_nth(wgn_predictee,vector));
285 x += predict_val;
286 y += real_val;
287 error = predict_val-real_val;
288 se += error*error;
289 e += fabs(error);
290 xx += predict_val*predict_val;
291 yy += real_val*real_val;
292 xy += predict_val*real_val;
293 }
294
295 cor = (xy.mean() - (x.mean()*y.mean()))/
296 (sqrt(xx.mean()-(x.mean()*x.mean())) *
297 sqrt(yy.mean()-(y.mean()*y.mean())));
298
299 fprintf(output,";; RMSE %1.4f Correlation is %1.4f Mean (abs) Error %1.4f (%1.4f)\n",
300 sqrt(se.mean()),
301 cor,
302 e.mean(),
303 e.stddev());
304}
305
306static void test_tree_class(EST_TokenStream &data, FILE *output,
307 LISP tree, LISP description)
308{
309 // Test tree against class data to get summary of results
310 EST_StrStr_KVL pairs;
311 EST_StrList lex;
312 EST_String predict_class,real_class;
313 LISP vector,w,predict;
314 double H=0,Q=0,prob;
315 (void)output;
316
317 for (vector=get_data_vector(data,description);
318 vector != NIL; vector=get_data_vector(data,description))
319 {
320 predict = wagon_vector_predict(tree,vector,description);
321 predict_class = get_c_string(car(reverse(predict)));
322 real_class = get_c_string(siod_nth(wgn_predictee,vector));
323 prob = get_c_float(car(cdr(siod_assoc_str(real_class,
324 predict))));
325 if (prob == 0)
326 H += log(0.000001);
327 else
328 H += log(prob);
329 Q ++;
330 pairs.add_item(real_class,predict_class,1);
331 }
332 for (w=cdr(siod_nth(wgn_predictee,description)); w != NIL; w = cdr(w))
333 lex.append(get_c_string(car(w)));
334
335 const EST_FMatrix &m = confusion(pairs,lex);
336 print_confusion(m,pairs,lex);
337 fprintf(stdout,";; entropy %g perplexity %g\n",
338 (-1*(H/Q)),pow(2.0,(-1*(H/Q))));
339}
340
341#if 0
342static void test_tree_vector(EST_TokenStream &data, FILE *output,
343 LISP tree, LISP description)
344{
345 // Test tree against class data to get summary of results
346 // Note we are talking about predicting vectors (a *bunch* of
347 // numbers, not just a single class here)
348 EST_StrStr_KVL pairs;
349 EST_StrList lex;
350 EST_String predict_class,real_class;
351 LISP vector,w,predict;
352 double H=0,Q=0,prob;
353 (void)output;
354
355 for (vector=get_data_vector(data,description);
356 vector != NIL; vector=get_data_vector(data,description))
357 {
358 predict = wagon_vector_predict(tree,vector,description);
359 predict_class = get_c_string(car(reverse(predict)));
360 real_class = get_c_string(siod_nth(wgn_predictee,vector));
361 prob = get_c_float(car(cdr(siod_assoc_str(real_class,
362 predict))));
363 if (prob == 0)
364 H += log(0.000001);
365 else
366 H += log(prob);
367 Q ++;
368 pairs.add_item(real_class,predict_class,1);
369 }
370 for (w=cdr(siod_nth(wgn_predictee,description)); w != NIL; w = cdr(w))
371 lex.append(get_c_string(car(w)));
372
373 const EST_FMatrix &m = confusion(pairs,lex);
374 print_confusion(m,pairs,lex);
375 fprintf(stdout,";; entropy %g perplexity %g\n",
376 (-1*(H/Q)),pow(2.0,(-1*(H/Q))));
377}
378#endif
379
380
381static LISP wagon_vector_predict(LISP tree, LISP vector, LISP description)
382{
383 // Using the LISP tree, vector and description, do standard prediction
384
385 if (cdr(tree) == NIL)
386 return car(tree);
387
388 LISP value = find_feature_value(wgn_ques_feature(car(tree)),
389 vector, description);
390
391 if (wagon_ask_question(car(tree),value))
392 // Yes answer
393 return wagon_vector_predict(car(cdr(tree)),vector,description);
394 else
395 // No answer
396 return wagon_vector_predict(car(cdr(cdr(tree))),vector,description);
397}
398
399static LISP find_feature_value(const char *feature,
400 LISP vector, LISP description)
401{
402 LISP v,d;
403
404 for (v=vector,d=description; v != NIL; v=cdr(v),d=cdr(d))
405 if (streq(feature,get_c_string(car(car(d)))))
406 return car(v);
407
408 cerr << "wagon_test: can't find feature \"" << feature <<
409 "\" in description" << endl;
410 siod_error();
411 return NIL;
412
413}
414
415/** @name Testing trees
416<para>
417Decision trees generated by wagon (or otherwise) can be applied
418to and tested against data sets using this program. This program
419requires a data set which is in the same format as wagon (and
420other programs) requires. It also needs a dataset description
421file naming the fields and given their type (see
422<link linkend="wagon-manual">the wagon manual</link> for a description
423for the actual format.
424<screen>
425wagon_test -data feats.data -desc feats.desc -tree feats.tree
426</screen>
427This will simply uses the tree against each sample in the data
428file and compare the predicted value with the actual value and
429produce a summary of the result. For categorial predictees a
430percentage correct and confusion matrix is generated. For continuous
431values the root mean squared error (RMSE) and correlation between the
432predicted values and the actual values is given.
433</para><para>
434By default the predictee is the first field but may also be specified
435on the command line. The dataset may contain features which are not
436used by the tree.
437</para><para>
438This program can also be used to generate output values for sampled
439data. In this case the sample data must still contain a "value" for
440the predictee even if it is dummy. The option
441<command>-predict</command> will output the new sample vector with
442the predicted value in place, and the option
443<command>-predict_val</command> option will just output the value.
444</para><para>
445This program is specifically designed for testing purposes although it
446can also just be used for prediction. It is probably more efficient
447to use the Lisp function <command>wagon</command> or underlying
448C++ function <command>wagon_predict()</command>.
449
450*/
451
452//@}
int ival(const EST_String &rkey, int m=1) const
Definition: EST_Option.cc:76
int contains(const char *s, int pos=-1) const
Does it contain this substring?
Definition: EST_String.h:375
double stddev(void) const
standard deviation of currently cummulated values
double mean(void) const
mean of currently cummulated values
int add_item(const K &rkey, const V &rval, int no_search=0)
add key-val pair to list
Definition: EST_TKVL.cc:248
const V & val(const K &rkey, bool m=0) const
return value according to key (const)
Definition: EST_TKVL.cc:145
const int present(const K &rkey) const
Returns true if key is present.
Definition: EST_TKVL.cc:222
void append(const T &item)
add item onto end of list
Definition: EST_TList.h:191
int eof()
end of file
Definition: EST_Token.h:356
void close(void)
Close stream.
Definition: EST_Token.cc:406
int open(const EST_String &filename)
open a \Ref{EST_TokenStream} for a file.
Definition: EST_Token.cc:200
EST_TokenStream & get(EST_Token &t)
get next token in stream
Definition: EST_Token.cc:486
int row(void) const
Line number in original \Ref{EST_TokenStream}.
Definition: EST_Token.h:182
EST_read_status load(const EST_String name, float ishift=0.0, float startt=0.0)
Definition: EST_Track.cc:1309