Edinburgh Speech Tools 2.4-release
token_example.cc
1/************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1996,1997 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/************************************************************************/
33/* Author: Alan W Black */
34/* Date: May 1997 */
35/************************************************************************/
36/* */
37/* Example of reading a file using the tokenizer */
38/* */
39/************************************************************************/
40
41#include <cstdlib>
42#include "EST_Token.h"
43
44#if defined(DATAC)
45# define __STRINGIZE(X) #X
46# define DATA __STRINGIZE(DATAC)
47#endif
48
49int main(int argc,char **argv)
50{
51 // Simple program to read all the tokens in the named file
52 // a print a summary of them
54 int tokens, alices, quotes;
55 EST_Token t;
56 EST_String fname;
57
58 if (argc > 2)
59 {
60 cerr << argv[0] << ": wrong number of arguments\n";
61 exit(-1);
62 }
63 else if (argc == 2)
64 fname = argv[1];
65 else
66 fname = DATA "/alice";
67
68 if (ts.open(fname) == -1)
69 {
70 cerr << argv[0] << ": can't open input file \"" << argv[1] <<
71 "\"\n";
72 exit(-1);
73 }
74
75 // Control of whitespace characters, single character symbols,
76 // pre and post punctuation may be set here.
77
78 // The defaults are standard whitespace, and nothing for the rest
79 // (this is like awk's basic tokenizer). For language analysis
80 // you'll probably want to modify the punctuation
81 // \173 is '{', it is inserted by number because of a doc++ problem.
82
83 ts.set_PrePunctuationSymbols("\173[(\"'");
84 ts.set_PunctuationSymbols(EST_Token_Default_PunctuationSymbols);
85
86 // Note you may set quotes so quoted tokens are read as single
87 // tokens (a la C)
88
89 for (tokens=quotes=alices=0; !ts.eof(); tokens++)
90 {
91 t = ts.get();
92 if (t == "Alice")
93 alices++;
94 if (t.prepunctuation().contains("\""))
95 quotes++;
96 }
97
98 printf("Input file contains:\n");
99 printf(" %5d tokens\n",tokens);
100 printf(" %5d tokens preceeded by double quotes\n",quotes);
101 printf(" %5d occurrences of Alice\n",alices);
102
103 return 0;
104}
105
106
int contains(const char *s, int pos=-1) const
Does it contain this substring?
Definition: EST_String.h:375
int eof()
end of file
Definition: EST_Token.h:356
void set_PrePunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
Definition: EST_Token.h:344
void set_PunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
Definition: EST_Token.h:341
int open(const EST_String &filename)
open a \Ref{EST_TokenStream} for a file.
Definition: EST_Token.cc:200
EST_TokenStream & get(EST_Token &t)
get next token in stream
Definition: EST_Token.cc:486