001/* 002* TextTokenizer -- A javolution.text.Text compatible replacement for java.util.StringTokenizer 003* 004* Copyright (C) 2009-2025 by Joseph A. Huwaldt. 005* All rights reserved. 006* 007* This library is free software; you can redistribute it and/or 008* modify it under the terms of the GNU Lesser General Public 009* License as published by the Free Software Foundation; either 010* version 2 of the License, or (at your option) any later version. 011* 012* This library is distributed in the hope that it will be useful, 013* but WITHOUT ANY WARRANTY; without even the implied warranty of 014* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015* Lesser General Public License for more details. 016* 017* You should have received a copy of the GNU Lesser General Public License 018* along with this program; if not, write to the Free Software 019* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 020* Or visit: http://www.gnu.org/licenses/lgpl.html 021* 022**/ 023package jahuwaldt.js.util; 024 025import java.util.NoSuchElementException; 026import java.util.Iterator; 027import java.util.Enumeration; 028 029import javolution.context.ObjectFactory; 030import javolution.lang.Realtime; 031import javolution.lang.Reusable; 032import javolution.text.Text; 033 034 035/** 036 * The text tokenizer class allows an application to break a 037 * <code>Text</code> object into tokens. The tokenization method is much simpler than 038 * the one used by the <code>StreamTokenizer</code> class. The 039 * <code>TextTokenizer</code> methods do not distinguish among 040 * identifiers, numbers, and quoted strings, nor do they recognize 041 * and skip comments. 042 * <p> 043 * The set of delimiters (the characters that separate tokens) may 044 * be specified either at creation time or on a per-token basis. 045 * <p> 046 * An instance of <code>TextTokenizer</code> behaves in one of two 047 * ways, depending on whether it was created with the 048 * <code>returnDelims</code> flag having the value <code>true</code> 049 * or <code>false</code>: 050 * <ul> 051 * <li>If the flag is <code>false</code>, delimiter characters serve to 052 * separate tokens. A token is a maximal sequence of consecutive 053 * characters that are not delimiters. 054 * <li>If the flag is <code>true</code>, delimiter characters are themselves 055 * considered to be tokens. A token is thus either one delimiter 056 * character, or a maximal sequence of consecutive characters that are 057 * not delimiters. 058 * </ul><p> 059 * A TextTokenizer object internally maintains a current 060 * position within the text to be tokenized. Some operations advance this 061 * current position past the characters processed.<p> 062 * A token is returned by taking a subtext of the text that was used to 063 * create the TextTokenizer object. 064 * <p> 065 * The following is one example of the use of the tokenizer. The code: 066 * <blockquote><pre> 067 * TextTokenizer tt = TextTokenizer.valueOf("this is a test"); 068 * while (tt.hasMoreTokens()) { 069 * System.out.println(tt.nextToken()); 070 * } 071 * </pre></blockquote> 072 * <p> 073 * prints the following output: 074 * <blockquote><pre> 075 * this 076 * is 077 * a 078 * test 079 * </pre></blockquote> 080 * 081 * <p> 082 * TextTokenizer is heavily based on <code>java.util.StringTokenizer</code>. 083 * However, there are some improvements and additional methods and capabilities. 084 * <p> 085 * 086 * <p> Modified by: Joseph A. Huwaldt </p> 087 * 088 * @author Joseph A. Huwaldt Date: March 12, 2009 089 * @version February 17, 2025 090 */ 091public final class TextTokenizer implements Enumeration<Text>, Iterator<Text>, Iterable<Text>, Realtime, Reusable { 092 private static final Text DEFAULT_DELIMS = Text.intern(" \t\n\r\f"); 093 094 private int _currentPosition; 095 private int _newPosition; 096 private int _maxPosition; 097 private Text _text; 098 private Text _delimiters; 099 private boolean _retDelims; 100 private boolean _delimsChanged; 101 private boolean _honorQuotes = false; 102 private char _quoteChar = '"'; 103 104 /** 105 * Set to true if empty tokens should be returned. 106 * For example, if "" should be returned when text starts with 107 * a delimiter, has two delimiters next to each other, or 108 * ends with a delimiter. 109 */ 110 private boolean _returnEmptyTokens; 111 112 /** 113 * _maxDelimChar stores the value of the delimiter character with the 114 * highest value. It is used to optimize the detection of delimiter 115 * characters. 116 */ 117 private char _maxDelimChar; 118 119 120 /** 121 * Return a text tokenizer with an initially empty string of text and with 122 * no delimiters. Use {@link #setText} and {@link #setDelimiters} to make 123 * this instance useful. 124 */ 125 @SuppressWarnings("unchecked") 126 public static TextTokenizer newInstance() { 127 TextTokenizer o = FACTORY.object(); 128 o._text = Text.EMPTY; 129 o._delimiters = DEFAULT_DELIMS; 130 o._currentPosition = 0; 131 o._newPosition = -1; 132 o._maxPosition = o._text.length(); 133 o._delimsChanged = false; 134 o._returnEmptyTokens = false; 135 o._retDelims = false; 136 o._honorQuotes = false; 137 o._quoteChar = '"'; 138 o.setMaxDelimChar(); 139 return o; 140 } 141 142 /** 143 * Resets the internal state of this object to its default values. 144 **/ 145 @Override 146 public void reset() { 147 _text = Text.EMPTY; 148 _delimiters = DEFAULT_DELIMS; 149 _currentPosition = 0; 150 _newPosition = -1; 151 _maxPosition = _text.length(); 152 _delimsChanged = false; 153 _returnEmptyTokens = false; 154 _retDelims = false; 155 _honorQuotes = false; 156 _quoteChar = '"'; 157 setMaxDelimChar(); 158 } 159 160 /** 161 * Return a text tokenizer for the specified character sequence. All 162 * characters in the <code>delim</code> argument are the delimiters 163 * for separating tokens. 164 * <p> 165 * If the <code>returnDelims</code> flag is <code>true</code>, then 166 * the delimiter characters are also returned as tokens. Each 167 * delimiter is returned as a string of length one. If the flag is 168 * <code>false</code>, the delimiter characters are skipped and only 169 * serve as separators between tokens. 170 * <p> 171 * Note that if delim is null, this constructor does 172 * not throw an exception. However, trying to invoke other methods on the 173 * resulting TextTokenizer may result in a 174 * NullPointerException. 175 * 176 * @param text the text to be parsed. 177 * @param delim the delimiters. 178 * @param returnDelims flag indicating whether to return the delimiters 179 * as tokens. 180 */ 181 public static TextTokenizer valueOf(CharSequence text, CharSequence delim, boolean returnDelims) { 182 TextTokenizer o = TextTokenizer.newInstance(); 183 184 o._text = (text != null ? Text.valueOf(text) : Text.EMPTY); 185 o._currentPosition = 0; 186 o._newPosition = -1; 187 o._maxPosition = o._text.length(); 188 189 o._delimsChanged = false; 190 o._delimiters = (delim != null ? Text.valueOf(delim) : Text.EMPTY); 191 o._retDelims = returnDelims; 192 o.setMaxDelimChar(); 193 194 return o; 195 } 196 197 /** 198 * Return a text tokenizer for the specified character sequence. The 199 * characters in the <code>delim</code> argument are the delimiters 200 * for separating tokens. Delimiter characters themselves will not 201 * be treated as tokens. 202 * 203 * @param text the text to be parsed. 204 * @param delim the delimiters. 205 */ 206 public static TextTokenizer valueOf(CharSequence text, CharSequence delim) { 207 return TextTokenizer.valueOf(text, delim, false); 208 } 209 210 /** 211 * Return a text tokenizer for the specified character sequence. The 212 * tokenizer uses the default delimiter set, which is 213 * <code>" \t\n\r\f"</code>: the space character, 214 * the tab character, the newline character, the carriage-return character, 215 * and the form-feed character. Delimiter characters themselves will 216 * not be treated as tokens. 217 * 218 * @param text the text to be parsed. 219 */ 220 public static TextTokenizer valueOf(CharSequence text) { 221 return TextTokenizer.valueOf(text, DEFAULT_DELIMS, false); 222 } 223 224 225 /** 226 * Set the text to be tokenized in this TextTokenizer. 227 * <p> 228 * This is useful when for TextTokenizer re-use so that new string tokenizers do not 229 * have to be created for each string you want to tokenizer. 230 * <p> 231 * The text will be tokenized from the beginning of the text. 232 * 233 * @param text the text to be parsed. 234 */ 235 public void setText(CharSequence text){ 236 _text = (text != null ? Text.valueOf(text) : Text.EMPTY); 237 _currentPosition = 0; 238 _newPosition = -1; 239 _maxPosition = _text.length(); 240 } 241 242 /** 243 * Set the delimiters for this TextTokenizer. 244 * The position must be initialized before this method is used 245 * (setText does this and it is called from the constructor). 246 * 247 * @param delim the delimiters 248 */ 249 public void setDelimiters(CharSequence delim){ 250 _delimiters = (delim != null ? Text.valueOf(delim) : Text.EMPTY); 251 252 /* delimiter string specified, so set the appropriate flag. */ 253 _delimsChanged = true; 254 255 setMaxDelimChar(); 256 } 257 258 /** 259 * Set the character to use as the "quote" character. All text between quote 260 * characters is considered a single token. The default quote character is <code>'"'</code>. 261 * 262 * @see #setHonorQuotes 263 **/ 264 public void setQuoteChar(char quote) { 265 _quoteChar = quote; 266 } 267 268 /** 269 * Sets whether or not this tokenizer recognizes quoted text using the specified 270 * quote character. If <code>true</code> is passed, this tokenizer will consider any 271 * text between the specified quote characters as a single token. Honoring of 272 * quotes defaults to false. 273 * 274 * @see #setQuoteChar 275 **/ 276 public void setHonorQuotes(boolean honorQuotes) { 277 _honorQuotes = honorQuotes; 278 } 279 280 /** 281 * Returns <code>true</code> if this tokenizer honors quoted text (counts it as a single token). 282 **/ 283 public boolean getHonorQuotes() { 284 return _honorQuotes; 285 } 286 287 /** 288 * Set _maxDelimChar to the highest char in the delimiter set. 289 */ 290 private void setMaxDelimChar() { 291 char m = 0; 292 for (int i = 0; i < _delimiters.length(); i++) { 293 char c = _delimiters.charAt(i); 294 if (m < c) 295 m = c; 296 } 297 _maxDelimChar = m; 298 } 299 300 /** 301 * Set whether empty tokens should be returned from this point in 302 * in the tokenizing process onward. 303 * <p> 304 * Empty tokens occur when two delimiters are next to each other 305 * or a delimiter occurs at the beginning or end of a string. If 306 * empty tokens are set to be returned, and a comma is the non token 307 * delimiter, the following table shows how many tokens are in each 308 * string.<br> 309 * <table><tr><th>String<th><th>Number of tokens<th></tr> 310 * <tr><td align=right>"one,two"<td><td>2 - normal case with no empty tokens.<td></tr> 311 * <tr><td align=right>"one,,three"<td><td>3 including the empty token in the middle.<td></tr> 312 * <tr><td align=right>"one,"<td><td>2 including the empty token at the end.<td></tr> 313 * <tr><td align=right>",two"<td><td>2 including the empty token at the beginning.<td></tr> 314 * <tr><td align=right>","<td><td>2 including the empty tokens at the beginning and the ends.<td></tr> 315 * <tr><td align=right>""<td><td>1 - all strings will have at least one token if empty tokens are returned.<td></tr></table> 316 * 317 * @param returnEmptyTokens true if and only if empty tokens should be returned. 318 */ 319 public void setReturnEmptyTokens(boolean returnEmptyTokens){ 320 _returnEmptyTokens = returnEmptyTokens; 321 } 322 323 /** 324 * Skips delimiters starting from the specified position. If _retDelims 325 * is false, returns the index of the first non-delimiter character at or 326 * after startPos. If _retDelims is true, startPos is returned. 327 */ 328 private int skipDelimiters(int startPos) { 329 if (Text.EMPTY.equals(_delimiters)) return _maxPosition; 330 331 int position = startPos; 332 while (!_retDelims && position < _maxPosition) { 333 char c = _text.charAt(position); 334 if ( (c > _maxDelimChar) || (_delimiters.indexOf(c,0) < 0) ) 335 break; 336 position++; 337 if (_returnEmptyTokens) 338 break; 339 } 340 return position; 341 } 342 343 /** 344 * Skips ahead from startPos and returns the index of the next delimiter 345 * character encountered, or _maxPosition if no such delimiter is found. 346 */ 347 private int scanToken(int startPos) { 348 boolean inQuote = false; 349 int position = startPos; 350 while (position < _maxPosition) { 351 char c = _text.charAt(position); 352 if (_honorQuotes && c == _quoteChar) { 353 if (!inQuote) 354 inQuote = true; 355 else 356 inQuote = false; 357 358 } else if (!inQuote && (c <= _maxDelimChar) && (_delimiters.indexOf(c,0) >= 0)) 359 break; 360 position++; 361 } 362 if (_retDelims && (startPos == position)) { 363 char c = _text.charAt(position); 364 if ((c <= _maxDelimChar) && (_delimiters.indexOf(c,0) >= 0)) 365 position++; 366 } 367 return position; 368 } 369 370 /** 371 * Tests if there are more tokens available from this tokenizer's text. 372 * If this method returns true, then a subsequent call to 373 * nextToken with no argument will successfully return a token. 374 * 375 * @return <code>true</code> if and only if there is at least one token 376 * in the text after the current position; <code>false</code> 377 * otherwise. 378 */ 379 public boolean hasMoreTokens() { 380 /* 381 * Temporary store this position and use it in the following 382 * nextToken() method only if the delimiters have'nt been changed in 383 * that nextToken() invocation. 384 */ 385 _newPosition = skipDelimiters(_currentPosition); 386 return (_newPosition < _maxPosition); 387 } 388 389 /** 390 * Returns the next token from this text tokenizer. 391 * 392 * @return the next token from this text tokenizer. 393 * @exception NoSuchElementException if there are no more tokens in this 394 * tokenizer's text. 395 */ 396 public Text nextToken() { 397 /* 398 * If next position already computed in hasMoreElements() and 399 * delimiters have changed between the computation and this invocation, 400 * then use the computed value. 401 */ 402 _currentPosition = (_newPosition >= 0 && !_delimsChanged) ? 403 _newPosition : skipDelimiters(_currentPosition); 404 405 /* Reset these anyway */ 406 _delimsChanged = false; 407 _newPosition = -1; 408 409 if (_currentPosition >= _maxPosition) 410 throw new NoSuchElementException(); 411 412 int start = _currentPosition; 413 _currentPosition = scanToken(_currentPosition); 414 415 return _text.subtext(start, _currentPosition); 416 } 417 418 /** 419 * Returns the next token in this text tokenizer's text. First, 420 * the set of characters considered to be delimiters by this 421 * TextTokenizer object is changed to be the characters in 422 * the string delim. Then the next token in the text 423 * after the current position is returned. The current position is 424 * advanced beyond the recognized token. The new delimiter set 425 * remains the default after this call. 426 * 427 * @param delim the new delimiters. 428 * @return the next token, after switching to the new delimiter set. 429 * @exception NoSuchElementException if there are no more tokens in this 430 * tokenizer's text. 431 */ 432 public Text nextToken(CharSequence delim) { 433 setDelimiters(delim); 434 return nextToken(); 435 } 436 437 /** 438 * Returns the same value as the <code>hasMoreTokens</code> 439 * method. It exists so that this class can implement the 440 * <code>Enumeration</code> interface. 441 * 442 * @return <code>true</code> if there are more tokens; 443 * <code>false</code> otherwise. 444 * @see java.util.Enumeration 445 * @see #hasMoreTokens() 446 */ 447 @Override 448 public boolean hasMoreElements() { 449 return hasMoreTokens(); 450 } 451 452 /** 453 * Returns the same value as the <code>nextToken</code> method. 454 * It exists so that this class can implement the 455 * <code>Enumeration</code> interface. 456 * 457 * @return the next token in the text. 458 * @exception NoSuchElementException if there are no more tokens in this 459 * tokenizer's text. 460 * @see java.util.Enumeration 461 * @see #nextToken() 462 */ 463 @Override 464 public Text nextElement() { 465 return nextToken(); 466 } 467 468 469 /** 470 * Returns an iterator over the tokens returned by this tokenizer. 471 **/ 472 @Override 473 public Iterator<Text> iterator() { 474 return this; 475 } 476 477 /** 478 * Returns the same value as the <code>hasMoreTokens()</code> method. It exists 479 * so that this class can implement the <code>Iterator</code> interface. 480 * 481 * @return <code>true</code> if there are more tokens; 482 * <code>false</code> otherwise. 483 * 484 * @see java.util.Iterator 485 * @see #hasMoreTokens() 486 */ 487 @Override 488 public boolean hasNext(){ 489 return hasMoreTokens(); 490 } 491 492 /** 493 * Returns the same value as the <code>nextToken()</code> method. 494 * It exists so that this class can implement the 495 * <code>Iterator</code> interface. 496 * 497 * @return the next token in the text. 498 * @throws NoSuchElementException if there are no more tokens in this tokenizer's text. 499 * 500 * @see java.util.Iterator 501 * @see #nextToken() 502 */ 503 @Override 504 public Text next(){ 505 return nextToken(); 506 } 507 508 /** 509 * This implementation always throws <code>UnsupportedOperationException</code>. 510 * It exists so that this class can implement the <code>Iterator</code> interface. 511 * 512 * @throws UnsupportedOperationException always is thrown. 513 * 514 * @see java.util.Iterator 515 */ 516 @Override 517 public void remove(){ 518 throw new UnsupportedOperationException(); 519 } 520 521 /** 522 * Calculates the number of times that this tokenizer's 523 * <code>nextToken</code> method can be called before it generates an 524 * exception. The current position is not advanced. 525 * 526 * @return the number of tokens remaining in the text using the current 527 * delimiter set. 528 * @see #nextToken() 529 */ 530 public int countTokens() { 531 int count = 0; 532 int currpos = _currentPosition; 533 while (currpos < _maxPosition) { 534 currpos = skipDelimiters(currpos); 535 if (currpos >= _maxPosition) 536 break; 537 currpos = scanToken(currpos); 538 count++; 539 } 540 return count; 541 } 542 543 /** 544 * Calculates the number of times that this tokenizer's <code>nextToken</code> 545 * method can be called before it generates an exception using the given set of 546 * delimiters. The delimiters given will be used for future calls to 547 * nextToken() unless new delimiters are given. The current position 548 * is not advanced. 549 * 550 * @param delims the new set of delimiters. 551 * @return the number of tokens remaining in the text using the new 552 * delimiter set. 553 * 554 * @see #countTokens() 555 */ 556 public int countTokens(CharSequence delims){ 557 setDelimiters(delims); 558 return countTokens(); 559 } 560 561 /** 562 * Retrieves the rest of the text as a single token. 563 * After calling this method hasMoreTokens() will always return false. 564 * 565 * @return any part of the text that has not yet been tokenized. 566 */ 567 public Text restOfText() { 568 Text output = _text.subtext(_currentPosition, _maxPosition); 569 _currentPosition = _maxPosition; 570 return output; 571 } 572 573 /** 574 * Returns the same value as the <code>nextToken()</code> method. 575 * It exists so that this class can implement the 576 * <code>Realtime</code> interface. 577 * 578 * @return the next token in the text. 579 * @throws NoSuchElementException if there are no more tokens in this tokenizer's text. 580 * 581 * @see javolution.lang.Realtime 582 * @see #nextToken() 583 */ 584 @Override 585 public Text toText() { 586 return nextToken(); 587 } 588 589 /** 590 * Recycles a <code>TextTokenizer</code> instance immediately 591 * (on the stack when executing in a <code>StackContext</code>). 592 **/ 593 public static void recycle(TextTokenizer instance) { 594 FACTORY.recycle(instance); 595 } 596 597 598 ////////////////////// 599 // Factory Creation // 600 ////////////////////// 601 602 @SuppressWarnings("unchecked") 603 private static final ObjectFactory<TextTokenizer> FACTORY = new ObjectFactory<TextTokenizer>() { 604 @Override 605 protected TextTokenizer create() { 606 return new TextTokenizer(); 607 } 608 @Override 609 protected void cleanup(TextTokenizer instance) { 610 instance.reset(); 611 } 612 }; 613 614 private TextTokenizer() { } 615 616 617 /** 618 * Testing code for this class. 619 **/ 620 public static void main (String args[]) { 621 System.out.println("Testing TextTokenizer:"); 622 623 System.out.println("\nTokenize: \"this is a test\":"); 624 TextTokenizer tt = TextTokenizer.valueOf("this is a test"); 625 while (tt.hasMoreTokens()) { 626 tt.nextToken().println(); 627 } 628 629 System.out.println("\nTokenize: \"this,is,,a,test\" returning empty tokens:"); 630 tt.setText("this,is,,a,test"); 631 tt.setDelimiters(","); 632 tt.setReturnEmptyTokens(true); 633 while (tt.hasMoreTokens()) { 634 tt.nextToken().println(); 635 } 636 637 } 638}