001/* 002* TextTokenizer -- A javolution.text.Text compatible replacement for java.util.StringTokenizer 003* 004* Copyright (C) 2009-2025 by Joseph A. Huwaldt. 005* All rights reserved. 006* 007* This library is free software; you can redistribute it and/or 008* modify it under the terms of the GNU Lesser General Public 009* License as published by the Free Software Foundation; either 010* version 2 of the License, or (at your option) any later version. 011* 012* This library is distributed in the hope that it will be useful, 013* but WITHOUT ANY WARRANTY; without even the implied warranty of 014* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015* Lesser General Public License for more details. 016* 017* You should have received a copy of the GNU Lesser General Public License 018* along with this program; if not, write to the Free Software 019* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 020* Or visit: http://www.gnu.org/licenses/lgpl.html 021* 022*/ 023package jahuwaldt.js.util; 024 025import java.util.NoSuchElementException; 026import java.util.Iterator; 027import java.util.Enumeration; 028 029import javolution.context.ObjectFactory; 030import javolution.lang.Realtime; 031import javolution.lang.Reusable; 032import javolution.text.Text; 033 034 035/** 036 * The text tokenizer class allows an application to break a 037 * <code>Text</code> object into tokens. The tokenization method is much simpler than 038 * the one used by the <code>StreamTokenizer</code> class. The 039 * <code>TextTokenizer</code> methods do not distinguish among 040 * identifiers, numbers, and quoted strings, nor do they recognize 041 * and skip comments. 042 * <p> 043 * The set of delimiters (the characters that separate tokens) may 044 * be specified either at creation time or on a per-token basis. 045 * <p> 046 * An instance of <code>TextTokenizer</code> behaves in one of two 047 * ways, depending on whether it was created with the 048 * <code>returnDelims</code> flag having the value <code>true</code> 049 * or <code>false</code>: 050 * <ul> 051 * <li>If the flag is <code>false</code>, delimiter characters serve to 052 * separate tokens. A token is a maximal sequence of consecutive 053 * characters that are not delimiters. 054 * <li>If the flag is <code>true</code>, delimiter characters are themselves 055 * considered to be tokens. A token is thus either one delimiter 056 * character, or a maximal sequence of consecutive characters that are 057 * not delimiters. 058 * </ul><p> 059 * A TextTokenizer object internally maintains a current 060 * position within the text to be tokenized. Some operations advance this 061 * current position past the characters processed.<p> 062 * A token is returned by taking a subtext of the text that was used to 063 * create the TextTokenizer object. 064 * <p> 065 * The following is one example of the use of the tokenizer. The code: 066 * <blockquote><pre> 067 * TextTokenizer tt = TextTokenizer.valueOf("this is a test"); 068 * while (tt.hasMoreTokens()) { 069 * System.out.println(tt.nextToken()); 070 * } 071 * </pre></blockquote> 072 * <p> 073 * prints the following output: 074 * <blockquote><pre> 075 * this 076 * is 077 * a 078 * test 079 * </pre></blockquote> 080 * 081 * <p> 082 * TextTokenizer is heavily based on <code>java.util.StringTokenizer</code>. 083 * However, there are some improvements and additional methods and capabilities. 084 * </p> 085 * 086 * <p> Modified by: Joseph A. Huwaldt </p> 087 * 088 * @author Joseph A. Huwaldt Date: March 12, 2009 089 * @version February 23, 2025 090 */ 091public final class TextTokenizer implements Enumeration<Text>, Iterator<Text>, Iterable<Text>, Realtime, Reusable { 092 private static final Text DEFAULT_DELIMS = Text.intern(" \t\n\r\f"); 093 094 private int _currentPosition; 095 private int _newPosition; 096 private int _maxPosition; 097 private Text _text; 098 private Text _delimiters; 099 private boolean _retDelims; 100 private boolean _delimsChanged; 101 private boolean _honorQuotes = false; 102 private char _quoteChar = '"'; 103 104 /** 105 * Set to true if empty tokens should be returned. 106 * For example, if "" should be returned when text starts with 107 * a delimiter, has two delimiters next to each other, or 108 * ends with a delimiter. 109 */ 110 private boolean _returnEmptyTokens; 111 112 /** 113 * _maxDelimChar stores the value of the delimiter character with the 114 * highest value. It is used to optimize the detection of delimiter 115 * characters. 116 */ 117 private char _maxDelimChar; 118 119 120 /** 121 * Return a text tokenizer with an initially empty string of text and with 122 * no delimiters. Use {@link #setText} and {@link #setDelimiters} to make 123 * this instance useful. 124 * 125 * @return A text tokenizer with an initially empty string of text and with 126 * no delimiters. 127 */ 128 @SuppressWarnings("unchecked") 129 public static TextTokenizer newInstance() { 130 TextTokenizer o = FACTORY.object(); 131 o._text = Text.EMPTY; 132 o._delimiters = DEFAULT_DELIMS; 133 o._currentPosition = 0; 134 o._newPosition = -1; 135 o._maxPosition = o._text.length(); 136 o._delimsChanged = false; 137 o._returnEmptyTokens = false; 138 o._retDelims = false; 139 o._honorQuotes = false; 140 o._quoteChar = '"'; 141 o.setMaxDelimChar(); 142 return o; 143 } 144 145 /** 146 * Resets the internal state of this object to its default values. 147 */ 148 @Override 149 public void reset() { 150 _text = Text.EMPTY; 151 _delimiters = DEFAULT_DELIMS; 152 _currentPosition = 0; 153 _newPosition = -1; 154 _maxPosition = _text.length(); 155 _delimsChanged = false; 156 _returnEmptyTokens = false; 157 _retDelims = false; 158 _honorQuotes = false; 159 _quoteChar = '"'; 160 setMaxDelimChar(); 161 } 162 163 /** 164 * Return a text tokenizer for the specified character sequence. All 165 * characters in the <code>delim</code> argument are the delimiters 166 * for separating tokens. 167 * <p> 168 * If the <code>returnDelims</code> flag is <code>true</code>, then 169 * the delimiter characters are also returned as tokens. Each 170 * delimiter is returned as a string of length one. If the flag is 171 * <code>false</code>, the delimiter characters are skipped and only 172 * serve as separators between tokens. 173 * <p> 174 * Note that if delim is null, this constructor does 175 * not throw an exception. However, trying to invoke other methods on the 176 * resulting TextTokenizer may result in a 177 * NullPointerException. 178 * 179 * @param text the text to be parsed. 180 * @param delim the delimiters. 181 * @param returnDelims flag indicating whether to return the delimiters 182 * as tokens. 183 * @return A text tokenizer for the specified character sequence. 184 */ 185 public static TextTokenizer valueOf(CharSequence text, CharSequence delim, boolean returnDelims) { 186 TextTokenizer o = TextTokenizer.newInstance(); 187 188 o._text = (text != null ? Text.valueOf(text) : Text.EMPTY); 189 o._currentPosition = 0; 190 o._newPosition = -1; 191 o._maxPosition = o._text.length(); 192 193 o._delimsChanged = false; 194 o._delimiters = (delim != null ? Text.valueOf(delim) : Text.EMPTY); 195 o._retDelims = returnDelims; 196 o.setMaxDelimChar(); 197 198 return o; 199 } 200 201 /** 202 * Return a text tokenizer for the specified character sequence. The 203 * characters in the <code>delim</code> argument are the delimiters 204 * for separating tokens. Delimiter characters themselves will not 205 * be treated as tokens. 206 * 207 * @param text the text to be parsed. 208 * @param delim the delimiters. 209 * @return A text tokenizer for the specified character sequence. 210 */ 211 public static TextTokenizer valueOf(CharSequence text, CharSequence delim) { 212 return TextTokenizer.valueOf(text, delim, false); 213 } 214 215 /** 216 * Return a text tokenizer for the specified character sequence. The 217 * tokenizer uses the default delimiter set, which is 218 * <code>" \t\n\r\f"</code>: the space character, 219 * the tab character, the newline character, the carriage-return character, 220 * and the form-feed character. Delimiter characters themselves will 221 * not be treated as tokens. 222 * 223 * @param text the text to be parsed. 224 * @return A text tokenizer for the specified character sequence. 225 */ 226 public static TextTokenizer valueOf(CharSequence text) { 227 return TextTokenizer.valueOf(text, DEFAULT_DELIMS, false); 228 } 229 230 231 /** 232 * Set the text to be tokenized in this TextTokenizer. 233 * <p> 234 * This is useful when for TextTokenizer re-use so that new string tokenizers do not 235 * have to be created for each string you want to tokenizer. 236 * <p> 237 * The text will be tokenized from the beginning of the text. 238 * 239 * @param text the text to be parsed. 240 */ 241 public void setText(CharSequence text){ 242 _text = (text != null ? Text.valueOf(text) : Text.EMPTY); 243 _currentPosition = 0; 244 _newPosition = -1; 245 _maxPosition = _text.length(); 246 } 247 248 /** 249 * Set the delimiters for this TextTokenizer. 250 * The position must be initialized before this method is used 251 * (setText does this and it is called from the constructor). 252 * 253 * @param delim the delimiters 254 */ 255 public void setDelimiters(CharSequence delim){ 256 _delimiters = (delim != null ? Text.valueOf(delim) : Text.EMPTY); 257 258 /* delimiter string specified, so set the appropriate flag. */ 259 _delimsChanged = true; 260 261 setMaxDelimChar(); 262 } 263 264 /** 265 * Set the character to use as the "quote" character. All text between quote 266 * characters is considered a single token. The default quote character is <code>'"'</code>. 267 * 268 * @param quote The character to use as the "quote" character. 269 * @see #setHonorQuotes 270 */ 271 public void setQuoteChar(char quote) { 272 _quoteChar = quote; 273 } 274 275 /** 276 * Sets whether or not this tokenizer recognizes quoted text using the specified 277 * quote character. If <code>true</code> is passed, this tokenizer will consider any 278 * text between the specified quote characters as a single token. Honoring of 279 * quotes defaults to false. 280 * 281 * @param honorQuotes Treat quoted text as a single token if <code>true</code>. 282 * @see #setQuoteChar 283 */ 284 public void setHonorQuotes(boolean honorQuotes) { 285 _honorQuotes = honorQuotes; 286 } 287 288 /** 289 * Returns <code>true</code> if this tokenizer honors quoted text (counts it as a single token). 290 * 291 * @return <code>true</code> if this tokenizer honors quoted text. 292 */ 293 public boolean getHonorQuotes() { 294 return _honorQuotes; 295 } 296 297 /** 298 * Set _maxDelimChar to the highest char in the delimiter set. 299 */ 300 private void setMaxDelimChar() { 301 char m = 0; 302 for (int i = 0; i < _delimiters.length(); i++) { 303 char c = _delimiters.charAt(i); 304 if (m < c) 305 m = c; 306 } 307 _maxDelimChar = m; 308 } 309 310 /** 311 * Set whether empty tokens should be returned from this point in 312 * in the tokenizing process onward. 313 * <p> 314 * Empty tokens occur when two delimiters are next to each other 315 * or a delimiter occurs at the beginning or end of a string. If 316 * empty tokens are set to be returned, and a comma is the non token 317 * delimiter, the following table shows how many tokens are in each 318 * string.<br> 319 * <table><tr><th>String<th><th>Number of tokens<th></tr> 320 * <tr><td align=right>"one,two"<td><td>2 - normal case with no empty tokens.<td></tr> 321 * <tr><td align=right>"one,,three"<td><td>3 including the empty token in the middle.<td></tr> 322 * <tr><td align=right>"one,"<td><td>2 including the empty token at the end.<td></tr> 323 * <tr><td align=right>",two"<td><td>2 including the empty token at the beginning.<td></tr> 324 * <tr><td align=right>","<td><td>2 including the empty tokens at the beginning and the ends.<td></tr> 325 * <tr><td align=right>""<td><td>1 - all strings will have at least one token if empty tokens are returned.<td></tr></table> 326 * 327 * @param returnEmptyTokens true if and only if empty tokens should be returned. 328 */ 329 public void setReturnEmptyTokens(boolean returnEmptyTokens){ 330 _returnEmptyTokens = returnEmptyTokens; 331 } 332 333 /** 334 * Skips delimiters starting from the specified position. If _retDelims 335 * is false, returns the index of the first non-delimiter character at or 336 * after startPos. If _retDelims is true, startPos is returned. 337 */ 338 private int skipDelimiters(int startPos) { 339 if (Text.EMPTY.equals(_delimiters)) return _maxPosition; 340 341 int position = startPos; 342 while (!_retDelims && position < _maxPosition) { 343 char c = _text.charAt(position); 344 if ( (c > _maxDelimChar) || (_delimiters.indexOf(c,0) < 0) ) 345 break; 346 position++; 347 if (_returnEmptyTokens) 348 break; 349 } 350 return position; 351 } 352 353 /** 354 * Skips ahead from startPos and returns the index of the next delimiter 355 * character encountered, or _maxPosition if no such delimiter is found. 356 */ 357 private int scanToken(int startPos) { 358 boolean inQuote = false; 359 int position = startPos; 360 while (position < _maxPosition) { 361 char c = _text.charAt(position); 362 if (_honorQuotes && c == _quoteChar) { 363 if (!inQuote) 364 inQuote = true; 365 else 366 inQuote = false; 367 368 } else if (!inQuote && (c <= _maxDelimChar) && (_delimiters.indexOf(c,0) >= 0)) 369 break; 370 position++; 371 } 372 if (_retDelims && (startPos == position)) { 373 char c = _text.charAt(position); 374 if ((c <= _maxDelimChar) && (_delimiters.indexOf(c,0) >= 0)) 375 position++; 376 } 377 return position; 378 } 379 380 /** 381 * Tests if there are more tokens available from this tokenizer's text. 382 * If this method returns true, then a subsequent call to 383 * nextToken with no argument will successfully return a token. 384 * 385 * @return <code>true</code> if and only if there is at least one token 386 * in the text after the current position; <code>false</code> 387 * otherwise. 388 */ 389 public boolean hasMoreTokens() { 390 /* 391 * Temporary store this position and use it in the following 392 * nextToken() method only if the delimiters have'nt been changed in 393 * that nextToken() invocation. 394 */ 395 _newPosition = skipDelimiters(_currentPosition); 396 return (_newPosition < _maxPosition); 397 } 398 399 /** 400 * Returns the next token from this text tokenizer. 401 * 402 * @return the next token from this text tokenizer. 403 * @exception NoSuchElementException if there are no more tokens in this 404 * tokenizer's text. 405 */ 406 public Text nextToken() { 407 /* 408 * If next position already computed in hasMoreElements() and 409 * delimiters have changed between the computation and this invocation, 410 * then use the computed value. 411 */ 412 _currentPosition = (_newPosition >= 0 && !_delimsChanged) ? 413 _newPosition : skipDelimiters(_currentPosition); 414 415 /* Reset these anyway */ 416 _delimsChanged = false; 417 _newPosition = -1; 418 419 if (_currentPosition >= _maxPosition) 420 throw new NoSuchElementException(); 421 422 int start = _currentPosition; 423 _currentPosition = scanToken(_currentPosition); 424 425 return _text.subtext(start, _currentPosition); 426 } 427 428 /** 429 * Returns the next token in this text tokenizer's text. First, 430 * the set of characters considered to be delimiters by this 431 * TextTokenizer object is changed to be the characters in 432 * the string delim. Then the next token in the text 433 * after the current position is returned. The current position is 434 * advanced beyond the recognized token. The new delimiter set 435 * remains the default after this call. 436 * 437 * @param delim the new delimiters. 438 * @return the next token, after switching to the new delimiter set. 439 * @exception NoSuchElementException if there are no more tokens in this 440 * tokenizer's text. 441 */ 442 public Text nextToken(CharSequence delim) { 443 setDelimiters(delim); 444 return nextToken(); 445 } 446 447 /** 448 * Returns the same value as the <code>hasMoreTokens</code> 449 * method. It exists so that this class can implement the 450 * <code>Enumeration</code> interface. 451 * 452 * @return <code>true</code> if there are more tokens; 453 * <code>false</code> otherwise. 454 * @see java.util.Enumeration 455 * @see #hasMoreTokens() 456 */ 457 @Override 458 public boolean hasMoreElements() { 459 return hasMoreTokens(); 460 } 461 462 /** 463 * Returns the same value as the <code>nextToken</code> method. 464 * It exists so that this class can implement the 465 * <code>Enumeration</code> interface. 466 * 467 * @return the next token in the text. 468 * @exception NoSuchElementException if there are no more tokens in this 469 * tokenizer's text. 470 * @see java.util.Enumeration 471 * @see #nextToken() 472 */ 473 @Override 474 public Text nextElement() { 475 return nextToken(); 476 } 477 478 479 /** 480 * Returns an iterator over the tokens returned by this tokenizer. 481 */ 482 @Override 483 public Iterator<Text> iterator() { 484 return this; 485 } 486 487 /** 488 * Returns the same value as the <code>hasMoreTokens()</code> method. It exists 489 * so that this class can implement the <code>Iterator</code> interface. 490 * 491 * @return <code>true</code> if there are more tokens; 492 * <code>false</code> otherwise. 493 * 494 * @see java.util.Iterator 495 * @see #hasMoreTokens() 496 */ 497 @Override 498 public boolean hasNext(){ 499 return hasMoreTokens(); 500 } 501 502 /** 503 * Returns the same value as the <code>nextToken()</code> method. 504 * It exists so that this class can implement the 505 * <code>Iterator</code> interface. 506 * 507 * @return the next token in the text. 508 * @throws NoSuchElementException if there are no more tokens in this tokenizer's text. 509 * 510 * @see java.util.Iterator 511 * @see #nextToken() 512 */ 513 @Override 514 public Text next(){ 515 return nextToken(); 516 } 517 518 /** 519 * This implementation always throws <code>UnsupportedOperationException</code>. 520 * It exists so that this class can implement the <code>Iterator</code> interface. 521 * 522 * @throws UnsupportedOperationException always is thrown. 523 * 524 * @see java.util.Iterator 525 */ 526 @Override 527 public void remove(){ 528 throw new UnsupportedOperationException(); 529 } 530 531 /** 532 * Calculates the number of times that this tokenizer's 533 * <code>nextToken</code> method can be called before it generates an 534 * exception. The current position is not advanced. 535 * 536 * @return the number of tokens remaining in the text using the current 537 * delimiter set. 538 * @see #nextToken() 539 */ 540 public int countTokens() { 541 int count = 0; 542 int currpos = _currentPosition; 543 while (currpos < _maxPosition) { 544 currpos = skipDelimiters(currpos); 545 if (currpos >= _maxPosition) 546 break; 547 currpos = scanToken(currpos); 548 count++; 549 } 550 return count; 551 } 552 553 /** 554 * Calculates the number of times that this tokenizer's <code>nextToken</code> 555 * method can be called before it generates an exception using the given set of 556 * delimiters. The delimiters given will be used for future calls to 557 * nextToken() unless new delimiters are given. The current position 558 * is not advanced. 559 * 560 * @param delims the new set of delimiters. 561 * @return the number of tokens remaining in the text using the new 562 * delimiter set. 563 * 564 * @see #countTokens() 565 */ 566 public int countTokens(CharSequence delims){ 567 setDelimiters(delims); 568 return countTokens(); 569 } 570 571 /** 572 * Retrieves the rest of the text as a single token. 573 * After calling this method hasMoreTokens() will always return false. 574 * 575 * @return any part of the text that has not yet been tokenized. 576 */ 577 public Text restOfText() { 578 Text output = _text.subtext(_currentPosition, _maxPosition); 579 _currentPosition = _maxPosition; 580 return output; 581 } 582 583 /** 584 * Returns the same value as the <code>nextToken()</code> method. 585 * It exists so that this class can implement the 586 * <code>Realtime</code> interface. 587 * 588 * @return the next token in the text. 589 * @throws NoSuchElementException if there are no more tokens in this tokenizer's text. 590 * 591 * @see javolution.lang.Realtime 592 * @see #nextToken() 593 */ 594 @Override 595 public Text toText() { 596 return nextToken(); 597 } 598 599 /** 600 * Recycles a <code>TextTokenizer</code> instance immediately 601 * (on the stack when executing in a <code>StackContext</code>). 602 * 603 * @param instance the instance of this class to recycle. 604 */ 605 public static void recycle(TextTokenizer instance) { 606 FACTORY.recycle(instance); 607 } 608 609 610 ////////////////////// 611 // Factory Creation // 612 ////////////////////// 613 614 @SuppressWarnings("unchecked") 615 private static final ObjectFactory<TextTokenizer> FACTORY = new ObjectFactory<TextTokenizer>() { 616 @Override 617 protected TextTokenizer create() { 618 return new TextTokenizer(); 619 } 620 @Override 621 protected void cleanup(TextTokenizer instance) { 622 instance.reset(); 623 } 624 }; 625 626 private TextTokenizer() { } 627 628 629 /** 630 * Testing code for this class. 631 * 632 * @param args the command-line arguments 633 */ 634 public static void main (String args[]) { 635 System.out.println("Testing TextTokenizer:"); 636 637 System.out.println("\nTokenize: \"this is a test\":"); 638 TextTokenizer tt = TextTokenizer.valueOf("this is a test"); 639 while (tt.hasMoreTokens()) { 640 tt.nextToken().println(); 641 } 642 643 System.out.println("\nTokenize: \"this,is,,a,test\" returning empty tokens:"); 644 tt.setText("this,is,,a,test"); 645 tt.setDelimiters(","); 646 tt.setReturnEmptyTokens(true); 647 while (tt.hasMoreTokens()) { 648 tt.nextToken().println(); 649 } 650 651 } 652}