001/*
002*   TextTokenizer -- A javolution.text.Text compatible replacement for java.util.StringTokenizer
003*
004*   Copyright (C) 2009-2025 by Joseph A. Huwaldt.
005*   All rights reserved.
006*   
007*   This library is free software; you can redistribute it and/or
008*   modify it under the terms of the GNU Lesser General Public
009*   License as published by the Free Software Foundation; either
010*   version 2 of the License, or (at your option) any later version.
011*   
012*   This library is distributed in the hope that it will be useful,
013*   but WITHOUT ANY WARRANTY; without even the implied warranty of
014*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
015*   Lesser General Public License for more details.
016*
017*   You should have received a copy of the GNU Lesser General Public License
018*   along with this program; if not, write to the Free Software
019*   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
020*   Or visit:  http://www.gnu.org/licenses/lgpl.html
021*
022**/
023package jahuwaldt.js.util;
024
025import java.util.NoSuchElementException;
026import java.util.Iterator;
027import java.util.Enumeration;
028
029import javolution.context.ObjectFactory;
030import javolution.lang.Realtime;
031import javolution.lang.Reusable;
032import javolution.text.Text;
033
034
035/**
036 * The text tokenizer class allows an application to break a 
037 * <code>Text</code> object into tokens. The tokenization method is much simpler than 
038 * the one used by the <code>StreamTokenizer</code> class. The 
039 * <code>TextTokenizer</code> methods do not distinguish among 
040 * identifiers, numbers, and quoted strings, nor do they recognize 
041 * and skip comments. 
042 * <p>
043 * The set of delimiters (the characters that separate tokens) may 
044 * be specified either at creation time or on a per-token basis. 
045 * <p>
046 * An instance of <code>TextTokenizer</code> behaves in one of two 
047 * ways, depending on whether it was created with the 
048 * <code>returnDelims</code> flag having the value <code>true</code> 
049 * or <code>false</code>: 
050 * <ul>
051 * <li>If the flag is <code>false</code>, delimiter characters serve to 
052 *     separate tokens. A token is a maximal sequence of consecutive 
053 *     characters that are not delimiters. 
054 * <li>If the flag is <code>true</code>, delimiter characters are themselves 
055 *     considered to be tokens. A token is thus either one delimiter 
056 *     character, or a maximal sequence of consecutive characters that are 
057 *     not delimiters.
058 * </ul><p>
059 * A TextTokenizer object internally maintains a current 
060 * position within the text to be tokenized. Some operations advance this 
061 * current position past the characters processed.<p>
062 * A token is returned by taking a subtext of the text that was used to 
063 * create the TextTokenizer object.
064 * <p>
065 * The following is one example of the use of the tokenizer. The code:
066 * <blockquote><pre>
067 *     TextTokenizer tt = TextTokenizer.valueOf("this is a test");
068 *     while (tt.hasMoreTokens()) {
069 *         System.out.println(tt.nextToken());
070 *     }
071 * </pre></blockquote>
072 * <p>
073 * prints the following output:
074 * <blockquote><pre>
075 *     this
076 *     is
077 *     a
078 *     test
079 * </pre></blockquote>
080 *
081 * <p>
082 * TextTokenizer is heavily based on <code>java.util.StringTokenizer</code>.
083 * However, there are some improvements and additional methods and capabilities.
084 * <p>
085 *
086 *  <p>  Modified by:  Joseph A. Huwaldt   </p>
087 *
088 *  @author  Joseph A. Huwaldt   Date: March 12, 2009
089 *  @version February 17, 2025
090 */
091public final class TextTokenizer implements Enumeration<Text>, Iterator<Text>, Iterable<Text>, Realtime, Reusable {
092        private static final Text DEFAULT_DELIMS = Text.intern(" \t\n\r\f");
093        
094    private int _currentPosition;
095    private int _newPosition;
096    private int _maxPosition;
097    private Text _text;
098    private Text _delimiters;
099    private boolean _retDelims;
100    private boolean _delimsChanged;
101        private boolean _honorQuotes = false;
102        private char _quoteChar = '"';
103
104        /**
105         * Set to true if empty tokens should be returned.
106         * For example, if "" should be returned when text starts with
107         * a delimiter, has two delimiters next to each other, or
108         * ends with a delimiter.
109         */
110        private boolean _returnEmptyTokens;
111
112    /**
113     * _maxDelimChar stores the value of the delimiter character with the
114     * highest value. It is used to optimize the detection of delimiter
115     * characters.
116     */
117    private char _maxDelimChar;
118
119
120    /**
121     * Return a text tokenizer with an initially empty string of text and with
122         * no delimiters. Use {@link #setText} and {@link #setDelimiters} to make
123         * this instance useful.
124         */
125        @SuppressWarnings("unchecked")
126        public static TextTokenizer newInstance() {
127                TextTokenizer o = FACTORY.object();
128                o._text = Text.EMPTY;
129                o._delimiters = DEFAULT_DELIMS;
130                o._currentPosition = 0;
131                o._newPosition = -1;
132                o._maxPosition = o._text.length();
133                o._delimsChanged = false;
134                o._returnEmptyTokens = false;
135                o._retDelims = false;
136        o._honorQuotes = false;
137        o._quoteChar = '"';
138        o.setMaxDelimChar();
139                return o;
140        }
141
142        /**
143        *  Resets the internal state of this object to its default values.
144        **/
145    @Override
146        public void reset() {
147                _text = Text.EMPTY;
148                _delimiters = DEFAULT_DELIMS;
149                _currentPosition = 0;
150                _newPosition = -1;
151                _maxPosition = _text.length();
152                _delimsChanged = false;
153                _returnEmptyTokens = false;
154                _retDelims = false;
155        _honorQuotes = false;
156        _quoteChar = '"';
157                setMaxDelimChar();
158        }
159        
160    /**
161     * Return a text tokenizer for the specified character sequence. All  
162     * characters in the <code>delim</code> argument are the delimiters 
163     * for separating tokens. 
164     * <p>
165     * If the <code>returnDelims</code> flag is <code>true</code>, then 
166     * the delimiter characters are also returned as tokens. Each 
167     * delimiter is returned as a string of length one. If the flag is 
168     * <code>false</code>, the delimiter characters are skipped and only 
169     * serve as separators between tokens. 
170     * <p>
171     * Note that if delim is null, this constructor does
172     * not throw an exception. However, trying to invoke other methods on the
173     * resulting TextTokenizer may result in a 
174     * NullPointerException.
175     *
176     * @param   text           the text to be parsed.
177     * @param   delim          the delimiters.
178     * @param   returnDelims   flag indicating whether to return the delimiters
179     *                         as tokens.
180     */
181    public static TextTokenizer valueOf(CharSequence text, CharSequence delim, boolean returnDelims) {
182                TextTokenizer o = TextTokenizer.newInstance();
183                
184                o._text = (text != null ? Text.valueOf(text) : Text.EMPTY);
185                o._currentPosition = 0;
186                o._newPosition = -1;
187                o._maxPosition = o._text.length();
188                
189                o._delimsChanged = false;
190                o._delimiters = (delim != null ? Text.valueOf(delim) : Text.EMPTY);
191                o._retDelims = returnDelims;
192        o.setMaxDelimChar();
193                
194                return o;
195    }
196
197    /**
198     * Return a text tokenizer for the specified character sequence. The 
199     * characters in the <code>delim</code> argument are the delimiters 
200     * for separating tokens. Delimiter characters themselves will not 
201     * be treated as tokens.
202     *
203     * @param   text    the text to be parsed.
204     * @param   delim   the delimiters.
205     */
206    public static TextTokenizer valueOf(CharSequence text, CharSequence delim) {
207                return TextTokenizer.valueOf(text, delim, false);
208    }
209
210    /**
211     * Return a text tokenizer for the specified character sequence. The 
212     * tokenizer uses the default delimiter set, which is 
213     * <code>"&nbsp;&#92;t&#92;n&#92;r&#92;f"</code>: the space character, 
214     * the tab character, the newline character, the carriage-return character,
215     * and the form-feed character. Delimiter characters themselves will 
216     * not be treated as tokens.
217     *
218     * @param   text  the text to be parsed.
219     */
220    public static TextTokenizer valueOf(CharSequence text) {
221                return TextTokenizer.valueOf(text, DEFAULT_DELIMS, false);
222    }
223
224
225        /**
226         * Set the text to be tokenized in this TextTokenizer.
227         * <p>
228         * This is useful when for TextTokenizer re-use so that new string tokenizers do not
229         * have to be created for each string you want to tokenizer.
230         * <p>
231         * The text will be tokenized from the beginning of the text.
232         *
233         * @param text the text to be parsed.
234         */
235        public void setText(CharSequence text){
236                _text = (text != null ? Text.valueOf(text) : Text.EMPTY);
237                _currentPosition = 0;
238                _newPosition = -1;
239                _maxPosition = _text.length();
240        }
241        
242        /**
243         * Set the delimiters for this TextTokenizer.
244         * The position must be initialized before this method is used
245         * (setText does this and it is called from the constructor).
246         *
247         * @param delim  the delimiters
248         */
249        public void setDelimiters(CharSequence delim){
250                _delimiters = (delim != null ? Text.valueOf(delim) : Text.EMPTY);
251                
252                /* delimiter string specified, so set the appropriate flag. */
253                _delimsChanged = true;
254
255                setMaxDelimChar();
256        }
257        
258        /**
259        *  Set the character to use as the "quote" character.  All text between quote
260        *  characters is considered a single token.  The default quote character is <code>'"'</code>.
261        *
262        *  @see #setHonorQuotes
263        **/
264        public void setQuoteChar(char quote) {
265                _quoteChar = quote;
266        }
267        
268        /**
269        *  Sets whether or not this tokenizer recognizes quoted text using the specified
270        *  quote character.  If <code>true</code> is passed, this tokenizer will consider any
271        *  text between the specified quote characters as a single token.  Honoring of
272        *  quotes defaults to false.
273        *
274        *  @see #setQuoteChar
275        **/
276        public void setHonorQuotes(boolean honorQuotes) {
277                _honorQuotes = honorQuotes;
278        }
279        
280        /**
281        *  Returns <code>true</code> if this tokenizer honors quoted text (counts it as a single token).
282        **/
283        public boolean getHonorQuotes() {
284                return _honorQuotes;
285        }
286        
287     /**
288     * Set _maxDelimChar to the highest char in the delimiter set.
289     */
290    private void setMaxDelimChar() {
291                char m = 0;
292                for (int i = 0; i < _delimiters.length(); i++) {
293                        char c = _delimiters.charAt(i);
294                        if (m < c)
295                        m = c;
296                }
297                _maxDelimChar = m;
298    }
299
300        /**
301         * Set whether empty tokens should be returned from this point in
302         * in the tokenizing process onward.
303         * <p>
304         * Empty tokens occur when two delimiters are next to each other
305         * or a delimiter occurs at the beginning or end of a string. If
306         * empty tokens are set to be returned, and a comma is the non token
307         * delimiter, the following table shows how many tokens are in each
308         * string.<br>
309         * <table><tr><th>String<th><th>Number of tokens<th></tr>
310         * <tr><td align=right>"one,two"<td><td>2 - normal case with no empty tokens.<td></tr>
311         * <tr><td align=right>"one,,three"<td><td>3 including the empty token in the middle.<td></tr>
312         * <tr><td align=right>"one,"<td><td>2 including the empty token at the end.<td></tr>
313         * <tr><td align=right>",two"<td><td>2 including the empty token at the beginning.<td></tr>
314         * <tr><td align=right>","<td><td>2 including the empty tokens at the beginning and the ends.<td></tr>
315         * <tr><td align=right>""<td><td>1 - all strings will have at least one token if empty tokens are returned.<td></tr></table>
316         *
317         * @param returnEmptyTokens true if and only if empty tokens should be returned.
318         */
319        public void setReturnEmptyTokens(boolean returnEmptyTokens){
320                _returnEmptyTokens = returnEmptyTokens;
321        }
322
323   /**
324     * Skips delimiters starting from the specified position. If _retDelims
325     * is false, returns the index of the first non-delimiter character at or
326     * after startPos. If _retDelims is true, startPos is returned.
327     */
328    private int skipDelimiters(int startPos) {
329                if (Text.EMPTY.equals(_delimiters))     return _maxPosition;
330                
331        int position = startPos;
332                while (!_retDelims && position < _maxPosition) {
333                        char c = _text.charAt(position);
334                        if ( (c > _maxDelimChar) || (_delimiters.indexOf(c,0) < 0) )
335                                break;
336                        position++;
337                        if (_returnEmptyTokens)
338                                break;
339                }
340        return position;
341    }
342
343    /**
344     * Skips ahead from startPos and returns the index of the next delimiter
345     * character encountered, or _maxPosition if no such delimiter is found.
346     */
347    private int scanToken(int startPos) {
348                boolean inQuote = false;
349        int position = startPos;
350        while (position < _maxPosition) {
351            char c = _text.charAt(position);
352                        if (_honorQuotes && c == _quoteChar) {
353                                if (!inQuote)
354                                        inQuote = true;
355                                else
356                                        inQuote = false;
357                                
358                        } else if (!inQuote && (c <= _maxDelimChar) && (_delimiters.indexOf(c,0) >= 0))
359                break;
360            position++;
361                }
362                if (_retDelims && (startPos == position)) {
363                        char c = _text.charAt(position);
364                        if ((c <= _maxDelimChar) && (_delimiters.indexOf(c,0) >= 0))
365                                        position++;
366        }
367        return position;
368    }
369
370    /**
371     * Tests if there are more tokens available from this tokenizer's text. 
372     * If this method returns true, then a subsequent call to 
373     * nextToken with no argument will successfully return a token.
374     *
375     * @return  <code>true</code> if and only if there is at least one token 
376     *          in the text after the current position; <code>false</code> 
377     *          otherwise.
378     */
379    public boolean hasMoreTokens() {
380                /*
381                 * Temporary store this position and use it in the following
382                 * nextToken() method only if the delimiters have'nt been changed in
383                 * that nextToken() invocation.
384                 */
385                _newPosition = skipDelimiters(_currentPosition);
386                return (_newPosition < _maxPosition);
387    }
388
389    /**
390     * Returns the next token from this text tokenizer.
391     *
392     * @return     the next token from this text tokenizer.
393     * @exception  NoSuchElementException  if there are no more tokens in this
394     *               tokenizer's text.
395     */
396    public Text nextToken() {
397                /* 
398                 * If next position already computed in hasMoreElements() and
399                 * delimiters have changed between the computation and this invocation,
400                 * then use the computed value.
401                 */
402                _currentPosition = (_newPosition >= 0 && !_delimsChanged) ?  
403                                                                _newPosition : skipDelimiters(_currentPosition);
404
405                /* Reset these anyway */
406                _delimsChanged = false;
407                _newPosition = -1;
408
409                if (_currentPosition >= _maxPosition)
410                        throw new NoSuchElementException();
411                
412                int start = _currentPosition;
413                _currentPosition = scanToken(_currentPosition);
414                
415                return _text.subtext(start, _currentPosition);
416    }
417
418    /**
419     * Returns the next token in this text tokenizer's text. First, 
420     * the set of characters considered to be delimiters by this 
421     * TextTokenizer object is changed to be the characters in 
422     * the string delim. Then the next token in the text
423     * after the current position is returned. The current position is 
424     * advanced beyond the recognized token.  The new delimiter set 
425     * remains the default after this call. 
426     *
427     * @param      delim   the new delimiters.
428     * @return     the next token, after switching to the new delimiter set.
429     * @exception  NoSuchElementException  if there are no more tokens in this
430     *               tokenizer's text.
431     */
432    public Text nextToken(CharSequence delim) {
433                setDelimiters(delim);
434                return nextToken();
435    }
436
437    /**
438     * Returns the same value as the <code>hasMoreTokens</code>
439     * method. It exists so that this class can implement the
440     * <code>Enumeration</code> interface. 
441     *
442     * @return  <code>true</code> if there are more tokens;
443     *          <code>false</code> otherwise.
444     * @see     java.util.Enumeration
445     * @see     #hasMoreTokens()
446     */
447    @Override
448    public boolean hasMoreElements() {
449                return hasMoreTokens();
450    }
451
452    /**
453     * Returns the same value as the <code>nextToken</code> method.
454         * It exists so that this class can implement the
455     * <code>Enumeration</code> interface. 
456     *
457     * @return     the next token in the text.
458     * @exception  NoSuchElementException  if there are no more tokens in this
459     *               tokenizer's text.
460     * @see        java.util.Enumeration
461     * @see        #nextToken()
462     */
463    @Override
464    public Text nextElement() {
465                return nextToken();
466    }
467
468        
469        /**
470        *  Returns an iterator over the tokens returned by this tokenizer.
471        **/
472    @Override
473        public Iterator<Text> iterator() {
474                return this;
475        }
476        
477        /**
478         * Returns the same value as the <code>hasMoreTokens()</code> method. It exists
479         * so that this class can implement the <code>Iterator</code> interface.
480         *
481         * @return <code>true</code> if there are more tokens;
482         *     <code>false</code> otherwise.
483         *
484         * @see java.util.Iterator
485         * @see #hasMoreTokens()
486         */
487    @Override
488        public boolean hasNext(){
489                return hasMoreTokens();
490        }
491
492        /**
493         * Returns the same value as the <code>nextToken()</code> method.
494         * It exists so that this class can implement the
495         * <code>Iterator</code> interface.
496         *
497         * @return the next token in the text.
498         * @throws NoSuchElementException if there are no more tokens in this tokenizer's text.
499         *
500         * @see java.util.Iterator
501         * @see #nextToken()
502         */
503    @Override
504        public Text next(){
505                return nextToken();
506        }
507
508        /**
509         * This implementation always throws <code>UnsupportedOperationException</code>.
510         * It exists so that this class can implement the <code>Iterator</code> interface.
511         *
512         * @throws UnsupportedOperationException always is thrown.
513         *
514         * @see java.util.Iterator
515         */
516    @Override
517        public void remove(){
518                throw new UnsupportedOperationException();
519        }
520
521   /**
522     * Calculates the number of times that this tokenizer's 
523     * <code>nextToken</code> method can be called before it generates an 
524     * exception. The current position is not advanced.
525     *
526     * @return  the number of tokens remaining in the text using the current
527     *          delimiter set.
528     * @see     #nextToken()
529     */
530    public int countTokens() {
531                int count = 0;
532                int currpos = _currentPosition;
533                while (currpos < _maxPosition) {
534                                currpos = skipDelimiters(currpos);
535                        if (currpos >= _maxPosition)
536                        break;
537                                currpos = scanToken(currpos);
538                        count++;
539                }
540                return count;
541    }
542
543        /**
544         * Calculates the number of times that this tokenizer's <code>nextToken</code>
545         * method can be called before it generates an exception using the given set of
546         * delimiters.  The delimiters given will be used for future calls to
547         * nextToken() unless new delimiters are given. The current position
548         * is not advanced.
549         *
550         * @param delims the new set of delimiters.
551         * @return the number of tokens remaining in the text using the new
552         *    delimiter set.
553         *
554         * @see #countTokens()
555         */
556        public int countTokens(CharSequence delims){
557                setDelimiters(delims);
558                return countTokens();
559        }
560
561        /**
562         * Retrieves the rest of the text as a single token.
563         * After calling this method hasMoreTokens() will always return false.
564         *
565         * @return any part of the text that has not yet been tokenized.
566         */
567        public Text restOfText() {
568                Text output = _text.subtext(_currentPosition, _maxPosition);
569                _currentPosition = _maxPosition;
570                return output;
571        }
572
573        /**
574         * Returns the same value as the <code>nextToken()</code> method.
575         * It exists so that this class can implement the
576         * <code>Realtime</code> interface.
577         *
578         * @return the next token in the text.
579         * @throws NoSuchElementException if there are no more tokens in this tokenizer's text.
580         *
581         * @see javolution.lang.Realtime
582         * @see #nextToken()
583         */
584    @Override
585        public Text toText() {
586                return nextToken();
587        }
588
589        /**
590        * Recycles a <code>TextTokenizer</code> instance immediately
591        * (on the stack when executing in a <code>StackContext</code>).
592        **/
593        public static void recycle(TextTokenizer instance) {
594                FACTORY.recycle(instance);
595        }
596        
597
598        //////////////////////
599        // Factory Creation //
600        //////////////////////
601        
602        @SuppressWarnings("unchecked")
603        private static final ObjectFactory<TextTokenizer> FACTORY = new ObjectFactory<TextTokenizer>() {
604                @Override
605                protected TextTokenizer create() {
606                        return new TextTokenizer();
607                }
608        @Override
609                protected void cleanup(TextTokenizer instance) {
610                        instance.reset();
611                }
612        };
613
614        private TextTokenizer() { }
615
616
617        /**
618        *  Testing code for this class.
619        **/
620        public static void main (String args[]) {
621                System.out.println("Testing TextTokenizer:");
622                
623                System.out.println("\nTokenize: \"this is a test\":");
624                TextTokenizer tt = TextTokenizer.valueOf("this is a test");
625                while (tt.hasMoreTokens()) {
626                        tt.nextToken().println();
627                }
628                
629                System.out.println("\nTokenize: \"this,is,,a,test\" returning empty tokens:");
630                tt.setText("this,is,,a,test");
631                tt.setDelimiters(",");
632                tt.setReturnEmptyTokens(true);
633                while (tt.hasMoreTokens()) {
634                        tt.nextToken().println();
635                }
636                
637        }
638}