Source code

001/*
002*   TextTokenizer -- A javolution.text.Text compatible replacement for java.util.StringTokenizer
003*
004*   Copyright (C) 2009-2025 by Joseph A. Huwaldt.
005*   All rights reserved.
006*   
007*   This library is free software; you can redistribute it and/or
008*   modify it under the terms of the GNU Lesser General Public
009*   License as published by the Free Software Foundation; either
010*   version 2 of the License, or (at your option) any later version.
011*   
012*   This library is distributed in the hope that it will be useful,
013*   but WITHOUT ANY WARRANTY; without even the implied warranty of
014*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
015*   Lesser General Public License for more details.
016*
017*   You should have received a copy of the GNU Lesser General Public License
018*   along with this program; if not, write to the Free Software
019*   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
020*   Or visit:  http://www.gnu.org/licenses/lgpl.html
021*
022*/
023package jahuwaldt.js.util;
024
025import java.util.NoSuchElementException;
026import java.util.Iterator;
027import java.util.Enumeration;
028
029import javolution.context.ObjectFactory;
030import javolution.lang.Realtime;
031import javolution.lang.Reusable;
032import javolution.text.Text;
033
034
035/**
036 * The text tokenizer class allows an application to break a 
037 * <code>Text</code> object into tokens. The tokenization method is much simpler than 
038 * the one used by the <code>StreamTokenizer</code> class. The 
039 * <code>TextTokenizer</code> methods do not distinguish among 
040 * identifiers, numbers, and quoted strings, nor do they recognize 
041 * and skip comments. 
042 * <p>
043 * The set of delimiters (the characters that separate tokens) may 
044 * be specified either at creation time or on a per-token basis. 
045 * <p>
046 * An instance of <code>TextTokenizer</code> behaves in one of two 
047 * ways, depending on whether it was created with the 
048 * <code>returnDelims</code> flag having the value <code>true</code> 
049 * or <code>false</code>: 
050 * <ul>
051 * <li>If the flag is <code>false</code>, delimiter characters serve to 
052 *     separate tokens. A token is a maximal sequence of consecutive 
053 *     characters that are not delimiters. 
054 * <li>If the flag is <code>true</code>, delimiter characters are themselves 
055 *     considered to be tokens. A token is thus either one delimiter 
056 *     character, or a maximal sequence of consecutive characters that are 
057 *     not delimiters.
058 * </ul><p>
059 * A TextTokenizer object internally maintains a current 
060 * position within the text to be tokenized. Some operations advance this 
061 * current position past the characters processed.<p>
062 * A token is returned by taking a subtext of the text that was used to 
063 * create the TextTokenizer object.
064 * <p>
065 * The following is one example of the use of the tokenizer. The code:
066 * <blockquote><pre>
067 *     TextTokenizer tt = TextTokenizer.valueOf("this is a test");
068 *     while (tt.hasMoreTokens()) {
069 *         System.out.println(tt.nextToken());
070 *     }
071 * </pre></blockquote>
072 * <p>
073 * prints the following output:
074 * <blockquote><pre>
075 *     this
076 *     is
077 *     a
078 *     test
079 * </pre></blockquote>
080 *
081 * <p>
082 * TextTokenizer is heavily based on <code>java.util.StringTokenizer</code>.
083 * However, there are some improvements and additional methods and capabilities.
084 * </p>
085 *
086 *  <p>  Modified by:  Joseph A. Huwaldt   </p>
087 *
088 *  @author  Joseph A. Huwaldt   Date: March 12, 2009
089 *  @version February 23, 2025
090 */
091public final class TextTokenizer implements Enumeration<Text>, Iterator<Text>, Iterable<Text>, Realtime, Reusable {
092        private static final Text DEFAULT_DELIMS = Text.intern(" \t\n\r\f");
093        
094    private int _currentPosition;
095    private int _newPosition;
096    private int _maxPosition;
097    private Text _text;
098    private Text _delimiters;
099    private boolean _retDelims;
100    private boolean _delimsChanged;
101        private boolean _honorQuotes = false;
102        private char _quoteChar = '"';
103
104        /**
105         * Set to true if empty tokens should be returned.
106         * For example, if "" should be returned when text starts with
107         * a delimiter, has two delimiters next to each other, or
108         * ends with a delimiter.
109         */
110        private boolean _returnEmptyTokens;
111
112    /**
113     * _maxDelimChar stores the value of the delimiter character with the
114     * highest value. It is used to optimize the detection of delimiter
115     * characters.
116     */
117    private char _maxDelimChar;
118
119
120    /**
121     * Return a text tokenizer with an initially empty string of text and with
122         * no delimiters. Use {@link #setText} and {@link #setDelimiters} to make
123         * this instance useful.
124         *
125         * @return A text tokenizer with an initially empty string of text and with
126         * no delimiters.
127         */
128        @SuppressWarnings("unchecked")
129        public static TextTokenizer newInstance() {
130                TextTokenizer o = FACTORY.object();
131                o._text = Text.EMPTY;
132                o._delimiters = DEFAULT_DELIMS;
133                o._currentPosition = 0;
134                o._newPosition = -1;
135                o._maxPosition = o._text.length();
136                o._delimsChanged = false;
137                o._returnEmptyTokens = false;
138                o._retDelims = false;
139        o._honorQuotes = false;
140        o._quoteChar = '"';
141        o.setMaxDelimChar();
142                return o;
143        }
144
145        /**
146        *  Resets the internal state of this object to its default values.
147        */
148    @Override
149        public void reset() {
150                _text = Text.EMPTY;
151                _delimiters = DEFAULT_DELIMS;
152                _currentPosition = 0;
153                _newPosition = -1;
154                _maxPosition = _text.length();
155                _delimsChanged = false;
156                _returnEmptyTokens = false;
157                _retDelims = false;
158        _honorQuotes = false;
159        _quoteChar = '"';
160                setMaxDelimChar();
161        }
162        
163    /**
164     * Return a text tokenizer for the specified character sequence. All  
165     * characters in the <code>delim</code> argument are the delimiters 
166     * for separating tokens. 
167     * <p>
168     * If the <code>returnDelims</code> flag is <code>true</code>, then 
169     * the delimiter characters are also returned as tokens. Each 
170     * delimiter is returned as a string of length one. If the flag is 
171     * <code>false</code>, the delimiter characters are skipped and only 
172     * serve as separators between tokens. 
173     * <p>
174     * Note that if delim is null, this constructor does
175     * not throw an exception. However, trying to invoke other methods on the
176     * resulting TextTokenizer may result in a 
177     * NullPointerException.
178     *
179     * @param   text           the text to be parsed.
180     * @param   delim          the delimiters.
181     * @param   returnDelims   flag indicating whether to return the delimiters
182     *                         as tokens.
183     * @return A text tokenizer for the specified character sequence.
184     */
185    public static TextTokenizer valueOf(CharSequence text, CharSequence delim, boolean returnDelims) {
186                TextTokenizer o = TextTokenizer.newInstance();
187                
188                o._text = (text != null ? Text.valueOf(text) : Text.EMPTY);
189                o._currentPosition = 0;
190                o._newPosition = -1;
191                o._maxPosition = o._text.length();
192                
193                o._delimsChanged = false;
194                o._delimiters = (delim != null ? Text.valueOf(delim) : Text.EMPTY);
195                o._retDelims = returnDelims;
196        o.setMaxDelimChar();
197                
198                return o;
199    }
200
201    /**
202     * Return a text tokenizer for the specified character sequence. The 
203     * characters in the <code>delim</code> argument are the delimiters 
204     * for separating tokens. Delimiter characters themselves will not 
205     * be treated as tokens.
206     *
207     * @param   text    the text to be parsed.
208     * @param   delim   the delimiters.
209     * @return A text tokenizer for the specified character sequence.
210     */
211    public static TextTokenizer valueOf(CharSequence text, CharSequence delim) {
212                return TextTokenizer.valueOf(text, delim, false);
213    }
214
215    /**
216     * Return a text tokenizer for the specified character sequence. The 
217     * tokenizer uses the default delimiter set, which is 
218     * <code>"&nbsp;&#92;t&#92;n&#92;r&#92;f"</code>: the space character, 
219     * the tab character, the newline character, the carriage-return character,
220     * and the form-feed character. Delimiter characters themselves will 
221     * not be treated as tokens.
222     *
223     * @param   text  the text to be parsed.
224     * @return A text tokenizer for the specified character sequence.
225     */
226    public static TextTokenizer valueOf(CharSequence text) {
227                return TextTokenizer.valueOf(text, DEFAULT_DELIMS, false);
228    }
229
230
231        /**
232         * Set the text to be tokenized in this TextTokenizer.
233         * <p>
234         * This is useful when for TextTokenizer re-use so that new string tokenizers do not
235         * have to be created for each string you want to tokenizer.
236         * <p>
237         * The text will be tokenized from the beginning of the text.
238         *
239         * @param text the text to be parsed.
240         */
241        public void setText(CharSequence text){
242                _text = (text != null ? Text.valueOf(text) : Text.EMPTY);
243                _currentPosition = 0;
244                _newPosition = -1;
245                _maxPosition = _text.length();
246        }
247        
248        /**
249         * Set the delimiters for this TextTokenizer.
250         * The position must be initialized before this method is used
251         * (setText does this and it is called from the constructor).
252         *
253         * @param delim  the delimiters
254         */
255        public void setDelimiters(CharSequence delim){
256                _delimiters = (delim != null ? Text.valueOf(delim) : Text.EMPTY);
257                
258                /* delimiter string specified, so set the appropriate flag. */
259                _delimsChanged = true;
260
261                setMaxDelimChar();
262        }
263        
264        /**
265        *  Set the character to use as the "quote" character.  All text between quote
266        *  characters is considered a single token.  The default quote character is <code>'"'</code>.
267        *
268        * @param quote The character to use as the "quote" character.
269        * @see #setHonorQuotes
270        */
271        public void setQuoteChar(char quote) {
272                _quoteChar = quote;
273        }
274        
275        /**
276        *  Sets whether or not this tokenizer recognizes quoted text using the specified
277        *  quote character.  If <code>true</code> is passed, this tokenizer will consider any
278        *  text between the specified quote characters as a single token.  Honoring of
279        *  quotes defaults to false.
280        *
281        * @param honorQuotes Treat quoted text as a single token if <code>true</code>.
282        * @see #setQuoteChar
283        */
284        public void setHonorQuotes(boolean honorQuotes) {
285                _honorQuotes = honorQuotes;
286        }
287        
288        /**
289        *  Returns <code>true</code> if this tokenizer honors quoted text (counts it as a single token).
290        *
291        * @return <code>true</code> if this tokenizer honors quoted text.
292        */
293        public boolean getHonorQuotes() {
294                return _honorQuotes;
295        }
296        
297     /**
298     * Set _maxDelimChar to the highest char in the delimiter set.
299     */
300    private void setMaxDelimChar() {
301                char m = 0;
302                for (int i = 0; i < _delimiters.length(); i++) {
303                        char c = _delimiters.charAt(i);
304                        if (m < c)
305                        m = c;
306                }
307                _maxDelimChar = m;
308    }
309
310        /**
311         * Set whether empty tokens should be returned from this point in
312         * in the tokenizing process onward.
313         * <p>
314         * Empty tokens occur when two delimiters are next to each other
315         * or a delimiter occurs at the beginning or end of a string. If
316         * empty tokens are set to be returned, and a comma is the non token
317         * delimiter, the following table shows how many tokens are in each
318         * string.<br>
319         * <table><tr><th>String<th><th>Number of tokens<th></tr>
320         * <tr><td align=right>"one,two"<td><td>2 - normal case with no empty tokens.<td></tr>
321         * <tr><td align=right>"one,,three"<td><td>3 including the empty token in the middle.<td></tr>
322         * <tr><td align=right>"one,"<td><td>2 including the empty token at the end.<td></tr>
323         * <tr><td align=right>",two"<td><td>2 including the empty token at the beginning.<td></tr>
324         * <tr><td align=right>","<td><td>2 including the empty tokens at the beginning and the ends.<td></tr>
325         * <tr><td align=right>""<td><td>1 - all strings will have at least one token if empty tokens are returned.<td></tr></table>
326         *
327         * @param returnEmptyTokens true if and only if empty tokens should be returned.
328         */
329        public void setReturnEmptyTokens(boolean returnEmptyTokens){
330                _returnEmptyTokens = returnEmptyTokens;
331        }
332
333   /**
334     * Skips delimiters starting from the specified position. If _retDelims
335     * is false, returns the index of the first non-delimiter character at or
336     * after startPos. If _retDelims is true, startPos is returned.
337     */
338    private int skipDelimiters(int startPos) {
339                if (Text.EMPTY.equals(_delimiters))     return _maxPosition;
340                
341        int position = startPos;
342                while (!_retDelims && position < _maxPosition) {
343                        char c = _text.charAt(position);
344                        if ( (c > _maxDelimChar) || (_delimiters.indexOf(c,0) < 0) )
345                                break;
346                        position++;
347                        if (_returnEmptyTokens)
348                                break;
349                }
350        return position;
351    }
352
353    /**
354     * Skips ahead from startPos and returns the index of the next delimiter
355     * character encountered, or _maxPosition if no such delimiter is found.
356     */
357    private int scanToken(int startPos) {
358                boolean inQuote = false;
359        int position = startPos;
360        while (position < _maxPosition) {
361            char c = _text.charAt(position);
362                        if (_honorQuotes && c == _quoteChar) {
363                                if (!inQuote)
364                                        inQuote = true;
365                                else
366                                        inQuote = false;
367                                
368                        } else if (!inQuote && (c <= _maxDelimChar) && (_delimiters.indexOf(c,0) >= 0))
369                break;
370            position++;
371                }
372                if (_retDelims && (startPos == position)) {
373                        char c = _text.charAt(position);
374                        if ((c <= _maxDelimChar) && (_delimiters.indexOf(c,0) >= 0))
375                                        position++;
376        }
377        return position;
378    }
379
380    /**
381     * Tests if there are more tokens available from this tokenizer's text. 
382     * If this method returns true, then a subsequent call to 
383     * nextToken with no argument will successfully return a token.
384     *
385     * @return  <code>true</code> if and only if there is at least one token 
386     *          in the text after the current position; <code>false</code> 
387     *          otherwise.
388     */
389    public boolean hasMoreTokens() {
390                /*
391                 * Temporary store this position and use it in the following
392                 * nextToken() method only if the delimiters have'nt been changed in
393                 * that nextToken() invocation.
394                 */
395                _newPosition = skipDelimiters(_currentPosition);
396                return (_newPosition < _maxPosition);
397    }
398
399    /**
400     * Returns the next token from this text tokenizer.
401     *
402     * @return     the next token from this text tokenizer.
403     * @exception  NoSuchElementException  if there are no more tokens in this
404     *               tokenizer's text.
405     */
406    public Text nextToken() {
407                /* 
408                 * If next position already computed in hasMoreElements() and
409                 * delimiters have changed between the computation and this invocation,
410                 * then use the computed value.
411                 */
412                _currentPosition = (_newPosition >= 0 && !_delimsChanged) ?  
413                                                                _newPosition : skipDelimiters(_currentPosition);
414
415                /* Reset these anyway */
416                _delimsChanged = false;
417                _newPosition = -1;
418
419                if (_currentPosition >= _maxPosition)
420                        throw new NoSuchElementException();
421                
422                int start = _currentPosition;
423                _currentPosition = scanToken(_currentPosition);
424                
425                return _text.subtext(start, _currentPosition);
426    }
427
428    /**
429     * Returns the next token in this text tokenizer's text. First, 
430     * the set of characters considered to be delimiters by this 
431     * TextTokenizer object is changed to be the characters in 
432     * the string delim. Then the next token in the text
433     * after the current position is returned. The current position is 
434     * advanced beyond the recognized token.  The new delimiter set 
435     * remains the default after this call. 
436     *
437     * @param      delim   the new delimiters.
438     * @return     the next token, after switching to the new delimiter set.
439     * @exception  NoSuchElementException  if there are no more tokens in this
440     *               tokenizer's text.
441     */
442    public Text nextToken(CharSequence delim) {
443                setDelimiters(delim);
444                return nextToken();
445    }
446
447    /**
448     * Returns the same value as the <code>hasMoreTokens</code>
449     * method. It exists so that this class can implement the
450     * <code>Enumeration</code> interface. 
451     *
452     * @return  <code>true</code> if there are more tokens;
453     *          <code>false</code> otherwise.
454     * @see     java.util.Enumeration
455     * @see     #hasMoreTokens()
456     */
457    @Override
458    public boolean hasMoreElements() {
459                return hasMoreTokens();
460    }
461
462    /**
463     * Returns the same value as the <code>nextToken</code> method.
464         * It exists so that this class can implement the
465     * <code>Enumeration</code> interface. 
466     *
467     * @return     the next token in the text.
468     * @exception  NoSuchElementException  if there are no more tokens in this
469     *               tokenizer's text.
470     * @see        java.util.Enumeration
471     * @see        #nextToken()
472     */
473    @Override
474    public Text nextElement() {
475                return nextToken();
476    }
477
478        
479        /**
480        *  Returns an iterator over the tokens returned by this tokenizer.
481        */
482    @Override
483        public Iterator<Text> iterator() {
484                return this;
485        }
486        
487        /**
488         * Returns the same value as the <code>hasMoreTokens()</code> method. It exists
489         * so that this class can implement the <code>Iterator</code> interface.
490         *
491         * @return <code>true</code> if there are more tokens;
492         *     <code>false</code> otherwise.
493         *
494         * @see java.util.Iterator
495         * @see #hasMoreTokens()
496         */
497    @Override
498        public boolean hasNext(){
499                return hasMoreTokens();
500        }
501
502        /**
503         * Returns the same value as the <code>nextToken()</code> method.
504         * It exists so that this class can implement the
505         * <code>Iterator</code> interface.
506         *
507         * @return the next token in the text.
508         * @throws NoSuchElementException if there are no more tokens in this tokenizer's text.
509         *
510         * @see java.util.Iterator
511         * @see #nextToken()
512         */
513    @Override
514        public Text next(){
515                return nextToken();
516        }
517
518        /**
519         * This implementation always throws <code>UnsupportedOperationException</code>.
520         * It exists so that this class can implement the <code>Iterator</code> interface.
521         *
522         * @throws UnsupportedOperationException always is thrown.
523         *
524         * @see java.util.Iterator
525         */
526    @Override
527        public void remove(){
528                throw new UnsupportedOperationException();
529        }
530
531   /**
532     * Calculates the number of times that this tokenizer's 
533     * <code>nextToken</code> method can be called before it generates an 
534     * exception. The current position is not advanced.
535     *
536     * @return  the number of tokens remaining in the text using the current
537     *          delimiter set.
538     * @see     #nextToken()
539     */
540    public int countTokens() {
541                int count = 0;
542                int currpos = _currentPosition;
543                while (currpos < _maxPosition) {
544                                currpos = skipDelimiters(currpos);
545                        if (currpos >= _maxPosition)
546                        break;
547                                currpos = scanToken(currpos);
548                        count++;
549                }
550                return count;
551    }
552
553        /**
554         * Calculates the number of times that this tokenizer's <code>nextToken</code>
555         * method can be called before it generates an exception using the given set of
556         * delimiters.  The delimiters given will be used for future calls to
557         * nextToken() unless new delimiters are given. The current position
558         * is not advanced.
559         *
560         * @param delims the new set of delimiters.
561         * @return the number of tokens remaining in the text using the new
562         *    delimiter set.
563         *
564         * @see #countTokens()
565         */
566        public int countTokens(CharSequence delims){
567                setDelimiters(delims);
568                return countTokens();
569        }
570
571        /**
572         * Retrieves the rest of the text as a single token.
573         * After calling this method hasMoreTokens() will always return false.
574         *
575         * @return any part of the text that has not yet been tokenized.
576         */
577        public Text restOfText() {
578                Text output = _text.subtext(_currentPosition, _maxPosition);
579                _currentPosition = _maxPosition;
580                return output;
581        }
582
583        /**
584         * Returns the same value as the <code>nextToken()</code> method.
585         * It exists so that this class can implement the
586         * <code>Realtime</code> interface.
587         *
588         * @return the next token in the text.
589         * @throws NoSuchElementException if there are no more tokens in this tokenizer's text.
590         *
591         * @see javolution.lang.Realtime
592         * @see #nextToken()
593         */
594    @Override
595        public Text toText() {
596                return nextToken();
597        }
598
599        /**
600        * Recycles a <code>TextTokenizer</code> instance immediately
601        * (on the stack when executing in a <code>StackContext</code>).
602        *
603        * @param instance the instance of this class to recycle.
604        */
605        public static void recycle(TextTokenizer instance) {
606                FACTORY.recycle(instance);
607        }
608        
609
610        //////////////////////
611        // Factory Creation //
612        //////////////////////
613        
614        @SuppressWarnings("unchecked")
615        private static final ObjectFactory<TextTokenizer> FACTORY = new ObjectFactory<TextTokenizer>() {
616                @Override
617                protected TextTokenizer create() {
618                        return new TextTokenizer();
619                }
620        @Override
621                protected void cleanup(TextTokenizer instance) {
622                        instance.reset();
623                }
624        };
625
626        private TextTokenizer() { }
627
628
629        /**
630        *  Testing code for this class.
631        *
632        * @param args the command-line arguments
633        */
634        public static void main (String args[]) {
635                System.out.println("Testing TextTokenizer:");
636                
637                System.out.println("\nTokenize: \"this is a test\":");
638                TextTokenizer tt = TextTokenizer.valueOf("this is a test");
639                while (tt.hasMoreTokens()) {
640                        tt.nextToken().println();
641                }
642                
643                System.out.println("\nTokenize: \"this,is,,a,test\" returning empty tokens:");
644                tt.setText("this,is,,a,test");
645                tt.setDelimiters(",");
646                tt.setReturnEmptyTokens(true);
647                while (tt.hasMoreTokens()) {
648                        tt.nextToken().println();
649                }
650                
651        }
652}