001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text;
018
019import java.util.ArrayList;
020import java.util.Collections;
021import java.util.List;
022import java.util.ListIterator;
023import java.util.NoSuchElementException;
024
025/**
026 * Tokenizes a string based on delimiters (separators)
027 * and supporting quoting and ignored character concepts.
028 * <p>
029 * This class can split a String into many smaller strings. It aims
030 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
031 * however it offers much more control and flexibility including implementing
032 * the <code>ListIterator</code> interface. By default, it is set up
033 * like <code>StringTokenizer</code>.
034 * <p>
035 * The input String is split into a number of <i>tokens</i>.
036 * Each token is separated from the next String by a <i>delimiter</i>.
037 * One or more delimiter characters must be specified.
038 * <p>
039 * Each token may be surrounded by quotes.
040 * The <i>quote</i> matcher specifies the quote character(s).
041 * A quote may be escaped within a quoted section by duplicating itself.
042 * <p>
043 * Between each token and the delimiter are potentially characters that need trimming.
044 * The <i>trimmer</i> matcher specifies these characters.
045 * One usage might be to trim whitespace characters.
046 * <p>
047 * At any point outside the quotes there might potentially be invalid characters.
048 * The <i>ignored</i> matcher specifies these characters to be removed.
049 * One usage might be to remove new line characters.
050 * <p>
051 * Empty tokens may be removed or returned as null.
052 * <pre>
053 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
054 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
055 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
056 * </pre>
057 *
058 * <table>
059 *  <caption>StrTokenizer properties and options</caption>
060 *  <tr>
061 *   <th>Property</th><th>Type</th><th>Default</th>
062 *  </tr>
063 *  <tr>
064 *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
065 *  </tr>
066 *  <tr>
067 *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
068 *  </tr>
069 *  <tr>
070 *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
071 *  </tr>
072 *  <tr>
073 *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
074 *  </tr>
075 *  <tr>
076 *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
077 *  </tr>
078 * </table>
079 *
080 * @since 1.0
081 * @deprecated Deprecated as of 1.3, use {@link StringTokenizer} instead. This class will be removed in 2.0.
082 */
083@Deprecated
084public class StrTokenizer implements ListIterator<String>, Cloneable {
085
086    /** Comma separated values tokenizer internal variable. */
087    private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
088    /** Tab separated values tokenizer internal variable. */
089    private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
090    static {
091        CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
092        CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
093        CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
094        CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
095        CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
096        CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
097        CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
098
099        TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
100        TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
101        TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
102        TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
103        TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
104        TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
105        TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
106    }
107
108    /** The text to work on. */
109    private char[] chars;
110    /** The parsed tokens. */
111    private String[] tokens;
112    /** The current iteration position. */
113    private int tokenPos;
114
115    /** The delimiter matcher. */
116    private StrMatcher delimMatcher = StrMatcher.splitMatcher();
117    /** The quote matcher. */
118    private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
119    /** The ignored matcher. */
120    private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
121    /** The trimmer matcher. */
122    private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
123
124    /** Whether to return empty tokens as null. */
125    private boolean emptyAsNull = false;
126    /** Whether to ignore empty tokens. */
127    private boolean ignoreEmptyTokens = true;
128
129    //-----------------------------------------------------------------------
130
131    /**
132     * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
133     *
134     * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
135     */
136    private static StrTokenizer getCSVClone() {
137        return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
138    }
139
140    /**
141     * Gets a new tokenizer instance which parses Comma Separated Value strings
142     * initializing it with the given input.  The default for CSV processing
143     * will be trim whitespace from both ends (which can be overridden with
144     * the setTrimmer method).
145     * <p>
146     * You must call a "reset" method to set the string which you want to parse.
147     * @return a new tokenizer instance which parses Comma Separated Value strings
148     */
149    public static StrTokenizer getCSVInstance() {
150        return getCSVClone();
151    }
152
153    /**
154     * Gets a new tokenizer instance which parses Comma Separated Value strings
155     * initializing it with the given input.  The default for CSV processing
156     * will be trim whitespace from both ends (which can be overridden with
157     * the setTrimmer method).
158     *
159     * @param input  the text to parse
160     * @return a new tokenizer instance which parses Comma Separated Value strings
161     */
162    public static StrTokenizer getCSVInstance(final String input) {
163        final StrTokenizer tok = getCSVClone();
164        tok.reset(input);
165        return tok;
166    }
167
168    /**
169     * Gets a new tokenizer instance which parses Comma Separated Value strings
170     * initializing it with the given input.  The default for CSV processing
171     * will be trim whitespace from both ends (which can be overridden with
172     * the setTrimmer method).
173     *
174     * @param input  the text to parse
175     * @return a new tokenizer instance which parses Comma Separated Value strings
176     */
177    public static StrTokenizer getCSVInstance(final char[] input) {
178        final StrTokenizer tok = getCSVClone();
179        tok.reset(input);
180        return tok;
181    }
182
183    /**
184     * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
185     *
186     * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
187     */
188    private static StrTokenizer getTSVClone() {
189        return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
190    }
191
192
193    /**
194     * Gets a new tokenizer instance which parses Tab Separated Value strings.
195     * The default for CSV processing will be trim whitespace from both ends
196     * (which can be overridden with the setTrimmer method).
197     * <p>
198     * You must call a "reset" method to set the string which you want to parse.
199     * @return a new tokenizer instance which parses Tab Separated Value strings.
200     */
201    public static StrTokenizer getTSVInstance() {
202        return getTSVClone();
203    }
204
205    /**
206     * Gets a new tokenizer instance which parses Tab Separated Value strings.
207     * The default for CSV processing will be trim whitespace from both ends
208     * (which can be overridden with the setTrimmer method).
209     * @param input  the string to parse
210     * @return a new tokenizer instance which parses Tab Separated Value strings.
211     */
212    public static StrTokenizer getTSVInstance(final String input) {
213        final StrTokenizer tok = getTSVClone();
214        tok.reset(input);
215        return tok;
216    }
217
218    /**
219     * Gets a new tokenizer instance which parses Tab Separated Value strings.
220     * The default for CSV processing will be trim whitespace from both ends
221     * (which can be overridden with the setTrimmer method).
222     * @param input  the string to parse
223     * @return a new tokenizer instance which parses Tab Separated Value strings.
224     */
225    public static StrTokenizer getTSVInstance(final char[] input) {
226        final StrTokenizer tok = getTSVClone();
227        tok.reset(input);
228        return tok;
229    }
230
231    //-----------------------------------------------------------------------
232    /**
233     * Constructs a tokenizer splitting on space, tab, newline and form feed
234     * as per StringTokenizer, but with no text to tokenize.
235     * <p>
236     * This constructor is normally used with {@link #reset(String)}.
237     */
238    public StrTokenizer() {
239        super();
240        this.chars = null;
241    }
242
243    /**
244     * Constructs a tokenizer splitting on space, tab, newline and form feed
245     * as per StringTokenizer.
246     *
247     * @param input  the string which is to be parsed
248     */
249    public StrTokenizer(final String input) {
250        super();
251        if (input != null) {
252            chars = input.toCharArray();
253        } else {
254            chars = null;
255        }
256    }
257
258    /**
259     * Constructs a tokenizer splitting on the specified delimiter character.
260     *
261     * @param input  the string which is to be parsed
262     * @param delim  the field delimiter character
263     */
264    public StrTokenizer(final String input, final char delim) {
265        this(input);
266        setDelimiterChar(delim);
267    }
268
269    /**
270     * Constructs a tokenizer splitting on the specified delimiter string.
271     *
272     * @param input  the string which is to be parsed
273     * @param delim  the field delimiter string
274     */
275    public StrTokenizer(final String input, final String delim) {
276        this(input);
277        setDelimiterString(delim);
278    }
279
280    /**
281     * Constructs a tokenizer splitting using the specified delimiter matcher.
282     *
283     * @param input  the string which is to be parsed
284     * @param delim  the field delimiter matcher
285     */
286    public StrTokenizer(final String input, final StrMatcher delim) {
287        this(input);
288        setDelimiterMatcher(delim);
289    }
290
291    /**
292     * Constructs a tokenizer splitting on the specified delimiter character
293     * and handling quotes using the specified quote character.
294     *
295     * @param input  the string which is to be parsed
296     * @param delim  the field delimiter character
297     * @param quote  the field quoted string character
298     */
299    public StrTokenizer(final String input, final char delim, final char quote) {
300        this(input, delim);
301        setQuoteChar(quote);
302    }
303
304    /**
305     * Constructs a tokenizer splitting using the specified delimiter matcher
306     * and handling quotes using the specified quote matcher.
307     *
308     * @param input  the string which is to be parsed
309     * @param delim  the field delimiter matcher
310     * @param quote  the field quoted string matcher
311     */
312    public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
313        this(input, delim);
314        setQuoteMatcher(quote);
315    }
316
317    /**
318     * Constructs a tokenizer splitting on space, tab, newline and form feed
319     * as per StringTokenizer.
320     *
321     * @param input  the string which is to be parsed, not cloned
322     */
323    public StrTokenizer(final char[] input) {
324        super();
325        if (input == null) {
326            this.chars = null;
327        } else {
328            this.chars = input.clone();
329        }
330    }
331
332    /**
333     * Constructs a tokenizer splitting on the specified character.
334     *
335     * @param input  the string which is to be parsed, not cloned
336     * @param delim the field delimiter character
337     */
338    public StrTokenizer(final char[] input, final char delim) {
339        this(input);
340        setDelimiterChar(delim);
341    }
342
343    /**
344     * Constructs a tokenizer splitting on the specified string.
345     *
346     * @param input  the string which is to be parsed, not cloned
347     * @param delim the field delimiter string
348     */
349    public StrTokenizer(final char[] input, final String delim) {
350        this(input);
351        setDelimiterString(delim);
352    }
353
354    /**
355     * Constructs a tokenizer splitting using the specified delimiter matcher.
356     *
357     * @param input  the string which is to be parsed, not cloned
358     * @param delim  the field delimiter matcher
359     */
360    public StrTokenizer(final char[] input, final StrMatcher delim) {
361        this(input);
362        setDelimiterMatcher(delim);
363    }
364
365    /**
366     * Constructs a tokenizer splitting on the specified delimiter character
367     * and handling quotes using the specified quote character.
368     *
369     * @param input  the string which is to be parsed, not cloned
370     * @param delim  the field delimiter character
371     * @param quote  the field quoted string character
372     */
373    public StrTokenizer(final char[] input, final char delim, final char quote) {
374        this(input, delim);
375        setQuoteChar(quote);
376    }
377
378    /**
379     * Constructs a tokenizer splitting using the specified delimiter matcher
380     * and handling quotes using the specified quote matcher.
381     *
382     * @param input  the string which is to be parsed, not cloned
383     * @param delim  the field delimiter character
384     * @param quote  the field quoted string character
385     */
386    public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
387        this(input, delim);
388        setQuoteMatcher(quote);
389    }
390
391    // API
392    //-----------------------------------------------------------------------
393    /**
394     * Gets the number of tokens found in the String.
395     *
396     * @return the number of matched tokens
397     */
398    public int size() {
399        checkTokenized();
400        return tokens.length;
401    }
402
403    /**
404     * Gets the next token from the String.
405     * Equivalent to {@link #next()} except it returns null rather than
406     * throwing {@link NoSuchElementException} when no tokens remain.
407     *
408     * @return the next sequential token, or null when no more tokens are found
409     */
410    public String nextToken() {
411        if (hasNext()) {
412            return tokens[tokenPos++];
413        }
414        return null;
415    }
416
417    /**
418     * Gets the previous token from the String.
419     *
420     * @return the previous sequential token, or null when no more tokens are found
421     */
422    public String previousToken() {
423        if (hasPrevious()) {
424            return tokens[--tokenPos];
425        }
426        return null;
427    }
428
429    /**
430     * Gets a copy of the full token list as an independent modifiable array.
431     *
432     * @return the tokens as a String array
433     */
434    public String[] getTokenArray() {
435        checkTokenized();
436        return tokens.clone();
437    }
438
439    /**
440     * Gets a copy of the full token list as an independent modifiable list.
441     *
442     * @return the tokens as a String array
443     */
444    public List<String> getTokenList() {
445        checkTokenized();
446        final List<String> list = new ArrayList<>(tokens.length);
447        Collections.addAll(list, tokens);
448
449        return list;
450    }
451
452    /**
453     * Resets this tokenizer, forgetting all parsing and iteration already completed.
454     * <p>
455     * This method allows the same tokenizer to be reused for the same String.
456     *
457     * @return this, to enable chaining
458     */
459    public StrTokenizer reset() {
460        tokenPos = 0;
461        tokens = null;
462        return this;
463    }
464
465    /**
466     * Reset this tokenizer, giving it a new input string to parse.
467     * In this manner you can re-use a tokenizer with the same settings
468     * on multiple input lines.
469     *
470     * @param input  the new string to tokenize, null sets no text to parse
471     * @return this, to enable chaining
472     */
473    public StrTokenizer reset(final String input) {
474        reset();
475        if (input != null) {
476            this.chars = input.toCharArray();
477        } else {
478            this.chars = null;
479        }
480        return this;
481    }
482
483    /**
484     * Reset this tokenizer, giving it a new input string to parse.
485     * In this manner you can re-use a tokenizer with the same settings
486     * on multiple input lines.
487     *
488     * @param input  the new character array to tokenize, not cloned, null sets no text to parse
489     * @return this, to enable chaining
490     */
491    public StrTokenizer reset(final char[] input) {
492        reset();
493        if (input != null) {
494            this.chars = input.clone();
495        } else {
496            this.chars = null;
497        }
498        return this;
499    }
500
501    // ListIterator
502    //-----------------------------------------------------------------------
503    /**
504     * Checks whether there are any more tokens.
505     *
506     * @return true if there are more tokens
507     */
508    @Override
509    public boolean hasNext() {
510        checkTokenized();
511        return tokenPos < tokens.length;
512    }
513
514    /**
515     * Gets the next token.
516     *
517     * @return the next String token
518     * @throws NoSuchElementException if there are no more elements
519     */
520    @Override
521    public String next() {
522        if (hasNext()) {
523            return tokens[tokenPos++];
524        }
525        throw new NoSuchElementException();
526    }
527
528    /**
529     * Gets the index of the next token to return.
530     *
531     * @return the next token index
532     */
533    @Override
534    public int nextIndex() {
535        return tokenPos;
536    }
537
538    /**
539     * Checks whether there are any previous tokens that can be iterated to.
540     *
541     * @return true if there are previous tokens
542     */
543    @Override
544    public boolean hasPrevious() {
545        checkTokenized();
546        return tokenPos > 0;
547    }
548
549    /**
550     * Gets the token previous to the last returned token.
551     *
552     * @return the previous token
553     */
554    @Override
555    public String previous() {
556        if (hasPrevious()) {
557            return tokens[--tokenPos];
558        }
559        throw new NoSuchElementException();
560    }
561
562    /**
563     * Gets the index of the previous token.
564     *
565     * @return the previous token index
566     */
567    @Override
568    public int previousIndex() {
569        return tokenPos - 1;
570    }
571
572    /**
573     * Unsupported ListIterator operation.
574     *
575     * @throws UnsupportedOperationException always
576     */
577    @Override
578    public void remove() {
579        throw new UnsupportedOperationException("remove() is unsupported");
580    }
581
582    /**
583     * Unsupported ListIterator operation.
584     * @param obj this parameter ignored.
585     * @throws UnsupportedOperationException always
586     */
587    @Override
588    public void set(final String obj) {
589        throw new UnsupportedOperationException("set() is unsupported");
590    }
591
592    /**
593     * Unsupported ListIterator operation.
594     * @param obj this parameter ignored.
595     * @throws UnsupportedOperationException always
596     */
597    @Override
598    public void add(final String obj) {
599        throw new UnsupportedOperationException("add() is unsupported");
600    }
601
602    // Implementation
603    //-----------------------------------------------------------------------
604    /**
605     * Checks if tokenization has been done, and if not then do it.
606     */
607    private void checkTokenized() {
608        if (tokens == null) {
609            if (chars == null) {
610                // still call tokenize as subclass may do some work
611                final List<String> split = tokenize(null, 0, 0);
612                tokens = split.toArray(new String[split.size()]);
613            } else {
614                final List<String> split = tokenize(chars, 0, chars.length);
615                tokens = split.toArray(new String[split.size()]);
616            }
617        }
618    }
619
620    /**
621     * Internal method to performs the tokenization.
622     * <p>
623     * Most users of this class do not need to call this method. This method
624     * will be called automatically by other (public) methods when required.
625     * <p>
626     * This method exists to allow subclasses to add code before or after the
627     * tokenization. For example, a subclass could alter the character array,
628     * offset or count to be parsed, or call the tokenizer multiple times on
629     * multiple strings. It is also be possible to filter the results.
630     * <p>
631     * <code>StrTokenizer</code> will always pass a zero offset and a count
632     * equal to the length of the array to this method, however a subclass
633     * may pass other values, or even an entirely different array.
634     *
635     * @param srcChars  the character array being tokenized, may be null
636     * @param offset  the start position within the character array, must be valid
637     * @param count  the number of characters to tokenize, must be valid
638     * @return the modifiable list of String tokens, unmodifiable if null array or zero count
639     */
640    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
641        if (srcChars == null || count == 0) {
642            return Collections.emptyList();
643        }
644        final StrBuilder buf = new StrBuilder();
645        final List<String> tokenList = new ArrayList<>();
646        int pos = offset;
647
648        // loop around the entire buffer
649        while (pos >= 0 && pos < count) {
650            // find next token
651            pos = readNextToken(srcChars, pos, count, buf, tokenList);
652
653            // handle case where end of string is a delimiter
654            if (pos >= count) {
655                addToken(tokenList, "");
656            }
657        }
658        return tokenList;
659    }
660
661    /**
662     * Adds a token to a list, paying attention to the parameters we've set.
663     *
664     * @param list  the list to add to
665     * @param tok  the token to add
666     */
667    private void addToken(final List<String> list, String tok) {
668        if (tok == null || tok.length() == 0) {
669            if (isIgnoreEmptyTokens()) {
670                return;
671            }
672            if (isEmptyTokenAsNull()) {
673                tok = null;
674            }
675        }
676        list.add(tok);
677    }
678
679    /**
680     * Reads character by character through the String to get the next token.
681     *
682     * @param srcChars  the character array being tokenized
683     * @param start  the first character of field
684     * @param len  the length of the character array being tokenized
685     * @param workArea  a temporary work area
686     * @param tokenList  the list of parsed tokens
687     * @return the starting position of the next field (the character
688     *  immediately after the delimiter), or -1 if end of string found
689     */
690    private int readNextToken(final char[] srcChars,
691                              int start,
692                              final int len,
693                              final StrBuilder workArea,
694                              final List<String> tokenList) {
695        // skip all leading whitespace, unless it is the
696        // field delimiter or the quote character
697        while (start < len) {
698            final int removeLen = Math.max(
699                    getIgnoredMatcher().isMatch(srcChars, start, start, len),
700                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
701            if (removeLen == 0
702                    || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
703                    || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
704                break;
705            }
706            start += removeLen;
707        }
708
709        // handle reaching end
710        if (start >= len) {
711            addToken(tokenList, "");
712            return -1;
713        }
714
715        // handle empty token
716        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
717        if (delimLen > 0) {
718            addToken(tokenList, "");
719            return start + delimLen;
720        }
721
722        // handle found token
723        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
724        if (quoteLen > 0) {
725            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
726        }
727        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
728    }
729
730    /**
731     * Reads a possibly quoted string token.
732     *
733     * @param srcChars  the character array being tokenized
734     * @param start  the first character of field
735     * @param len  the length of the character array being tokenized
736     * @param workArea  a temporary work area
737     * @param tokenList  the list of parsed tokens
738     * @param quoteStart  the start position of the matched quote, 0 if no quoting
739     * @param quoteLen  the length of the matched quote, 0 if no quoting
740     * @return the starting position of the next field (the character
741     *  immediately after the delimiter, or if end of string found,
742     *  then the length of string
743     */
744    private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
745                               final List<String> tokenList, final int quoteStart, final int quoteLen) {
746        // Loop until we've found the end of the quoted
747        // string or the end of the input
748        workArea.clear();
749        int pos = start;
750        boolean quoting = quoteLen > 0;
751        int trimStart = 0;
752
753        while (pos < len) {
754            // quoting mode can occur several times throughout a string
755            // we must switch between quoting and non-quoting until we
756            // encounter a non-quoted delimiter, or end of string
757            if (quoting) {
758                // In quoting mode
759
760                // If we've found a quote character, see if it's
761                // followed by a second quote.  If so, then we need
762                // to actually put the quote character into the token
763                // rather than end the token.
764                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
765                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
766                        // matched pair of quotes, thus an escaped quote
767                        workArea.append(srcChars, pos, quoteLen);
768                        pos += quoteLen * 2;
769                        trimStart = workArea.size();
770                        continue;
771                    }
772
773                    // end of quoting
774                    quoting = false;
775                    pos += quoteLen;
776                    continue;
777                }
778
779                // copy regular character from inside quotes
780                workArea.append(srcChars[pos++]);
781                trimStart = workArea.size();
782
783            } else {
784                // Not in quoting mode
785
786                // check for delimiter, and thus end of token
787                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
788                if (delimLen > 0) {
789                    // return condition when end of token found
790                    addToken(tokenList, workArea.substring(0, trimStart));
791                    return pos + delimLen;
792                }
793
794                // check for quote, and thus back into quoting mode
795                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
796                    quoting = true;
797                    pos += quoteLen;
798                    continue;
799                }
800
801                // check for ignored (outside quotes), and ignore
802                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
803                if (ignoredLen > 0) {
804                    pos += ignoredLen;
805                    continue;
806                }
807
808                // check for trimmed character
809                // don't yet know if its at the end, so copy to workArea
810                // use trimStart to keep track of trim at the end
811                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
812                if (trimmedLen > 0) {
813                    workArea.append(srcChars, pos, trimmedLen);
814                    pos += trimmedLen;
815                    continue;
816                }
817
818                // copy regular character from outside quotes
819                workArea.append(srcChars[pos++]);
820                trimStart = workArea.size();
821            }
822        }
823
824        // return condition when end of string found
825        addToken(tokenList, workArea.substring(0, trimStart));
826        return -1;
827    }
828
829    /**
830     * Checks if the characters at the index specified match the quote
831     * already matched in readNextToken().
832     *
833     * @param srcChars  the character array being tokenized
834     * @param pos  the position to check for a quote
835     * @param len  the length of the character array being tokenized
836     * @param quoteStart  the start position of the matched quote, 0 if no quoting
837     * @param quoteLen  the length of the matched quote, 0 if no quoting
838     * @return true if a quote is matched
839     */
840    private boolean isQuote(final char[] srcChars,
841                            final int pos,
842                            final int len,
843                            final int quoteStart,
844                            final int quoteLen) {
845        for (int i = 0; i < quoteLen; i++) {
846            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
847                return false;
848            }
849        }
850        return true;
851    }
852
853    // Delimiter
854    //-----------------------------------------------------------------------
855    /**
856     * Gets the field delimiter matcher.
857     *
858     * @return the delimiter matcher in use
859     */
860    public StrMatcher getDelimiterMatcher() {
861        return this.delimMatcher;
862    }
863
864    /**
865     * Sets the field delimiter matcher.
866     * <p>
867     * The delimiter is used to separate one token from another.
868     *
869     * @param delim  the delimiter matcher to use
870     * @return this, to enable chaining
871     */
872    public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
873        if (delim == null) {
874            this.delimMatcher = StrMatcher.noneMatcher();
875        } else {
876            this.delimMatcher = delim;
877        }
878        return this;
879    }
880
881    /**
882     * Sets the field delimiter character.
883     *
884     * @param delim  the delimiter character to use
885     * @return this, to enable chaining
886     */
887    public StrTokenizer setDelimiterChar(final char delim) {
888        return setDelimiterMatcher(StrMatcher.charMatcher(delim));
889    }
890
891    /**
892     * Sets the field delimiter string.
893     *
894     * @param delim  the delimiter string to use
895     * @return this, to enable chaining
896     */
897    public StrTokenizer setDelimiterString(final String delim) {
898        return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
899    }
900
901    // Quote
902    //-----------------------------------------------------------------------
903    /**
904     * Gets the quote matcher currently in use.
905     * <p>
906     * The quote character is used to wrap data between the tokens.
907     * This enables delimiters to be entered as data.
908     * The default value is '"' (double quote).
909     *
910     * @return the quote matcher in use
911     */
912    public StrMatcher getQuoteMatcher() {
913        return quoteMatcher;
914    }
915
916    /**
917     * Set the quote matcher to use.
918     * <p>
919     * The quote character is used to wrap data between the tokens.
920     * This enables delimiters to be entered as data.
921     *
922     * @param quote  the quote matcher to use, null ignored
923     * @return this, to enable chaining
924     */
925    public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
926        if (quote != null) {
927            this.quoteMatcher = quote;
928        }
929        return this;
930    }
931
932    /**
933     * Sets the quote character to use.
934     * <p>
935     * The quote character is used to wrap data between the tokens.
936     * This enables delimiters to be entered as data.
937     *
938     * @param quote  the quote character to use
939     * @return this, to enable chaining
940     */
941    public StrTokenizer setQuoteChar(final char quote) {
942        return setQuoteMatcher(StrMatcher.charMatcher(quote));
943    }
944
945    // Ignored
946    //-----------------------------------------------------------------------
947    /**
948     * Gets the ignored character matcher.
949     * <p>
950     * These characters are ignored when parsing the String, unless they are
951     * within a quoted region.
952     * The default value is not to ignore anything.
953     *
954     * @return the ignored matcher in use
955     */
956    public StrMatcher getIgnoredMatcher() {
957        return ignoredMatcher;
958    }
959
960    /**
961     * Set the matcher for characters to ignore.
962     * <p>
963     * These characters are ignored when parsing the String, unless they are
964     * within a quoted region.
965     *
966     * @param ignored  the ignored matcher to use, null ignored
967     * @return this, to enable chaining
968     */
969    public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
970        if (ignored != null) {
971            this.ignoredMatcher = ignored;
972        }
973        return this;
974    }
975
976    /**
977     * Set the character to ignore.
978     * <p>
979     * This character is ignored when parsing the String, unless it is
980     * within a quoted region.
981     *
982     * @param ignored  the ignored character to use
983     * @return this, to enable chaining
984     */
985    public StrTokenizer setIgnoredChar(final char ignored) {
986        return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
987    }
988
989    // Trimmer
990    //-----------------------------------------------------------------------
991    /**
992     * Gets the trimmer character matcher.
993     * <p>
994     * These characters are trimmed off on each side of the delimiter
995     * until the token or quote is found.
996     * The default value is not to trim anything.
997     *
998     * @return the trimmer matcher in use
999     */
1000    public StrMatcher getTrimmerMatcher() {
1001        return trimmerMatcher;
1002    }
1003
1004    /**
1005     * Sets the matcher for characters to trim.
1006     * <p>
1007     * These characters are trimmed off on each side of the delimiter
1008     * until the token or quote is found.
1009     *
1010     * @param trimmer  the trimmer matcher to use, null ignored
1011     * @return this, to enable chaining
1012     */
1013    public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1014        if (trimmer != null) {
1015            this.trimmerMatcher = trimmer;
1016        }
1017        return this;
1018    }
1019
1020    //-----------------------------------------------------------------------
1021    /**
1022     * Gets whether the tokenizer currently returns empty tokens as null.
1023     * The default for this property is false.
1024     *
1025     * @return true if empty tokens are returned as null
1026     */
1027    public boolean isEmptyTokenAsNull() {
1028        return this.emptyAsNull;
1029    }
1030
1031    /**
1032     * Sets whether the tokenizer should return empty tokens as null.
1033     * The default for this property is false.
1034     *
1035     * @param emptyAsNull  whether empty tokens are returned as null
1036     * @return this, to enable chaining
1037     */
1038    public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
1039        this.emptyAsNull = emptyAsNull;
1040        return this;
1041    }
1042
1043    //-----------------------------------------------------------------------
1044    /**
1045     * Gets whether the tokenizer currently ignores empty tokens.
1046     * The default for this property is true.
1047     *
1048     * @return true if empty tokens are not returned
1049     */
1050    public boolean isIgnoreEmptyTokens() {
1051        return ignoreEmptyTokens;
1052    }
1053
1054    /**
1055     * Sets whether the tokenizer should ignore and not return empty tokens.
1056     * The default for this property is true.
1057     *
1058     * @param ignoreEmptyTokens  whether empty tokens are not returned
1059     * @return this, to enable chaining
1060     */
1061    public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1062        this.ignoreEmptyTokens = ignoreEmptyTokens;
1063        return this;
1064    }
1065
1066    //-----------------------------------------------------------------------
1067    /**
1068     * Gets the String content that the tokenizer is parsing.
1069     *
1070     * @return the string content being parsed
1071     */
1072    public String getContent() {
1073        if (chars == null) {
1074            return null;
1075        }
1076        return new String(chars);
1077    }
1078
1079    //-----------------------------------------------------------------------
1080    /**
1081     * Creates a new instance of this Tokenizer. The new instance is reset so
1082     * that it will be at the start of the token list.
1083     * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1084     *
1085     * @return a new instance of this Tokenizer which has been reset.
1086     */
1087    @Override
1088    public Object clone() {
1089        try {
1090            return cloneReset();
1091        } catch (final CloneNotSupportedException ex) {
1092            return null;
1093        }
1094    }
1095
1096    /**
1097     * Creates a new instance of this Tokenizer. The new instance is reset so that
1098     * it will be at the start of the token list.
1099     *
1100     * @return a new instance of this Tokenizer which has been reset.
1101     * @throws CloneNotSupportedException if there is a problem cloning
1102     */
1103    Object cloneReset() throws CloneNotSupportedException {
1104        // this method exists to enable 100% test coverage
1105        final StrTokenizer cloned = (StrTokenizer) super.clone();
1106        if (cloned.chars != null) {
1107            cloned.chars = cloned.chars.clone();
1108        }
1109        cloned.reset();
1110        return cloned;
1111    }
1112
1113    //-----------------------------------------------------------------------
1114    /**
1115     * Gets the String content that the tokenizer is parsing.
1116     *
1117     * @return the string content being parsed
1118     */
1119    @Override
1120    public String toString() {
1121        if (tokens == null) {
1122            return "StrTokenizer[not tokenized yet]";
1123        }
1124        return "StrTokenizer" + getTokenList();
1125    }
1126
1127}