001 /*
002 * Copyright (C) 2008 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017 package com.google.common.base;
018
019 import static com.google.common.base.Preconditions.checkArgument;
020 import static com.google.common.base.Preconditions.checkNotNull;
021
022 import com.google.common.annotations.Beta;
023 import com.google.common.annotations.GwtCompatible;
024
025 import java.util.Arrays;
026 import javax.annotation.CheckReturnValue;
027
028 /**
029 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
030 * for any {@link Object}. Also offers basic text processing methods based on this function.
031 * Implementations are strongly encouraged to be side-effect-free and immutable.
032 *
033 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean
034 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}".
035 *
036 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand
037 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical
038 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher}
039 * treats these just as two separate characters.
040 *
041 * <p>Example usages: <pre>
042 * String trimmed = {@link #WHITESPACE WHITESPACE}.{@link #trimFrom trimFrom}(userInput);
043 * if ({@link #ASCII ASCII}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre>
044 *
045 * <p>See the Guava User Guide article on <a href=
046 * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#CharMatcher">
047 * {@code CharMatcher}</a>.
048 *
049 * @author Kevin Bourrillion
050 * @since 1.0
051 */
052 @Beta // Possibly change from chars to code points; decide constants vs. methods
053 @GwtCompatible
054 public abstract class CharMatcher implements Predicate<Character> {
055 // Constants
056 /**
057 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be
058 * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a
059 * discussion of that term.
060 *
061 * @since 2.0
062 */
063 public static final CharMatcher BREAKING_WHITESPACE =
064 anyOf("\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000")
065 .or(inRange('\u2000', '\u2006'))
066 .or(inRange('\u2008', '\u200a'))
067 .withToString("CharMatcher.BREAKING_WHITESPACE")
068 .precomputed();
069
070 /**
071 * Determines whether a character is ASCII, meaning that its code point is less than 128.
072 */
073 public static final CharMatcher ASCII = inRange('\0', '\u007f', "CharMatcher.ASCII");
074
075 /**
076 * Determines whether a character is a digit according to
077 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>.
078 */
079 public static final CharMatcher DIGIT;
080
081 static {
082 CharMatcher digit = inRange('0', '9');
083 String zeroes =
084 "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66"
085 + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946"
086 + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10";
087 for (char base : zeroes.toCharArray()) {
088 digit = digit.or(inRange(base, (char) (base + 9)));
089 }
090 DIGIT = digit.withToString("CharMatcher.DIGIT").precomputed();
091 }
092
093 /**
094 * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's
095 * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
096 */
097 public static final CharMatcher JAVA_DIGIT = new CharMatcher("CharMatcher.JAVA_DIGIT") {
098 @Override public boolean matches(char c) {
099 return Character.isDigit(c);
100 }
101 };
102
103 /**
104 * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's
105 * definition}. If you only care to match letters of the Latin alphabet, you can use {@code
106 * inRange('a', 'z').or(inRange('A', 'Z'))}.
107 */
108 public static final CharMatcher JAVA_LETTER = new CharMatcher("CharMatcher.JAVA_LETTER") {
109 @Override public boolean matches(char c) {
110 return Character.isLetter(c);
111 }
112
113 @Override public CharMatcher precomputed() {
114 return this;
115 }
116 };
117
118 /**
119 * Determines whether a character is a letter or digit according to {@link
120 * Character#isLetterOrDigit(char) Java's definition}.
121 */
122 public static final CharMatcher JAVA_LETTER_OR_DIGIT =
123 new CharMatcher("CharMatcher.JAVA_LETTER_OR_DIGIT") {
124 @Override public boolean matches(char c) {
125 return Character.isLetterOrDigit(c);
126 }
127 };
128
129 /**
130 * Determines whether a character is upper case according to {@link Character#isUpperCase(char)
131 * Java's definition}.
132 */
133 public static final CharMatcher JAVA_UPPER_CASE =
134 new CharMatcher("CharMatcher.JAVA_UPPER_CASE") {
135 @Override public boolean matches(char c) {
136 return Character.isUpperCase(c);
137 }
138 };
139
140 /**
141 * Determines whether a character is lower case according to {@link Character#isLowerCase(char)
142 * Java's definition}.
143 */
144 public static final CharMatcher JAVA_LOWER_CASE =
145 new CharMatcher("CharMatcher.JAVA_LOWER_CASE") {
146 @Override public boolean matches(char c) {
147 return Character.isLowerCase(c);
148 }
149 };
150
151 /**
152 * Determines whether a character is an ISO control character as specified by {@link
153 * Character#isISOControl(char)}.
154 */
155 public static final CharMatcher JAVA_ISO_CONTROL =
156 inRange('\u0000', '\u001f')
157 .or(inRange('\u007f', '\u009f'))
158 .withToString("CharMatcher.JAVA_ISO_CONTROL");
159
160 /**
161 * Determines whether a character is invisible; that is, if its Unicode category is any of
162 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
163 * PRIVATE_USE according to ICU4J.
164 */
165 public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020')
166 .or(inRange('\u007f', '\u00a0'))
167 .or(is('\u00ad'))
168 .or(inRange('\u0600', '\u0604'))
169 .or(anyOf("\u06dd\u070f\u1680\u180e"))
170 .or(inRange('\u2000', '\u200f'))
171 .or(inRange('\u2028', '\u202f'))
172 .or(inRange('\u205f', '\u2064'))
173 .or(inRange('\u206a', '\u206f'))
174 .or(is('\u3000'))
175 .or(inRange('\ud800', '\uf8ff'))
176 .or(anyOf("\ufeff\ufff9\ufffa\ufffb"))
177 .withToString("CharMatcher.INVISIBLE")
178 .precomputed();
179
180 /**
181 * Determines whether a character is single-width (not double-width). When in doubt, this matcher
182 * errs on the side of returning {@code false} (that is, it tends to assume a character is
183 * double-width).
184 *
185 * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to
186 * date.
187 */
188 public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9')
189 .or(is('\u05be'))
190 .or(inRange('\u05d0', '\u05ea'))
191 .or(is('\u05f3'))
192 .or(is('\u05f4'))
193 .or(inRange('\u0600', '\u06ff'))
194 .or(inRange('\u0750', '\u077f'))
195 .or(inRange('\u0e00', '\u0e7f'))
196 .or(inRange('\u1e00', '\u20af'))
197 .or(inRange('\u2100', '\u213a'))
198 .or(inRange('\ufb50', '\ufdff'))
199 .or(inRange('\ufe70', '\ufeff'))
200 .or(inRange('\uff61', '\uffdc'))
201 .withToString("CharMatcher.SINGLE_WIDTH")
202 .precomputed();
203
204 /** Matches any character. */
205 public static final CharMatcher ANY =
206 new CharMatcher("CharMatcher.ANY") {
207 @Override public boolean matches(char c) {
208 return true;
209 }
210
211 @Override public int indexIn(CharSequence sequence) {
212 return (sequence.length() == 0) ? -1 : 0;
213 }
214
215 @Override public int indexIn(CharSequence sequence, int start) {
216 int length = sequence.length();
217 Preconditions.checkPositionIndex(start, length);
218 return (start == length) ? -1 : start;
219 }
220
221 @Override public int lastIndexIn(CharSequence sequence) {
222 return sequence.length() - 1;
223 }
224
225 @Override public boolean matchesAllOf(CharSequence sequence) {
226 checkNotNull(sequence);
227 return true;
228 }
229
230 @Override public boolean matchesNoneOf(CharSequence sequence) {
231 return sequence.length() == 0;
232 }
233
234 @Override public String removeFrom(CharSequence sequence) {
235 checkNotNull(sequence);
236 return "";
237 }
238
239 @Override public String replaceFrom(CharSequence sequence, char replacement) {
240 char[] array = new char[sequence.length()];
241 Arrays.fill(array, replacement);
242 return new String(array);
243 }
244
245 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
246 StringBuilder retval = new StringBuilder(sequence.length() * replacement.length());
247 for (int i = 0; i < sequence.length(); i++) {
248 retval.append(replacement);
249 }
250 return retval.toString();
251 }
252
253 @Override public String collapseFrom(CharSequence sequence, char replacement) {
254 return (sequence.length() == 0) ? "" : String.valueOf(replacement);
255 }
256
257 @Override public String trimFrom(CharSequence sequence) {
258 checkNotNull(sequence);
259 return "";
260 }
261
262 @Override public int countIn(CharSequence sequence) {
263 return sequence.length();
264 }
265
266 @Override public CharMatcher and(CharMatcher other) {
267 return checkNotNull(other);
268 }
269
270 @Override public CharMatcher or(CharMatcher other) {
271 checkNotNull(other);
272 return this;
273 }
274
275 @Override public CharMatcher negate() {
276 return NONE;
277 }
278
279 @Override public CharMatcher precomputed() {
280 return this;
281 }
282 };
283
284 /** Matches no characters. */
285 public static final CharMatcher NONE =
286 new CharMatcher("CharMatcher.NONE") {
287 @Override public boolean matches(char c) {
288 return false;
289 }
290
291 @Override public int indexIn(CharSequence sequence) {
292 checkNotNull(sequence);
293 return -1;
294 }
295
296 @Override public int indexIn(CharSequence sequence, int start) {
297 int length = sequence.length();
298 Preconditions.checkPositionIndex(start, length);
299 return -1;
300 }
301
302 @Override public int lastIndexIn(CharSequence sequence) {
303 checkNotNull(sequence);
304 return -1;
305 }
306
307 @Override public boolean matchesAllOf(CharSequence sequence) {
308 return sequence.length() == 0;
309 }
310
311 @Override public boolean matchesNoneOf(CharSequence sequence) {
312 checkNotNull(sequence);
313 return true;
314 }
315
316 @Override public String removeFrom(CharSequence sequence) {
317 return sequence.toString();
318 }
319
320 @Override public String replaceFrom(CharSequence sequence, char replacement) {
321 return sequence.toString();
322 }
323
324 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
325 checkNotNull(replacement);
326 return sequence.toString();
327 }
328
329 @Override public String collapseFrom(CharSequence sequence, char replacement) {
330 return sequence.toString();
331 }
332
333 @Override public String trimFrom(CharSequence sequence) {
334 return sequence.toString();
335 }
336
337 @Override public int countIn(CharSequence sequence) {
338 checkNotNull(sequence);
339 return 0;
340 }
341
342 @Override public CharMatcher and(CharMatcher other) {
343 checkNotNull(other);
344 return this;
345 }
346
347 @Override public CharMatcher or(CharMatcher other) {
348 return checkNotNull(other);
349 }
350
351 @Override public CharMatcher negate() {
352 return ANY;
353 }
354
355 @Override void setBits(LookupTable table) {}
356
357 @Override public CharMatcher precomputed() {
358 return this;
359 }
360 };
361
362 // Static factories
363
364 /**
365 * Returns a {@code char} matcher that matches only one specified character.
366 */
367 public static CharMatcher is(final char match) {
368 String description = new StringBuilder("CharMatcher.is(")
369 .append(Integer.toHexString(match))
370 .append(")")
371 .toString();
372 return new CharMatcher(description) {
373 @Override public boolean matches(char c) {
374 return c == match;
375 }
376
377 @Override public String replaceFrom(CharSequence sequence, char replacement) {
378 return sequence.toString().replace(match, replacement);
379 }
380
381 @Override public CharMatcher and(CharMatcher other) {
382 return other.matches(match) ? this : NONE;
383 }
384
385 @Override public CharMatcher or(CharMatcher other) {
386 return other.matches(match) ? other : super.or(other);
387 }
388
389 @Override public CharMatcher negate() {
390 return isNot(match);
391 }
392
393 @Override void setBits(LookupTable table) {
394 table.set(match);
395 }
396
397 @Override public CharMatcher precomputed() {
398 return this;
399 }
400 };
401 }
402
403 /**
404 * Returns a {@code char} matcher that matches any character except the one specified.
405 *
406 * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
407 */
408 public static CharMatcher isNot(final char match) {
409 String description = new StringBuilder("CharMatcher.isNot(")
410 .append(Integer.toHexString(match))
411 .append(")")
412 .toString();
413 return new CharMatcher(description) {
414 @Override public boolean matches(char c) {
415 return c != match;
416 }
417
418 @Override public CharMatcher and(CharMatcher other) {
419 return other.matches(match) ? super.and(other) : other;
420 }
421
422 @Override public CharMatcher or(CharMatcher other) {
423 return other.matches(match) ? ANY : this;
424 }
425
426 @Override public CharMatcher negate() {
427 return is(match);
428 }
429 };
430 }
431
432 /**
433 * Returns a {@code char} matcher that matches any character present in the given character
434 * sequence.
435 */
436 public static CharMatcher anyOf(final CharSequence sequence) {
437 switch (sequence.length()) {
438 case 0:
439 return NONE;
440 case 1:
441 return is(sequence.charAt(0));
442 case 2:
443 final char match1 = sequence.charAt(0);
444 final char match2 = sequence.charAt(1);
445 return new CharMatcher(
446 new StringBuilder("CharMatcher.anyOf(\"").append(sequence).append("\")").toString()) {
447 @Override public boolean matches(char c) {
448 return c == match1 || c == match2;
449 }
450
451 @Override void setBits(LookupTable table) {
452 table.set(match1);
453 table.set(match2);
454 }
455
456 @Override public CharMatcher precomputed() {
457 return this;
458 }
459 };
460 }
461 final char[] chars = sequence.toString().toCharArray();
462 Arrays.sort(chars);
463
464 return new CharMatcher(new StringBuilder("CharMatcher.anyOf(\"").append(chars)
465 .append("\")").toString()) {
466 @Override public boolean matches(char c) {
467 return Arrays.binarySearch(chars, c) >= 0;
468 }
469 };
470 }
471
472 /**
473 * Returns a {@code char} matcher that matches any character not present in the given character
474 * sequence.
475 */
476 public static CharMatcher noneOf(CharSequence sequence) {
477 return anyOf(sequence).negate();
478 }
479
480 /**
481 * Returns a {@code char} matcher that matches any character in a given range (both endpoints are
482 * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code
483 * CharMatcher.inRange('a', 'z')}.
484 *
485 * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
486 */
487 public static CharMatcher inRange(final char startInclusive, final char endInclusive) {
488 checkArgument(endInclusive >= startInclusive);
489 String description = new StringBuilder("CharMatcher.inRange(")
490 .append(Integer.toHexString(startInclusive))
491 .append(", ")
492 .append(Integer.toHexString(endInclusive))
493 .append(")")
494 .toString();
495 return inRange(startInclusive, endInclusive, description);
496 }
497
498 static CharMatcher inRange(final char startInclusive, final char endInclusive,
499 String description) {
500 return new CharMatcher(description) {
501 @Override public boolean matches(char c) {
502 return startInclusive <= c && c <= endInclusive;
503 }
504
505 @Override void setBits(LookupTable table) {
506 char c = startInclusive;
507 while (true) {
508 table.set(c);
509 if (c++ == endInclusive) {
510 break;
511 }
512 }
513 }
514
515 @Override public CharMatcher precomputed() {
516 return this;
517 }
518 };
519 }
520
521 /**
522 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but
523 * which operates on primitive {@code char} instances instead.
524 */
525 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) {
526 checkNotNull(predicate);
527 if (predicate instanceof CharMatcher) {
528 return (CharMatcher) predicate;
529 }
530 String description = new StringBuilder("CharMatcher.forPredicate(")
531 .append(predicate)
532 .append(')')
533 .toString();
534 return new CharMatcher(description) {
535 @Override public boolean matches(char c) {
536 return predicate.apply(c);
537 }
538
539 @Override public boolean apply(Character character) {
540 return predicate.apply(checkNotNull(character));
541 }
542 };
543 }
544
545 // State
546 final String description;
547
548 // Constructors
549
550 /**
551 * Sets the {@code toString()} from the given description.
552 */
553 CharMatcher(String description) {
554 this.description = description;
555 }
556
557 /**
558 * Constructor for use by subclasses. When subclassing, you may want to override
559 * {@code toString()} to provide a useful description.
560 */
561 protected CharMatcher() {
562 description = "UnknownCharMatcher";
563 }
564
565 // Abstract methods
566
567 /** Determines a true or false value for the given character. */
568 public abstract boolean matches(char c);
569
570 // Non-static factories
571
572 /**
573 * Returns a matcher that matches any character not matched by this matcher.
574 */
575 public CharMatcher negate() {
576 final CharMatcher original = this;
577 return new CharMatcher(original + ".negate()") {
578 @Override public boolean matches(char c) {
579 return !original.matches(c);
580 }
581
582 @Override public boolean matchesAllOf(CharSequence sequence) {
583 return original.matchesNoneOf(sequence);
584 }
585
586 @Override public boolean matchesNoneOf(CharSequence sequence) {
587 return original.matchesAllOf(sequence);
588 }
589
590 @Override public int countIn(CharSequence sequence) {
591 return sequence.length() - original.countIn(sequence);
592 }
593
594 @Override public CharMatcher negate() {
595 return original;
596 }
597 };
598 }
599
600 /**
601 * Returns a matcher that matches any character matched by both this matcher and {@code other}.
602 */
603 public CharMatcher and(CharMatcher other) {
604 return new And(this, checkNotNull(other));
605 }
606
607 private static class And extends CharMatcher {
608 final CharMatcher first;
609 final CharMatcher second;
610
611 And(CharMatcher a, CharMatcher b) {
612 this(a, b, "CharMatcher.and(" + a + ", " + b + ")");
613 }
614
615 And(CharMatcher a, CharMatcher b, String description) {
616 super(description);
617 first = checkNotNull(a);
618 second = checkNotNull(b);
619 }
620
621 @Override
622 public CharMatcher and(CharMatcher other) {
623 return new And(this, other);
624 }
625
626 @Override
627 public boolean matches(char c) {
628 return first.matches(c) && second.matches(c);
629 }
630
631 @Override
632 CharMatcher withToString(String description) {
633 return new And(first, second, description);
634 }
635 }
636
637 /**
638 * Returns a matcher that matches any character matched by either this matcher or {@code other}.
639 */
640 public CharMatcher or(CharMatcher other) {
641 return new Or(this, checkNotNull(other));
642 }
643
644 private static class Or extends CharMatcher {
645 final CharMatcher first;
646 final CharMatcher second;
647
648 Or(CharMatcher a, CharMatcher b, String description) {
649 super(description);
650 first = checkNotNull(a);
651 second = checkNotNull(b);
652 }
653
654 Or(CharMatcher a, CharMatcher b) {
655 this(a, b, "CharMatcher.or(" + a + ", " + b + ")");
656 }
657
658 @Override
659 public CharMatcher or(CharMatcher other) {
660 return new Or(this, checkNotNull(other));
661 }
662
663 @Override
664 public boolean matches(char c) {
665 return first.matches(c) || second.matches(c);
666 }
667
668 @Override
669 CharMatcher withToString(String description) {
670 return new Or(first, second, description);
671 }
672 }
673
674 /**
675 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to
676 * query than the original; your mileage may vary. Precomputation takes time and is likely to be
677 * worthwhile only if the precomputed matcher is queried many thousands of times.
678 *
679 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a
680 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a
681 * worthwhile tradeoff in a browser.
682 */
683 public CharMatcher precomputed() {
684 return Platform.precomputeCharMatcher(this);
685 }
686
687 /**
688 * Construct an array of all possible chars in the slowest way possible.
689 */
690 char[] slowGetChars() {
691 char[] allChars = new char[65536];
692 int size = 0;
693 for (int c = Character.MIN_VALUE; c <= Character.MAX_VALUE; c++) {
694 if (matches((char) c)) {
695 allChars[size++] = (char) c;
696 }
697 }
698 char[] retValue = new char[size];
699 System.arraycopy(allChars, 0, retValue, 0, size);
700 return retValue;
701 }
702
703 /**
704 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method
705 * on {@link Platform} so that we can have different behavior in GWT.
706 *
707 * <p>If the number of matched characters is small enough, we try to build a small hash
708 * table to contain all of the characters. Otherwise, we record the characters in eight-kilobyte
709 * bit array. In many situations this produces a matcher which is faster to query
710 * than the original.
711 */
712 CharMatcher precomputedInternal() {
713 final char[] chars = slowGetChars();
714 int totalCharacters = chars.length;
715 if (totalCharacters == 0) {
716 return NONE;
717 } else if (totalCharacters == 1) {
718 return is(chars[0]);
719 } else if (totalCharacters < SmallCharMatcher.MAX_SIZE) {
720 return SmallCharMatcher.from(chars, toString());
721 } else if (totalCharacters < MediumCharMatcher.MAX_SIZE) {
722 return MediumCharMatcher.from(chars, toString());
723 }
724 // Otherwise, make the full lookup table.
725 final LookupTable table = new LookupTable();
726 setBits(table);
727 final CharMatcher outer = this;
728
729 return new CharMatcher(outer.toString()) {
730 @Override public boolean matches(char c) {
731 return table.get(c);
732 }
733
734 // TODO(kevinb): make methods like negate() smart?
735
736 @Override public CharMatcher precomputed() {
737 return this;
738 }
739 };
740 }
741
742 /**
743 * Subclasses should provide a new CharMatcher with the same characteristics as {@code this},
744 * but with their {@code toString} method overridden with the new description.
745 *
746 * <p>This is unsupported by default.
747 */
748 CharMatcher withToString(String description) {
749 throw new UnsupportedOperationException();
750
751 }
752
753 /**
754 * For use by implementors; sets the bit corresponding to each character ('\0' to '{@literal
755 * \}uFFFF') that matches this matcher in the given bit array, leaving all other bits untouched.
756 *
757 * <p>The default implementation loops over every possible character value, invoking {@link
758 * #matches} for each one.
759 */
760 void setBits(LookupTable table) {
761 char c = Character.MIN_VALUE;
762 while (true) {
763 if (matches(c)) {
764 table.set(c);
765 }
766 if (c++ == Character.MAX_VALUE) {
767 break;
768 }
769 }
770 }
771
772 /**
773 * A bit array with one bit per {@code char} value, used by {@link CharMatcher#precomputed}.
774 *
775 * <p>TODO(kevinb): possibly share a common BitArray class with BloomFilter and others... a
776 * simpler java.util.BitSet.
777 */
778 private static final class LookupTable {
779 int[] data = new int[2048];
780
781 void set(char index) {
782 data[index >> 5] |= (1 << index);
783 }
784
785 boolean get(char index) {
786 return (data[index >> 5] & (1 << index)) != 0;
787 }
788 }
789
790 // Text processing routines
791
792 /**
793 * Returns {@code true} if a character sequence contains at least one matching character.
794 * Equivalent to {@code !matchesNoneOf(sequence)}.
795 *
796 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
797 * character, until this returns {@code true} or the end is reached.
798 *
799 * @param sequence the character sequence to examine, possibly empty
800 * @return {@code true} if this matcher matches at least one character in the sequence
801 * @since 8.0
802 */
803 public boolean matchesAnyOf(CharSequence sequence) {
804 return !matchesNoneOf(sequence);
805 }
806
807 /**
808 * Returns {@code true} if a character sequence contains only matching characters.
809 *
810 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
811 * character, until this returns {@code false} or the end is reached.
812 *
813 * @param sequence the character sequence to examine, possibly empty
814 * @return {@code true} if this matcher matches every character in the sequence, including when
815 * the sequence is empty
816 */
817 public boolean matchesAllOf(CharSequence sequence) {
818 for (int i = sequence.length() - 1; i >= 0; i--) {
819 if (!matches(sequence.charAt(i))) {
820 return false;
821 }
822 }
823 return true;
824 }
825
826 /**
827 * Returns {@code true} if a character sequence contains no matching characters. Equivalent to
828 * {@code !matchesAnyOf(sequence)}.
829 *
830 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
831 * character, until this returns {@code false} or the end is reached.
832 *
833 * @param sequence the character sequence to examine, possibly empty
834 * @return {@code true} if this matcher matches every character in the sequence, including when
835 * the sequence is empty
836 */
837 public boolean matchesNoneOf(CharSequence sequence) {
838 return indexIn(sequence) == -1;
839 }
840
841 /**
842 * Returns the index of the first matching character in a character sequence, or {@code -1} if no
843 * matching character is present.
844 *
845 * <p>The default implementation iterates over the sequence in forward order calling {@link
846 * #matches} for each character.
847 *
848 * @param sequence the character sequence to examine from the beginning
849 * @return an index, or {@code -1} if no character matches
850 */
851 public int indexIn(CharSequence sequence) {
852 int length = sequence.length();
853 for (int i = 0; i < length; i++) {
854 if (matches(sequence.charAt(i))) {
855 return i;
856 }
857 }
858 return -1;
859 }
860
861 /**
862 * Returns the index of the first matching character in a character sequence, starting from a
863 * given position, or {@code -1} if no character matches after that position.
864 *
865 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code
866 * start}, calling {@link #matches} for each character.
867 *
868 * @param sequence the character sequence to examine
869 * @param start the first index to examine; must be nonnegative and no greater than {@code
870 * sequence.length()}
871 * @return the index of the first matching character, guaranteed to be no less than {@code start},
872 * or {@code -1} if no character matches
873 * @throws IndexOutOfBoundsException if start is negative or greater than {@code
874 * sequence.length()}
875 */
876 public int indexIn(CharSequence sequence, int start) {
877 int length = sequence.length();
878 Preconditions.checkPositionIndex(start, length);
879 for (int i = start; i < length; i++) {
880 if (matches(sequence.charAt(i))) {
881 return i;
882 }
883 }
884 return -1;
885 }
886
887 /**
888 * Returns the index of the last matching character in a character sequence, or {@code -1} if no
889 * matching character is present.
890 *
891 * <p>The default implementation iterates over the sequence in reverse order calling {@link
892 * #matches} for each character.
893 *
894 * @param sequence the character sequence to examine from the end
895 * @return an index, or {@code -1} if no character matches
896 */
897 public int lastIndexIn(CharSequence sequence) {
898 for (int i = sequence.length() - 1; i >= 0; i--) {
899 if (matches(sequence.charAt(i))) {
900 return i;
901 }
902 }
903 return -1;
904 }
905
906 /**
907 * Returns the number of matching characters found in a character sequence.
908 */
909 public int countIn(CharSequence sequence) {
910 int count = 0;
911 for (int i = 0; i < sequence.length(); i++) {
912 if (matches(sequence.charAt(i))) {
913 count++;
914 }
915 }
916 return count;
917 }
918
919 /**
920 * Returns a string containing all non-matching characters of a character sequence, in order. For
921 * example: <pre> {@code
922 *
923 * CharMatcher.is('a').removeFrom("bazaar")}</pre>
924 *
925 * ... returns {@code "bzr"}.
926 */
927 @CheckReturnValue
928 public String removeFrom(CharSequence sequence) {
929 String string = sequence.toString();
930 int pos = indexIn(string);
931 if (pos == -1) {
932 return string;
933 }
934
935 char[] chars = string.toCharArray();
936 int spread = 1;
937
938 // This unusual loop comes from extensive benchmarking
939 OUT: while (true) {
940 pos++;
941 while (true) {
942 if (pos == chars.length) {
943 break OUT;
944 }
945 if (matches(chars[pos])) {
946 break;
947 }
948 chars[pos - spread] = chars[pos];
949 pos++;
950 }
951 spread++;
952 }
953 return new String(chars, 0, pos - spread);
954 }
955
956 /**
957 * Returns a string containing all matching characters of a character sequence, in order. For
958 * example: <pre> {@code
959 *
960 * CharMatcher.is('a').retainFrom("bazaar")}</pre>
961 *
962 * ... returns {@code "aaa"}.
963 */
964 @CheckReturnValue
965 public String retainFrom(CharSequence sequence) {
966 return negate().removeFrom(sequence);
967 }
968
969 /**
970 * Returns a string copy of the input character sequence, with each character that matches this
971 * matcher replaced by a given replacement character. For example: <pre> {@code
972 *
973 * CharMatcher.is('a').replaceFrom("radar", 'o')}</pre>
974 *
975 * ... returns {@code "rodor"}.
976 *
977 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
978 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
979 * character.
980 *
981 * @param sequence the character sequence to replace matching characters in
982 * @param replacement the character to append to the result string in place of each matching
983 * character in {@code sequence}
984 * @return the new string
985 */
986 @CheckReturnValue
987 public String replaceFrom(CharSequence sequence, char replacement) {
988 String string = sequence.toString();
989 int pos = indexIn(string);
990 if (pos == -1) {
991 return string;
992 }
993 char[] chars = string.toCharArray();
994 chars[pos] = replacement;
995 for (int i = pos + 1; i < chars.length; i++) {
996 if (matches(chars[i])) {
997 chars[i] = replacement;
998 }
999 }
1000 return new String(chars);
1001 }
1002
1003 /**
1004 * Returns a string copy of the input character sequence, with each character that matches this
1005 * matcher replaced by a given replacement sequence. For example: <pre> {@code
1006 *
1007 * CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre>
1008 *
1009 * ... returns {@code "yoohoo"}.
1010 *
1011 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better
1012 * off calling {@link #replaceFrom(CharSequence, char)} directly.
1013 *
1014 * @param sequence the character sequence to replace matching characters in
1015 * @param replacement the characters to append to the result string in place of each matching
1016 * character in {@code sequence}
1017 * @return the new string
1018 */
1019 @CheckReturnValue
1020 public String replaceFrom(CharSequence sequence, CharSequence replacement) {
1021 int replacementLen = replacement.length();
1022 if (replacementLen == 0) {
1023 return removeFrom(sequence);
1024 }
1025 if (replacementLen == 1) {
1026 return replaceFrom(sequence, replacement.charAt(0));
1027 }
1028
1029 String string = sequence.toString();
1030 int pos = indexIn(string);
1031 if (pos == -1) {
1032 return string;
1033 }
1034
1035 int len = string.length();
1036 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16);
1037
1038 int oldpos = 0;
1039 do {
1040 buf.append(string, oldpos, pos);
1041 buf.append(replacement);
1042 oldpos = pos + 1;
1043 pos = indexIn(string, oldpos);
1044 } while (pos != -1);
1045
1046 buf.append(string, oldpos, len);
1047 return buf.toString();
1048 }
1049
1050 /**
1051 * Returns a substring of the input character sequence that omits all characters this matcher
1052 * matches from the beginning and from the end of the string. For example: <pre> {@code
1053 *
1054 * CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre>
1055 *
1056 * ... returns {@code "cat"}.
1057 *
1058 * <p>Note that: <pre> {@code
1059 *
1060 * CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre>
1061 *
1062 * ... is equivalent to {@link String#trim()}.
1063 */
1064 @CheckReturnValue
1065 public String trimFrom(CharSequence sequence) {
1066 int len = sequence.length();
1067 int first;
1068 int last;
1069
1070 for (first = 0; first < len; first++) {
1071 if (!matches(sequence.charAt(first))) {
1072 break;
1073 }
1074 }
1075 for (last = len - 1; last > first; last--) {
1076 if (!matches(sequence.charAt(last))) {
1077 break;
1078 }
1079 }
1080
1081 return sequence.subSequence(first, last + 1).toString();
1082 }
1083
1084 /**
1085 * Returns a substring of the input character sequence that omits all characters this matcher
1086 * matches from the beginning of the string. For example: <pre> {@code
1087 *
1088 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre>
1089 *
1090 * ... returns {@code "catbab"}.
1091 */
1092 @CheckReturnValue
1093 public String trimLeadingFrom(CharSequence sequence) {
1094 int len = sequence.length();
1095 int first;
1096
1097 for (first = 0; first < len; first++) {
1098 if (!matches(sequence.charAt(first))) {
1099 break;
1100 }
1101 }
1102
1103 return sequence.subSequence(first, len).toString();
1104 }
1105
1106 /**
1107 * Returns a substring of the input character sequence that omits all characters this matcher
1108 * matches from the end of the string. For example: <pre> {@code
1109 *
1110 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre>
1111 *
1112 * ... returns {@code "abacat"}.
1113 */
1114 @CheckReturnValue
1115 public String trimTrailingFrom(CharSequence sequence) {
1116 int len = sequence.length();
1117 int last;
1118
1119 for (last = len - 1; last >= 0; last--) {
1120 if (!matches(sequence.charAt(last))) {
1121 break;
1122 }
1123 }
1124
1125 return sequence.subSequence(0, last + 1).toString();
1126 }
1127
1128 /**
1129 * Returns a string copy of the input character sequence, with each group of consecutive
1130 * characters that match this matcher replaced by a single replacement character. For example:
1131 * <pre> {@code
1132 *
1133 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre>
1134 *
1135 * ... returns {@code "b-p-r"}.
1136 *
1137 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1138 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1139 * character.
1140 *
1141 * @param sequence the character sequence to replace matching groups of characters in
1142 * @param replacement the character to append to the result string in place of each group of
1143 * matching characters in {@code sequence}
1144 * @return the new string
1145 */
1146 @CheckReturnValue
1147 public String collapseFrom(CharSequence sequence, char replacement) {
1148 int first = indexIn(sequence);
1149 if (first == -1) {
1150 return sequence.toString();
1151 }
1152
1153 // TODO(kevinb): see if this implementation can be made faster
1154 StringBuilder builder = new StringBuilder(sequence.length())
1155 .append(sequence.subSequence(0, first))
1156 .append(replacement);
1157 boolean in = true;
1158 for (int i = first + 1; i < sequence.length(); i++) {
1159 char c = sequence.charAt(i);
1160 if (matches(c)) {
1161 if (!in) {
1162 builder.append(replacement);
1163 in = true;
1164 }
1165 } else {
1166 builder.append(c);
1167 in = false;
1168 }
1169 }
1170 return builder.toString();
1171 }
1172
1173 /**
1174 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that
1175 * groups of matching characters at the start or end of the sequence are removed without
1176 * replacement.
1177 */
1178 @CheckReturnValue
1179 public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
1180 int first = negate().indexIn(sequence);
1181 if (first == -1) {
1182 return ""; // everything matches. nothing's left.
1183 }
1184 StringBuilder builder = new StringBuilder(sequence.length());
1185 boolean inMatchingGroup = false;
1186 for (int i = first; i < sequence.length(); i++) {
1187 char c = sequence.charAt(i);
1188 if (matches(c)) {
1189 inMatchingGroup = true;
1190 } else {
1191 if (inMatchingGroup) {
1192 builder.append(replacement);
1193 inMatchingGroup = false;
1194 }
1195 builder.append(c);
1196 }
1197 }
1198 return builder.toString();
1199 }
1200
1201 // Predicate interface
1202
1203 /**
1204 * Returns {@code true} if this matcher matches the given character.
1205 *
1206 * @throws NullPointerException if {@code character} is null
1207 */
1208 @Override public boolean apply(Character character) {
1209 return matches(character);
1210 }
1211
1212 /**
1213 * Returns a string representation of this {@code CharMatcher}, such as
1214 * {@code CharMatcher.or(WHITESPACE, JAVA_DIGIT)}.
1215 */
1216 @Override
1217 public String toString() {
1218 return description;
1219 }
1220
1221 /**
1222 * Determines whether a character is whitespace according to the latest Unicode standard, as
1223 * illustrated
1224 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
1225 * This is not the same definition used by other Java APIs. (See a
1226 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several
1227 * definitions of "whitespace"</a>.)
1228 *
1229 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up
1230 * to date.
1231 */
1232 public static final CharMatcher WHITESPACE = new CharMatcher("CharMatcher.WHITESPACE") {
1233 /**
1234 * A special-case CharMatcher for Unicode whitespace characters that is extremely
1235 * efficient both in space required and in time to check for matches.
1236 *
1237 * Implementation details.
1238 * It turns out that all current (early 2012) Unicode characters are unique modulo 79:
1239 * so we can construct a lookup table of exactly 79 entries, and just check the character code
1240 * mod 79, and see if that character is in the table.
1241 *
1242 * There is a 1 at the beginning of the table so that the null character is not listed
1243 * as whitespace.
1244 *
1245 * Other things we tried that did not prove to be beneficial, mostly due to speed concerns:
1246 *
1247 * * Binary search into the sorted list of characters, i.e., what
1248 * CharMatcher.anyOf() does</li>
1249 * * Perfect hash function into a table of size 26 (using an offset table and a special
1250 * Jenkins hash function)</li>
1251 * * Perfect-ish hash function that required two lookups into a single table of size 26.</li>
1252 * * Using a power-of-2 sized hash table (size 64) with linear probing.</li>
1253 *
1254 * --Christopher Swenson, February 2012.
1255 */
1256
1257 // Mod-79 lookup table.
1258 private final char[] table = {1, 0, 160, 0, 0, 0, 0, 0, 0, 9, 10, 11, 12, 13, 0, 0,
1259 8232, 8233, 0, 0, 0, 0, 0, 8239, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1260 12288, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 133, 8192, 8193, 8194, 8195, 8196, 8197, 8198, 8199,
1261 8200, 8201, 8202, 0, 0, 0, 0, 0, 8287, 5760, 0, 0, 6158, 0, 0, 0};
1262
1263 @Override public boolean matches(char c) {
1264 return table[c % 79] == c;
1265 }
1266
1267 @Override public CharMatcher precomputed() {
1268 return this;
1269 }
1270 };
1271 }