Frames | No Frames |
1: /* java.lang.Character -- Wrapper class for char, and Unicode subsets 2: Copyright (C) 1998, 1999, 2001, 2002, 2005, 2006, 2007 3: Free Software Foundation, Inc. 4: 5: This file is part of GNU Classpath. 6: 7: GNU Classpath is free software; you can redistribute it and/or modify 8: it under the terms of the GNU General Public License as published by 9: the Free Software Foundation; either version 2, or (at your option) 10: any later version. 11: 12: GNU Classpath is distributed in the hope that it will be useful, but 13: WITHOUT ANY WARRANTY; without even the implied warranty of 14: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15: General Public License for more details. 16: 17: You should have received a copy of the GNU General Public License 18: along with GNU Classpath; see the file COPYING. If not, write to the 19: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 20: 02110-1301 USA. 21: 22: Linking this library statically or dynamically with other modules is 23: making a combined work based on this library. Thus, the terms and 24: conditions of the GNU General Public License cover the whole 25: combination. 26: 27: As a special exception, the copyright holders of this library give you 28: permission to link this library with independent modules to produce an 29: executable, regardless of the license terms of these independent 30: modules, and to copy and distribute the resulting executable under 31: terms of your choice, provided that you also meet, for each linked 32: independent module, the terms and conditions of the license of that 33: module. An independent module is a module which is not derived from 34: or based on this library. If you modify this library, you may extend 35: this exception to your version of the library, but you are not 36: obligated to do so. If you do not wish to do so, delete this 37: exception statement from your version. */ 38: 39: /* 40: * Note: This class must not be merged with Classpath. Gcj uses C-style 41: * arrays (see include/java-chartables.h) to store the Unicode character 42: * database, whereas Classpath uses Java objects (char[] extracted from 43: * String constants) in gnu.java.lang.CharData. Gcj's approach is more 44: * efficient, because there is no vtable or data relocation to worry about. 45: * However, despite the difference in the database interface, the two 46: * versions share identical algorithms. 47: */ 48: 49: package java.lang; 50: 51: import java.io.Serializable; 52: import java.text.Collator; 53: import java.util.Locale; 54: 55: /** 56: * Wrapper class for the primitive char data type. In addition, this class 57: * allows one to retrieve property information and perform transformations 58: * on the defined characters in the Unicode Standard, Version 4.0.0. 59: * java.lang.Character is designed to be very dynamic, and as such, it 60: * retrieves information on the Unicode character set from a separate 61: * database, gnu.java.lang.CharData, which can be easily upgraded. 62: * 63: * <p>For predicates, boundaries are used to describe 64: * the set of characters for which the method will return true. 65: * This syntax uses fairly normal regular expression notation. 66: * See 5.13 of the Unicode Standard, Version 4.0, for the 67: * boundary specification. 68: * 69: * <p>See <a href="http://www.unicode.org">http://www.unicode.org</a> 70: * for more information on the Unicode Standard. 71: * 72: * @author Tom Tromey (tromey@cygnus.com) 73: * @author Paul N. Fisher 74: * @author Jochen Hoenicke 75: * @author Eric Blake (ebb9@email.byu.edu) 76: * @author Andrew John Hughes (gnu_andrew@member.fsf.org) 77: * @since 1.0 78: * @status partly updated to 1.5; some things still missing 79: */ 80: public final class Character implements Serializable, Comparable<Character> 81: { 82: /** 83: * A subset of Unicode blocks. 84: * 85: * @author Paul N. Fisher 86: * @author Eric Blake (ebb9@email.byu.edu) 87: * @since 1.2 88: */ 89: public static class Subset 90: { 91: /** The name of the subset. */ 92: private final String name; 93: 94: /** 95: * Construct a new subset of characters. 96: * 97: * @param name the name of the subset 98: * @throws NullPointerException if name is null 99: */ 100: protected Subset(String name) 101: { 102: // Note that name.toString() is name, unless name was null. 103: this.name = name.toString(); 104: } 105: 106: /** 107: * Compares two Subsets for equality. This is <code>final</code>, and 108: * restricts the comparison on the <code>==</code> operator, so it returns 109: * true only for the same object. 110: * 111: * @param o the object to compare 112: * @return true if o is this 113: */ 114: public final boolean equals(Object o) 115: { 116: return o == this; 117: } 118: 119: /** 120: * Makes the original hashCode of Object final, to be consistent with 121: * equals. 122: * 123: * @return the hash code for this object 124: */ 125: public final int hashCode() 126: { 127: return super.hashCode(); 128: } 129: 130: /** 131: * Returns the name of the subset. 132: * 133: * @return the name 134: */ 135: public final String toString() 136: { 137: return name; 138: } 139: } // class Subset 140: 141: /** 142: * A family of character subsets in the Unicode specification. A character 143: * is in at most one of these blocks. 144: * 145: * This inner class was generated automatically from 146: * <code>libjava/gnu/gcj/convert/Blocks-3.txt</code>, by some perl scripts. 147: * This Unicode definition file can be found on the 148: * <a href="http://www.unicode.org">http://www.unicode.org</a> website. 149: * JDK 1.4 uses Unicode version 3.0.0. 150: * 151: * @author scripts/unicode-blocks.pl (written by Eric Blake) 152: * @since 1.2 153: */ 154: public static final class UnicodeBlock extends Subset 155: { 156: /** The start of the subset. */ 157: private final int start; 158: 159: /** The end of the subset. */ 160: private final int end; 161: 162: /** The canonical name of the block according to the Unicode standard. */ 163: private final String canonicalName; 164: 165: /** Enumeration for the <code>forName()</code> method */ 166: private enum NameType { CANONICAL, NO_SPACES, CONSTANT; } 167: 168: /** 169: * Constructor for strictly defined blocks. 170: * 171: * @param start the start character of the range 172: * @param end the end character of the range 173: * @param name the block name 174: */ 175: private UnicodeBlock(int start, int end, String name, 176: String canonicalName) 177: { 178: super(name); 179: this.start = start; 180: this.end = end; 181: this.canonicalName = canonicalName; 182: } 183: 184: /** 185: * Returns the Unicode character block which a character belongs to. 186: * <strong>Note</strong>: This method does not support the use of 187: * supplementary characters. For such support, <code>of(int)</code> 188: * should be used instead. 189: * 190: * @param ch the character to look up 191: * @return the set it belongs to, or null if it is not in one 192: */ 193: public static UnicodeBlock of(char ch) 194: { 195: return of((int) ch); 196: } 197: 198: /** 199: * Returns the Unicode character block which a code point belongs to. 200: * 201: * @param codePoint the character to look up 202: * @return the set it belongs to, or null if it is not in one. 203: * @throws IllegalArgumentException if the specified code point is 204: * invalid. 205: * @since 1.5 206: */ 207: public static UnicodeBlock of(int codePoint) 208: { 209: if (codePoint > MAX_CODE_POINT) 210: throw new IllegalArgumentException("The supplied integer value is " + 211: "too large to be a codepoint."); 212: // Simple binary search for the correct block. 213: int low = 0; 214: int hi = sets.length - 1; 215: while (low <= hi) 216: { 217: int mid = (low + hi) >> 1; 218: UnicodeBlock b = sets[mid]; 219: if (codePoint < b.start) 220: hi = mid - 1; 221: else if (codePoint > b.end) 222: low = mid + 1; 223: else 224: return b; 225: } 226: return null; 227: } 228: 229: /** 230: * <p> 231: * Returns the <code>UnicodeBlock</code> with the given name, as defined 232: * by the Unicode standard. The version of Unicode in use is defined by 233: * the <code>Character</code> class, and the names are given in the 234: * <code>Blocks-<version>.txt</code> file corresponding to that version. 235: * The name may be specified in one of three ways: 236: * </p> 237: * <ol> 238: * <li>The canonical, human-readable name used by the Unicode standard. 239: * This is the name with all spaces and hyphens retained. For example, 240: * `Basic Latin' retrieves the block, UnicodeBlock.BASIC_LATIN.</li> 241: * <li>The canonical name with all spaces removed e.g. `BasicLatin'.</li> 242: * <li>The name used for the constants specified by this class, which 243: * is the canonical name with all spaces and hyphens replaced with 244: * underscores e.g. `BASIC_LATIN'</li> 245: * </ol> 246: * <p> 247: * The names are compared case-insensitively using the case comparison 248: * associated with the U.S. English locale. The method recognises the 249: * previous names used for blocks as well as the current ones. At 250: * present, this simply means that the deprecated `SURROGATES_AREA' 251: * will be recognised by this method (the <code>of()</code> methods 252: * only return one of the three new surrogate blocks). 253: * </p> 254: * 255: * @param blockName the name of the block to look up. 256: * @return the specified block. 257: * @throws NullPointerException if the <code>blockName</code> is 258: * <code>null</code>. 259: * @throws IllegalArgumentException if the name does not match any Unicode 260: * block. 261: * @since 1.5 262: */ 263: public static final UnicodeBlock forName(String blockName) 264: { 265: NameType type; 266: if (blockName.indexOf(' ') != -1) 267: type = NameType.CANONICAL; 268: else if (blockName.indexOf('_') != -1) 269: type = NameType.CONSTANT; 270: else 271: type = NameType.NO_SPACES; 272: Collator usCollator = Collator.getInstance(Locale.US); 273: usCollator.setStrength(Collator.PRIMARY); 274: /* Special case for deprecated blocks not in sets */ 275: switch (type) 276: { 277: case CANONICAL: 278: if (usCollator.compare(blockName, "Surrogates Area") == 0) 279: return SURROGATES_AREA; 280: break; 281: case NO_SPACES: 282: if (usCollator.compare(blockName, "SurrogatesArea") == 0) 283: return SURROGATES_AREA; 284: break; 285: case CONSTANT: 286: if (usCollator.compare(blockName, "SURROGATES_AREA") == 0) 287: return SURROGATES_AREA; 288: break; 289: } 290: /* Other cases */ 291: switch (type) 292: { 293: case CANONICAL: 294: for (UnicodeBlock block : sets) 295: if (usCollator.compare(blockName, block.canonicalName) == 0) 296: return block; 297: break; 298: case NO_SPACES: 299: for (UnicodeBlock block : sets) 300: { 301: String nsName = block.canonicalName.replaceAll(" ",""); 302: if (usCollator.compare(blockName, nsName) == 0) 303: return block; 304: } 305: break; 306: case CONSTANT: 307: for (UnicodeBlock block : sets) 308: if (usCollator.compare(blockName, block.toString()) == 0) 309: return block; 310: break; 311: } 312: throw new IllegalArgumentException("No Unicode block found for " + 313: blockName + "."); 314: } 315: 316: /** 317: * Basic Latin. 318: * 0x0000 - 0x007F. 319: */ 320: public static final UnicodeBlock BASIC_LATIN 321: = new UnicodeBlock(0x0000, 0x007F, 322: "BASIC_LATIN", 323: "Basic Latin"); 324: 325: /** 326: * Latin-1 Supplement. 327: * 0x0080 - 0x00FF. 328: */ 329: public static final UnicodeBlock LATIN_1_SUPPLEMENT 330: = new UnicodeBlock(0x0080, 0x00FF, 331: "LATIN_1_SUPPLEMENT", 332: "Latin-1 Supplement"); 333: 334: /** 335: * Latin Extended-A. 336: * 0x0100 - 0x017F. 337: */ 338: public static final UnicodeBlock LATIN_EXTENDED_A 339: = new UnicodeBlock(0x0100, 0x017F, 340: "LATIN_EXTENDED_A", 341: "Latin Extended-A"); 342: 343: /** 344: * Latin Extended-B. 345: * 0x0180 - 0x024F. 346: */ 347: public static final UnicodeBlock LATIN_EXTENDED_B 348: = new UnicodeBlock(0x0180, 0x024F, 349: "LATIN_EXTENDED_B", 350: "Latin Extended-B"); 351: 352: /** 353: * IPA Extensions. 354: * 0x0250 - 0x02AF. 355: */ 356: public static final UnicodeBlock IPA_EXTENSIONS 357: = new UnicodeBlock(0x0250, 0x02AF, 358: "IPA_EXTENSIONS", 359: "IPA Extensions"); 360: 361: /** 362: * Spacing Modifier Letters. 363: * 0x02B0 - 0x02FF. 364: */ 365: public static final UnicodeBlock SPACING_MODIFIER_LETTERS 366: = new UnicodeBlock(0x02B0, 0x02FF, 367: "SPACING_MODIFIER_LETTERS", 368: "Spacing Modifier Letters"); 369: 370: /** 371: * Combining Diacritical Marks. 372: * 0x0300 - 0x036F. 373: */ 374: public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS 375: = new UnicodeBlock(0x0300, 0x036F, 376: "COMBINING_DIACRITICAL_MARKS", 377: "Combining Diacritical Marks"); 378: 379: /** 380: * Greek. 381: * 0x0370 - 0x03FF. 382: */ 383: public static final UnicodeBlock GREEK 384: = new UnicodeBlock(0x0370, 0x03FF, 385: "GREEK", 386: "Greek"); 387: 388: /** 389: * Cyrillic. 390: * 0x0400 - 0x04FF. 391: */ 392: public static final UnicodeBlock CYRILLIC 393: = new UnicodeBlock(0x0400, 0x04FF, 394: "CYRILLIC", 395: "Cyrillic"); 396: 397: /** 398: * Cyrillic Supplementary. 399: * 0x0500 - 0x052F. 400: * @since 1.5 401: */ 402: public static final UnicodeBlock CYRILLIC_SUPPLEMENTARY 403: = new UnicodeBlock(0x0500, 0x052F, 404: "CYRILLIC_SUPPLEMENTARY", 405: "Cyrillic Supplementary"); 406: 407: /** 408: * Armenian. 409: * 0x0530 - 0x058F. 410: */ 411: public static final UnicodeBlock ARMENIAN 412: = new UnicodeBlock(0x0530, 0x058F, 413: "ARMENIAN", 414: "Armenian"); 415: 416: /** 417: * Hebrew. 418: * 0x0590 - 0x05FF. 419: */ 420: public static final UnicodeBlock HEBREW 421: = new UnicodeBlock(0x0590, 0x05FF, 422: "HEBREW", 423: "Hebrew"); 424: 425: /** 426: * Arabic. 427: * 0x0600 - 0x06FF. 428: */ 429: public static final UnicodeBlock ARABIC 430: = new UnicodeBlock(0x0600, 0x06FF, 431: "ARABIC", 432: "Arabic"); 433: 434: /** 435: * Syriac. 436: * 0x0700 - 0x074F. 437: * @since 1.4 438: */ 439: public static final UnicodeBlock SYRIAC 440: = new UnicodeBlock(0x0700, 0x074F, 441: "SYRIAC", 442: "Syriac"); 443: 444: /** 445: * Thaana. 446: * 0x0780 - 0x07BF. 447: * @since 1.4 448: */ 449: public static final UnicodeBlock THAANA 450: = new UnicodeBlock(0x0780, 0x07BF, 451: "THAANA", 452: "Thaana"); 453: 454: /** 455: * Devanagari. 456: * 0x0900 - 0x097F. 457: */ 458: public static final UnicodeBlock DEVANAGARI 459: = new UnicodeBlock(0x0900, 0x097F, 460: "DEVANAGARI", 461: "Devanagari"); 462: 463: /** 464: * Bengali. 465: * 0x0980 - 0x09FF. 466: */ 467: public static final UnicodeBlock BENGALI 468: = new UnicodeBlock(0x0980, 0x09FF, 469: "BENGALI", 470: "Bengali"); 471: 472: /** 473: * Gurmukhi. 474: * 0x0A00 - 0x0A7F. 475: */ 476: public static final UnicodeBlock GURMUKHI 477: = new UnicodeBlock(0x0A00, 0x0A7F, 478: "GURMUKHI", 479: "Gurmukhi"); 480: 481: /** 482: * Gujarati. 483: * 0x0A80 - 0x0AFF. 484: */ 485: public static final UnicodeBlock GUJARATI 486: = new UnicodeBlock(0x0A80, 0x0AFF, 487: "GUJARATI", 488: "Gujarati"); 489: 490: /** 491: * Oriya. 492: * 0x0B00 - 0x0B7F. 493: */ 494: public static final UnicodeBlock ORIYA 495: = new UnicodeBlock(0x0B00, 0x0B7F, 496: "ORIYA", 497: "Oriya"); 498: 499: /** 500: * Tamil. 501: * 0x0B80 - 0x0BFF. 502: */ 503: public static final UnicodeBlock TAMIL 504: = new UnicodeBlock(0x0B80, 0x0BFF, 505: "TAMIL", 506: "Tamil"); 507: 508: /** 509: * Telugu. 510: * 0x0C00 - 0x0C7F. 511: */ 512: public static final UnicodeBlock TELUGU 513: = new UnicodeBlock(0x0C00, 0x0C7F, 514: "TELUGU", 515: "Telugu"); 516: 517: /** 518: * Kannada. 519: * 0x0C80 - 0x0CFF. 520: */ 521: public static final UnicodeBlock KANNADA 522: = new UnicodeBlock(0x0C80, 0x0CFF, 523: "KANNADA", 524: "Kannada"); 525: 526: /** 527: * Malayalam. 528: * 0x0D00 - 0x0D7F. 529: */ 530: public static final UnicodeBlock MALAYALAM 531: = new UnicodeBlock(0x0D00, 0x0D7F, 532: "MALAYALAM", 533: "Malayalam"); 534: 535: /** 536: * Sinhala. 537: * 0x0D80 - 0x0DFF. 538: * @since 1.4 539: */ 540: public static final UnicodeBlock SINHALA 541: = new UnicodeBlock(0x0D80, 0x0DFF, 542: "SINHALA", 543: "Sinhala"); 544: 545: /** 546: * Thai. 547: * 0x0E00 - 0x0E7F. 548: */ 549: public static final UnicodeBlock THAI 550: = new UnicodeBlock(0x0E00, 0x0E7F, 551: "THAI", 552: "Thai"); 553: 554: /** 555: * Lao. 556: * 0x0E80 - 0x0EFF. 557: */ 558: public static final UnicodeBlock LAO 559: = new UnicodeBlock(0x0E80, 0x0EFF, 560: "LAO", 561: "Lao"); 562: 563: /** 564: * Tibetan. 565: * 0x0F00 - 0x0FFF. 566: */ 567: public static final UnicodeBlock TIBETAN 568: = new UnicodeBlock(0x0F00, 0x0FFF, 569: "TIBETAN", 570: "Tibetan"); 571: 572: /** 573: * Myanmar. 574: * 0x1000 - 0x109F. 575: * @since 1.4 576: */ 577: public static final UnicodeBlock MYANMAR 578: = new UnicodeBlock(0x1000, 0x109F, 579: "MYANMAR", 580: "Myanmar"); 581: 582: /** 583: * Georgian. 584: * 0x10A0 - 0x10FF. 585: */ 586: public static final UnicodeBlock GEORGIAN 587: = new UnicodeBlock(0x10A0, 0x10FF, 588: "GEORGIAN", 589: "Georgian"); 590: 591: /** 592: * Hangul Jamo. 593: * 0x1100 - 0x11FF. 594: */ 595: public static final UnicodeBlock HANGUL_JAMO 596: = new UnicodeBlock(0x1100, 0x11FF, 597: "HANGUL_JAMO", 598: "Hangul Jamo"); 599: 600: /** 601: * Ethiopic. 602: * 0x1200 - 0x137F. 603: * @since 1.4 604: */ 605: public static final UnicodeBlock ETHIOPIC 606: = new UnicodeBlock(0x1200, 0x137F, 607: "ETHIOPIC", 608: "Ethiopic"); 609: 610: /** 611: * Cherokee. 612: * 0x13A0 - 0x13FF. 613: * @since 1.4 614: */ 615: public static final UnicodeBlock CHEROKEE 616: = new UnicodeBlock(0x13A0, 0x13FF, 617: "CHEROKEE", 618: "Cherokee"); 619: 620: /** 621: * Unified Canadian Aboriginal Syllabics. 622: * 0x1400 - 0x167F. 623: * @since 1.4 624: */ 625: public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS 626: = new UnicodeBlock(0x1400, 0x167F, 627: "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS", 628: "Unified Canadian Aboriginal Syllabics"); 629: 630: /** 631: * Ogham. 632: * 0x1680 - 0x169F. 633: * @since 1.4 634: */ 635: public static final UnicodeBlock OGHAM 636: = new UnicodeBlock(0x1680, 0x169F, 637: "OGHAM", 638: "Ogham"); 639: 640: /** 641: * Runic. 642: * 0x16A0 - 0x16FF. 643: * @since 1.4 644: */ 645: public static final UnicodeBlock RUNIC 646: = new UnicodeBlock(0x16A0, 0x16FF, 647: "RUNIC", 648: "Runic"); 649: 650: /** 651: * Tagalog. 652: * 0x1700 - 0x171F. 653: * @since 1.5 654: */ 655: public static final UnicodeBlock TAGALOG 656: = new UnicodeBlock(0x1700, 0x171F, 657: "TAGALOG", 658: "Tagalog"); 659: 660: /** 661: * Hanunoo. 662: * 0x1720 - 0x173F. 663: * @since 1.5 664: */ 665: public static final UnicodeBlock HANUNOO 666: = new UnicodeBlock(0x1720, 0x173F, 667: "HANUNOO", 668: "Hanunoo"); 669: 670: /** 671: * Buhid. 672: * 0x1740 - 0x175F. 673: * @since 1.5 674: */ 675: public static final UnicodeBlock BUHID 676: = new UnicodeBlock(0x1740, 0x175F, 677: "BUHID", 678: "Buhid"); 679: 680: /** 681: * Tagbanwa. 682: * 0x1760 - 0x177F. 683: * @since 1.5 684: */ 685: public static final UnicodeBlock TAGBANWA 686: = new UnicodeBlock(0x1760, 0x177F, 687: "TAGBANWA", 688: "Tagbanwa"); 689: 690: /** 691: * Khmer. 692: * 0x1780 - 0x17FF. 693: * @since 1.4 694: */ 695: public static final UnicodeBlock KHMER 696: = new UnicodeBlock(0x1780, 0x17FF, 697: "KHMER", 698: "Khmer"); 699: 700: /** 701: * Mongolian. 702: * 0x1800 - 0x18AF. 703: * @since 1.4 704: */ 705: public static final UnicodeBlock MONGOLIAN 706: = new UnicodeBlock(0x1800, 0x18AF, 707: "MONGOLIAN", 708: "Mongolian"); 709: 710: /** 711: * Limbu. 712: * 0x1900 - 0x194F. 713: * @since 1.5 714: */ 715: public static final UnicodeBlock LIMBU 716: = new UnicodeBlock(0x1900, 0x194F, 717: "LIMBU", 718: "Limbu"); 719: 720: /** 721: * Tai Le. 722: * 0x1950 - 0x197F. 723: * @since 1.5 724: */ 725: public static final UnicodeBlock TAI_LE 726: = new UnicodeBlock(0x1950, 0x197F, 727: "TAI_LE", 728: "Tai Le"); 729: 730: /** 731: * Khmer Symbols. 732: * 0x19E0 - 0x19FF. 733: * @since 1.5 734: */ 735: public static final UnicodeBlock KHMER_SYMBOLS 736: = new UnicodeBlock(0x19E0, 0x19FF, 737: "KHMER_SYMBOLS", 738: "Khmer Symbols"); 739: 740: /** 741: * Phonetic Extensions. 742: * 0x1D00 - 0x1D7F. 743: * @since 1.5 744: */ 745: public static final UnicodeBlock PHONETIC_EXTENSIONS 746: = new UnicodeBlock(0x1D00, 0x1D7F, 747: "PHONETIC_EXTENSIONS", 748: "Phonetic Extensions"); 749: 750: /** 751: * Latin Extended Additional. 752: * 0x1E00 - 0x1EFF. 753: */ 754: public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL 755: = new UnicodeBlock(0x1E00, 0x1EFF, 756: "LATIN_EXTENDED_ADDITIONAL", 757: "Latin Extended Additional"); 758: 759: /** 760: * Greek Extended. 761: * 0x1F00 - 0x1FFF. 762: */ 763: public static final UnicodeBlock GREEK_EXTENDED 764: = new UnicodeBlock(0x1F00, 0x1FFF, 765: "GREEK_EXTENDED", 766: "Greek Extended"); 767: 768: /** 769: * General Punctuation. 770: * 0x2000 - 0x206F. 771: */ 772: public static final UnicodeBlock GENERAL_PUNCTUATION 773: = new UnicodeBlock(0x2000, 0x206F, 774: "GENERAL_PUNCTUATION", 775: "General Punctuation"); 776: 777: /** 778: * Superscripts and Subscripts. 779: * 0x2070 - 0x209F. 780: */ 781: public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS 782: = new UnicodeBlock(0x2070, 0x209F, 783: "SUPERSCRIPTS_AND_SUBSCRIPTS", 784: "Superscripts and Subscripts"); 785: 786: /** 787: * Currency Symbols. 788: * 0x20A0 - 0x20CF. 789: */ 790: public static final UnicodeBlock CURRENCY_SYMBOLS 791: = new UnicodeBlock(0x20A0, 0x20CF, 792: "CURRENCY_SYMBOLS", 793: "Currency Symbols"); 794: 795: /** 796: * Combining Marks for Symbols. 797: * 0x20D0 - 0x20FF. 798: */ 799: public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS 800: = new UnicodeBlock(0x20D0, 0x20FF, 801: "COMBINING_MARKS_FOR_SYMBOLS", 802: "Combining Marks for Symbols"); 803: 804: /** 805: * Letterlike Symbols. 806: * 0x2100 - 0x214F. 807: */ 808: public static final UnicodeBlock LETTERLIKE_SYMBOLS 809: = new UnicodeBlock(0x2100, 0x214F, 810: "LETTERLIKE_SYMBOLS", 811: "Letterlike Symbols"); 812: 813: /** 814: * Number Forms. 815: * 0x2150 - 0x218F. 816: */ 817: public static final UnicodeBlock NUMBER_FORMS 818: = new UnicodeBlock(0x2150, 0x218F, 819: "NUMBER_FORMS", 820: "Number Forms"); 821: 822: /** 823: * Arrows. 824: * 0x2190 - 0x21FF. 825: */ 826: public static final UnicodeBlock ARROWS 827: = new UnicodeBlock(0x2190, 0x21FF, 828: "ARROWS", 829: "Arrows"); 830: 831: /** 832: * Mathematical Operators. 833: * 0x2200 - 0x22FF. 834: */ 835: public static final UnicodeBlock MATHEMATICAL_OPERATORS 836: = new UnicodeBlock(0x2200, 0x22FF, 837: "MATHEMATICAL_OPERATORS", 838: "Mathematical Operators"); 839: 840: /** 841: * Miscellaneous Technical. 842: * 0x2300 - 0x23FF. 843: */ 844: public static final UnicodeBlock MISCELLANEOUS_TECHNICAL 845: = new UnicodeBlock(0x2300, 0x23FF, 846: "MISCELLANEOUS_TECHNICAL", 847: "Miscellaneous Technical"); 848: 849: /** 850: * Control Pictures. 851: * 0x2400 - 0x243F. 852: */ 853: public static final UnicodeBlock CONTROL_PICTURES 854: = new UnicodeBlock(0x2400, 0x243F, 855: "CONTROL_PICTURES", 856: "Control Pictures"); 857: 858: /** 859: * Optical Character Recognition. 860: * 0x2440 - 0x245F. 861: */ 862: public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION 863: = new UnicodeBlock(0x2440, 0x245F, 864: "OPTICAL_CHARACTER_RECOGNITION", 865: "Optical Character Recognition"); 866: 867: /** 868: * Enclosed Alphanumerics. 869: * 0x2460 - 0x24FF. 870: */ 871: public static final UnicodeBlock ENCLOSED_ALPHANUMERICS 872: = new UnicodeBlock(0x2460, 0x24FF, 873: "ENCLOSED_ALPHANUMERICS", 874: "Enclosed Alphanumerics"); 875: 876: /** 877: * Box Drawing. 878: * 0x2500 - 0x257F. 879: */ 880: public static final UnicodeBlock BOX_DRAWING 881: = new UnicodeBlock(0x2500, 0x257F, 882: "BOX_DRAWING", 883: "Box Drawing"); 884: 885: /** 886: * Block Elements. 887: * 0x2580 - 0x259F. 888: */ 889: public static final UnicodeBlock BLOCK_ELEMENTS 890: = new UnicodeBlock(0x2580, 0x259F, 891: "BLOCK_ELEMENTS", 892: "Block Elements"); 893: 894: /** 895: * Geometric Shapes. 896: * 0x25A0 - 0x25FF. 897: */ 898: public static final UnicodeBlock GEOMETRIC_SHAPES 899: = new UnicodeBlock(0x25A0, 0x25FF, 900: "GEOMETRIC_SHAPES", 901: "Geometric Shapes"); 902: 903: /** 904: * Miscellaneous Symbols. 905: * 0x2600 - 0x26FF. 906: */ 907: public static final UnicodeBlock MISCELLANEOUS_SYMBOLS 908: = new UnicodeBlock(0x2600, 0x26FF, 909: "MISCELLANEOUS_SYMBOLS", 910: "Miscellaneous Symbols"); 911: 912: /** 913: * Dingbats. 914: * 0x2700 - 0x27BF. 915: */ 916: public static final UnicodeBlock DINGBATS 917: = new UnicodeBlock(0x2700, 0x27BF, 918: "DINGBATS", 919: "Dingbats"); 920: 921: /** 922: * Miscellaneous Mathematical Symbols-A. 923: * 0x27C0 - 0x27EF. 924: * @since 1.5 925: */ 926: public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A 927: = new UnicodeBlock(0x27C0, 0x27EF, 928: "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A", 929: "Miscellaneous Mathematical Symbols-A"); 930: 931: /** 932: * Supplemental Arrows-A. 933: * 0x27F0 - 0x27FF. 934: * @since 1.5 935: */ 936: public static final UnicodeBlock SUPPLEMENTAL_ARROWS_A 937: = new UnicodeBlock(0x27F0, 0x27FF, 938: "SUPPLEMENTAL_ARROWS_A", 939: "Supplemental Arrows-A"); 940: 941: /** 942: * Braille Patterns. 943: * 0x2800 - 0x28FF. 944: * @since 1.4 945: */ 946: public static final UnicodeBlock BRAILLE_PATTERNS 947: = new UnicodeBlock(0x2800, 0x28FF, 948: "BRAILLE_PATTERNS", 949: "Braille Patterns"); 950: 951: /** 952: * Supplemental Arrows-B. 953: * 0x2900 - 0x297F. 954: * @since 1.5 955: */ 956: public static final UnicodeBlock SUPPLEMENTAL_ARROWS_B 957: = new UnicodeBlock(0x2900, 0x297F, 958: "SUPPLEMENTAL_ARROWS_B", 959: "Supplemental Arrows-B"); 960: 961: /** 962: * Miscellaneous Mathematical Symbols-B. 963: * 0x2980 - 0x29FF. 964: * @since 1.5 965: */ 966: public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B 967: = new UnicodeBlock(0x2980, 0x29FF, 968: "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B", 969: "Miscellaneous Mathematical Symbols-B"); 970: 971: /** 972: * Supplemental Mathematical Operators. 973: * 0x2A00 - 0x2AFF. 974: * @since 1.5 975: */ 976: public static final UnicodeBlock SUPPLEMENTAL_MATHEMATICAL_OPERATORS 977: = new UnicodeBlock(0x2A00, 0x2AFF, 978: "SUPPLEMENTAL_MATHEMATICAL_OPERATORS", 979: "Supplemental Mathematical Operators"); 980: 981: /** 982: * Miscellaneous Symbols and Arrows. 983: * 0x2B00 - 0x2BFF. 984: * @since 1.5 985: */ 986: public static final UnicodeBlock MISCELLANEOUS_SYMBOLS_AND_ARROWS 987: = new UnicodeBlock(0x2B00, 0x2BFF, 988: "MISCELLANEOUS_SYMBOLS_AND_ARROWS", 989: "Miscellaneous Symbols and Arrows"); 990: 991: /** 992: * CJK Radicals Supplement. 993: * 0x2E80 - 0x2EFF. 994: * @since 1.4 995: */ 996: public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT 997: = new UnicodeBlock(0x2E80, 0x2EFF, 998: "CJK_RADICALS_SUPPLEMENT", 999: "CJK Radicals Supplement"); 1000: 1001: /** 1002: * Kangxi Radicals. 1003: * 0x2F00 - 0x2FDF. 1004: * @since 1.4 1005: */ 1006: public static final UnicodeBlock KANGXI_RADICALS 1007: = new UnicodeBlock(0x2F00, 0x2FDF, 1008: "KANGXI_RADICALS", 1009: "Kangxi Radicals"); 1010: 1011: /** 1012: * Ideographic Description Characters. 1013: * 0x2FF0 - 0x2FFF. 1014: * @since 1.4 1015: */ 1016: public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS 1017: = new UnicodeBlock(0x2FF0, 0x2FFF, 1018: "IDEOGRAPHIC_DESCRIPTION_CHARACTERS", 1019: "Ideographic Description Characters"); 1020: 1021: /** 1022: * CJK Symbols and Punctuation. 1023: * 0x3000 - 0x303F. 1024: */ 1025: public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION 1026: = new UnicodeBlock(0x3000, 0x303F, 1027: "CJK_SYMBOLS_AND_PUNCTUATION", 1028: "CJK Symbols and Punctuation"); 1029: 1030: /** 1031: * Hiragana. 1032: * 0x3040 - 0x309F. 1033: */ 1034: public static final UnicodeBlock HIRAGANA 1035: = new UnicodeBlock(0x3040, 0x309F, 1036: "HIRAGANA", 1037: "Hiragana"); 1038: 1039: /** 1040: * Katakana. 1041: * 0x30A0 - 0x30FF. 1042: */ 1043: public static final UnicodeBlock KATAKANA 1044: = new UnicodeBlock(0x30A0, 0x30FF, 1045: "KATAKANA", 1046: "Katakana"); 1047: 1048: /** 1049: * Bopomofo. 1050: * 0x3100 - 0x312F. 1051: */ 1052: public static final UnicodeBlock BOPOMOFO 1053: = new UnicodeBlock(0x3100, 0x312F, 1054: "BOPOMOFO", 1055: "Bopomofo"); 1056: 1057: /** 1058: * Hangul Compatibility Jamo. 1059: * 0x3130 - 0x318F. 1060: */ 1061: public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO 1062: = new UnicodeBlock(0x3130, 0x318F, 1063: "HANGUL_COMPATIBILITY_JAMO", 1064: "Hangul Compatibility Jamo"); 1065: 1066: /** 1067: * Kanbun. 1068: * 0x3190 - 0x319F. 1069: */ 1070: public static final UnicodeBlock KANBUN 1071: = new UnicodeBlock(0x3190, 0x319F, 1072: "KANBUN", 1073: "Kanbun"); 1074: 1075: /** 1076: * Bopomofo Extended. 1077: * 0x31A0 - 0x31BF. 1078: * @since 1.4 1079: */ 1080: public static final UnicodeBlock BOPOMOFO_EXTENDED 1081: = new UnicodeBlock(0x31A0, 0x31BF, 1082: "BOPOMOFO_EXTENDED", 1083: "Bopomofo Extended"); 1084: 1085: /** 1086: * Katakana Phonetic Extensions. 1087: * 0x31F0 - 0x31FF. 1088: * @since 1.5 1089: */ 1090: public static final UnicodeBlock KATAKANA_PHONETIC_EXTENSIONS 1091: = new UnicodeBlock(0x31F0, 0x31FF, 1092: "KATAKANA_PHONETIC_EXTENSIONS", 1093: "Katakana Phonetic Extensions"); 1094: 1095: /** 1096: * Enclosed CJK Letters and Months. 1097: * 0x3200 - 0x32FF. 1098: */ 1099: public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS 1100: = new UnicodeBlock(0x3200, 0x32FF, 1101: "ENCLOSED_CJK_LETTERS_AND_MONTHS", 1102: "Enclosed CJK Letters and Months"); 1103: 1104: /** 1105: * CJK Compatibility. 1106: * 0x3300 - 0x33FF. 1107: */ 1108: public static final UnicodeBlock CJK_COMPATIBILITY 1109: = new UnicodeBlock(0x3300, 0x33FF, 1110: "CJK_COMPATIBILITY", 1111: "CJK Compatibility"); 1112: 1113: /** 1114: * CJK Unified Ideographs Extension A. 1115: * 0x3400 - 0x4DBF. 1116: * @since 1.4 1117: */ 1118: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A 1119: = new UnicodeBlock(0x3400, 0x4DBF, 1120: "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A", 1121: "CJK Unified Ideographs Extension A"); 1122: 1123: /** 1124: * Yijing Hexagram Symbols. 1125: * 0x4DC0 - 0x4DFF. 1126: * @since 1.5 1127: */ 1128: public static final UnicodeBlock YIJING_HEXAGRAM_SYMBOLS 1129: = new UnicodeBlock(0x4DC0, 0x4DFF, 1130: "YIJING_HEXAGRAM_SYMBOLS", 1131: "Yijing Hexagram Symbols"); 1132: 1133: /** 1134: * CJK Unified Ideographs. 1135: * 0x4E00 - 0x9FFF. 1136: */ 1137: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS 1138: = new UnicodeBlock(0x4E00, 0x9FFF, 1139: "CJK_UNIFIED_IDEOGRAPHS", 1140: "CJK Unified Ideographs"); 1141: 1142: /** 1143: * Yi Syllables. 1144: * 0xA000 - 0xA48F. 1145: * @since 1.4 1146: */ 1147: public static final UnicodeBlock YI_SYLLABLES 1148: = new UnicodeBlock(0xA000, 0xA48F, 1149: "YI_SYLLABLES", 1150: "Yi Syllables"); 1151: 1152: /** 1153: * Yi Radicals. 1154: * 0xA490 - 0xA4CF. 1155: * @since 1.4 1156: */ 1157: public static final UnicodeBlock YI_RADICALS 1158: = new UnicodeBlock(0xA490, 0xA4CF, 1159: "YI_RADICALS", 1160: "Yi Radicals"); 1161: 1162: /** 1163: * Hangul Syllables. 1164: * 0xAC00 - 0xD7AF. 1165: */ 1166: public static final UnicodeBlock HANGUL_SYLLABLES 1167: = new UnicodeBlock(0xAC00, 0xD7AF, 1168: "HANGUL_SYLLABLES", 1169: "Hangul Syllables"); 1170: 1171: /** 1172: * High Surrogates. 1173: * 0xD800 - 0xDB7F. 1174: * @since 1.5 1175: */ 1176: public static final UnicodeBlock HIGH_SURROGATES 1177: = new UnicodeBlock(0xD800, 0xDB7F, 1178: "HIGH_SURROGATES", 1179: "High Surrogates"); 1180: 1181: /** 1182: * High Private Use Surrogates. 1183: * 0xDB80 - 0xDBFF. 1184: * @since 1.5 1185: */ 1186: public static final UnicodeBlock HIGH_PRIVATE_USE_SURROGATES 1187: = new UnicodeBlock(0xDB80, 0xDBFF, 1188: "HIGH_PRIVATE_USE_SURROGATES", 1189: "High Private Use Surrogates"); 1190: 1191: /** 1192: * Low Surrogates. 1193: * 0xDC00 - 0xDFFF. 1194: * @since 1.5 1195: */ 1196: public static final UnicodeBlock LOW_SURROGATES 1197: = new UnicodeBlock(0xDC00, 0xDFFF, 1198: "LOW_SURROGATES", 1199: "Low Surrogates"); 1200: 1201: /** 1202: * Private Use Area. 1203: * 0xE000 - 0xF8FF. 1204: */ 1205: public static final UnicodeBlock PRIVATE_USE_AREA 1206: = new UnicodeBlock(0xE000, 0xF8FF, 1207: "PRIVATE_USE_AREA", 1208: "Private Use Area"); 1209: 1210: /** 1211: * CJK Compatibility Ideographs. 1212: * 0xF900 - 0xFAFF. 1213: */ 1214: public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS 1215: = new UnicodeBlock(0xF900, 0xFAFF, 1216: "CJK_COMPATIBILITY_IDEOGRAPHS", 1217: "CJK Compatibility Ideographs"); 1218: 1219: /** 1220: * Alphabetic Presentation Forms. 1221: * 0xFB00 - 0xFB4F. 1222: */ 1223: public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS 1224: = new UnicodeBlock(0xFB00, 0xFB4F, 1225: "ALPHABETIC_PRESENTATION_FORMS", 1226: "Alphabetic Presentation Forms"); 1227: 1228: /** 1229: * Arabic Presentation Forms-A. 1230: * 0xFB50 - 0xFDFF. 1231: */ 1232: public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A 1233: = new UnicodeBlock(0xFB50, 0xFDFF, 1234: "ARABIC_PRESENTATION_FORMS_A", 1235: "Arabic Presentation Forms-A"); 1236: 1237: /** 1238: * Variation Selectors. 1239: * 0xFE00 - 0xFE0F. 1240: * @since 1.5 1241: */ 1242: public static final UnicodeBlock VARIATION_SELECTORS 1243: = new UnicodeBlock(0xFE00, 0xFE0F, 1244: "VARIATION_SELECTORS", 1245: "Variation Selectors"); 1246: 1247: /** 1248: * Combining Half Marks. 1249: * 0xFE20 - 0xFE2F. 1250: */ 1251: public static final UnicodeBlock COMBINING_HALF_MARKS 1252: = new UnicodeBlock(0xFE20, 0xFE2F, 1253: "COMBINING_HALF_MARKS", 1254: "Combining Half Marks"); 1255: 1256: /** 1257: * CJK Compatibility Forms. 1258: * 0xFE30 - 0xFE4F. 1259: */ 1260: public static final UnicodeBlock CJK_COMPATIBILITY_FORMS 1261: = new UnicodeBlock(0xFE30, 0xFE4F, 1262: "CJK_COMPATIBILITY_FORMS", 1263: "CJK Compatibility Forms"); 1264: 1265: /** 1266: * Small Form Variants. 1267: * 0xFE50 - 0xFE6F. 1268: */ 1269: public static final UnicodeBlock SMALL_FORM_VARIANTS 1270: = new UnicodeBlock(0xFE50, 0xFE6F, 1271: "SMALL_FORM_VARIANTS", 1272: "Small Form Variants"); 1273: 1274: /** 1275: * Arabic Presentation Forms-B. 1276: * 0xFE70 - 0xFEFF. 1277: */ 1278: public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B 1279: = new UnicodeBlock(0xFE70, 0xFEFF, 1280: "ARABIC_PRESENTATION_FORMS_B", 1281: "Arabic Presentation Forms-B"); 1282: 1283: /** 1284: * Halfwidth and Fullwidth Forms. 1285: * 0xFF00 - 0xFFEF. 1286: */ 1287: public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS 1288: = new UnicodeBlock(0xFF00, 0xFFEF, 1289: "HALFWIDTH_AND_FULLWIDTH_FORMS", 1290: "Halfwidth and Fullwidth Forms"); 1291: 1292: /** 1293: * Specials. 1294: * 0xFFF0 - 0xFFFF. 1295: */ 1296: public static final UnicodeBlock SPECIALS 1297: = new UnicodeBlock(0xFFF0, 0xFFFF, 1298: "SPECIALS", 1299: "Specials"); 1300: 1301: /** 1302: * Linear B Syllabary. 1303: * 0x10000 - 0x1007F. 1304: * @since 1.5 1305: */ 1306: public static final UnicodeBlock LINEAR_B_SYLLABARY 1307: = new UnicodeBlock(0x10000, 0x1007F, 1308: "LINEAR_B_SYLLABARY", 1309: "Linear B Syllabary"); 1310: 1311: /** 1312: * Linear B Ideograms. 1313: * 0x10080 - 0x100FF. 1314: * @since 1.5 1315: */ 1316: public static final UnicodeBlock LINEAR_B_IDEOGRAMS 1317: = new UnicodeBlock(0x10080, 0x100FF, 1318: "LINEAR_B_IDEOGRAMS", 1319: "Linear B Ideograms"); 1320: 1321: /** 1322: * Aegean Numbers. 1323: * 0x10100 - 0x1013F. 1324: * @since 1.5 1325: */ 1326: public static final UnicodeBlock AEGEAN_NUMBERS 1327: = new UnicodeBlock(0x10100, 0x1013F, 1328: "AEGEAN_NUMBERS", 1329: "Aegean Numbers"); 1330: 1331: /** 1332: * Old Italic. 1333: * 0x10300 - 0x1032F. 1334: * @since 1.5 1335: */ 1336: public static final UnicodeBlock OLD_ITALIC 1337: = new UnicodeBlock(0x10300, 0x1032F, 1338: "OLD_ITALIC", 1339: "Old Italic"); 1340: 1341: /** 1342: * Gothic. 1343: * 0x10330 - 0x1034F. 1344: * @since 1.5 1345: */ 1346: public static final UnicodeBlock GOTHIC 1347: = new UnicodeBlock(0x10330, 0x1034F, 1348: "GOTHIC", 1349: "Gothic"); 1350: 1351: /** 1352: * Ugaritic. 1353: * 0x10380 - 0x1039F. 1354: * @since 1.5 1355: */ 1356: public static final UnicodeBlock UGARITIC 1357: = new UnicodeBlock(0x10380, 0x1039F, 1358: "UGARITIC", 1359: "Ugaritic"); 1360: 1361: /** 1362: * Deseret. 1363: * 0x10400 - 0x1044F. 1364: * @since 1.5 1365: */ 1366: public static final UnicodeBlock DESERET 1367: = new UnicodeBlock(0x10400, 0x1044F, 1368: "DESERET", 1369: "Deseret"); 1370: 1371: /** 1372: * Shavian. 1373: * 0x10450 - 0x1047F. 1374: * @since 1.5 1375: */ 1376: public static final UnicodeBlock SHAVIAN 1377: = new UnicodeBlock(0x10450, 0x1047F, 1378: "SHAVIAN", 1379: "Shavian"); 1380: 1381: /** 1382: * Osmanya. 1383: * 0x10480 - 0x104AF. 1384: * @since 1.5 1385: */ 1386: public static final UnicodeBlock OSMANYA 1387: = new UnicodeBlock(0x10480, 0x104AF, 1388: "OSMANYA", 1389: "Osmanya"); 1390: 1391: /** 1392: * Cypriot Syllabary. 1393: * 0x10800 - 0x1083F. 1394: * @since 1.5 1395: */ 1396: public static final UnicodeBlock CYPRIOT_SYLLABARY 1397: = new UnicodeBlock(0x10800, 0x1083F, 1398: "CYPRIOT_SYLLABARY", 1399: "Cypriot Syllabary"); 1400: 1401: /** 1402: * Byzantine Musical Symbols. 1403: * 0x1D000 - 0x1D0FF. 1404: * @since 1.5 1405: */ 1406: public static final UnicodeBlock BYZANTINE_MUSICAL_SYMBOLS 1407: = new UnicodeBlock(0x1D000, 0x1D0FF, 1408: "BYZANTINE_MUSICAL_SYMBOLS", 1409: "Byzantine Musical Symbols"); 1410: 1411: /** 1412: * Musical Symbols. 1413: * 0x1D100 - 0x1D1FF. 1414: * @since 1.5 1415: */ 1416: public static final UnicodeBlock MUSICAL_SYMBOLS 1417: = new UnicodeBlock(0x1D100, 0x1D1FF, 1418: "MUSICAL_SYMBOLS", 1419: "Musical Symbols"); 1420: 1421: /** 1422: * Tai Xuan Jing Symbols. 1423: * 0x1D300 - 0x1D35F. 1424: * @since 1.5 1425: */ 1426: public static final UnicodeBlock TAI_XUAN_JING_SYMBOLS 1427: = new UnicodeBlock(0x1D300, 0x1D35F, 1428: "TAI_XUAN_JING_SYMBOLS", 1429: "Tai Xuan Jing Symbols"); 1430: 1431: /** 1432: * Mathematical Alphanumeric Symbols. 1433: * 0x1D400 - 0x1D7FF. 1434: * @since 1.5 1435: */ 1436: public static final UnicodeBlock MATHEMATICAL_ALPHANUMERIC_SYMBOLS 1437: = new UnicodeBlock(0x1D400, 0x1D7FF, 1438: "MATHEMATICAL_ALPHANUMERIC_SYMBOLS", 1439: "Mathematical Alphanumeric Symbols"); 1440: 1441: /** 1442: * CJK Unified Ideographs Extension B. 1443: * 0x20000 - 0x2A6DF. 1444: * @since 1.5 1445: */ 1446: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B 1447: = new UnicodeBlock(0x20000, 0x2A6DF, 1448: "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B", 1449: "CJK Unified Ideographs Extension B"); 1450: 1451: /** 1452: * CJK Compatibility Ideographs Supplement. 1453: * 0x2F800 - 0x2FA1F. 1454: * @since 1.5 1455: */ 1456: public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT 1457: = new UnicodeBlock(0x2F800, 0x2FA1F, 1458: "CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT", 1459: "CJK Compatibility Ideographs Supplement"); 1460: 1461: /** 1462: * Tags. 1463: * 0xE0000 - 0xE007F. 1464: * @since 1.5 1465: */ 1466: public static final UnicodeBlock TAGS 1467: = new UnicodeBlock(0xE0000, 0xE007F, 1468: "TAGS", 1469: "Tags"); 1470: 1471: /** 1472: * Variation Selectors Supplement. 1473: * 0xE0100 - 0xE01EF. 1474: * @since 1.5 1475: */ 1476: public static final UnicodeBlock VARIATION_SELECTORS_SUPPLEMENT 1477: = new UnicodeBlock(0xE0100, 0xE01EF, 1478: "VARIATION_SELECTORS_SUPPLEMENT", 1479: "Variation Selectors Supplement"); 1480: 1481: /** 1482: * Supplementary Private Use Area-A. 1483: * 0xF0000 - 0xFFFFF. 1484: * @since 1.5 1485: */ 1486: public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_A 1487: = new UnicodeBlock(0xF0000, 0xFFFFF, 1488: "SUPPLEMENTARY_PRIVATE_USE_AREA_A", 1489: "Supplementary Private Use Area-A"); 1490: 1491: /** 1492: * Supplementary Private Use Area-B. 1493: * 0x100000 - 0x10FFFF. 1494: * @since 1.5 1495: */ 1496: public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_B 1497: = new UnicodeBlock(0x100000, 0x10FFFF, 1498: "SUPPLEMENTARY_PRIVATE_USE_AREA_B", 1499: "Supplementary Private Use Area-B"); 1500: 1501: /** 1502: * Surrogates Area. 1503: * 'D800' - 'DFFF'. 1504: * @deprecated As of 1.5, the three areas, 1505: * <a href="#HIGH_SURROGATES">HIGH_SURROGATES</a>, 1506: * <a href="#HIGH_PRIVATE_USE_SURROGATES">HIGH_PRIVATE_USE_SURROGATES</a> 1507: * and <a href="#LOW_SURROGATES">LOW_SURROGATES</a>, as defined 1508: * by the Unicode standard, should be used in preference to 1509: * this. These are also returned from calls to <code>of(int)</code> 1510: * and <code>of(char)</code>. 1511: */ 1512: @Deprecated 1513: public static final UnicodeBlock SURROGATES_AREA 1514: = new UnicodeBlock(0xD800, 0xDFFF, 1515: "SURROGATES_AREA", 1516: "Surrogates Area"); 1517: 1518: /** 1519: * The defined subsets. 1520: */ 1521: private static final UnicodeBlock sets[] = { 1522: BASIC_LATIN, 1523: LATIN_1_SUPPLEMENT, 1524: LATIN_EXTENDED_A, 1525: LATIN_EXTENDED_B, 1526: IPA_EXTENSIONS, 1527: SPACING_MODIFIER_LETTERS, 1528: COMBINING_DIACRITICAL_MARKS, 1529: GREEK, 1530: CYRILLIC, 1531: CYRILLIC_SUPPLEMENTARY, 1532: ARMENIAN, 1533: HEBREW, 1534: ARABIC, 1535: SYRIAC, 1536: THAANA, 1537: DEVANAGARI, 1538: BENGALI, 1539: GURMUKHI, 1540: GUJARATI, 1541: ORIYA, 1542: TAMIL, 1543: TELUGU, 1544: KANNADA, 1545: MALAYALAM, 1546: SINHALA, 1547: THAI, 1548: LAO, 1549: TIBETAN, 1550: MYANMAR, 1551: GEORGIAN, 1552: HANGUL_JAMO, 1553: ETHIOPIC, 1554: CHEROKEE, 1555: UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS, 1556: OGHAM, 1557: RUNIC, 1558: TAGALOG, 1559: HANUNOO, 1560: BUHID, 1561: TAGBANWA, 1562: KHMER, 1563: MONGOLIAN, 1564: LIMBU, 1565: TAI_LE, 1566: KHMER_SYMBOLS, 1567: PHONETIC_EXTENSIONS, 1568: LATIN_EXTENDED_ADDITIONAL, 1569: GREEK_EXTENDED, 1570: GENERAL_PUNCTUATION, 1571: SUPERSCRIPTS_AND_SUBSCRIPTS, 1572: CURRENCY_SYMBOLS, 1573: COMBINING_MARKS_FOR_SYMBOLS, 1574: LETTERLIKE_SYMBOLS, 1575: NUMBER_FORMS, 1576: ARROWS, 1577: MATHEMATICAL_OPERATORS, 1578: MISCELLANEOUS_TECHNICAL, 1579: CONTROL_PICTURES, 1580: OPTICAL_CHARACTER_RECOGNITION, 1581: ENCLOSED_ALPHANUMERICS, 1582: BOX_DRAWING, 1583: BLOCK_ELEMENTS, 1584: GEOMETRIC_SHAPES, 1585: MISCELLANEOUS_SYMBOLS, 1586: DINGBATS, 1587: MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A, 1588: SUPPLEMENTAL_ARROWS_A, 1589: BRAILLE_PATTERNS, 1590: SUPPLEMENTAL_ARROWS_B, 1591: MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B, 1592: SUPPLEMENTAL_MATHEMATICAL_OPERATORS, 1593: MISCELLANEOUS_SYMBOLS_AND_ARROWS, 1594: CJK_RADICALS_SUPPLEMENT, 1595: KANGXI_RADICALS, 1596: IDEOGRAPHIC_DESCRIPTION_CHARACTERS, 1597: CJK_SYMBOLS_AND_PUNCTUATION, 1598: HIRAGANA, 1599: KATAKANA, 1600: BOPOMOFO, 1601: HANGUL_COMPATIBILITY_JAMO, 1602: KANBUN, 1603: BOPOMOFO_EXTENDED, 1604: KATAKANA_PHONETIC_EXTENSIONS, 1605: ENCLOSED_CJK_LETTERS_AND_MONTHS, 1606: CJK_COMPATIBILITY, 1607: CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A, 1608: YIJING_HEXAGRAM_SYMBOLS, 1609: CJK_UNIFIED_IDEOGRAPHS, 1610: YI_SYLLABLES, 1611: YI_RADICALS, 1612: HANGUL_SYLLABLES, 1613: HIGH_SURROGATES, 1614: HIGH_PRIVATE_USE_SURROGATES, 1615: LOW_SURROGATES, 1616: PRIVATE_USE_AREA, 1617: CJK_COMPATIBILITY_IDEOGRAPHS, 1618: ALPHABETIC_PRESENTATION_FORMS, 1619: ARABIC_PRESENTATION_FORMS_A, 1620: VARIATION_SELECTORS, 1621: COMBINING_HALF_MARKS, 1622: CJK_COMPATIBILITY_FORMS, 1623: SMALL_FORM_VARIANTS, 1624: ARABIC_PRESENTATION_FORMS_B, 1625: HALFWIDTH_AND_FULLWIDTH_FORMS, 1626: SPECIALS, 1627: LINEAR_B_SYLLABARY, 1628: LINEAR_B_IDEOGRAMS, 1629: AEGEAN_NUMBERS, 1630: OLD_ITALIC, 1631: GOTHIC, 1632: UGARITIC, 1633: DESERET, 1634: SHAVIAN, 1635: OSMANYA, 1636: CYPRIOT_SYLLABARY, 1637: BYZANTINE_MUSICAL_SYMBOLS, 1638: MUSICAL_SYMBOLS, 1639: TAI_XUAN_JING_SYMBOLS, 1640: MATHEMATICAL_ALPHANUMERIC_SYMBOLS, 1641: CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, 1642: CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, 1643: TAGS, 1644: VARIATION_SELECTORS_SUPPLEMENT, 1645: SUPPLEMENTARY_PRIVATE_USE_AREA_A, 1646: SUPPLEMENTARY_PRIVATE_USE_AREA_B, 1647: }; 1648: } // class UnicodeBlock 1649: 1650: /** 1651: * The immutable value of this Character. 1652: * 1653: * @serial the value of this Character 1654: */ 1655: private final char value; 1656: 1657: /** 1658: * Compatible with JDK 1.0+. 1659: */ 1660: private static final long serialVersionUID = 3786198910865385080L; 1661: 1662: /** 1663: * Smallest value allowed for radix arguments in Java. This value is 2. 1664: * 1665: * @see #digit(char, int) 1666: * @see #forDigit(int, int) 1667: * @see Integer#toString(int, int) 1668: * @see Integer#valueOf(String) 1669: */ 1670: public static final int MIN_RADIX = 2; 1671: 1672: /** 1673: * Largest value allowed for radix arguments in Java. This value is 36. 1674: * 1675: * @see #digit(char, int) 1676: * @see #forDigit(int, int) 1677: * @see Integer#toString(int, int) 1678: * @see Integer#valueOf(String) 1679: */ 1680: public static final int MAX_RADIX = 36; 1681: 1682: /** 1683: * The minimum value the char data type can hold. 1684: * This value is <code>'\\u0000'</code>. 1685: */ 1686: public static final char MIN_VALUE = '\u0000'; 1687: 1688: /** 1689: * The maximum value the char data type can hold. 1690: * This value is <code>'\\uFFFF'</code>. 1691: */ 1692: public static final char MAX_VALUE = '\uFFFF'; 1693: 1694: /** 1695: * The minimum Unicode 4.0 code point. This value is <code>0</code>. 1696: * @since 1.5 1697: */ 1698: public static final int MIN_CODE_POINT = 0; 1699: 1700: /** 1701: * The maximum Unicode 4.0 code point, which is greater than the range 1702: * of the char data type. 1703: * This value is <code>0x10FFFF</code>. 1704: * @since 1.5 1705: */ 1706: public static final int MAX_CODE_POINT = 0x10FFFF; 1707: 1708: /** 1709: * The minimum Unicode high surrogate code unit, or 1710: * <emph>leading-surrogate</emph>, in the UTF-16 character encoding. 1711: * This value is <code>'\uD800'</code>. 1712: * @since 1.5 1713: */ 1714: public static final char MIN_HIGH_SURROGATE = '\uD800'; 1715: 1716: /** 1717: * The maximum Unicode high surrogate code unit, or 1718: * <emph>leading-surrogate</emph>, in the UTF-16 character encoding. 1719: * This value is <code>'\uDBFF'</code>. 1720: * @since 1.5 1721: */ 1722: public static final char MAX_HIGH_SURROGATE = '\uDBFF'; 1723: 1724: /** 1725: * The minimum Unicode low surrogate code unit, or 1726: * <emph>trailing-surrogate</emph>, in the UTF-16 character encoding. 1727: * This value is <code>'\uDC00'</code>. 1728: * @since 1.5 1729: */ 1730: public static final char MIN_LOW_SURROGATE = '\uDC00'; 1731: 1732: /** 1733: * The maximum Unicode low surrogate code unit, or 1734: * <emph>trailing-surrogate</emph>, in the UTF-16 character encoding. 1735: * This value is <code>'\uDFFF'</code>. 1736: * @since 1.5 1737: */ 1738: public static final char MAX_LOW_SURROGATE = '\uDFFF'; 1739: 1740: /** 1741: * The minimum Unicode surrogate code unit in the UTF-16 character encoding. 1742: * This value is <code>'\uD800'</code>. 1743: * @since 1.5 1744: */ 1745: public static final char MIN_SURROGATE = MIN_HIGH_SURROGATE; 1746: 1747: /** 1748: * The maximum Unicode surrogate code unit in the UTF-16 character encoding. 1749: * This value is <code>'\uDFFF'</code>. 1750: * @since 1.5 1751: */ 1752: public static final char MAX_SURROGATE = MAX_LOW_SURROGATE; 1753: 1754: /** 1755: * The lowest possible supplementary Unicode code point (the first code 1756: * point outside the basic multilingual plane (BMP)). 1757: * This value is <code>0x10000</code>. 1758: */ 1759: public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000; 1760: 1761: /** 1762: * Class object representing the primitive char data type. 1763: * 1764: * @since 1.1 1765: */ 1766: public static final Class<Character> TYPE = (Class<Character>) VMClassLoader.getPrimitiveClass('C'); 1767: 1768: /** 1769: * The number of bits needed to represent a <code>char</code>. 1770: * @since 1.5 1771: */ 1772: public static final int SIZE = 16; 1773: 1774: // This caches some Character values, and is used by boxing 1775: // conversions via valueOf(). We must cache at least 0..127; 1776: // this constant controls how much we actually cache. 1777: private static final int MAX_CACHE = 127; 1778: private static Character[] charCache = new Character[MAX_CACHE + 1]; 1779: 1780: /** 1781: * Lu = Letter, Uppercase (Informative). 1782: * 1783: * @since 1.1 1784: */ 1785: public static final byte UPPERCASE_LETTER = 1; 1786: 1787: /** 1788: * Ll = Letter, Lowercase (Informative). 1789: * 1790: * @since 1.1 1791: */ 1792: public static final byte LOWERCASE_LETTER = 2; 1793: 1794: /** 1795: * Lt = Letter, Titlecase (Informative). 1796: * 1797: * @since 1.1 1798: */ 1799: public static final byte TITLECASE_LETTER = 3; 1800: 1801: /** 1802: * Mn = Mark, Non-Spacing (Normative). 1803: * 1804: * @since 1.1 1805: */ 1806: public static final byte NON_SPACING_MARK = 6; 1807: 1808: /** 1809: * Mc = Mark, Spacing Combining (Normative). 1810: * 1811: * @since 1.1 1812: */ 1813: public static final byte COMBINING_SPACING_MARK = 8; 1814: 1815: /** 1816: * Me = Mark, Enclosing (Normative). 1817: * 1818: * @since 1.1 1819: */ 1820: public static final byte ENCLOSING_MARK = 7; 1821: 1822: /** 1823: * Nd = Number, Decimal Digit (Normative). 1824: * 1825: * @since 1.1 1826: */ 1827: public static final byte DECIMAL_DIGIT_NUMBER = 9; 1828: 1829: /** 1830: * Nl = Number, Letter (Normative). 1831: * 1832: * @since 1.1 1833: */ 1834: public static final byte LETTER_NUMBER = 10; 1835: 1836: /** 1837: * No = Number, Other (Normative). 1838: * 1839: * @since 1.1 1840: */ 1841: public static final byte OTHER_NUMBER = 11; 1842: 1843: /** 1844: * Zs = Separator, Space (Normative). 1845: * 1846: * @since 1.1 1847: */ 1848: public static final byte SPACE_SEPARATOR = 12; 1849: 1850: /** 1851: * Zl = Separator, Line (Normative). 1852: * 1853: * @since 1.1 1854: */ 1855: public static final byte LINE_SEPARATOR = 13; 1856: 1857: /** 1858: * Zp = Separator, Paragraph (Normative). 1859: * 1860: * @since 1.1 1861: */ 1862: public static final byte PARAGRAPH_SEPARATOR = 14; 1863: 1864: /** 1865: * Cc = Other, Control (Normative). 1866: * 1867: * @since 1.1 1868: */ 1869: public static final byte CONTROL = 15; 1870: 1871: /** 1872: * Cf = Other, Format (Normative). 1873: * 1874: * @since 1.1 1875: */ 1876: public static final byte FORMAT = 16; 1877: 1878: /** 1879: * Cs = Other, Surrogate (Normative). 1880: * 1881: * @since 1.1 1882: */ 1883: public static final byte SURROGATE = 19; 1884: 1885: /** 1886: * Co = Other, Private Use (Normative). 1887: * 1888: * @since 1.1 1889: */ 1890: public static final byte PRIVATE_USE = 18; 1891: 1892: /** 1893: * Cn = Other, Not Assigned (Normative). 1894: * 1895: * @since 1.1 1896: */ 1897: public static final byte UNASSIGNED = 0; 1898: 1899: /** 1900: * Lm = Letter, Modifier (Informative). 1901: * 1902: * @since 1.1 1903: */ 1904: public static final byte MODIFIER_LETTER = 4; 1905: 1906: /** 1907: * Lo = Letter, Other (Informative). 1908: * 1909: * @since 1.1 1910: */ 1911: public static final byte OTHER_LETTER = 5; 1912: 1913: /** 1914: * Pc = Punctuation, Connector (Informative). 1915: * 1916: * @since 1.1 1917: */ 1918: public static final byte CONNECTOR_PUNCTUATION = 23; 1919: 1920: /** 1921: * Pd = Punctuation, Dash (Informative). 1922: * 1923: * @since 1.1 1924: */ 1925: public static final byte DASH_PUNCTUATION = 20; 1926: 1927: /** 1928: * Ps = Punctuation, Open (Informative). 1929: * 1930: * @since 1.1 1931: */ 1932: public static final byte START_PUNCTUATION = 21; 1933: 1934: /** 1935: * Pe = Punctuation, Close (Informative). 1936: * 1937: * @since 1.1 1938: */ 1939: public static final byte END_PUNCTUATION = 22; 1940: 1941: /** 1942: * Pi = Punctuation, Initial Quote (Informative). 1943: * 1944: * @since 1.4 1945: */ 1946: public static final byte INITIAL_QUOTE_PUNCTUATION = 29; 1947: 1948: /** 1949: * Pf = Punctuation, Final Quote (Informative). 1950: * 1951: * @since 1.4 1952: */ 1953: public static final byte FINAL_QUOTE_PUNCTUATION = 30; 1954: 1955: /** 1956: * Po = Punctuation, Other (Informative). 1957: * 1958: * @since 1.1 1959: */ 1960: public static final byte OTHER_PUNCTUATION = 24; 1961: 1962: /** 1963: * Sm = Symbol, Math (Informative). 1964: * 1965: * @since 1.1 1966: */ 1967: public static final byte MATH_SYMBOL = 25; 1968: 1969: /** 1970: * Sc = Symbol, Currency (Informative). 1971: * 1972: * @since 1.1 1973: */ 1974: public static final byte CURRENCY_SYMBOL = 26; 1975: 1976: /** 1977: * Sk = Symbol, Modifier (Informative). 1978: * 1979: * @since 1.1 1980: */ 1981: public static final byte MODIFIER_SYMBOL = 27; 1982: 1983: /** 1984: * So = Symbol, Other (Informative). 1985: * 1986: * @since 1.1 1987: */ 1988: public static final byte OTHER_SYMBOL = 28; 1989: 1990: /** 1991: * Undefined bidirectional character type. Undefined char values have 1992: * undefined directionality in the Unicode specification. 1993: * 1994: * @since 1.4 1995: */ 1996: public static final byte DIRECTIONALITY_UNDEFINED = -1; 1997: 1998: /** 1999: * Strong bidirectional character type "L". 2000: * 2001: * @since 1.4 2002: */ 2003: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = 0; 2004: 2005: /** 2006: * Strong bidirectional character type "R". 2007: * 2008: * @since 1.4 2009: */ 2010: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = 1; 2011: 2012: /** 2013: * Strong bidirectional character type "AL". 2014: * 2015: * @since 1.4 2016: */ 2017: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2; 2018: 2019: /** 2020: * Weak bidirectional character type "EN". 2021: * 2022: * @since 1.4 2023: */ 2024: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = 3; 2025: 2026: /** 2027: * Weak bidirectional character type "ES". 2028: * 2029: * @since 1.4 2030: */ 2031: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4; 2032: 2033: /** 2034: * Weak bidirectional character type "ET". 2035: * 2036: * @since 1.4 2037: */ 2038: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5; 2039: 2040: /** 2041: * Weak bidirectional character type "AN". 2042: * 2043: * @since 1.4 2044: */ 2045: public static final byte DIRECTIONALITY_ARABIC_NUMBER = 6; 2046: 2047: /** 2048: * Weak bidirectional character type "CS". 2049: * 2050: * @since 1.4 2051: */ 2052: public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7; 2053: 2054: /** 2055: * Weak bidirectional character type "NSM". 2056: * 2057: * @since 1.4 2058: */ 2059: public static final byte DIRECTIONALITY_NONSPACING_MARK = 8; 2060: 2061: /** 2062: * Weak bidirectional character type "BN". 2063: * 2064: * @since 1.4 2065: */ 2066: public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9; 2067: 2068: /** 2069: * Neutral bidirectional character type "B". 2070: * 2071: * @since 1.4 2072: */ 2073: public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10; 2074: 2075: /** 2076: * Neutral bidirectional character type "S". 2077: * 2078: * @since 1.4 2079: */ 2080: public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11; 2081: 2082: /** 2083: * Strong bidirectional character type "WS". 2084: * 2085: * @since 1.4 2086: */ 2087: public static final byte DIRECTIONALITY_WHITESPACE = 12; 2088: 2089: /** 2090: * Neutral bidirectional character type "ON". 2091: * 2092: * @since 1.4 2093: */ 2094: public static final byte DIRECTIONALITY_OTHER_NEUTRALS = 13; 2095: 2096: /** 2097: * Strong bidirectional character type "LRE". 2098: * 2099: * @since 1.4 2100: */ 2101: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14; 2102: 2103: /** 2104: * Strong bidirectional character type "LRO". 2105: * 2106: * @since 1.4 2107: */ 2108: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15; 2109: 2110: /** 2111: * Strong bidirectional character type "RLE". 2112: * 2113: * @since 1.4 2114: */ 2115: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16; 2116: 2117: /** 2118: * Strong bidirectional character type "RLO". 2119: * 2120: * @since 1.4 2121: */ 2122: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17; 2123: 2124: /** 2125: * Weak bidirectional character type "PDF". 2126: * 2127: * @since 1.4 2128: */ 2129: public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18; 2130: 2131: /** 2132: * Mask for grabbing the type out of the result of readChar. 2133: * @see #readChar(char) 2134: */ 2135: private static final int TYPE_MASK = 0x1F; 2136: 2137: /** 2138: * Mask for grabbing the non-breaking space flag out of the result of 2139: * readChar. 2140: * @see #readChar(char) 2141: */ 2142: private static final int NO_BREAK_MASK = 0x20; 2143: 2144: /** 2145: * Mask for grabbing the mirrored directionality flag out of the result 2146: * of readChar. 2147: * @see #readChar(char) 2148: */ 2149: private static final int MIRROR_MASK = 0x40; 2150: 2151: /** 2152: * Grabs an attribute offset from the Unicode attribute database. The lower 2153: * 5 bits are the character type, the next 2 bits are flags, and the top 2154: * 9 bits are the offset into the attribute tables. Note that the top 9 2155: * bits are meaningless in this context; they are useful only in the native 2156: * code. 2157: * 2158: * @param ch the character to look up 2159: * @return the character's attribute offset and type 2160: * @see #TYPE_MASK 2161: * @see #NO_BREAK_MASK 2162: * @see #MIRROR_MASK 2163: */ 2164: private static native char readChar(char ch); 2165: 2166: /** 2167: * Grabs an attribute offset from the Unicode attribute database. The lower 2168: * 5 bits are the character type, the next 2 bits are flags, and the top 2169: * 9 bits are the offset into the attribute tables. Note that the top 9 2170: * bits are meaningless in this context; they are useful only in the native 2171: * code. 2172: * 2173: * @param codePoint the character to look up 2174: * @return the character's attribute offset and type 2175: * @see #TYPE_MASK 2176: * @see #NO_BREAK_MASK 2177: * @see #MIRROR_MASK 2178: */ 2179: private static native char readCodePoint(int codePoint); 2180: 2181: /** 2182: * Wraps up a character. 2183: * 2184: * @param value the character to wrap 2185: */ 2186: public Character(char value) 2187: { 2188: this.value = value; 2189: } 2190: 2191: /** 2192: * Returns the character which has been wrapped by this class. 2193: * 2194: * @return the character wrapped 2195: */ 2196: public char charValue() 2197: { 2198: return value; 2199: } 2200: 2201: /** 2202: * Returns the numerical value (unsigned) of the wrapped character. 2203: * Range of returned values: 0x0000-0xFFFF. 2204: * 2205: * @return the value of the wrapped character 2206: */ 2207: public int hashCode() 2208: { 2209: return value; 2210: } 2211: 2212: /** 2213: * Determines if an object is equal to this object. This is only true for 2214: * another Character object wrapping the same value. 2215: * 2216: * @param o object to compare 2217: * @return true if o is a Character with the same value 2218: */ 2219: public boolean equals(Object o) 2220: { 2221: return o instanceof Character && value == ((Character) o).value; 2222: } 2223: 2224: /** 2225: * Converts the wrapped character into a String. 2226: * 2227: * @return a String containing one character -- the wrapped character 2228: * of this instance 2229: */ 2230: public String toString() 2231: { 2232: // This assumes that String.valueOf(char) can create a single-character 2233: // String more efficiently than through the public API. 2234: return String.valueOf(value); 2235: } 2236: 2237: /** 2238: * Returns a String of length 1 representing the specified character. 2239: * 2240: * @param ch the character to convert 2241: * @return a String containing the character 2242: * @since 1.4 2243: */ 2244: public static String toString(char ch) 2245: { 2246: // This assumes that String.valueOf(char) can create a single-character 2247: // String more efficiently than through the public API. 2248: return String.valueOf(ch); 2249: } 2250: 2251: /** 2252: * Determines if a character is a Unicode lowercase letter. For example, 2253: * <code>'a'</code> is lowercase. 2254: * <br> 2255: * lowercase = [Ll] 2256: * 2257: * @param ch character to test 2258: * @return true if ch is a Unicode lowercase letter, else false 2259: * @see #isUpperCase(char) 2260: * @see #isTitleCase(char) 2261: * @see #toLowerCase(char) 2262: * @see #getType(char) 2263: */ 2264: public static boolean isLowerCase(char ch) 2265: { 2266: return getType(ch) == LOWERCASE_LETTER; 2267: } 2268: 2269: /** 2270: * Determines if a character is a Unicode lowercase letter. For example, 2271: * <code>'a'</code> is lowercase. Unlike isLowerCase(char), this method 2272: * supports supplementary Unicode code points. 2273: * <br> 2274: * lowercase = [Ll] 2275: * 2276: * @param codePoint character to test 2277: * @return true if codePoint is a Unicode lowercase letter, else false 2278: * @see #isUpperCase(int) 2279: * @see #isTitleCase(int) 2280: * @see #toLowerCase(int) 2281: * @see #getType(int) 2282: * @since 1.5 2283: */ 2284: public static boolean isLowerCase(int codePoint) 2285: { 2286: return getType(codePoint) == LOWERCASE_LETTER; 2287: } 2288: 2289: /** 2290: * Determines if a character is a Unicode uppercase letter. For example, 2291: * <code>'A'</code> is uppercase. 2292: * <br> 2293: * uppercase = [Lu] 2294: * 2295: * @param ch character to test 2296: * @return true if ch is a Unicode uppercase letter, else false 2297: * @see #isLowerCase(char) 2298: * @see #isTitleCase(char) 2299: * @see #toUpperCase(char) 2300: * @see #getType(char) 2301: */ 2302: public static boolean isUpperCase(char ch) 2303: { 2304: return getType(ch) == UPPERCASE_LETTER; 2305: } 2306: 2307: /** 2308: * Determines if a character is a Unicode uppercase letter. For example, 2309: * <code>'A'</code> is uppercase. Unlike isUpperCase(char), this method 2310: * supports supplementary Unicode code points. 2311: * <br> 2312: * uppercase = [Lu] 2313: * 2314: * @param codePoint character to test 2315: * @return true if codePoint is a Unicode uppercase letter, else false 2316: * @see #isLowerCase(int) 2317: * @see #isTitleCase(int) 2318: * @see #toUpperCase(int) 2319: * @see #getType(int) 2320: * @since 1.5 2321: */ 2322: public static boolean isUpperCase(int codePoint) 2323: { 2324: return getType(codePoint) == UPPERCASE_LETTER; 2325: } 2326: 2327: /** 2328: * Determines if a character is a Unicode titlecase letter. For example, 2329: * the character "Lj" (Latin capital L with small letter j) is titlecase. 2330: * <br> 2331: * titlecase = [Lt] 2332: * 2333: * @param ch character to test 2334: * @return true if ch is a Unicode titlecase letter, else false 2335: * @see #isLowerCase(char) 2336: * @see #isUpperCase(char) 2337: * @see #toTitleCase(char) 2338: * @see #getType(char) 2339: */ 2340: public static boolean isTitleCase(char ch) 2341: { 2342: return getType(ch) == TITLECASE_LETTER; 2343: } 2344: 2345: /** 2346: * Determines if a character is a Unicode titlecase letter. For example, 2347: * the character "Lj" (Latin capital L with small letter j) is titlecase. 2348: * Unlike isTitleCase(char), this method supports supplementary Unicode 2349: * code points. 2350: * <br> 2351: * titlecase = [Lt] 2352: * 2353: * @param codePoint character to test 2354: * @return true if codePoint is a Unicode titlecase letter, else false 2355: * @see #isLowerCase(int) 2356: * @see #isUpperCase(int) 2357: * @see #toTitleCase(int) 2358: * @see #getType(int) 2359: * @since 1.5 2360: */ 2361: public static boolean isTitleCase(int codePoint) 2362: { 2363: return getType(codePoint) == TITLECASE_LETTER; 2364: } 2365: 2366: /** 2367: * Determines if a character is a Unicode decimal digit. For example, 2368: * <code>'0'</code> is a digit. 2369: * <br> 2370: * Unicode decimal digit = [Nd] 2371: * 2372: * @param ch character to test 2373: * @return true if ch is a Unicode decimal digit, else false 2374: * @see #digit(char, int) 2375: * @see #forDigit(int, int) 2376: * @see #getType(char) 2377: */ 2378: public static boolean isDigit(char ch) 2379: { 2380: return getType(ch) == DECIMAL_DIGIT_NUMBER; 2381: } 2382: 2383: /** 2384: * Determines if a character is a Unicode decimal digit. For example, 2385: * <code>'0'</code> is a digit. Unlike isDigit(char), this method 2386: * supports supplementary Unicode code points. 2387: * <br> 2388: * Unicode decimal digit = [Nd] 2389: * 2390: * @param codePoint character to test 2391: * @return true if ccodePoint is a Unicode decimal digit, else false 2392: * @see #digit(int, int) 2393: * @see #forDigit(int, int) 2394: * @see #getType(int) 2395: * @since 1.5 2396: */ 2397: public static boolean isDigit(int codePoint) 2398: { 2399: return getType(codePoint) == DECIMAL_DIGIT_NUMBER; 2400: } 2401: 2402: /** 2403: * Determines if a character is part of the Unicode Standard. This is an 2404: * evolving standard, but covers every character in the data file. 2405: * <br> 2406: * defined = not [Cn] 2407: * 2408: * @param ch character to test 2409: * @return true if ch is a Unicode character, else false 2410: * @see #isDigit(char) 2411: * @see #isLetter(char) 2412: * @see #isLetterOrDigit(char) 2413: * @see #isLowerCase(char) 2414: * @see #isTitleCase(char) 2415: * @see #isUpperCase(char) 2416: */ 2417: public static boolean isDefined(char ch) 2418: { 2419: return getType(ch) != UNASSIGNED; 2420: } 2421: 2422: /** 2423: * Determines if a character is part of the Unicode Standard. This is an 2424: * evolving standard, but covers every character in the data file. Unlike 2425: * isDefined(char), this method supports supplementary Unicode code points. 2426: * <br> 2427: * defined = not [Cn] 2428: * 2429: * @param codePoint character to test 2430: * @return true if codePoint is a Unicode character, else false 2431: * @see #isDigit(int) 2432: * @see #isLetter(int) 2433: * @see #isLetterOrDigit(int) 2434: * @see #isLowerCase(int) 2435: * @see #isTitleCase(int) 2436: * @see #isUpperCase(int) 2437: * @since 1.5 2438: */ 2439: public static boolean isDefined(int codePoint) 2440: { 2441: return getType(codePoint) != UNASSIGNED; 2442: } 2443: 2444: /** 2445: * Determines if a character is a Unicode letter. Not all letters have case, 2446: * so this may return true when isLowerCase and isUpperCase return false. 2447: * <br> 2448: * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo] 2449: * 2450: * @param ch character to test 2451: * @return true if ch is a Unicode letter, else false 2452: * @see #isDigit(char) 2453: * @see #isJavaIdentifierStart(char) 2454: * @see #isJavaLetter(char) 2455: * @see #isJavaLetterOrDigit(char) 2456: * @see #isLetterOrDigit(char) 2457: * @see #isLowerCase(char) 2458: * @see #isTitleCase(char) 2459: * @see #isUnicodeIdentifierStart(char) 2460: * @see #isUpperCase(char) 2461: */ 2462: public static boolean isLetter(char ch) 2463: { 2464: return ((1 << getType(ch)) 2465: & ((1 << UPPERCASE_LETTER) 2466: | (1 << LOWERCASE_LETTER) 2467: | (1 << TITLECASE_LETTER) 2468: | (1 << MODIFIER_LETTER) 2469: | (1 << OTHER_LETTER))) != 0; 2470: } 2471: 2472: /** 2473: * Determines if a character is a Unicode letter. Not all letters have case, 2474: * so this may return true when isLowerCase and isUpperCase return false. 2475: * Unlike isLetter(char), this method supports supplementary Unicode code 2476: * points. 2477: * <br> 2478: * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo] 2479: * 2480: * @param codePoint character to test 2481: * @return true if codePoint is a Unicode letter, else false 2482: * @see #isDigit(int) 2483: * @see #isJavaIdentifierStart(int) 2484: * @see #isJavaLetter(int) 2485: * @see #isJavaLetterOrDigit(int) 2486: * @see #isLetterOrDigit(int) 2487: * @see #isLowerCase(int) 2488: * @see #isTitleCase(int) 2489: * @see #isUnicodeIdentifierStart(int) 2490: * @see #isUpperCase(int) 2491: * @since 1.5 2492: */ 2493: public static boolean isLetter(int codePoint) 2494: { 2495: return ((1 << getType(codePoint)) 2496: & ((1 << UPPERCASE_LETTER) 2497: | (1 << LOWERCASE_LETTER) 2498: | (1 << TITLECASE_LETTER) 2499: | (1 << MODIFIER_LETTER) 2500: | (1 << OTHER_LETTER))) != 0; 2501: } 2502: 2503: /** 2504: * Returns the index into the given CharSequence that is offset 2505: * <code>codePointOffset</code> code points from <code>index</code>. 2506: * @param seq the CharSequence 2507: * @param index the start position in the CharSequence 2508: * @param codePointOffset the number of code points offset from the start 2509: * position 2510: * @return the index into the CharSequence that is codePointOffset code 2511: * points offset from index 2512: * 2513: * @throws NullPointerException if seq is null 2514: * @throws IndexOutOfBoundsException if index is negative or greater than the 2515: * length of the sequence. 2516: * @throws IndexOutOfBoundsException if codePointOffset is positive and the 2517: * subsequence from index to the end of seq has fewer than codePointOffset 2518: * code points 2519: * @throws IndexOutOfBoundsException if codePointOffset is negative and the 2520: * subsequence from the start of seq to index has fewer than 2521: * (-codePointOffset) code points 2522: * @since 1.5 2523: */ 2524: public static int offsetByCodePoints(CharSequence seq, 2525: int index, 2526: int codePointOffset) 2527: { 2528: int len = seq.length(); 2529: if (index < 0 || index > len) 2530: throw new IndexOutOfBoundsException(); 2531: 2532: int numToGo = codePointOffset; 2533: int offset = index; 2534: int adjust = 1; 2535: if (numToGo >= 0) 2536: { 2537: for (; numToGo > 0; offset++) 2538: { 2539: numToGo--; 2540: if (Character.isHighSurrogate(seq.charAt(offset)) 2541: && (offset + 1) < len 2542: && Character.isLowSurrogate(seq.charAt(offset + 1))) 2543: offset++; 2544: } 2545: return offset; 2546: } 2547: else 2548: { 2549: numToGo *= -1; 2550: for (; numToGo > 0;) 2551: { 2552: numToGo--; 2553: offset--; 2554: if (Character.isLowSurrogate(seq.charAt(offset)) 2555: && (offset - 1) >= 0 2556: && Character.isHighSurrogate(seq.charAt(offset - 1))) 2557: offset--; 2558: } 2559: return offset; 2560: } 2561: } 2562: 2563: /** 2564: * Returns the index into the given char subarray that is offset 2565: * <code>codePointOffset</code> code points from <code>index</code>. 2566: * @param a the char array 2567: * @param start the start index of the subarray 2568: * @param count the length of the subarray 2569: * @param index the index to be offset 2570: * @param codePointOffset the number of code points offset from <code>index 2571: * </code> 2572: * @return the index into the char array 2573: * 2574: * @throws NullPointerException if a is null 2575: * @throws IndexOutOfBoundsException if start or count is negative or if 2576: * start + count is greater than the length of the array 2577: * @throws IndexOutOfBoundsException if index is less than start or larger 2578: * than start + count 2579: * @throws IndexOutOfBoundsException if codePointOffset is positive and the 2580: * subarray from index to start + count - 1 has fewer than codePointOffset 2581: * code points. 2582: * @throws IndexOutOfBoundsException if codePointOffset is negative and the 2583: * subarray from start to index - 1 has fewer than (-codePointOffset) code 2584: * points 2585: * @since 1.5 2586: 2587: */ 2588: public static int offsetByCodePoints(char[] a, 2589: int start, 2590: int count, 2591: int index, 2592: int codePointOffset) 2593: { 2594: int len = a.length; 2595: int end = start + count; 2596: if (start < 0 || count < 0 || end > len || index < start || index > end) 2597: throw new IndexOutOfBoundsException(); 2598: 2599: int numToGo = codePointOffset; 2600: int offset = index; 2601: int adjust = 1; 2602: if (numToGo >= 0) 2603: { 2604: for (; numToGo > 0; offset++) 2605: { 2606: numToGo--; 2607: if (Character.isHighSurrogate(a[offset]) 2608: && (offset + 1) < len 2609: && Character.isLowSurrogate(a[offset + 1])) 2610: offset++; 2611: } 2612: return offset; 2613: } 2614: else 2615: { 2616: numToGo *= -1; 2617: for (; numToGo > 0;) 2618: { 2619: numToGo--; 2620: offset--; 2621: if (Character.isLowSurrogate(a[offset]) 2622: && (offset - 1) >= 0 2623: && Character.isHighSurrogate(a[offset - 1])) 2624: offset--; 2625: if (offset < start) 2626: throw new IndexOutOfBoundsException(); 2627: } 2628: return offset; 2629: } 2630: 2631: } 2632: 2633: /** 2634: * Returns the number of Unicode code points in the specified range of the 2635: * given CharSequence. The first char in the range is at position 2636: * beginIndex and the last one is at position endIndex - 1. Paired 2637: * surrogates (supplementary characters are represented by a pair of chars - 2638: * one from the high surrogates and one from the low surrogates) 2639: * count as just one code point. 2640: * @param seq the CharSequence to inspect 2641: * @param beginIndex the beginning of the range 2642: * @param endIndex the end of the range 2643: * @return the number of Unicode code points in the given range of the 2644: * sequence 2645: * @throws NullPointerException if seq is null 2646: * @throws IndexOutOfBoundsException if beginIndex is negative, endIndex is 2647: * larger than the length of seq, or if beginIndex is greater than endIndex. 2648: * @since 1.5 2649: */ 2650: public static int codePointCount(CharSequence seq, int beginIndex, 2651: int endIndex) 2652: { 2653: int len = seq.length(); 2654: if (beginIndex < 0 || endIndex > len || beginIndex > endIndex) 2655: throw new IndexOutOfBoundsException(); 2656: 2657: int count = 0; 2658: for (int i = beginIndex; i < endIndex; i++) 2659: { 2660: count++; 2661: // If there is a pairing, count it only once. 2662: if (isHighSurrogate(seq.charAt(i)) && (i + 1) < endIndex 2663: && isLowSurrogate(seq.charAt(i + 1))) 2664: i ++; 2665: } 2666: return count; 2667: } 2668: 2669: /** 2670: * Returns the number of Unicode code points in the specified range of the 2671: * given char array. The first char in the range is at position 2672: * offset and the length of the range is count. Paired surrogates 2673: * (supplementary characters are represented by a pair of chars - 2674: * one from the high surrogates and one from the low surrogates) 2675: * count as just one code point. 2676: * @param a the char array to inspect 2677: * @param offset the beginning of the range 2678: * @param count the length of the range 2679: * @return the number of Unicode code points in the given range of the 2680: * array 2681: * @throws NullPointerException if a is null 2682: * @throws IndexOutOfBoundsException if offset or count is negative or if 2683: * offset + countendIndex is larger than the length of a. 2684: * @since 1.5 2685: */ 2686: public static int codePointCount(char[] a, int offset, 2687: int count) 2688: { 2689: int len = a.length; 2690: int end = offset + count; 2691: if (offset < 0 || count < 0 || end > len) 2692: throw new IndexOutOfBoundsException(); 2693: 2694: int counter = 0; 2695: for (int i = offset; i < end; i++) 2696: { 2697: counter++; 2698: // If there is a pairing, count it only once. 2699: if (isHighSurrogate(a[i]) && (i + 1) < end 2700: && isLowSurrogate(a[i + 1])) 2701: i ++; 2702: } 2703: return counter; 2704: } 2705: 2706: /** 2707: * Determines if a character is a Unicode letter or a Unicode digit. This 2708: * is the combination of isLetter and isDigit. 2709: * <br> 2710: * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd] 2711: * 2712: * @param ch character to test 2713: * @return true if ch is a Unicode letter or a Unicode digit, else false 2714: * @see #isDigit(char) 2715: * @see #isJavaIdentifierPart(char) 2716: * @see #isJavaLetter(char) 2717: * @see #isJavaLetterOrDigit(char) 2718: * @see #isLetter(char) 2719: * @see #isUnicodeIdentifierPart(char) 2720: */ 2721: public static boolean isLetterOrDigit(char ch) 2722: { 2723: return ((1 << getType(ch)) 2724: & ((1 << UPPERCASE_LETTER) 2725: | (1 << LOWERCASE_LETTER) 2726: | (1 << TITLECASE_LETTER) 2727: | (1 << MODIFIER_LETTER) 2728: | (1 << OTHER_LETTER) 2729: | (1 << DECIMAL_DIGIT_NUMBER))) != 0; 2730: } 2731: 2732: /** 2733: * Determines if a character is a Unicode letter or a Unicode digit. This 2734: * is the combination of isLetter and isDigit. Unlike isLetterOrDigit(char), 2735: * this method supports supplementary Unicode code points. 2736: * <br> 2737: * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd] 2738: * 2739: * @param codePoint character to test 2740: * @return true if codePoint is a Unicode letter or a Unicode digit, else false 2741: * @see #isDigit(int) 2742: * @see #isJavaIdentifierPart(int) 2743: * @see #isJavaLetter(int) 2744: * @see #isJavaLetterOrDigit(int) 2745: * @see #isLetter(int) 2746: * @see #isUnicodeIdentifierPart(int) 2747: * @since 1.5 2748: */ 2749: public static boolean isLetterOrDigit(int codePoint) 2750: { 2751: return ((1 << getType(codePoint) 2752: & ((1 << UPPERCASE_LETTER) 2753: | (1 << LOWERCASE_LETTER) 2754: | (1 << TITLECASE_LETTER) 2755: | (1 << MODIFIER_LETTER) 2756: | (1 << OTHER_LETTER) 2757: | (1 << DECIMAL_DIGIT_NUMBER))) != 0); 2758: } 2759: 2760: /** 2761: * Determines if a character can start a Java identifier. This is the 2762: * combination of isLetter, any character where getType returns 2763: * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 2764: * (like '_'). 2765: * 2766: * @param ch character to test 2767: * @return true if ch can start a Java identifier, else false 2768: * @deprecated Replaced by {@link #isJavaIdentifierStart(char)} 2769: * @see #isJavaLetterOrDigit(char) 2770: * @see #isJavaIdentifierStart(char) 2771: * @see #isJavaIdentifierPart(char) 2772: * @see #isLetter(char) 2773: * @see #isLetterOrDigit(char) 2774: * @see #isUnicodeIdentifierStart(char) 2775: */ 2776: public static boolean isJavaLetter(char ch) 2777: { 2778: return isJavaIdentifierStart(ch); 2779: } 2780: 2781: /** 2782: * Determines if a character can start a Java identifier. This is the 2783: * combination of isLetter, any character where getType returns 2784: * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 2785: * (like '_'). Unlike isJavaIdentifierStart(char), this method supports 2786: * supplementary Unicode code points. 2787: * <br> 2788: * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc] 2789: * 2790: * @param codePoint character to test 2791: * @return true if codePoint can start a Java identifier, else false 2792: * @see #isJavaIdentifierPart(int) 2793: * @see #isLetter(int) 2794: * @see #isUnicodeIdentifierStart(int) 2795: * @since 1.5 2796: */ 2797: public static boolean isJavaIdentifierStart(int codePoint) 2798: { 2799: return ((1 << getType(codePoint)) 2800: & ((1 << UPPERCASE_LETTER) 2801: | (1 << LOWERCASE_LETTER) 2802: | (1 << TITLECASE_LETTER) 2803: | (1 << MODIFIER_LETTER) 2804: | (1 << OTHER_LETTER) 2805: | (1 << LETTER_NUMBER) 2806: | (1 << CURRENCY_SYMBOL) 2807: | (1 << CONNECTOR_PUNCTUATION))) != 0; 2808: } 2809: 2810: /** 2811: * Determines if a character can follow the first letter in 2812: * a Java identifier. This is the combination of isJavaLetter (isLetter, 2813: * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 2814: * numeric letter (like Roman numerals), combining marks, non-spacing marks, 2815: * or isIdentifierIgnorable. 2816: * 2817: * @param ch character to test 2818: * @return true if ch can follow the first letter in a Java identifier 2819: * @deprecated Replaced by {@link #isJavaIdentifierPart(char)} 2820: * @see #isJavaLetter(char) 2821: * @see #isJavaIdentifierStart(char) 2822: * @see #isJavaIdentifierPart(char) 2823: * @see #isLetter(char) 2824: * @see #isLetterOrDigit(char) 2825: * @see #isUnicodeIdentifierPart(char) 2826: * @see #isIdentifierIgnorable(char) 2827: */ 2828: public static boolean isJavaLetterOrDigit(char ch) 2829: { 2830: return isJavaIdentifierPart(ch); 2831: } 2832: 2833: /** 2834: * Determines if a character can start a Java identifier. This is the 2835: * combination of isLetter, any character where getType returns 2836: * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 2837: * (like '_'). 2838: * <br> 2839: * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc] 2840: * 2841: * @param ch character to test 2842: * @return true if ch can start a Java identifier, else false 2843: * @see #isJavaIdentifierPart(char) 2844: * @see #isLetter(char) 2845: * @see #isUnicodeIdentifierStart(char) 2846: * @since 1.1 2847: */ 2848: public static boolean isJavaIdentifierStart(char ch) 2849: { 2850: return ((1 << getType(ch)) 2851: & ((1 << UPPERCASE_LETTER) 2852: | (1 << LOWERCASE_LETTER) 2853: | (1 << TITLECASE_LETTER) 2854: | (1 << MODIFIER_LETTER) 2855: | (1 << OTHER_LETTER) 2856: | (1 << LETTER_NUMBER) 2857: | (1 << CURRENCY_SYMBOL) 2858: | (1 << CONNECTOR_PUNCTUATION))) != 0; 2859: } 2860: 2861: /** 2862: * Determines if a character can follow the first letter in 2863: * a Java identifier. This is the combination of isJavaLetter (isLetter, 2864: * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 2865: * numeric letter (like Roman numerals), combining marks, non-spacing marks, 2866: * or isIdentifierIgnorable. 2867: * <br> 2868: * Java identifier extender = 2869: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf] 2870: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 2871: * 2872: * @param ch character to test 2873: * @return true if ch can follow the first letter in a Java identifier 2874: * @see #isIdentifierIgnorable(char) 2875: * @see #isJavaIdentifierStart(char) 2876: * @see #isLetterOrDigit(char) 2877: * @see #isUnicodeIdentifierPart(char) 2878: * @since 1.1 2879: */ 2880: public static boolean isJavaIdentifierPart(char ch) 2881: { 2882: int category = getType(ch); 2883: return ((1 << category) 2884: & ((1 << UPPERCASE_LETTER) 2885: | (1 << LOWERCASE_LETTER) 2886: | (1 << TITLECASE_LETTER) 2887: | (1 << MODIFIER_LETTER) 2888: | (1 << OTHER_LETTER) 2889: | (1 << NON_SPACING_MARK) 2890: | (1 << COMBINING_SPACING_MARK) 2891: | (1 << DECIMAL_DIGIT_NUMBER) 2892: | (1 << LETTER_NUMBER) 2893: | (1 << CURRENCY_SYMBOL) 2894: | (1 << CONNECTOR_PUNCTUATION) 2895: | (1 << FORMAT))) != 0 2896: || (category == CONTROL && isIdentifierIgnorable(ch)); 2897: } 2898: 2899: /** 2900: * Determines if a character can follow the first letter in 2901: * a Java identifier. This is the combination of isJavaLetter (isLetter, 2902: * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 2903: * numeric letter (like Roman numerals), combining marks, non-spacing marks, 2904: * or isIdentifierIgnorable. Unlike isJavaIdentifierPart(char), this method 2905: * supports supplementary Unicode code points. 2906: * <br> 2907: * Java identifier extender = 2908: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf] 2909: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 2910: * 2911: * @param codePoint character to test 2912: * @return true if codePoint can follow the first letter in a Java identifier 2913: * @see #isIdentifierIgnorable(int) 2914: * @see #isJavaIdentifierStart(int) 2915: * @see #isLetterOrDigit(int) 2916: * @see #isUnicodeIdentifierPart(int) 2917: * @since 1.5 2918: */ 2919: public static boolean isJavaIdentifierPart(int codePoint) 2920: { 2921: int category = getType(codePoint); 2922: return ((1 << category) 2923: & ((1 << UPPERCASE_LETTER) 2924: | (1 << LOWERCASE_LETTER) 2925: | (1 << TITLECASE_LETTER) 2926: | (1 << MODIFIER_LETTER) 2927: | (1 << OTHER_LETTER) 2928: | (1 << NON_SPACING_MARK) 2929: | (1 << COMBINING_SPACING_MARK) 2930: | (1 << DECIMAL_DIGIT_NUMBER) 2931: | (1 << LETTER_NUMBER) 2932: | (1 << CURRENCY_SYMBOL) 2933: | (1 << CONNECTOR_PUNCTUATION) 2934: | (1 << FORMAT))) != 0 2935: || (category == CONTROL && isIdentifierIgnorable(codePoint)); 2936: } 2937: 2938: /** 2939: * Determines if a character can start a Unicode identifier. Only 2940: * letters can start a Unicode identifier, but this includes characters 2941: * in LETTER_NUMBER. 2942: * <br> 2943: * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl] 2944: * 2945: * @param ch character to test 2946: * @return true if ch can start a Unicode identifier, else false 2947: * @see #isJavaIdentifierStart(char) 2948: * @see #isLetter(char) 2949: * @see #isUnicodeIdentifierPart(char) 2950: * @since 1.1 2951: */ 2952: public static boolean isUnicodeIdentifierStart(char ch) 2953: { 2954: return ((1 << getType(ch)) 2955: & ((1 << UPPERCASE_LETTER) 2956: | (1 << LOWERCASE_LETTER) 2957: | (1 << TITLECASE_LETTER) 2958: | (1 << MODIFIER_LETTER) 2959: | (1 << OTHER_LETTER) 2960: | (1 << LETTER_NUMBER))) != 0; 2961: } 2962: 2963: /** 2964: * Determines if a character can start a Unicode identifier. Only 2965: * letters can start a Unicode identifier, but this includes characters 2966: * in LETTER_NUMBER. Unlike isUnicodeIdentifierStart(char), this method 2967: * supports supplementary Unicode code points. 2968: * <br> 2969: * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl] 2970: * 2971: * @param codePoint character to test 2972: * @return true if codePoint can start a Unicode identifier, else false 2973: * @see #isJavaIdentifierStart(int) 2974: * @see #isLetter(int) 2975: * @see #isUnicodeIdentifierPart(int) 2976: * @since 1.5 2977: */ 2978: public static boolean isUnicodeIdentifierStart(int codePoint) 2979: { 2980: return ((1 << getType(codePoint)) 2981: & ((1 << UPPERCASE_LETTER) 2982: | (1 << LOWERCASE_LETTER) 2983: | (1 << TITLECASE_LETTER) 2984: | (1 << MODIFIER_LETTER) 2985: | (1 << OTHER_LETTER) 2986: | (1 << LETTER_NUMBER))) != 0; 2987: } 2988: 2989: /** 2990: * Determines if a character can follow the first letter in 2991: * a Unicode identifier. This includes letters, connecting punctuation, 2992: * digits, numeric letters, combining marks, non-spacing marks, and 2993: * isIdentifierIgnorable. 2994: * <br> 2995: * Unicode identifier extender = 2996: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]| 2997: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 2998: * 2999: * @param ch character to test 3000: * @return true if ch can follow the first letter in a Unicode identifier 3001: * @see #isIdentifierIgnorable(char) 3002: * @see #isJavaIdentifierPart(char) 3003: * @see #isLetterOrDigit(char) 3004: * @see #isUnicodeIdentifierStart(char) 3005: * @since 1.1 3006: */ 3007: public static boolean isUnicodeIdentifierPart(char ch) 3008: { 3009: int category = getType(ch); 3010: return ((1 << category) 3011: & ((1 << UPPERCASE_LETTER) 3012: | (1 << LOWERCASE_LETTER) 3013: | (1 << TITLECASE_LETTER) 3014: | (1 << MODIFIER_LETTER) 3015: | (1 << OTHER_LETTER) 3016: | (1 << NON_SPACING_MARK) 3017: | (1 << COMBINING_SPACING_MARK) 3018: | (1 << DECIMAL_DIGIT_NUMBER) 3019: | (1 << LETTER_NUMBER) 3020: | (1 << CONNECTOR_PUNCTUATION) 3021: | (1 << FORMAT))) != 0 3022: || (category == CONTROL && isIdentifierIgnorable(ch)); 3023: } 3024: 3025: /** 3026: * Determines if a character can follow the first letter in 3027: * a Unicode identifier. This includes letters, connecting punctuation, 3028: * digits, numeric letters, combining marks, non-spacing marks, and 3029: * isIdentifierIgnorable. Unlike isUnicodeIdentifierPart(char), this method 3030: * supports supplementary Unicode code points. 3031: * <br> 3032: * Unicode identifier extender = 3033: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]| 3034: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 3035: * 3036: * @param codePoint character to test 3037: * @return true if codePoint can follow the first letter in a Unicode 3038: * identifier 3039: * @see #isIdentifierIgnorable(int) 3040: * @see #isJavaIdentifierPart(int) 3041: * @see #isLetterOrDigit(int) 3042: * @see #isUnicodeIdentifierStart(int) 3043: * @since 1.5 3044: */ 3045: public static boolean isUnicodeIdentifierPart(int codePoint) 3046: { 3047: int category = getType(codePoint); 3048: return ((1 << category) 3049: & ((1 << UPPERCASE_LETTER) 3050: | (1 << LOWERCASE_LETTER) 3051: | (1 << TITLECASE_LETTER) 3052: | (1 << MODIFIER_LETTER) 3053: | (1 << OTHER_LETTER) 3054: | (1 << NON_SPACING_MARK) 3055: | (1 << COMBINING_SPACING_MARK) 3056: | (1 << DECIMAL_DIGIT_NUMBER) 3057: | (1 << LETTER_NUMBER) 3058: | (1 << CONNECTOR_PUNCTUATION) 3059: | (1 << FORMAT))) != 0 3060: || (category == CONTROL && isIdentifierIgnorable(codePoint)); 3061: } 3062: 3063: /** 3064: * Determines if a character is ignorable in a Unicode identifier. This 3065: * includes the non-whitespace ISO control characters (<code>'\u0000'</code> 3066: * through <code>'\u0008'</code>, <code>'\u000E'</code> through 3067: * <code>'\u001B'</code>, and <code>'\u007F'</code> through 3068: * <code>'\u009F'</code>), and FORMAT characters. 3069: * <br> 3070: * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B 3071: * |U+007F-U+009F 3072: * 3073: * @param ch character to test 3074: * @return true if ch is ignorable in a Unicode or Java identifier 3075: * @see #isJavaIdentifierPart(char) 3076: * @see #isUnicodeIdentifierPart(char) 3077: * @since 1.1 3078: */ 3079: public static boolean isIdentifierIgnorable(char ch) 3080: { 3081: return (ch <= '\u009F' && (ch < '\t' || ch >= '\u007F' 3082: || (ch <= '\u001B' && ch >= '\u000E'))) 3083: || getType(ch) == FORMAT; 3084: } 3085: 3086: /** 3087: * Determines if a character is ignorable in a Unicode identifier. This 3088: * includes the non-whitespace ISO control characters (<code>'\u0000'</code> 3089: * through <code>'\u0008'</code>, <code>'\u000E'</code> through 3090: * <code>'\u001B'</code>, and <code>'\u007F'</code> through 3091: * <code>'\u009F'</code>), and FORMAT characters. Unlike 3092: * isIdentifierIgnorable(char), this method supports supplementary Unicode 3093: * code points. 3094: * <br> 3095: * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B 3096: * |U+007F-U+009F 3097: * 3098: * @param codePoint character to test 3099: * @return true if codePoint is ignorable in a Unicode or Java identifier 3100: * @see #isJavaIdentifierPart(int) 3101: * @see #isUnicodeIdentifierPart(int) 3102: * @since 1.5 3103: */ 3104: public static boolean isIdentifierIgnorable(int codePoint) 3105: { 3106: return ((codePoint >= 0 && codePoint <= 0x0008) 3107: || (codePoint >= 0x000E && codePoint <= 0x001B) 3108: || (codePoint >= 0x007F && codePoint <= 0x009F) 3109: || getType(codePoint) == FORMAT); 3110: } 3111: 3112: /** 3113: * Converts a Unicode character into its lowercase equivalent mapping. 3114: * If a mapping does not exist, then the character passed is returned. 3115: * Note that isLowerCase(toLowerCase(ch)) does not always return true. 3116: * 3117: * @param ch character to convert to lowercase 3118: * @return lowercase mapping of ch, or ch if lowercase mapping does 3119: * not exist 3120: * @see #isLowerCase(char) 3121: * @see #isUpperCase(char) 3122: * @see #toTitleCase(char) 3123: * @see #toUpperCase(char) 3124: */ 3125: public static native char toLowerCase(char ch); 3126: 3127: /** 3128: * Converts a Unicode character into its lowercase equivalent mapping. 3129: * If a mapping does not exist, then the character passed is returned. 3130: * Note that isLowerCase(toLowerCase(codePoint)) does not always return true. 3131: * Unlike toLowerCase(char), this method supports supplementary Unicode 3132: * code points. 3133: * 3134: * @param codePoint character to convert to lowercase 3135: * @return lowercase mapping of codePoint, or codePoint if lowercase 3136: * mapping does not exist 3137: * @see #isLowerCase(int) 3138: * @see #isUpperCase(int) 3139: * @see #toTitleCase(int) 3140: * @see #toUpperCase(int) 3141: * @since 1.5 3142: */ 3143: public static native int toLowerCase(int codePoint); 3144: 3145: /** 3146: * Converts a Unicode character into its uppercase equivalent mapping. 3147: * If a mapping does not exist, then the character passed is returned. 3148: * Note that isUpperCase(toUpperCase(ch)) does not always return true. 3149: * 3150: * @param ch character to convert to uppercase 3151: * @return uppercase mapping of ch, or ch if uppercase mapping does 3152: * not exist 3153: * @see #isLowerCase(char) 3154: * @see #isUpperCase(char) 3155: * @see #toLowerCase(char) 3156: * @see #toTitleCase(char) 3157: */ 3158: public static native char toUpperCase(char ch); 3159: 3160: /** 3161: * Converts a Unicode character into its uppercase equivalent mapping. 3162: * If a mapping does not exist, then the character passed is returned. 3163: * Note that isUpperCase(toUpperCase(codePoint)) does not always return true. 3164: * Unlike toUpperCase(char), this method supports supplementary 3165: * Unicode code points. 3166: * 3167: * @param codePoint character to convert to uppercase 3168: * @return uppercase mapping of codePoint, or codePoint if uppercase 3169: * mapping does not exist 3170: * @see #isLowerCase(int) 3171: * @see #isUpperCase(int) 3172: * @see #toLowerCase(int) 3173: * @see #toTitleCase(int) 3174: * @since 1.5 3175: */ 3176: public static native int toUpperCase(int codePoint); 3177: 3178: /** 3179: * Converts a Unicode character into its titlecase equivalent mapping. 3180: * If a mapping does not exist, then the character passed is returned. 3181: * Note that isTitleCase(toTitleCase(ch)) does not always return true. 3182: * 3183: * @param ch character to convert to titlecase 3184: * @return titlecase mapping of ch, or ch if titlecase mapping does 3185: * not exist 3186: * @see #isTitleCase(char) 3187: * @see #toLowerCase(char) 3188: * @see #toUpperCase(char) 3189: */ 3190: public static native char toTitleCase(char ch); 3191: 3192: /** 3193: * Converts a Unicode character into its titlecase equivalent mapping. 3194: * If a mapping does not exist, then the character passed is returned. 3195: * Note that isTitleCase(toTitleCase(codePoint)) does not always return true. 3196: * Unlike toTitleCase(char), this method supports supplementary 3197: * Unicode code points. 3198: * 3199: * @param codePoint character to convert to titlecase 3200: * @return titlecase mapping of codePoint, or codePoint if titlecase 3201: * mapping does not exist 3202: * @see #isTitleCase(int) 3203: * @see #toLowerCase(int) 3204: * @see #toUpperCase(int) 3205: * @since 1.5 3206: */ 3207: public static native int toTitleCase(int codePoint); 3208: 3209: /** 3210: * Converts a character into a digit of the specified radix. If the radix 3211: * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch) 3212: * exceeds the radix, or if ch is not a decimal digit or in the case 3213: * insensitive set of 'a'-'z', the result is -1. 3214: * <br> 3215: * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A 3216: * |U+FF21-U+FF3A|U+FF41-U+FF5A 3217: * 3218: * @param ch character to convert into a digit 3219: * @param radix radix in which ch is a digit 3220: * @return digit which ch represents in radix, or -1 not a valid digit 3221: * @see #MIN_RADIX 3222: * @see #MAX_RADIX 3223: * @see #forDigit(int, int) 3224: * @see #isDigit(char) 3225: * @see #getNumericValue(char) 3226: */ 3227: public static native int digit(char ch, int radix); 3228: 3229: /** 3230: * Converts a character into a digit of the specified radix. If the radix 3231: * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(int) 3232: * exceeds the radix, or if codePoint is not a decimal digit or in the case 3233: * insensitive set of 'a'-'z', the result is -1. Unlike digit(char, int), 3234: * this method supports supplementary Unicode code points. 3235: * <br> 3236: * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A 3237: * |U+FF21-U+FF3A|U+FF41-U+FF5A 3238: * 3239: * @param codePoint character to convert into a digit 3240: * @param radix radix in which codePoint is a digit 3241: * @return digit which codePoint represents in radix, or -1 not a valid digit 3242: * @see #MIN_RADIX 3243: * @see #MAX_RADIX 3244: * @see #forDigit(int, int) 3245: * @see #isDigit(int) 3246: * @see #getNumericValue(int) 3247: * @since 1.5 3248: */ 3249: public static native int digit(int codePoint, int radix); 3250: 3251: /** 3252: * Returns the Unicode numeric value property of a character. For example, 3253: * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50. 3254: * 3255: * <p>This method also returns values for the letters A through Z, (not 3256: * specified by Unicode), in these ranges: <code>'\u0041'</code> 3257: * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code> 3258: * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code> 3259: * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through 3260: * <code>'\uFF5A'</code> (full width variants). 3261: * 3262: * <p>If the character lacks a numeric value property, -1 is returned. 3263: * If the character has a numeric value property which is not representable 3264: * as a nonnegative integer, such as a fraction, -2 is returned. 3265: * 3266: * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A 3267: * |U+FF21-U+FF3A|U+FF41-U+FF5A 3268: * 3269: * @param ch character from which the numeric value property will 3270: * be retrieved 3271: * @return the numeric value property of ch, or -1 if it does not exist, or 3272: * -2 if it is not representable as a nonnegative integer 3273: * @see #forDigit(int, int) 3274: * @see #digit(char, int) 3275: * @see #isDigit(char) 3276: * @since 1.1 3277: */ 3278: public static native int getNumericValue(char ch); 3279: 3280: /** 3281: * Returns the Unicode numeric value property of a character. For example, 3282: * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50. 3283: * 3284: * <p>This method also returns values for the letters A through Z, (not 3285: * specified by Unicode), in these ranges: <code>'\u0041'</code> 3286: * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code> 3287: * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code> 3288: * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through 3289: * <code>'\uFF5A'</code> (full width variants). 3290: * 3291: * <p>If the character lacks a numeric value property, -1 is returned. 3292: * If the character has a numeric value property which is not representable 3293: * as a nonnegative integer, such as a fraction, -2 is returned. 3294: * 3295: * Unlike getNumericValue(char), this method supports supplementary Unicode 3296: * code points. 3297: * 3298: * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A 3299: * |U+FF21-U+FF3A|U+FF41-U+FF5A 3300: * 3301: * @param codePoint character from which the numeric value property will 3302: * be retrieved 3303: * @return the numeric value property of codePoint, or -1 if it does not 3304: * exist, or -2 if it is not representable as a nonnegative integer 3305: * @see #forDigit(int, int) 3306: * @see #digit(int, int) 3307: * @see #isDigit(int) 3308: * @since 1.5 3309: */ 3310: public static native int getNumericValue(int codePoint); 3311: 3312: /** 3313: * Determines if a character is a ISO-LATIN-1 space. This is only the five 3314: * characters <code>'\t'</code>, <code>'\n'</code>, <code>'\f'</code>, 3315: * <code>'\r'</code>, and <code>' '</code>. 3316: * <br> 3317: * Java space = U+0020|U+0009|U+000A|U+000C|U+000D 3318: * 3319: * @param ch character to test 3320: * @return true if ch is a space, else false 3321: * @deprecated Replaced by {@link #isWhitespace(char)} 3322: * @see #isSpaceChar(char) 3323: * @see #isWhitespace(char) 3324: */ 3325: public static boolean isSpace(char ch) 3326: { 3327: // Performing the subtraction up front alleviates need to compare longs. 3328: return ch-- <= ' ' && ((1 << ch) 3329: & ((1 << (' ' - 1)) 3330: | (1 << ('\t' - 1)) 3331: | (1 << ('\n' - 1)) 3332: | (1 << ('\r' - 1)) 3333: | (1 << ('\f' - 1)))) != 0; 3334: } 3335: 3336: /** 3337: * Determines if a character is a Unicode space character. This includes 3338: * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR. 3339: * <br> 3340: * Unicode space = [Zs]|[Zp]|[Zl] 3341: * 3342: * @param ch character to test 3343: * @return true if ch is a Unicode space, else false 3344: * @see #isWhitespace(char) 3345: * @since 1.1 3346: */ 3347: public static boolean isSpaceChar(char ch) 3348: { 3349: return ((1 << getType(ch)) 3350: & ((1 << SPACE_SEPARATOR) 3351: | (1 << LINE_SEPARATOR) 3352: | (1 << PARAGRAPH_SEPARATOR))) != 0; 3353: } 3354: 3355: /** 3356: * Determines if a character is a Unicode space character. This includes 3357: * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR. Unlike 3358: * isSpaceChar(char), this method supports supplementary Unicode code points. 3359: * <br> 3360: * Unicode space = [Zs]|[Zp]|[Zl] 3361: * 3362: * @param codePoint character to test 3363: * @return true if codePoint is a Unicode space, else false 3364: * @see #isWhitespace(int) 3365: * @since 1.5 3366: */ 3367: public static boolean isSpaceChar(int codePoint) 3368: { 3369: return ((1 << getType(codePoint)) 3370: & ((1 << SPACE_SEPARATOR) 3371: | (1 << LINE_SEPARATOR) 3372: | (1 << PARAGRAPH_SEPARATOR))) != 0; 3373: } 3374: 3375: /** 3376: * Determines if a character is Java whitespace. This includes Unicode 3377: * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and 3378: * PARAGRAPH_SEPARATOR) except the non-breaking spaces 3379: * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>); 3380: * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>, 3381: * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>, 3382: * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>, 3383: * and <code>'\u001F'</code>. 3384: * <br> 3385: * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F 3386: * 3387: * @param ch character to test 3388: * @return true if ch is Java whitespace, else false 3389: * @see #isSpaceChar(char) 3390: * @since 1.1 3391: */ 3392: public static boolean isWhitespace(char ch) 3393: { 3394: int attr = readChar(ch); 3395: return ((((1 << (attr & TYPE_MASK)) 3396: & ((1 << SPACE_SEPARATOR) 3397: | (1 << LINE_SEPARATOR) 3398: | (1 << PARAGRAPH_SEPARATOR))) != 0) 3399: && (attr & NO_BREAK_MASK) == 0) 3400: || (ch <= '\u001F' && ((1 << ch) 3401: & ((1 << '\t') 3402: | (1 << '\n') 3403: | (1 << '\u000B') 3404: | (1 << '\u000C') 3405: | (1 << '\r') 3406: | (1 << '\u001C') 3407: | (1 << '\u001D') 3408: | (1 << '\u001E') 3409: | (1 << '\u001F'))) != 0); 3410: } 3411: 3412: /** 3413: * Determines if a character is Java whitespace. This includes Unicode 3414: * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and 3415: * PARAGRAPH_SEPARATOR) except the non-breaking spaces 3416: * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>); 3417: * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>, 3418: * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>, 3419: * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>, 3420: * and <code>'\u001F'</code>. Unlike isWhitespace(char), this method 3421: * supports supplementary Unicode code points. 3422: * <br> 3423: * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F 3424: * 3425: * @param codePoint character to test 3426: * @return true if codePoint is Java whitespace, else false 3427: * @see #isSpaceChar(int) 3428: * @since 1.5 3429: */ 3430: public static boolean isWhitespace(int codePoint) 3431: { 3432: int plane = codePoint >>> 16; 3433: if (plane > 2 && plane != 14) 3434: return false; 3435: int attr = readCodePoint(codePoint); 3436: return ((((1 << (attr & TYPE_MASK)) 3437: & ((1 << SPACE_SEPARATOR) 3438: | (1 << LINE_SEPARATOR) 3439: | (1 << PARAGRAPH_SEPARATOR))) != 0) 3440: && (attr & NO_BREAK_MASK) == 0) 3441: || (codePoint <= '\u001F' && ((1 << codePoint) 3442: & ((1 << '\t') 3443: | (1 << '\n') 3444: | (1 << '\u000B') 3445: | (1 << '\u000C') 3446: | (1 << '\r') 3447: | (1 << '\u001C') 3448: | (1 << '\u001D') 3449: | (1 << '\u001E') 3450: | (1 << '\u001F'))) != 0); 3451: } 3452: 3453: /** 3454: * Determines if a character has the ISO Control property. 3455: * <br> 3456: * ISO Control = [Cc] 3457: * 3458: * @param ch character to test 3459: * @return true if ch is an ISO Control character, else false 3460: * @see #isSpaceChar(char) 3461: * @see #isWhitespace(char) 3462: * @since 1.1 3463: */ 3464: public static boolean isISOControl(char ch) 3465: { 3466: return getType(ch) == CONTROL; 3467: } 3468: 3469: /** 3470: * Determines if a character has the ISO Control property. Unlike 3471: * isISOControl(char), this method supports supplementary unicode 3472: * code points. 3473: * <br> 3474: * ISO Control = [Cc] 3475: * 3476: * @param codePoint character to test 3477: * @return true if codePoint is an ISO Control character, else false 3478: * @see #isSpaceChar(int) 3479: * @see #isWhitespace(int) 3480: * @since 1.5 3481: */ 3482: public static boolean isISOControl(int codePoint) 3483: { 3484: return getType(codePoint) == CONTROL; 3485: } 3486: 3487: /** 3488: * Returns the Unicode general category property of a character. 3489: * 3490: * @param ch character from which the general category property will 3491: * be retrieved 3492: * @return the character category property of ch as an integer 3493: * @see #UNASSIGNED 3494: * @see #UPPERCASE_LETTER 3495: * @see #LOWERCASE_LETTER 3496: * @see #TITLECASE_LETTER 3497: * @see #MODIFIER_LETTER 3498: * @see #OTHER_LETTER 3499: * @see #NON_SPACING_MARK 3500: * @see #ENCLOSING_MARK 3501: * @see #COMBINING_SPACING_MARK 3502: * @see #DECIMAL_DIGIT_NUMBER 3503: * @see #LETTER_NUMBER 3504: * @see #OTHER_NUMBER 3505: * @see #SPACE_SEPARATOR 3506: * @see #LINE_SEPARATOR 3507: * @see #PARAGRAPH_SEPARATOR 3508: * @see #CONTROL 3509: * @see #FORMAT 3510: * @see #PRIVATE_USE 3511: * @see #SURROGATE 3512: * @see #DASH_PUNCTUATION 3513: * @see #START_PUNCTUATION 3514: * @see #END_PUNCTUATION 3515: * @see #CONNECTOR_PUNCTUATION 3516: * @see #OTHER_PUNCTUATION 3517: * @see #MATH_SYMBOL 3518: * @see #CURRENCY_SYMBOL 3519: * @see #MODIFIER_SYMBOL 3520: * @see #INITIAL_QUOTE_PUNCTUATION 3521: * @see #FINAL_QUOTE_PUNCTUATION 3522: * @since 1.1 3523: */ 3524: public static native int getType(char ch); 3525: 3526: /** 3527: * Returns the Unicode general category property of a character. Supports 3528: * supplementary Unicode code points. 3529: * 3530: * @param codePoint character from which the general category property will 3531: * be retrieved 3532: * @return the character category property of codePoint as an integer 3533: * @see #UNASSIGNED 3534: * @see #UPPERCASE_LETTER 3535: * @see #LOWERCASE_LETTER 3536: * @see #TITLECASE_LETTER 3537: * @see #MODIFIER_LETTER 3538: * @see #OTHER_LETTER 3539: * @see #NON_SPACING_MARK 3540: * @see #ENCLOSING_MARK 3541: * @see #COMBINING_SPACING_MARK 3542: * @see #DECIMAL_DIGIT_NUMBER 3543: * @see #LETTER_NUMBER 3544: * @see #OTHER_NUMBER 3545: * @see #SPACE_SEPARATOR 3546: * @see #LINE_SEPARATOR 3547: * @see #PARAGRAPH_SEPARATOR 3548: * @see #CONTROL 3549: * @see #FORMAT 3550: * @see #PRIVATE_USE 3551: * @see #SURROGATE 3552: * @see #DASH_PUNCTUATION 3553: * @see #START_PUNCTUATION 3554: * @see #END_PUNCTUATION 3555: * @see #CONNECTOR_PUNCTUATION 3556: * @see #OTHER_PUNCTUATION 3557: * @see #MATH_SYMBOL 3558: * @see #CURRENCY_SYMBOL 3559: * @see #MODIFIER_SYMBOL 3560: * @see #INITIAL_QUOTE_PUNCTUATION 3561: * @see #FINAL_QUOTE_PUNCTUATION 3562: * @since 1.5 3563: */ 3564: public static native int getType(int codePoint); 3565: 3566: /** 3567: * Converts a digit into a character which represents that digit 3568: * in a specified radix. If the radix exceeds MIN_RADIX or MAX_RADIX, 3569: * or the digit exceeds the radix, then the null character <code>'\0'</code> 3570: * is returned. Otherwise the return value is in '0'-'9' and 'a'-'z'. 3571: * <br> 3572: * return value boundary = U+0030-U+0039|U+0061-U+007A 3573: * 3574: * @param digit digit to be converted into a character 3575: * @param radix radix of digit 3576: * @return character representing digit in radix, or '\0' 3577: * @see #MIN_RADIX 3578: * @see #MAX_RADIX 3579: * @see #digit(char, int) 3580: */ 3581: public static char forDigit(int digit, int radix) 3582: { 3583: if (radix < MIN_RADIX || radix > MAX_RADIX 3584: || digit < 0 || digit >= radix) 3585: return '\0'; 3586: return (char) (digit < 10 ? ('0' + digit) : ('a' - 10 + digit)); 3587: } 3588: 3589: /** 3590: * Returns the Unicode directionality property of the character. This 3591: * is used in the visual ordering of text. 3592: * 3593: * @param ch the character to look up 3594: * @return the directionality constant, or DIRECTIONALITY_UNDEFINED 3595: * @see #DIRECTIONALITY_UNDEFINED 3596: * @see #DIRECTIONALITY_LEFT_TO_RIGHT 3597: * @see #DIRECTIONALITY_RIGHT_TO_LEFT 3598: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC 3599: * @see #DIRECTIONALITY_EUROPEAN_NUMBER 3600: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR 3601: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR 3602: * @see #DIRECTIONALITY_ARABIC_NUMBER 3603: * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR 3604: * @see #DIRECTIONALITY_NONSPACING_MARK 3605: * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL 3606: * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR 3607: * @see #DIRECTIONALITY_SEGMENT_SEPARATOR 3608: * @see #DIRECTIONALITY_WHITESPACE 3609: * @see #DIRECTIONALITY_OTHER_NEUTRALS 3610: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING 3611: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE 3612: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING 3613: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE 3614: * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT 3615: * @since 1.4 3616: */ 3617: public static native byte getDirectionality(char ch); 3618: 3619: /** 3620: * Returns the Unicode directionality property of the character. This 3621: * is used in the visual ordering of text. Unlike getDirectionality(char), 3622: * this method supports supplementary Unicode code points. 3623: * 3624: * @param codePoint the character to look up 3625: * @return the directionality constant, or DIRECTIONALITY_UNDEFINED 3626: * @see #DIRECTIONALITY_UNDEFINED 3627: * @see #DIRECTIONALITY_LEFT_TO_RIGHT 3628: * @see #DIRECTIONALITY_RIGHT_TO_LEFT 3629: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC 3630: * @see #DIRECTIONALITY_EUROPEAN_NUMBER 3631: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR 3632: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR 3633: * @see #DIRECTIONALITY_ARABIC_NUMBER 3634: * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR 3635: * @see #DIRECTIONALITY_NONSPACING_MARK 3636: * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL 3637: * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR 3638: * @see #DIRECTIONALITY_SEGMENT_SEPARATOR 3639: * @see #DIRECTIONALITY_WHITESPACE 3640: * @see #DIRECTIONALITY_OTHER_NEUTRALS 3641: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING 3642: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE 3643: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING 3644: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE 3645: * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT 3646: * @since 1.5 3647: */ 3648: public static native byte getDirectionality(int codePoint); 3649: 3650: /** 3651: * Determines whether the character is mirrored according to Unicode. For 3652: * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in 3653: * left-to-right text, but ')' in right-to-left text. 3654: * 3655: * @param ch the character to look up 3656: * @return true if the character is mirrored 3657: * @since 1.4 3658: */ 3659: public static boolean isMirrored(char ch) 3660: { 3661: return (readChar(ch) & MIRROR_MASK) != 0; 3662: } 3663: 3664: /** 3665: * Determines whether the character is mirrored according to Unicode. For 3666: * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in 3667: * left-to-right text, but ')' in right-to-left text. Unlike 3668: * isMirrored(char), this method supports supplementary Unicode code points. 3669: * 3670: * @param codePoint the character to look up 3671: * @return true if the character is mirrored 3672: * @since 1.5 3673: */ 3674: public static boolean isMirrored(int codePoint) 3675: { 3676: int plane = codePoint >>> 16; 3677: if (plane > 2 && plane != 14) 3678: return false; 3679: return (readCodePoint(codePoint) & MIRROR_MASK) != 0; 3680: } 3681: 3682: /** 3683: * Compares another Character to this Character, numerically. 3684: * 3685: * @param anotherCharacter Character to compare with this Character 3686: * @return a negative integer if this Character is less than 3687: * anotherCharacter, zero if this Character is equal, and 3688: * a positive integer if this Character is greater 3689: * @throws NullPointerException if anotherCharacter is null 3690: * @since 1.2 3691: */ 3692: public int compareTo(Character anotherCharacter) 3693: { 3694: return value - anotherCharacter.value; 3695: } 3696: 3697: /** 3698: * Returns an <code>Character</code> object wrapping the value. 3699: * In contrast to the <code>Character</code> constructor, this method 3700: * will cache some values. It is used by boxing conversion. 3701: * 3702: * @param val the value to wrap 3703: * @return the <code>Character</code> 3704: * 3705: * @since 1.5 3706: */ 3707: public static Character valueOf(char val) 3708: { 3709: if (val > MAX_CACHE) 3710: return new Character(val); 3711: synchronized (charCache) 3712: { 3713: if (charCache[val - MIN_VALUE] == null) 3714: charCache[val - MIN_VALUE] = new Character(val); 3715: return charCache[val - MIN_VALUE]; 3716: } 3717: } 3718: 3719: /** 3720: * Reverse the bytes in val. 3721: * @since 1.5 3722: */ 3723: public static char reverseBytes(char val) 3724: { 3725: return (char) (((val >> 8) & 0xff) | ((val << 8) & 0xff00)); 3726: } 3727: 3728: /** 3729: * Converts a unicode code point to a UTF-16 representation of that 3730: * code point. 3731: * 3732: * @param codePoint the unicode code point 3733: * 3734: * @return the UTF-16 representation of that code point 3735: * 3736: * @throws IllegalArgumentException if the code point is not a valid 3737: * unicode code point 3738: * 3739: * @since 1.5 3740: */ 3741: public static char[] toChars(int codePoint) 3742: { 3743: if (!isValidCodePoint(codePoint)) 3744: throw new IllegalArgumentException("Illegal Unicode code point : " 3745: + codePoint); 3746: char[] result = new char[charCount(codePoint)]; 3747: int ignore = toChars(codePoint, result, 0); 3748: return result; 3749: } 3750: 3751: /** 3752: * Converts a unicode code point to its UTF-16 representation. 3753: * 3754: * @param codePoint the unicode code point 3755: * @param dst the target char array 3756: * @param dstIndex the start index for the target 3757: * 3758: * @return number of characters written to <code>dst</code> 3759: * 3760: * @throws IllegalArgumentException if <code>codePoint</code> is not a 3761: * valid unicode code point 3762: * @throws NullPointerException if <code>dst</code> is <code>null</code> 3763: * @throws IndexOutOfBoundsException if <code>dstIndex</code> is not valid 3764: * in <code>dst</code> or if the UTF-16 representation does not 3765: * fit into <code>dst</code> 3766: * 3767: * @since 1.5 3768: */ 3769: public static int toChars(int codePoint, char[] dst, int dstIndex) 3770: { 3771: if (!isValidCodePoint(codePoint)) 3772: { 3773: throw new IllegalArgumentException("not a valid code point: " 3774: + codePoint); 3775: } 3776: 3777: int result; 3778: if (isSupplementaryCodePoint(codePoint)) 3779: { 3780: // Write second char first to cause IndexOutOfBoundsException 3781: // immediately. 3782: final int cp2 = codePoint - 0x10000; 3783: dst[dstIndex + 1] = (char) ((cp2 % 0x400) + (int) MIN_LOW_SURROGATE); 3784: dst[dstIndex] = (char) ((cp2 / 0x400) + (int) MIN_HIGH_SURROGATE); 3785: result = 2; 3786: } 3787: else 3788: { 3789: dst[dstIndex] = (char) codePoint; 3790: result = 1; 3791: } 3792: return result; 3793: } 3794: 3795: /** 3796: * Return number of 16-bit characters required to represent the given 3797: * code point. 3798: * 3799: * @param codePoint a unicode code point 3800: * 3801: * @return 2 if codePoint >= 0x10000, 1 otherwise. 3802: * 3803: * @since 1.5 3804: */ 3805: public static int charCount(int codePoint) 3806: { 3807: return 3808: (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT) 3809: ? 2 3810: : 1; 3811: } 3812: 3813: /** 3814: * Determines whether the specified code point is 3815: * in the range 0x10000 .. 0x10FFFF, i.e. the character is within the Unicode 3816: * supplementary character range. 3817: * 3818: * @param codePoint a Unicode code point 3819: * 3820: * @return <code>true</code> if code point is in supplementary range 3821: * 3822: * @since 1.5 3823: */ 3824: public static boolean isSupplementaryCodePoint(int codePoint) 3825: { 3826: return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT 3827: && codePoint <= MAX_CODE_POINT; 3828: } 3829: 3830: /** 3831: * Determines whether the specified code point is 3832: * in the range 0x0000 .. 0x10FFFF, i.e. it is a valid Unicode code point. 3833: * 3834: * @param codePoint a Unicode code point 3835: * 3836: * @return <code>true</code> if code point is valid 3837: * 3838: * @since 1.5 3839: */ 3840: public static boolean isValidCodePoint(int codePoint) 3841: { 3842: return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT; 3843: } 3844: 3845: /** 3846: * Return true if the given character is a high surrogate. 3847: * @param ch the character 3848: * @return true if the character is a high surrogate character 3849: * 3850: * @since 1.5 3851: */ 3852: public static boolean isHighSurrogate(char ch) 3853: { 3854: return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE; 3855: } 3856: 3857: /** 3858: * Return true if the given character is a low surrogate. 3859: * @param ch the character 3860: * @return true if the character is a low surrogate character 3861: * 3862: * @since 1.5 3863: */ 3864: public static boolean isLowSurrogate(char ch) 3865: { 3866: return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE; 3867: } 3868: 3869: /** 3870: * Return true if the given characters compose a surrogate pair. 3871: * This is true if the first character is a high surrogate and the 3872: * second character is a low surrogate. 3873: * @param ch1 the first character 3874: * @param ch2 the first character 3875: * @return true if the characters compose a surrogate pair 3876: * 3877: * @since 1.5 3878: */ 3879: public static boolean isSurrogatePair(char ch1, char ch2) 3880: { 3881: return isHighSurrogate(ch1) && isLowSurrogate(ch2); 3882: } 3883: 3884: /** 3885: * Given a valid surrogate pair, this returns the corresponding 3886: * code point. 3887: * @param high the high character of the pair 3888: * @param low the low character of the pair 3889: * @return the corresponding code point 3890: * 3891: * @since 1.5 3892: */ 3893: public static int toCodePoint(char high, char low) 3894: { 3895: return ((high - MIN_HIGH_SURROGATE) * 0x400) + 3896: (low - MIN_LOW_SURROGATE) + 0x10000; 3897: } 3898: 3899: /** 3900: * Get the code point at the specified index in the CharSequence. 3901: * This is like CharSequence#charAt(int), but if the character is 3902: * the start of a surrogate pair, and there is a following 3903: * character, and this character completes the pair, then the 3904: * corresponding supplementary code point is returned. Otherwise, 3905: * the character at the index is returned. 3906: * 3907: * @param sequence the CharSequence 3908: * @param index the index of the codepoint to get, starting at 0 3909: * @return the codepoint at the specified index 3910: * @throws IndexOutOfBoundsException if index is negative or >= length() 3911: * @since 1.5 3912: */ 3913: public static int codePointAt(CharSequence sequence, int index) 3914: { 3915: int len = sequence.length(); 3916: if (index < 0 || index >= len) 3917: throw new IndexOutOfBoundsException(); 3918: char high = sequence.charAt(index); 3919: if (! isHighSurrogate(high) || ++index >= len) 3920: return high; 3921: char low = sequence.charAt(index); 3922: if (! isLowSurrogate(low)) 3923: return high; 3924: return toCodePoint(high, low); 3925: } 3926: 3927: /** 3928: * Get the code point at the specified index in the CharSequence. 3929: * If the character is the start of a surrogate pair, and there is a 3930: * following character, and this character completes the pair, then 3931: * the corresponding supplementary code point is returned. 3932: * Otherwise, the character at the index is returned. 3933: * 3934: * @param chars the character array in which to look 3935: * @param index the index of the codepoint to get, starting at 0 3936: * @return the codepoint at the specified index 3937: * @throws IndexOutOfBoundsException if index is negative or >= length() 3938: * @since 1.5 3939: */ 3940: public static int codePointAt(char[] chars, int index) 3941: { 3942: return codePointAt(chars, index, chars.length); 3943: } 3944: 3945: /** 3946: * Get the code point at the specified index in the CharSequence. 3947: * If the character is the start of a surrogate pair, and there is a 3948: * following character within the specified range, and this 3949: * character completes the pair, then the corresponding 3950: * supplementary code point is returned. Otherwise, the character 3951: * at the index is returned. 3952: * 3953: * @param chars the character array in which to look 3954: * @param index the index of the codepoint to get, starting at 0 3955: * @param limit the limit past which characters should not be examined 3956: * @return the codepoint at the specified index 3957: * @throws IndexOutOfBoundsException if index is negative or >= 3958: * limit, or if limit is negative or >= the length of the array 3959: * @since 1.5 3960: */ 3961: public static int codePointAt(char[] chars, int index, int limit) 3962: { 3963: if (index < 0 || index >= limit || limit < 0 || limit > chars.length) 3964: throw new IndexOutOfBoundsException(); 3965: char high = chars[index]; 3966: if (! isHighSurrogate(high) || ++index >= limit) 3967: return high; 3968: char low = chars[index]; 3969: if (! isLowSurrogate(low)) 3970: return high; 3971: return toCodePoint(high, low); 3972: } 3973: 3974: /** 3975: * Get the code point before the specified index. This is like 3976: * #codePointAt(char[], int), but checks the characters at 3977: * <code>index-1</code> and <code>index-2</code> to see if they form 3978: * a supplementary code point. If they do not, the character at 3979: * <code>index-1</code> is returned. 3980: * 3981: * @param chars the character array 3982: * @param index the index just past the codepoint to get, starting at 0 3983: * @return the codepoint at the specified index 3984: * @throws IndexOutOfBoundsException if index is negative or >= length() 3985: * @since 1.5 3986: */ 3987: public static int codePointBefore(char[] chars, int index) 3988: { 3989: return codePointBefore(chars, index, 1); 3990: } 3991: 3992: /** 3993: * Get the code point before the specified index. This is like 3994: * #codePointAt(char[], int), but checks the characters at 3995: * <code>index-1</code> and <code>index-2</code> to see if they form 3996: * a supplementary code point. If they do not, the character at 3997: * <code>index-1</code> is returned. The start parameter is used to 3998: * limit the range of the array which may be examined. 3999: * 4000: * @param chars the character array 4001: * @param index the index just past the codepoint to get, starting at 0 4002: * @param start the index before which characters should not be examined 4003: * @return the codepoint at the specified index 4004: * @throws IndexOutOfBoundsException if index is > start or > 4005: * the length of the array, or if limit is negative or >= the 4006: * length of the array 4007: * @since 1.5 4008: */ 4009: public static int codePointBefore(char[] chars, int index, int start) 4010: { 4011: if (index < start || index > chars.length 4012: || start < 0 || start >= chars.length) 4013: throw new IndexOutOfBoundsException(); 4014: --index; 4015: char low = chars[index]; 4016: if (! isLowSurrogate(low) || --index < start) 4017: return low; 4018: char high = chars[index]; 4019: if (! isHighSurrogate(high)) 4020: return low; 4021: return toCodePoint(high, low); 4022: } 4023: 4024: /** 4025: * Get the code point before the specified index. This is like 4026: * #codePointAt(CharSequence, int), but checks the characters at 4027: * <code>index-1</code> and <code>index-2</code> to see if they form 4028: * a supplementary code point. If they do not, the character at 4029: * <code>index-1</code> is returned. 4030: * 4031: * @param sequence the CharSequence 4032: * @param index the index just past the codepoint to get, starting at 0 4033: * @return the codepoint at the specified index 4034: * @throws IndexOutOfBoundsException if index is negative or >= length() 4035: * @since 1.5 4036: */ 4037: public static int codePointBefore(CharSequence sequence, int index) 4038: { 4039: int len = sequence.length(); 4040: if (index < 1 || index > len) 4041: throw new IndexOutOfBoundsException(); 4042: --index; 4043: char low = sequence.charAt(index); 4044: if (! isLowSurrogate(low) || --index < 0) 4045: return low; 4046: char high = sequence.charAt(index); 4047: if (! isHighSurrogate(high)) 4048: return low; 4049: return toCodePoint(high, low); 4050: } 4051: } // class Character