package org.madore.android.unicodeMap; import java.util.Arrays; import java.util.Set; import java.util.Map; import java.util.EnumSet; import java.util.HashMap; import java.util.Formatter; import java.io.ByteArrayOutputStream; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.io.IOException; public class UnicodeCharacter implements UnicodeDisplayable { public static enum Range implements UnicodeDisplayable, UnicodeRangeable { BASIC_LATIN(0x0000, 0x007F, "Basic Latin"), LATIN_1_SUPPLEMENT(0x0080, 0x00FF, "Latin-1 Supplement"), LATIN_EXTENDED_A(0x0100, 0x017F, "Latin Extended-A"), LATIN_EXTENDED_B(0x0180, 0x024F, "Latin Extended-B"), IPA_EXTENSIONS(0x0250, 0x02AF, "IPA Extensions"), SPACING_MODIFIER_LETTERS(0x02B0, 0x02FF, "Spacing Modifier Letters"), COMBINING_DIACRITICAL_MARKS(0x0300, 0x036F, "Combining Diacritical Marks"), GREEK_AND_COPTIC(0x0370, 0x03FF, "Greek and Coptic"), CYRILLIC(0x0400, 0x04FF, "Cyrillic"), CYRILLIC_SUPPLEMENT(0x0500, 0x052F, "Cyrillic Supplement"), ARMENIAN(0x0530, 0x058F, "Armenian"), HEBREW(0x0590, 0x05FF, "Hebrew"), ARABIC(0x0600, 0x06FF, "Arabic"), SYRIAC(0x0700, 0x074F, "Syriac"), ARABIC_SUPPLEMENT(0x0750, 0x077F, "Arabic Supplement"), THAANA(0x0780, 0x07BF, "Thaana"), NKO(0x07C0, 0x07FF, "NKo"), SAMARITAN(0x0800, 0x083F, "Samaritan"), MANDAIC(0x0840, 0x085F, "Mandaic"), ARABIC_EXTENDED_A(0x08A0, 0x08FF, "Arabic Extended-A"), DEVANAGARI(0x0900, 0x097F, "Devanagari"), BENGALI(0x0980, 0x09FF, "Bengali"), GURMUKHI(0x0A00, 0x0A7F, "Gurmukhi"), GUJARATI(0x0A80, 0x0AFF, "Gujarati"), ORIYA(0x0B00, 0x0B7F, "Oriya"), TAMIL(0x0B80, 0x0BFF, "Tamil"), TELUGU(0x0C00, 0x0C7F, "Telugu"), KANNADA(0x0C80, 0x0CFF, "Kannada"), MALAYALAM(0x0D00, 0x0D7F, "Malayalam"), SINHALA(0x0D80, 0x0DFF, "Sinhala"), THAI(0x0E00, 0x0E7F, "Thai"), LAO(0x0E80, 0x0EFF, "Lao"), TIBETAN(0x0F00, 0x0FFF, "Tibetan"), MYANMAR(0x1000, 0x109F, "Myanmar"), GEORGIAN(0x10A0, 0x10FF, "Georgian"), HANGUL_JAMO(0x1100, 0x11FF, "Hangul Jamo"), ETHIOPIC(0x1200, 0x137F, "Ethiopic"), ETHIOPIC_SUPPLEMENT(0x1380, 0x139F, "Ethiopic Supplement"), CHEROKEE(0x13A0, 0x13FF, "Cherokee"), UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS(0x1400, 0x167F, "Unified Canadian Aboriginal Syllabics"), OGHAM(0x1680, 0x169F, "Ogham"), RUNIC(0x16A0, 0x16FF, "Runic"), TAGALOG(0x1700, 0x171F, "Tagalog"), HANUNOO(0x1720, 0x173F, "Hanunoo"), BUHID(0x1740, 0x175F, "Buhid"), TAGBANWA(0x1760, 0x177F, "Tagbanwa"), KHMER(0x1780, 0x17FF, "Khmer"), MONGOLIAN(0x1800, 0x18AF, "Mongolian"), UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED(0x18B0, 0x18FF, "Unified Canadian Aboriginal Syllabics Extended"), LIMBU(0x1900, 0x194F, "Limbu"), TAI_LE(0x1950, 0x197F, "Tai Le"), NEW_TAI_LUE(0x1980, 0x19DF, "New Tai Lue"), KHMER_SYMBOLS(0x19E0, 0x19FF, "Khmer Symbols"), BUGINESE(0x1A00, 0x1A1F, "Buginese"), TAI_THAM(0x1A20, 0x1AAF, "Tai Tham"), BALINESE(0x1B00, 0x1B7F, "Balinese"), SUNDANESE(0x1B80, 0x1BBF, "Sundanese"), BATAK(0x1BC0, 0x1BFF, "Batak"), LEPCHA(0x1C00, 0x1C4F, "Lepcha"), OL_CHIKI(0x1C50, 0x1C7F, "Ol Chiki"), SUNDANESE_SUPPLEMENT(0x1CC0, 0x1CCF, "Sundanese Supplement"), VEDIC_EXTENSIONS(0x1CD0, 0x1CFF, "Vedic Extensions"), PHONETIC_EXTENSIONS(0x1D00, 0x1D7F, "Phonetic Extensions"), PHONETIC_EXTENSIONS_SUPPLEMENT(0x1D80, 0x1DBF, "Phonetic Extensions Supplement"), COMBINING_DIACRITICAL_MARKS_SUPPLEMENT(0x1DC0, 0x1DFF, "Combining Diacritical Marks Supplement"), LATIN_EXTENDED_ADDITIONAL(0x1E00, 0x1EFF, "Latin Extended Additional"), GREEK_EXTENDED(0x1F00, 0x1FFF, "Greek Extended"), GENERAL_PUNCTUATION(0x2000, 0x206F, "General Punctuation"), SUPERSCRIPTS_AND_SUBSCRIPTS(0x2070, 0x209F, "Superscripts and Subscripts"), CURRENCY_SYMBOLS(0x20A0, 0x20CF, "Currency Symbols"), COMBINING_DIACRITICAL_MARKS_FOR_SYMBOLS(0x20D0, 0x20FF, "Combining Diacritical Marks for Symbols"), LETTERLIKE_SYMBOLS(0x2100, 0x214F, "Letterlike Symbols"), NUMBER_FORMS(0x2150, 0x218F, "Number Forms"), ARROWS(0x2190, 0x21FF, "Arrows"), MATHEMATICAL_OPERATORS(0x2200, 0x22FF, "Mathematical Operators"), MISCELLANEOUS_TECHNICAL(0x2300, 0x23FF, "Miscellaneous Technical"), CONTROL_PICTURES(0x2400, 0x243F, "Control Pictures"), OPTICAL_CHARACTER_RECOGNITION(0x2440, 0x245F, "Optical Character Recognition"), ENCLOSED_ALPHANUMERICS(0x2460, 0x24FF, "Enclosed Alphanumerics"), BOX_DRAWING(0x2500, 0x257F, "Box Drawing"), BLOCK_ELEMENTS(0x2580, 0x259F, "Block Elements"), GEOMETRIC_SHAPES(0x25A0, 0x25FF, "Geometric Shapes"), MISCELLANEOUS_SYMBOLS(0x2600, 0x26FF, "Miscellaneous Symbols"), DINGBATS(0x2700, 0x27BF, "Dingbats"), MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A(0x27C0, 0x27EF, "Miscellaneous Mathematical Symbols-A"), SUPPLEMENTAL_ARROWS_A(0x27F0, 0x27FF, "Supplemental Arrows-A"), BRAILLE_PATTERNS(0x2800, 0x28FF, "Braille Patterns"), SUPPLEMENTAL_ARROWS_B(0x2900, 0x297F, "Supplemental Arrows-B"), MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B(0x2980, 0x29FF, "Miscellaneous Mathematical Symbols-B"), SUPPLEMENTAL_MATHEMATICAL_OPERATORS(0x2A00, 0x2AFF, "Supplemental Mathematical Operators"), MISCELLANEOUS_SYMBOLS_AND_ARROWS(0x2B00, 0x2BFF, "Miscellaneous Symbols and Arrows"), GLAGOLITIC(0x2C00, 0x2C5F, "Glagolitic"), LATIN_EXTENDED_C(0x2C60, 0x2C7F, "Latin Extended-C"), COPTIC(0x2C80, 0x2CFF, "Coptic"), GEORGIAN_SUPPLEMENT(0x2D00, 0x2D2F, "Georgian Supplement"), TIFINAGH(0x2D30, 0x2D7F, "Tifinagh"), ETHIOPIC_EXTENDED(0x2D80, 0x2DDF, "Ethiopic Extended"), CYRILLIC_EXTENDED_A(0x2DE0, 0x2DFF, "Cyrillic Extended-A"), SUPPLEMENTAL_PUNCTUATION(0x2E00, 0x2E7F, "Supplemental Punctuation"), CJK_RADICALS_SUPPLEMENT(0x2E80, 0x2EFF, "CJK Radicals Supplement"), KANGXI_RADICALS(0x2F00, 0x2FDF, "Kangxi Radicals"), IDEOGRAPHIC_DESCRIPTION_CHARACTERS(0x2FF0, 0x2FFF, "Ideographic Description Characters"), CJK_SYMBOLS_AND_PUNCTUATION(0x3000, 0x303F, "CJK Symbols and Punctuation"), HIRAGANA(0x3040, 0x309F, "Hiragana"), KATAKANA(0x30A0, 0x30FF, "Katakana"), BOPOMOFO(0x3100, 0x312F, "Bopomofo"), HANGUL_COMPATIBILITY_JAMO(0x3130, 0x318F, "Hangul Compatibility Jamo"), KANBUN(0x3190, 0x319F, "Kanbun"), BOPOMOFO_EXTENDED(0x31A0, 0x31BF, "Bopomofo Extended"), CJK_STROKES(0x31C0, 0x31EF, "CJK Strokes"), KATAKANA_PHONETIC_EXTENSIONS(0x31F0, 0x31FF, "Katakana Phonetic Extensions"), ENCLOSED_CJK_LETTERS_AND_MONTHS(0x3200, 0x32FF, "Enclosed CJK Letters and Months"), CJK_COMPATIBILITY(0x3300, 0x33FF, "CJK Compatibility"), CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A(0x3400, 0x4DB5, "CJK Unified Ideographs Extension A"), YIJING_HEXAGRAM_SYMBOLS(0x4DC0, 0x4DFF, "Yijing Hexagram Symbols"), CJK_UNIFIED_IDEOGRAPHS(0x4E00, 0x9FCC, "CJK Unified Ideographs"), YI_SYLLABLES(0xA000, 0xA48F, "Yi Syllables"), YI_RADICALS(0xA490, 0xA4CF, "Yi Radicals"), LISU(0xA4D0, 0xA4FF, "Lisu"), VAI(0xA500, 0xA63F, "Vai"), CYRILLIC_EXTENDED_B(0xA640, 0xA69F, "Cyrillic Extended-B"), BAMUM(0xA6A0, 0xA6FF, "Bamum"), MODIFIER_TONE_LETTERS(0xA700, 0xA71F, "Modifier Tone Letters"), LATIN_EXTENDED_D(0xA720, 0xA7FF, "Latin Extended-D"), SYLOTI_NAGRI(0xA800, 0xA82F, "Syloti Nagri"), COMMON_INDIC_NUMBER_FORMS(0xA830, 0xA83F, "Common Indic Number Forms"), PHAGS_PA(0xA840, 0xA87F, "Phags-pa"), SAURASHTRA(0xA880, 0xA8DF, "Saurashtra"), DEVANAGARI_EXTENDED(0xA8E0, 0xA8FF, "Devanagari Extended"), KAYAH_LI(0xA900, 0xA92F, "Kayah Li"), REJANG(0xA930, 0xA95F, "Rejang"), HANGUL_JAMO_EXTENDED_A(0xA960, 0xA97F, "Hangul Jamo Extended-A"), JAVANESE(0xA980, 0xA9DF, "Javanese"), CHAM(0xAA00, 0xAA5F, "Cham"), MYANMAR_EXTENDED_A(0xAA60, 0xAA7F, "Myanmar Extended-A"), TAI_VIET(0xAA80, 0xAADF, "Tai Viet"), MEETEI_MAYEK_EXTENSIONS(0xAAE0, 0xAAFF, "Meetei Mayek Extensions"), ETHIOPIC_EXTENDED_A(0xAB00, 0xAB2F, "Ethiopic Extended-A"), MEETEI_MAYEK(0xABC0, 0xABFF, "Meetei Mayek"), HANGUL_SYLLABLES(0xAC00, 0xD7A3, "Hangul Syllables"), HANGUL_JAMO_EXTENDED_B(0xD7B0, 0xD7FF, "Hangul Jamo Extended-B"), HIGH_SURROGATES(0xD800, 0xDB7F, "High Surrogates"), HIGH_PRIVATE_USE_SURROGATES(0xDB80, 0xDBFF, "High Private Use Surrogates"), LOW_SURROGATES(0xDC00, 0xDFFF, "Low Surrogates"), PRIVATE_USE_AREA(0xE000, 0xF8FF, "Private Use Area"), CJK_COMPATIBILITY_IDEOGRAPHS(0xF900, 0xFAFF, "CJK Compatibility Ideographs"), ALPHABETIC_PRESENTATION_FORMS(0xFB00, 0xFB4F, "Alphabetic Presentation Forms"), ARABIC_PRESENTATION_FORMS_A(0xFB50, 0xFDFF, "Arabic Presentation Forms-A"), VARIATION_SELECTORS(0xFE00, 0xFE0F, "Variation Selectors"), VERTICAL_FORMS(0xFE10, 0xFE1F, "Vertical Forms"), COMBINING_HALF_MARKS(0xFE20, 0xFE2F, "Combining Half Marks"), CJK_COMPATIBILITY_FORMS(0xFE30, 0xFE4F, "CJK Compatibility Forms"), SMALL_FORM_VARIANTS(0xFE50, 0xFE6F, "Small Form Variants"), ARABIC_PRESENTATION_FORMS_B(0xFE70, 0xFEFF, "Arabic Presentation Forms-B"), HALFWIDTH_AND_FULLWIDTH_FORMS(0xFF00, 0xFFEF, "Halfwidth and Fullwidth Forms"), SPECIALS(0xFFF0, 0xFFFF, "Specials"), LINEAR_B_SYLLABARY(0x10000, 0x1007F, "Linear B Syllabary"), LINEAR_B_IDEOGRAMS(0x10080, 0x100FF, "Linear B Ideograms"), AEGEAN_NUMBERS(0x10100, 0x1013F, "Aegean Numbers"), ANCIENT_GREEK_NUMBERS(0x10140, 0x1018F, "Ancient Greek Numbers"), ANCIENT_SYMBOLS(0x10190, 0x101CF, "Ancient Symbols"), PHAISTOS_DISC(0x101D0, 0x101FF, "Phaistos Disc"), LYCIAN(0x10280, 0x1029F, "Lycian"), CARIAN(0x102A0, 0x102DF, "Carian"), OLD_ITALIC(0x10300, 0x1032F, "Old Italic"), GOTHIC(0x10330, 0x1034F, "Gothic"), UGARITIC(0x10380, 0x1039F, "Ugaritic"), OLD_PERSIAN(0x103A0, 0x103DF, "Old Persian"), DESERET(0x10400, 0x1044F, "Deseret"), SHAVIAN(0x10450, 0x1047F, "Shavian"), OSMANYA(0x10480, 0x104AF, "Osmanya"), CYPRIOT_SYLLABARY(0x10800, 0x1083F, "Cypriot Syllabary"), IMPERIAL_ARAMAIC(0x10840, 0x1085F, "Imperial Aramaic"), PHOENICIAN(0x10900, 0x1091F, "Phoenician"), LYDIAN(0x10920, 0x1093F, "Lydian"), MEROITIC_HIEROGLYPHS(0x10980, 0x1099F, "Meroitic Hieroglyphs"), MEROITIC_CURSIVE(0x109A0, 0x109FF, "Meroitic Cursive"), KHAROSHTHI(0x10A00, 0x10A5F, "Kharoshthi"), OLD_SOUTH_ARABIAN(0x10A60, 0x10A7F, "Old South Arabian"), AVESTAN(0x10B00, 0x10B3F, "Avestan"), INSCRIPTIONAL_PARTHIAN(0x10B40, 0x10B5F, "Inscriptional Parthian"), INSCRIPTIONAL_PAHLAVI(0x10B60, 0x10B7F, "Inscriptional Pahlavi"), OLD_TURKIC(0x10C00, 0x10C4F, "Old Turkic"), RUMI_NUMERAL_SYMBOLS(0x10E60, 0x10E7F, "Rumi Numeral Symbols"), BRAHMI(0x11000, 0x1107F, "Brahmi"), KAITHI(0x11080, 0x110CF, "Kaithi"), SORA_SOMPENG(0x110D0, 0x110FF, "Sora Sompeng"), CHAKMA(0x11100, 0x1114F, "Chakma"), SHARADA(0x11180, 0x111DF, "Sharada"), TAKRI(0x11680, 0x116CF, "Takri"), CUNEIFORM(0x12000, 0x123FF, "Cuneiform"), CUNEIFORM_NUMBERS_AND_PUNCTUATION(0x12400, 0x1247F, "Cuneiform Numbers and Punctuation"), EGYPTIAN_HIEROGLYPHS(0x13000, 0x1342F, "Egyptian Hieroglyphs"), BAMUM_SUPPLEMENT(0x16800, 0x16A3F, "Bamum Supplement"), MIAO(0x16F00, 0x16F9F, "Miao"), KANA_SUPPLEMENT(0x1B000, 0x1B0FF, "Kana Supplement"), BYZANTINE_MUSICAL_SYMBOLS(0x1D000, 0x1D0FF, "Byzantine Musical Symbols"), MUSICAL_SYMBOLS(0x1D100, 0x1D1FF, "Musical Symbols"), ANCIENT_GREEK_MUSICAL_NOTATION(0x1D200, 0x1D24F, "Ancient Greek Musical Notation"), TAI_XUAN_JING_SYMBOLS(0x1D300, 0x1D35F, "Tai Xuan Jing Symbols"), COUNTING_ROD_NUMERALS(0x1D360, 0x1D37F, "Counting Rod Numerals"), MATHEMATICAL_ALPHANUMERIC_SYMBOLS(0x1D400, 0x1D7FF, "Mathematical Alphanumeric Symbols"), ARABIC_MATHEMATICAL_ALPHABETIC_SYMBOLS(0x1EE00, 0x1EEFF, "Arabic Mathematical Alphabetic Symbols"), MAHJONG_TILES(0x1F000, 0x1F02F, "Mahjong Tiles"), DOMINO_TILES(0x1F030, 0x1F09F, "Domino Tiles"), PLAYING_CARDS(0x1F0A0, 0x1F0FF, "Playing Cards"), ENCLOSED_ALPHANUMERIC_SUPPLEMENT(0x1F100, 0x1F1FF, "Enclosed Alphanumeric Supplement"), ENCLOSED_IDEOGRAPHIC_SUPPLEMENT(0x1F200, 0x1F2FF, "Enclosed Ideographic Supplement"), MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS(0x1F300, 0x1F5FF, "Miscellaneous Symbols and Pictographs"), EMOTICONS(0x1F600, 0x1F64F, "Emoticons"), TRANSPORT_AND_MAP_SYMBOLS(0x1F680, 0x1F6FF, "Transport and Map Symbols"), ALCHEMICAL_SYMBOLS(0x1F700, 0x1F77F, "Alchemical Symbols"), CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B(0x20000, 0x2A6D6, "CJK Unified Ideographs Extension B"), CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C(0x2A700, 0x2B734, "CJK Unified Ideographs Extension C"), CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D(0x2B740, 0x2B81D, "CJK Unified Ideographs Extension D"), CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT(0x2F800, 0x2FA1F, "CJK Compatibility Ideographs Supplement"), TAGS(0xE0000, 0xE007F, "Tags"), VARIATION_SELECTORS_SUPPLEMENT(0xE0100, 0xE01EF, "Variation Selectors Supplement"), SUPPLEMENTARY_PRIVATE_USE_AREA_A(0xF0000, 0xFFFFF, "Supplementary Private Use Area-A"), SUPPLEMENTARY_PRIVATE_USE_AREA_B(0x100000, 0x10FFFF, "Supplementary Private Use Area-B"), UNASSIGNED(0x110000, 0x110000, "Not Assigned"); protected final int from; protected final int to; protected final String descr; Range(int from, int last, String descr) { this.from = from; this.to = last+1; this.descr = descr; } public int getFrom() { return this.from; } public int getTo() { return this.to; } public String getDescr() { return this.descr; } public String getTitle() { return this.descr; } public boolean belongs(int codePoint) { return ( codePoint>=this.from && codePoint=this.from && ch.codePoint=this.from && codePoint= this.from && to <= this.to ); } public static boolean isCjkIdeograph(int codePoint) { return ( CJK_IDEOGRAPH.belongs(codePoint) || CJK_IDEOGRAPH_EXTENSION_A.belongs(codePoint) || CJK_IDEOGRAPH_EXTENSION_B.belongs(codePoint) || CJK_IDEOGRAPH_EXTENSION_C.belongs(codePoint) ); } public static boolean isHangulSyllable(int codePoint) { return HANGUL_SYLLABLE.belongs(codePoint); } protected static String cjkIdeographName(int codePoint) { return String.format("CJK UNIFIED IDEOGRAPH-%04X", codePoint); } protected static String hangulSyllableName(int codePoint) { int index = codePoint - HANGUL_SYLLABLE.getFrom(); final int tCount = 28; final int nCount = 21*tCount; int l = index/nCount; int v = (index%nCount)/tCount; int t = index%tCount; final String[] partL = { "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H" }; final String[] partV = { "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I" }; final String[] partT = { "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H" }; return String.format("HANGUL SYLLABLE %s%s%s", partL[l], partV[v], partT[t]); } } public static enum Category { UPPERCASE_LETTER("Lu", Character.UPPERCASE_LETTER, "Letter, Uppercase"), LOWERCASE_LETTER("Ll", Character.LOWERCASE_LETTER, "Letter, Lowercase"), TITLECASE_LETTER("Lt", Character.TITLECASE_LETTER, "Letter, Titlecase"), MODIFIER_LETTER("Lm", Character.MODIFIER_LETTER, "Letter, Modifier"), OTHER_LETTER("Lo", Character.OTHER_LETTER, "Letter, Other"), NON_SPACING_MARK("Mn", Character.NON_SPACING_MARK, "Mark, Nonspacing"), COMBINING_SPACING_MARK("Mc", Character.COMBINING_SPACING_MARK, "Mark, Spacing Combining"), ENCLOSING_MARK("Me", Character.ENCLOSING_MARK, "Mark, Enclosing"), DECIMAL_DIGIT_NUMBER("Nd", Character.DECIMAL_DIGIT_NUMBER, "Number, Decimal Digit"), LETTER_NUMBER("Nl", Character.LETTER_NUMBER, "Number, Letter"), OTHER_NUMBER("No", Character.OTHER_NUMBER, "Number, Other"), CONNECTOR_PUNCTUATION("Pc", Character.CONNECTOR_PUNCTUATION, "Punctuation, Connector"), DASH_PUNCTUATION("Pd", Character.DASH_PUNCTUATION, "Punctuation, Dash"), START_PUNCTUATION("Ps", Character.START_PUNCTUATION, "Punctuation, Open"), END_PUNCTUATION("Pe", Character.END_PUNCTUATION, "Punctuation, Close"), INITIAL_QUOTE_PUNCTUATION("Pi", Character.INITIAL_QUOTE_PUNCTUATION, "Punctuation, Initial quote"), FINAL_QUOTE_PUNCTUATION("Pf", Character.FINAL_QUOTE_PUNCTUATION, "Punctuation, Final quote"), OTHER_PUNCTUATION("Po", Character.OTHER_PUNCTUATION, "Punctuation, Other"), MATH_SYMBOL("Sm", Character.MATH_SYMBOL, "Symbol, Math"), CURRENCY_SYMBOL("Sc", Character.CURRENCY_SYMBOL, "Symbol, Currency"), MODIFIER_SYMBOL("Sk", Character.MODIFIER_SYMBOL, "Symbol, Modifier"), OTHER_SYMBOL("So", Character.OTHER_SYMBOL, "Symbol, Other"), SPACE_SEPARATOR("Zs", Character.SPACE_SEPARATOR, "Separator, Space"), LINE_SEPARATOR("Zl", Character.LINE_SEPARATOR, "Separator, Line"), PARAGRAPH_SEPARATOR("Zp", Character.PARAGRAPH_SEPARATOR, "Separator, Paragraph"), CONTROL("Cc", Character.CONTROL, "Other, Control"), FORMAT("Cf", Character.FORMAT, "Other, Format"), SURROGATE("Cs", Character.SURROGATE, "Other, Surrogate"), PRIVATE_USE("Co", Character.PRIVATE_USE, "Other, Private Use"), UNASSIGNED("Cn", Character.UNASSIGNED, "Other, Not Assigned"); protected final String code; protected final byte javaValue; protected final String descr; Category(String code, byte javaValue, String descr) { this.code = code; this.javaValue = javaValue; this.descr = descr; } protected final static Map revMap = new HashMap(); static { for ( Category cat : Category.values() ) revMap.put(cat.code, cat); } public static Category fromCode(String code) { Category cat = revMap.get(code); if ( cat == null ) cat = UNASSIGNED; return cat; } public String getCode() { return this.code; } public byte getJavaValue() { return this.javaValue; } public String getDescr() { return this.descr; } } protected final static Set printable; static { Category[] prlist = new Category[] { Category.UPPERCASE_LETTER, Category.LOWERCASE_LETTER, Category.TITLECASE_LETTER, Category.MODIFIER_LETTER, Category.OTHER_LETTER, Category.NON_SPACING_MARK, Category.COMBINING_SPACING_MARK, Category.ENCLOSING_MARK, Category.DECIMAL_DIGIT_NUMBER, Category.LETTER_NUMBER, Category.OTHER_NUMBER, Category.CONNECTOR_PUNCTUATION, Category.DASH_PUNCTUATION, Category.START_PUNCTUATION, Category.END_PUNCTUATION, Category.INITIAL_QUOTE_PUNCTUATION, Category.FINAL_QUOTE_PUNCTUATION, Category.OTHER_PUNCTUATION, Category.MATH_SYMBOL, Category.CURRENCY_SYMBOL, Category.MODIFIER_SYMBOL, Category.OTHER_SYMBOL }; printable = EnumSet.copyOf(Arrays.asList(prlist)); } protected final int codePoint; protected final String name; protected Range range; protected final Category category; protected final boolean isUnicode; protected final String charStr; protected final String label; protected String makeCharStr() { return new String(Character.toChars(codePoint)); } protected String makeLabel() { StringBuilder s = new StringBuilder(); Formatter fmt = new Formatter(s); fmt.format("U+%04X %s", codePoint, name); return new String(s); } public UnicodeCharacter(int codePoint, String name, Category category) { this.codePoint = codePoint; this.name = name; this.range = null; this.category = category; this.isUnicode = true; this.charStr = makeCharStr(); this.label = makeLabel(); } public UnicodeCharacter(int codePoint, String name, Category category, boolean isUnicode) { this.codePoint = codePoint; this.name = name; this.range = null; this.category = category; this.isUnicode = isUnicode; this.charStr = makeCharStr(); this.label = makeLabel(); } public int getCodePoint() { return this.codePoint; } public String getName() { return this.name; } public Range getRange() { if ( this.range != null ) return this.range; for ( Range r : Range.values() ) if ( this.codePoint >= r.from && this.codePoint < r.to ) return this.range = r; return this.range = Range.UNASSIGNED; } public Category getCategory() { return this.category; } public String getChar() { return this.charStr; } public String getLabel() { return this.label; } public boolean isUnicode() { return this.isUnicode; } public boolean isPrintable() { return this.isUnicode && printable.contains(this.category); } @Override public String toString() { return this.getLabel(); } static byte[] toUtf8(String s) { try { ByteArrayOutputStream buf = new ByteArrayOutputStream(8); OutputStreamWriter writer = new OutputStreamWriter(buf, "UTF-8"); writer.write(s, 0, s.length()); writer.close(); return buf.toByteArray(); } catch (UnsupportedEncodingException e) { throw new AssertionError("UTF-8 encoding unsupported"); } catch (IOException e) { throw new AssertionError("this is impossible"); } } static byte[] toUtf16(String s) { try { ByteArrayOutputStream buf = new ByteArrayOutputStream(8); OutputStreamWriter writer = new OutputStreamWriter(buf, "UTF-16BE"); writer.write(s, 0, s.length()); writer.close(); return buf.toByteArray(); } catch (UnsupportedEncodingException e) { throw new AssertionError("UTF-16BE encoding unsupported"); } catch (IOException e) { throw new AssertionError("this is impossible"); } } }