/** * @file * Phoebe DOM Implementation. * * This is a C++ approximation of the W3C DOM model, which follows * fairly closely the specifications in the various .idl files, copies of * which are provided for reference. Most important is this one: * * http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407/idl-definitions.html * * More thorough explanations of the various classes and their algorithms * can be found there. * */ /* * Authors: * Bob Jamison * * Copyright (C) 2006-2008 Bob Jamison * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef SEEN_UCD_H #define SEEN_UCD_H /* *********************************************** ** Unicode character classification ************************************************/ /** * Enumerated Unicode general category types */ typedef enum UniCharType { UNI_UNASSIGNED = 0, /* Cn */ UNI_UPPERCASE_LETTER = 1, /* Lu */ UNI_LOWERCASE_LETTER = 2, /* Ll */ UNI_TITLECASE_LETTER = 3, /* Lt */ UNI_MODIFIER_LETTER = 4, /* Lm */ UNI_OTHER_LETTER = 5, /* Lo */ UNI_NON_SPACING_MARK = 6, /* Mn */ UNI_ENCLOSING_MARK = 7, /* Me */ UNI_COMBINING_SPACING_MARK = 8, /* Mc */ UNI_DECIMAL_DIGIT_NUMBER = 9, /* Nd */ UNI_LETTER_NUMBER = 10, /* Nl */ UNI_OTHER_NUMBER = 11, /* No */ UNI_SPACE_SEPARATOR = 12, /* Zs */ UNI_LINE_SEPARATOR = 13, /* Zl */ UNI_PARAGRAPH_SEPARATOR = 14, /* Zp */ UNI_CONTROL = 15, /* Cc */ UNI_FORMAT = 16, /* Cf */ UNI_UNUSED_RESERVE = 17, /* xx */ UNI_PRIVATE_USE = 18, /* Co */ UNI_SURROGATE = 19, /* Cs */ UNI_DASH_PUNCTUATION = 20, /* Pd */ UNI_START_PUNCTUATION = 21, /* Ps */ UNI_END_PUNCTUATION = 22, /* Pe */ UNI_CONNECTOR_PUNCTUATION = 23, /* Pc */ UNI_OTHER_PUNCTUATION = 24, /* Po */ UNI_MATH_SYMBOL = 25, /* Sm */ UNI_CURRENCY_SYMBOL = 26, /* Sc */ UNI_MODIFIER_SYMBOL = 27, /* Sk */ UNI_OTHER_SYMBOL = 28, /* So */ UNI_INITIAL_QUOTE_PUNCTUATION = 29, /* Pi */ UNI_FINAL_QUOTE_PUNCTUATION = 30 /* Pf */ } UnicodeCharType; /** * Get the raw table entry for this Unicode codepoint * @param ch the Unicode codepoint to test * @return the raw UCD property table entry */ unsigned int uni_code(int ch); /** * Get the Unicode General Category of ths character * @param ch the Unicode codepoint to test * @return the 'UniCharType' General Category enumeration (above) */ unsigned int uni_type(int ch); /** * Test if this Unicode code point is lower case * @param ch the Unicode codepoint to test * @return 1 if successful, else 0 */ int uni_is_lower(int ch); /** * Test if this Unicode code point is upper case * @param ch the Unicode codepoint to test * @return 1 if successful, else 0 */ int uni_is_upper(int ch); /** * Test if this Unicode code point is title case * @param ch the Unicode codepoint to test * @return 1 if successful, else 0 */ int uni_is_title(int ch); /** * Test if this Unicode code point is a numeric digit * @param ch the Unicode codepoint to test * @return 1 if successful, else 0 */ int uni_is_digit(int ch); /** * Test if this Unicode code point is defined in the database * @param ch the Unicode codepoint to test * @return 1 if successful, else 0 */ int uni_is_defined(int ch); /** * Test if this Unicode code point is a letter * @param ch the Unicode codepoint to test * @return 1 if successful, else 0 */ int uni_is_letter(int ch); /** * Test if this Unicode code point is a letter or a digit * @param ch the Unicode codepoint to test * @return 1 if successful, else 0 */ int uni_is_letter_or_digit(int ch); /** * Test if this Unicode code point is considered to be a space * @param ch the Unicode codepoint to test * @return 1 if successful, else 0 */ int uni_is_space(int ch); /************************************************ ** Unicode case conversion ************************************************/ /** * Convert the given codepoint to its lower case mapping. * If there is none, return the codepoint. * @param ch the Unicode codepoint to convert * @return the converted codepoint */ int uni_to_lower(int ch); /** * Convert the given codepoint to its upper case mapping. * If there is none, return the codepoint. * @param ch the Unicode codepoint to convert * @return the converted codepoint */ int uni_to_upper(int ch); /** * Convert the given codepoint to its title case mapping. * If there is none, return the codepoint. * @param ch the Unicode codepoint to convert * @return the converted codepoint */ int uni_to_title(int ch); /* *********************************************** ** Unicode blocks ************************************************/ /** * Used to hold the information for a Unicode codepoint * block */ typedef struct { /** * Low end of the block range */ unsigned long low; /** * High end of the block range */ unsigned long high; /** * Name string for the block */ const char *name; } UcdBlockData; /** * Return the Unicode block (defined below) for the given * codepoint. If not found, return UCD_BLOCK_NO_BLOCK. * @param ch the Unicode codepoint to search * @return the block */ int uni_block(int ch); /** * Return the Unicode block data for the enumerated block number. * @param nr the Unicode block number * @return the block data if found, else NULL */ UcdBlockData *uni_block_data(int blockNr); /** * The Unicode codepoint blocks as defined in Blocks.txt. * Block list has 171 entries */ typedef enum { /* 0, 000000 - 00007f */ UCD_BLOCK_BASIC_LATIN, /* 2, 000100 - 00017f */ UCD_BLOCK_LATIN_EXTENDED_A, /* 4, 000250 - 0002af */ UCD_BLOCK_IPA_EXTENSIONS, /* 6, 000300 - 00036f */ UCD_BLOCK_COMBINING_DIACRITICAL_MARKS, /* 8, 000400 - 0004ff */ UCD_BLOCK_CYRILLIC, /* 10, 000530 - 00058f */ UCD_BLOCK_ARMENIAN, /* 12, 000600 - 0006ff */ UCD_BLOCK_ARABIC, /* 14, 000750 - 00077f */ UCD_BLOCK_ARABIC_SUPPLEMENT, /* 16, 0007c0 - 0007ff */ UCD_BLOCK_NKO, /* 18, 000980 - 0009ff */ UCD_BLOCK_BENGALI, /* 20, 000a80 - 000aff */ UCD_BLOCK_GUJARATI, /* 22, 000b80 - 000bff */ UCD_BLOCK_TAMIL, /* 24, 000c80 - 000cff */ UCD_BLOCK_KANNADA, /* 26, 000d80 - 000dff */ UCD_BLOCK_SINHALA, /* 28, 000e80 - 000eff */ UCD_BLOCK_LAO, /* 30, 001000 - 00109f */ UCD_BLOCK_MYANMAR, /* 32, 001100 - 0011ff */ UCD_BLOCK_HANGUL_JAMO, /* 34, 001380 - 00139f */ UCD_BLOCK_ETHIOPIC_SUPPLEMENT, /* 36, 001400 - 00167f */ UCD_BLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS, /* 38, 0016a0 - 0016ff */ UCD_BLOCK_RUNIC, /* 40, 001720 - 00173f */ UCD_BLOCK_HANUNOO, /* 42, 001760 - 00177f */ UCD_BLOCK_TAGBANWA, /* 44, 001800 - 0018af */ UCD_BLOCK_MONGOLIAN, /* 46, 001950 - 00197f */ UCD_BLOCK_TAI_LE, /* 48, 0019e0 - 0019ff */ UCD_BLOCK_KHMER_SYMBOLS, /* 50, 001b00 - 001b7f */ UCD_BLOCK_BALINESE, /* 52, 001c00 - 001c4f */ UCD_BLOCK_LEPCHA, /* 54, 001d00 - 001d7f */ UCD_BLOCK_PHONETIC_EXTENSIONS, /* 56, 001dc0 - 001dff */ UCD_BLOCK_COMBINING_DIACRITICAL_MARKS_SUPPLEMENT, /* 58, 001f00 - 001fff */ UCD_BLOCK_GREEK_EXTENDED, /* 60, 002070 - 00209f */ UCD_BLOCK_SUPERSCRIPTS_AND_SUBSCRIPTS, /* 62, 0020d0 - 0020ff */ UCD_BLOCK_COMBINING_DIACRITICAL_MARKS_FOR_SYMBOLS, /* 64, 002150 - 00218f */ UCD_BLOCK_NUMBER_FORMS, /* 66, 002200 - 0022ff */ UCD_BLOCK_MATHEMATICAL_OPERATORS, /* 68, 002400 - 00243f */ UCD_BLOCK_CONTROL_PICTURES, /* 70, 002460 - 0024ff */ UCD_BLOCK_ENCLOSED_ALPHANUMERICS, /* 72, 002580 - 00259f */ UCD_BLOCK_BLOCK_ELEMENTS, /* 74, 002600 - 0026ff */ UCD_BLOCK_MISCELLANEOUS_SYMBOLS, /* 76, 0027c0 - 0027ef */ UCD_BLOCK_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A, /* 78, 002800 - 0028ff */ UCD_BLOCK_BRAILLE_PATTERNS, /* 80, 002980 - 0029ff */ UCD_BLOCK_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B, /* 82, 002b00 - 002bff */ UCD_BLOCK_MISCELLANEOUS_SYMBOLS_AND_ARROWS, /* 84, 002c60 - 002c7f */ UCD_BLOCK_LATIN_EXTENDED_C, /* 86, 002d00 - 002d2f */ UCD_BLOCK_GEORGIAN_SUPPLEMENT, /* 88, 002d80 - 002ddf */ UCD_BLOCK_ETHIOPIC_EXTENDED, /* 90, 002e00 - 002e7f */ UCD_BLOCK_SUPPLEMENTAL_PUNCTUATION, /* 92, 002f00 - 002fdf */ UCD_BLOCK_KANGXI_RADICALS, /* 94, 003000 - 00303f */ UCD_BLOCK_CJK_SYMBOLS_AND_PUNCTUATION, /* 96, 0030a0 - 0030ff */ UCD_BLOCK_KATAKANA, /* 98, 003130 - 00318f */ UCD_BLOCK_HANGUL_COMPATIBILITY_JAMO, /* 100, 0031a0 - 0031bf */ UCD_BLOCK_BOPOMOFO_EXTENDED, /* 102, 0031f0 - 0031ff */ UCD_BLOCK_KATAKANA_PHONETIC_EXTENSIONS, /* 104, 003300 - 0033ff */ UCD_BLOCK_CJK_COMPATIBILITY, /* 106, 004dc0 - 004dff */ UCD_BLOCK_YIJING_HEXAGRAM_SYMBOLS, /* 108, 00a000 - 00a48f */ UCD_BLOCK_YI_SYLLABLES, /* 110, 00a500 - 00a63f */ UCD_BLOCK_VAI, /* 112, 00a700 - 00a71f */ UCD_BLOCK_MODIFIER_TONE_LETTERS, /* 114, 00a800 - 00a82f */ UCD_BLOCK_SYLOTI_NAGRI, /* 116, 00a880 - 00a8df */ UCD_BLOCK_SAURASHTRA, /* 118, 00a930 - 00a95f */ UCD_BLOCK_REJANG, /* 120, 00ac00 - 00d7af */ UCD_BLOCK_HANGUL_SYLLABLES, /* 122, 00db80 - 00dbff */ UCD_BLOCK_HIGH_PRIVATE_USE_SURROGATES, /* 124, 00e000 - 00f8ff */ UCD_BLOCK_PRIVATE_USE_AREA, /* 126, 00fb00 - 00fb4f */ UCD_BLOCK_ALPHABETIC_PRESENTATION_FORMS, /* 128, 00fe00 - 00fe0f */ UCD_BLOCK_VARIATION_SELECTORS, /* 130, 00fe20 - 00fe2f */ UCD_BLOCK_COMBINING_HALF_MARKS, /* 132, 00fe50 - 00fe6f */ UCD_BLOCK_SMALL_FORM_VARIANTS, /* 134, 00ff00 - 00ffef */ UCD_BLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, /* 136, 010000 - 01007f */ UCD_BLOCK_LINEAR_B_SYLLABARY, /* 138, 010100 - 01013f */ UCD_BLOCK_AEGEAN_NUMBERS, /* 140, 010190 - 0101cf */ UCD_BLOCK_ANCIENT_SYMBOLS, /* 142, 010280 - 01029f */ UCD_BLOCK_LYCIAN, /* 144, 010300 - 01032f */ UCD_BLOCK_OLD_ITALIC, /* 146, 010380 - 01039f */ UCD_BLOCK_UGARITIC, /* 148, 010400 - 01044f */ UCD_BLOCK_DESERET, /* 150, 010480 - 0104af */ UCD_BLOCK_OSMANYA, /* 152, 010900 - 01091f */ UCD_BLOCK_PHOENICIAN, /* 154, 010a00 - 010a5f */ UCD_BLOCK_KHAROSHTHI, /* 156, 012400 - 01247f */ UCD_BLOCK_CUNEIFORM_NUMBERS_AND_PUNCTUATION, /* 158, 01d100 - 01d1ff */ UCD_BLOCK_MUSICAL_SYMBOLS, /* 160, 01d300 - 01d35f */ UCD_BLOCK_TAI_XUAN_JING_SYMBOLS, /* 162, 01d400 - 01d7ff */ UCD_BLOCK_MATHEMATICAL_ALPHANUMERIC_SYMBOLS, /* 164, 01f030 - 01f09f */ UCD_BLOCK_DOMINO_TILES, /* 166, 02f800 - 02fa1f */ UCD_BLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, /* 168, 0e0100 - 0e01ef */ UCD_BLOCK_VARIATION_SELECTORS_SUPPLEMENT, /* 170, 100000 - 10ffff */ UCD_BLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B, /* 171, 000000 - 10ffff */ UCD_BLOCK_NO_BLOCK } UnicodeBlocks; #endif // SEEN_UCD_H