00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00032 #ifndef _UCOMMON_UNICODE_H_
00033 #define _UCOMMON_UNICODE_H_
00034
00035 #ifndef _UCOMMON_STRING_H_
00036 #include <ucommon/string.h>
00037 #endif
00038
00039 NAMESPACE_UCOMMON
00040
00045 typedef int32_t ucs4_t;
00046
00050 typedef int16_t ucs2_t;
00051
00055 typedef void *unicode_t;
00056
00062 class __EXPORT utf8
00063 {
00064 public:
00068 static const unsigned ucsize;
00069
00073 static const char *nil;
00074
00080 static unsigned size(const char *codepoint);
00081
00087 static size_t count(const char *string);
00088
00095 static char *offset(char *string, ssize_t position);
00096
00102 static ucs4_t codepoint(const char *encoded);
00103
00109 static size_t chars(const unicode_t string);
00110
00116 static size_t chars(ucs4_t character);
00117
00124 static size_t unpack(const unicode_t string, CharacterProtocol& buffer);
00125
00133 static size_t pack(unicode_t unicode, CharacterProtocol& buffer, size_t size);
00134
00142 static const char *find(const char *string, ucs4_t character, size_t start = 0);
00143
00151 static const char *rfind(const char *string, ucs4_t character, size_t end = (size_t)-1l);
00152
00159 static unsigned ccount(const char *string, ucs4_t character);
00160
00166 static ucs4_t get(CharacterProtocol& buffer);
00167
00174 static ucs4_t put(ucs4_t character, CharacterProtocol& buffer);
00175 };
00176
00183 class __EXPORT UString : public String, public utf8
00184 {
00185 protected:
00189 UString();
00190
00195 UString(strsize_t size);
00196
00201 UString(const unicode_t text);
00202
00209 UString(const char *text, strsize_t size);
00210
00217 UString(const unicode_t *text, const unicode_t *end);
00218
00224 UString(const UString& existing);
00225
00230 virtual ~UString();
00231
00238 UString get(strsize_t codepoint, strsize_t size = 0) const;
00239
00246 size_t get(unicode_t unicode, size_t size) const;
00247
00252 void set(const unicode_t unicode);
00253
00258 void add(const unicode_t unicode);
00259
00265 ucs4_t at(int position) const;
00266
00273 inline size_t operator()(unicode_t unicode, size_t size) const
00274 {return get(unicode, size);};
00275
00282 UString operator()(int codepoint, strsize_t size) const;
00283
00291 const char *operator()(int offset) const;
00292
00298 inline ucs4_t operator[](int position) const
00299 {return UString::at(position);};
00300
00305 inline strsize_t count(void) const
00306 {return utf8::count(str->text);}
00307
00313 unsigned ccount(ucs4_t character) const;
00314
00321 const char *find(ucs4_t character, strsize_t start = 0) const;
00322
00329 const char *rfind(ucs4_t character, strsize_t end = npos) const;
00330 };
00331
00337 class __EXPORT utf8_pointer
00338 {
00339 protected:
00340 uint8_t *text;
00341
00342 public:
00346 utf8_pointer();
00347
00352 utf8_pointer(const char *string);
00353
00358 utf8_pointer(const utf8_pointer& copy);
00359
00364 utf8_pointer& operator ++();
00365
00370 utf8_pointer& operator --();
00371
00377 utf8_pointer& operator +=(long offset);
00378
00384 utf8_pointer& operator -=(long offset);
00385
00391 utf8_pointer operator+(long offset) const;
00392
00398 utf8_pointer operator-(long offset) const;
00399
00404 inline operator bool() const
00405 {return text != NULL;};
00406
00411 inline bool operator!() const
00412 {return text == NULL;};
00413
00419 ucs4_t operator[](long codepoint) const;
00420
00426 utf8_pointer& operator=(const char *string);
00427
00431 void inc(void);
00432
00436 void dec(void);
00437
00443 inline bool operator==(const char *string) const
00444 {return (const char *)text == string;};
00445
00451 inline bool operator!=(const char *string) const
00452 {return (const char *)text != string;};
00453
00458 inline ucs4_t operator*() const
00459 {return utf8::codepoint((const char *)text);};
00460
00465 inline char *c_str(void) const
00466 {return (char *)text;};
00467
00472 inline operator char*() const
00473 {return (char *)text;};
00474
00479 inline size_t len(void) const
00480 {return utf8::count((const char *)text);};
00481 };
00482
00486 typedef UString ustring_t;
00487
00491 typedef utf8_pointer utf8_t;
00492
00493 END_NAMESPACE
00494
00495 #endif