module Unicode
Constants
- VERSION
Public Class Methods
abbr_categories(p1)
click to toggle source
VALUE
unicode_get_abbr_categories(VALUE obj, VALUE str)
{
WString wstr;
get_categories_param param = { &wstr, str, catname_abbr };
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
return rb_ensure(get_categories_internal, (VALUE)¶m,
get_categories_ensure, (VALUE)&wstr);
/* wstr will be freed in get_text_elements_ensure() */
}
capitalize(p1)
click to toggle source
endif
static VALUE
unicode_capitalize(VALUE obj, VALUE str)
{
WString ustr;
WString result;
UString ret;
VALUE vret;
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
WStr_alloc(&result);
capitalize_internal(&ustr, &result);
//sort_canonical(&result);
WStr_free(&ustr);
UniStr_alloc(&ret);
WStr_convertIntoUString(&result, &ret);
WStr_free(&result);
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
UniStr_free(&ret);
return vret;
}
categories(p1)
click to toggle source
VALUE
unicode_get_categories(VALUE obj, VALUE str)
{
WString wstr;
get_categories_param param = { &wstr, str, catname_long };
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
return rb_ensure(get_categories_internal, (VALUE)¶m,
get_categories_ensure, (VALUE)&wstr);
/* wstr will be freed in get_text_elements_ensure() */
}
compose(p1)
click to toggle source
static VALUE
unicode_compose(VALUE obj, VALUE str)
{
WString ustr;
WString result;
UString ret;
VALUE vret;
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
sort_canonical(&ustr);
WStr_alloc(&result);
compose_internal(&ustr, &result);
WStr_free(&ustr);
UniStr_alloc(&ret);
WStr_convertIntoUString(&result, &ret);
WStr_free(&result);
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
UniStr_free(&ret);
return vret;
}
decompose(p1)
click to toggle source
static VALUE
unicode_decompose(VALUE obj, VALUE str)
{
WString ustr;
WString result;
UString ret;
VALUE vret;
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
WStr_alloc(&result);
decompose_internal(&ustr, &result);
WStr_free(&ustr);
sort_canonical(&result);
UniStr_alloc(&ret);
WStr_convertIntoUString(&result, &ret);
WStr_free(&result);
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
UniStr_free(&ret);
return vret;
}
decompose_compat(p1)
click to toggle source
static VALUE
unicode_decompose_compat(VALUE obj, VALUE str)
{
WString ustr;
WString result;
UString ret;
VALUE vret;
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
WStr_alloc(&result);
decompose_compat_internal(&ustr, &result);
WStr_free(&ustr);
sort_canonical(&result);
UniStr_alloc(&ret);
WStr_convertIntoUString(&result, &ret);
WStr_free(&result);
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
UniStr_free(&ret);
return vret;
}
decompose_safe(p1)
click to toggle source
static VALUE
unicode_decompose_safe(VALUE obj, VALUE str)
{
WString ustr;
WString result;
UString ret;
VALUE vret;
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
WStr_alloc(&result);
decompose_safe_internal(&ustr, &result);
WStr_free(&ustr);
sort_canonical(&result);
UniStr_alloc(&ret);
WStr_convertIntoUString(&result, &ret);
WStr_free(&result);
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
UniStr_free(&ret);
return vret;
}
downcase(p1)
click to toggle source
static VALUE
unicode_downcase(VALUE obj, VALUE str)
{
WString ustr;
WString result;
UString ret;
VALUE vret;
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
WStr_alloc(&result);
downcase_internal(&ustr, &result);
//sort_canonical(&result);
WStr_free(&ustr);
UniStr_alloc(&ret);
WStr_convertIntoUString(&result, &ret);
WStr_free(&result);
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
UniStr_free(&ret);
return vret;
}
nfc(p1)
click to toggle source
static VALUE
unicode_normalize_C(VALUE obj, VALUE str)
{
WString ustr1;
WString ustr2;
WString result;
UString ret;
VALUE vret;
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
WStr_alloc(&ustr2);
decompose_internal(&ustr1, &ustr2);
WStr_free(&ustr1);
sort_canonical(&ustr2);
WStr_alloc(&result);
compose_internal(&ustr2, &result);
WStr_free(&ustr2);
UniStr_alloc(&ret);
WStr_convertIntoUString(&result, &ret);
WStr_free(&result);
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
UniStr_free(&ret);
return vret;
}
nfc_safe(p1)
click to toggle source
static VALUE
unicode_normalize_safe(VALUE obj, VALUE str)
{
WString ustr1;
WString ustr2;
WString result;
UString ret;
VALUE vret;
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
WStr_alloc(&ustr2);
decompose_safe_internal(&ustr1, &ustr2);
WStr_free(&ustr1);
sort_canonical(&ustr2);
WStr_alloc(&result);
compose_internal(&ustr2, &result);
WStr_free(&ustr2);
UniStr_alloc(&ret);
WStr_convertIntoUString(&result, &ret);
WStr_free(&result);
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
UniStr_free(&ret);
return vret;
}
nfd(p1)
click to toggle source
static VALUE
unicode_decompose(VALUE obj, VALUE str)
{
WString ustr;
WString result;
UString ret;
VALUE vret;
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
WStr_alloc(&result);
decompose_internal(&ustr, &result);
WStr_free(&ustr);
sort_canonical(&result);
UniStr_alloc(&ret);
WStr_convertIntoUString(&result, &ret);
WStr_free(&result);
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
UniStr_free(&ret);
return vret;
}
nfd_safe(p1)
click to toggle source
static VALUE
unicode_decompose_safe(VALUE obj, VALUE str)
{
WString ustr;
WString result;
UString ret;
VALUE vret;
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
WStr_alloc(&result);
decompose_safe_internal(&ustr, &result);
WStr_free(&ustr);
sort_canonical(&result);
UniStr_alloc(&ret);
WStr_convertIntoUString(&result, &ret);
WStr_free(&result);
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
UniStr_free(&ret);
return vret;
}
nfkc(p1)
click to toggle source
static VALUE
unicode_normalize_KC(VALUE obj, VALUE str)
{
WString ustr1;
WString ustr2;
WString result;
UString ret;
VALUE vret;
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
WStr_alloc(&ustr2);
decompose_compat_internal(&ustr1, &ustr2);
WStr_free(&ustr1);
sort_canonical(&ustr2);
WStr_alloc(&result);
compose_internal(&ustr2, &result);
WStr_free(&ustr2);
UniStr_alloc(&ret);
WStr_convertIntoUString(&result, &ret);
WStr_free(&result);
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
UniStr_free(&ret);
return vret;
}
nfkd(p1)
click to toggle source
static VALUE
unicode_decompose_compat(VALUE obj, VALUE str)
{
WString ustr;
WString result;
UString ret;
VALUE vret;
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
WStr_alloc(&result);
decompose_compat_internal(&ustr, &result);
WStr_free(&ustr);
sort_canonical(&result);
UniStr_alloc(&ret);
WStr_convertIntoUString(&result, &ret);
WStr_free(&result);
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
UniStr_free(&ret);
return vret;
}
normalize_C(p1)
click to toggle source
static VALUE
unicode_normalize_C(VALUE obj, VALUE str)
{
WString ustr1;
WString ustr2;
WString result;
UString ret;
VALUE vret;
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
WStr_alloc(&ustr2);
decompose_internal(&ustr1, &ustr2);
WStr_free(&ustr1);
sort_canonical(&ustr2);
WStr_alloc(&result);
compose_internal(&ustr2, &result);
WStr_free(&ustr2);
UniStr_alloc(&ret);
WStr_convertIntoUString(&result, &ret);
WStr_free(&result);
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
UniStr_free(&ret);
return vret;
}
normalize_C_safe(p1)
click to toggle source
static VALUE
unicode_normalize_safe(VALUE obj, VALUE str)
{
WString ustr1;
WString ustr2;
WString result;
UString ret;
VALUE vret;
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
WStr_alloc(&ustr2);
decompose_safe_internal(&ustr1, &ustr2);
WStr_free(&ustr1);
sort_canonical(&ustr2);
WStr_alloc(&result);
compose_internal(&ustr2, &result);
WStr_free(&ustr2);
UniStr_alloc(&ret);
WStr_convertIntoUString(&result, &ret);
WStr_free(&result);
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
UniStr_free(&ret);
return vret;
}
normalize_D(p1)
click to toggle source
static VALUE
unicode_decompose(VALUE obj, VALUE str)
{
WString ustr;
WString result;
UString ret;
VALUE vret;
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
WStr_alloc(&result);
decompose_internal(&ustr, &result);
WStr_free(&ustr);
sort_canonical(&result);
UniStr_alloc(&ret);
WStr_convertIntoUString(&result, &ret);
WStr_free(&result);
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
UniStr_free(&ret);
return vret;
}
normalize_D_safe(p1)
click to toggle source
static VALUE
unicode_decompose_safe(VALUE obj, VALUE str)
{
WString ustr;
WString result;
UString ret;
VALUE vret;
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
WStr_alloc(&result);
decompose_safe_internal(&ustr, &result);
WStr_free(&ustr);
sort_canonical(&result);
UniStr_alloc(&ret);
WStr_convertIntoUString(&result, &ret);
WStr_free(&result);
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
UniStr_free(&ret);
return vret;
}
normalize_KC(p1)
click to toggle source
static VALUE
unicode_normalize_KC(VALUE obj, VALUE str)
{
WString ustr1;
WString ustr2;
WString result;
UString ret;
VALUE vret;
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
WStr_alloc(&ustr2);
decompose_compat_internal(&ustr1, &ustr2);
WStr_free(&ustr1);
sort_canonical(&ustr2);
WStr_alloc(&result);
compose_internal(&ustr2, &result);
WStr_free(&ustr2);
UniStr_alloc(&ret);
WStr_convertIntoUString(&result, &ret);
WStr_free(&result);
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
UniStr_free(&ret);
return vret;
}
normalize_KD(p1)
click to toggle source
static VALUE
unicode_decompose_compat(VALUE obj, VALUE str)
{
WString ustr;
WString result;
UString ret;
VALUE vret;
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
WStr_alloc(&result);
decompose_compat_internal(&ustr, &result);
WStr_free(&ustr);
sort_canonical(&result);
UniStr_alloc(&ret);
WStr_convertIntoUString(&result, &ret);
WStr_free(&result);
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
UniStr_free(&ret);
return vret;
}
strcmp(p1, p2)
click to toggle source
static VALUE
unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
{
WString wstr1;
WString wstr2;
WString result1;
WString result2;
UString ustr1;
UString ustr2;
int ret;
Check_Type(str1, T_STRING);
Check_Type(str2, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str1);
CONVERT_TO_UTF8(str2);
#endif
WStr_allocWithUTF8L(&wstr1, RSTRING_PTR(str1), RSTRING_LEN(str1));
WStr_allocWithUTF8L(&wstr2, RSTRING_PTR(str2), RSTRING_LEN(str2));
WStr_alloc(&result1);
WStr_alloc(&result2);
decompose_internal(&wstr1, &result1);
decompose_internal(&wstr2, &result2);
WStr_free(&wstr1);
WStr_free(&wstr2);
sort_canonical(&result1);
sort_canonical(&result2);
UniStr_alloc(&ustr1);
UniStr_alloc(&ustr2);
WStr_convertIntoUString(&result1, &ustr1);
WStr_convertIntoUString(&result2, &ustr2);
WStr_free(&result1);
WStr_free(&result2);
UniStr_addChar(&ustr1, '\0');
UniStr_addChar(&ustr2, '\0');
ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
UniStr_free(&ustr1);
UniStr_free(&ustr2);
return INT2FIX(ret);
}
strcmp_compat(p1, p2)
click to toggle source
static VALUE
unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
{
WString wstr1;
WString wstr2;
WString result1;
WString result2;
UString ustr1;
UString ustr2;
int ret;
Check_Type(str1, T_STRING);
Check_Type(str2, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str1);
CONVERT_TO_UTF8(str2);
#endif
WStr_allocWithUTF8L(&wstr1, RSTRING_PTR(str1), RSTRING_LEN(str1));
WStr_allocWithUTF8L(&wstr2, RSTRING_PTR(str2), RSTRING_LEN(str2));
WStr_alloc(&result1);
WStr_alloc(&result2);
decompose_compat_internal(&wstr1, &result1);
decompose_compat_internal(&wstr2, &result2);
WStr_free(&wstr1);
WStr_free(&wstr2);
sort_canonical(&result1);
sort_canonical(&result2);
UniStr_alloc(&ustr1);
UniStr_alloc(&ustr2);
WStr_convertIntoUString(&result1, &ustr1);
WStr_convertIntoUString(&result2, &ustr2);
WStr_free(&result1);
WStr_free(&result2);
UniStr_addChar(&ustr1, '\0');
UniStr_addChar(&ustr2, '\0');
ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
UniStr_free(&ustr1);
UniStr_free(&ustr2);
return INT2FIX(ret);
}
text_elements(p1)
click to toggle source
VALUE
unicode_get_text_elements(VALUE obj, VALUE str)
{
WString wstr;
get_text_elements_param param = { &wstr, str };
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
return rb_ensure(get_text_elements_internal, (VALUE)¶m,
get_text_elements_ensure, (VALUE)&wstr);
/* wstr will be freed in get_text_elements_ensure() */
}
upcase(p1)
click to toggle source
static VALUE
unicode_upcase(VALUE obj, VALUE str)
{
WString ustr;
WString result;
UString ret;
VALUE vret;
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
WStr_alloc(&result);
upcase_internal(&ustr, &result);
//sort_canonical(&result);
WStr_free(&ustr);
UniStr_alloc(&ret);
WStr_convertIntoUString(&result, &ret);
WStr_free(&result);
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
UniStr_free(&ret);
return vret;
}
width(p1, p2 = v2)
click to toggle source
VALUE
unicode_wcswidth(int argc, VALUE* argv, VALUE obj)
{
WString wstr;
int i, count;
int width = 0;
int cjk_p = 0;
VALUE str;
VALUE cjk;
count = rb_scan_args(argc, argv, "11", &str, &cjk);
if (count > 1)
cjk_p = RTEST(cjk);
Check_Type(str, T_STRING);
#ifdef HAVE_RUBY_ENCODING_H
CONVERT_TO_UTF8(str);
#endif
WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
for (i = 0; i <wstr.len; i++) {
int c = wstr.str[i];
int cat = get_gencat(c);
int eaw = get_eawidth(c);
if ((c > 0 && c < 32) || (c >= 0x7f && c < 0xa0)) {
/* Control Characters */
width = -1;
break;
}
else if (c != 0x00ad && /* SOFT HYPHEN */
(cat == c_Mn || cat == c_Me || /* Non-spacing Marks */
cat == c_Cf || /* Format */
c == 0 || /* NUL */
(c >= 0x1160 && c <= 0x11ff))) /* HANGUL JUNGSEONG/JONGSEONG */
/* zero width */ ;
else if (eaw == w_F || eaw == w_W || /* Fullwidth or Wide */
(c >= 0x4db6 && c <= 0x4dbf) || /* CJK Reserved */
(c >= 0x9fcd && c <= 0x9fff) || /* CJK Reserved */
(c >= 0xfa6e && c <= 0xfa6f) || /* CJK Reserved */
(c >= 0xfada && c <= 0xfaff) || /* CJK Reserved */
(c >= 0x2a6d7 && c <= 0x2a6ff) || /* CJK Reserved */
(c >= 0x2b735 && c <= 0x2b73f) || /* CJK Reserved */
(c >= 0x2b81e && c <= 0x2f7ff) || /* CJK Reserved */
(c >= 0x2fa1e && c <= 0x2fffd) || /* CJK Reserved */
(c >= 0x30000 && c <= 0x3fffd) || /* CJK Reserved */
(cjk_p && eaw == w_A)) /* East Asian Ambiguous */
width += 2;
else
width++; /* Halfwidth or Neutral */
}
WStr_free(&wstr);
return INT2FIX(width);
}