类 Encoding::Converter
Encoding
转换类。
常量
- AFTER_OUTPUT
-
在某些输出完成但尚未使用所有输入之前停止转换。有关示例,请参阅
primitive_convert
。 - CRLF_NEWLINE_DECORATOR
-
用于将 LF 转换为 CRLF 的装饰器
- CR_NEWLINE_DECORATOR
-
用于将 LF 转换为 CR 的装饰器
- INVALID_MASK
-
无效字节序列的掩码
- INVALID_REPLACE
-
替换无效字节序列
- LF_NEWLINE_DECORATOR
-
用于在写入时将 CRLF 和 CR 转换为 LF 的装饰器
- PARTIAL_INPUT
-
指示源可能是较大字符串的一部分。有关示例,请参阅
primitive_convert
。 - UNDEF_HEX_CHARREF
-
使用 XML 十六进制字符引用替换目标编码中未定义的字节序列。这对于 XML 转换是有效的。
- UNDEF_MASK
-
源编码中有效字符的掩码,但在目标编码中没有相关字符。
- UNDEF_REPLACE
-
替换目标编码中未定义的字节序列。
- UNIVERSAL_NEWLINE_DECORATOR
-
用于将 CRLF 和 CR 转换为 LF 的装饰器
- XML_ATTR_CONTENT_DECORATOR
-
转义为 XML AttValue
- XML_ATTR_QUOTE_DECORATOR
-
转义为 XML AttValue
- XML_TEXT_DECORATOR
-
转义为 XML CharData
公共类方法
返回相应的 ASCII 兼容编码。
如果参数是 ASCII 兼容编码,则返回 nil。
“相应的 ASCII 兼容编码”是可以准确表示与给定的 ASCII 不兼容编码相同字符的 ASCII 兼容编码。因此,在两种编码之间转换时不会发生转换未定义错误。
Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP> Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8> Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
static VALUE econv_s_asciicompat_encoding(VALUE klass, VALUE arg) { const char *arg_name, *result_name; rb_encoding *arg_enc, *result_enc; enc_arg(&arg, &arg_name, &arg_enc); result_name = rb_econv_asciicompat_encoding(arg_name); if (result_name == NULL) return Qnil; result_enc = make_encoding(result_name); return rb_enc_from_encoding(result_enc); }
可能的选项元素
hash form: :invalid => nil # raise error on invalid byte sequence (default) :invalid => :replace # replace invalid byte sequence :undef => nil # raise error on undefined conversion (default) :undef => :replace # replace undefined conversion :replace => string # replacement string ("?" or "\uFFFD" if not specified) :newline => :universal # decorator for converting CRLF and CR to LF :newline => :lf # decorator for converting CRLF and CR to LF when writing :newline => :crlf # decorator for converting LF to CRLF :newline => :cr # decorator for converting LF to CR :universal_newline => true # decorator for converting CRLF and CR to LF :crlf_newline => true # decorator for converting LF to CRLF :cr_newline => true # decorator for converting LF to CR :lf_newline => true # decorator for converting CRLF and CR to LF when writing :xml => :text # escape as XML CharData. :xml => :attr # escape as XML AttValue integer form: Encoding::Converter::INVALID_REPLACE Encoding::Converter::UNDEF_REPLACE Encoding::Converter::UNDEF_HEX_CHARREF Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR Encoding::Converter::LF_NEWLINE_DECORATOR Encoding::Converter::CRLF_NEWLINE_DECORATOR Encoding::Converter::CR_NEWLINE_DECORATOR Encoding::Converter::XML_TEXT_DECORATOR Encoding::Converter::XML_ATTR_CONTENT_DECORATOR Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
Encoding::Converter.new
创建 Encoding::Converter
的实例。
Source_encoding 和 destination_encoding
应为字符串或 Encoding
对象。
opt 应为 nil、哈希或整数。
convpath 应为数组。convpath 可能包含
-
包含编码或编码名称的两元素数组,或
-
表示装饰器名称的字符串。
Encoding::Converter.new
可选择采用一个选项。该选项应为哈希或整数。选项哈希可以包含 :invalid => nil 等。选项整数应为常量的逻辑或,例如 Encoding::Converter::INVALID_REPLACE
等。
- :invalid => nil
-
在无效字节序列上引发错误。这是默认行为。
- :invalid => :replace
-
用替换字符串替换无效字节序列。
- :undef => nil
-
如果
source_encoding
中的字符未在 destination_encoding 中定义,则引发错误。这是默认行为。 - :undef => :replace
-
用替换字符串替换
destination_encoding
中未定义的字符。 - :replace => string
-
指定替换字符串。如果未指定,则 Unicode 编码使用 “uFFFD”,其他编码使用 “?”。
- :universal_newline => true
-
将 CRLF 和 CR 转换为 LF。
- :crlf_newline => true
-
将 LF 转换为 CRLF。
- :cr_newline => true
-
将 LF 转换为 CR。
- :lf_newline => true
-
将 CRLF 和 CR 转换为 LF(写入时)。
- :xml => :text
-
转义为 XML CharData。此表单可用作 HTML 4.0 PCDATA。
-
‘&’ -> ‘&’
-
‘<’ -> ‘<’
-
‘>’ -> ‘>’
-
destination_encoding
中未定义的字符 -> 十六进制 CharRef,例如 &#xHH;
-
- :xml => :attr
-
转义为 XML AttValue。转换结果用 “….” 引号引起来。此表单可用作 HTML 4.0 属性值。
-
‘&’ -> ‘&’
-
‘<’ -> ‘<’
-
‘>’ -> ‘>’
-
‘“’ -> ‘"’
-
destination_encoding
中未定义的字符 -> 十六进制 CharRef,例如 &#xHH;
-
示例
# UTF-16BE to UTF-8 ec = Encoding::Converter.new("UTF-16BE", "UTF-8") # Usually, decorators such as newline conversion are inserted last. ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true) p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>], # "universal_newline"] # But, if the last encoding is ASCII incompatible, # decorators are inserted before the last conversion. ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true) p ec.convpath #=> ["crlf_newline", # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]] # Conversion path can be specified directly. ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]]) p ec.convpath #=> ["universal_newline", # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>], # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
static VALUE econv_init(int argc, VALUE *argv, VALUE self) { VALUE ecopts; VALUE snamev, dnamev; const char *sname, *dname; rb_encoding *senc, *denc; rb_econv_t *ec; int ecflags; VALUE convpath; if (rb_check_typeddata(self, &econv_data_type)) { rb_raise(rb_eTypeError, "already initialized"); } if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) { ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc); ecflags = 0; ecopts = Qnil; } else { econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts); ec = rb_econv_open_opts(sname, dname, ecflags, ecopts); } if (!ec) { VALUE exc = rb_econv_open_exc(sname, dname, ecflags); RB_GC_GUARD(snamev); RB_GC_GUARD(dnamev); rb_exc_raise(exc); } if (!DECORATOR_P(sname, dname)) { if (!senc) senc = make_dummy_encoding(sname); if (!denc) denc = make_dummy_encoding(dname); RB_GC_GUARD(snamev); RB_GC_GUARD(dnamev); } ec->source_encoding = senc; ec->destination_encoding = denc; DATA_PTR(self) = ec; return self; }
返回转换路径。
p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP") #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]] p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true) or p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal) #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>], # "universal_newline"] p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true) or p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal) #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], # "universal_newline", # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
static VALUE econv_s_search_convpath(int argc, VALUE *argv, VALUE klass) { VALUE snamev, dnamev; const char *sname, *dname; rb_encoding *senc, *denc; int ecflags; VALUE ecopts; VALUE convpath; econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts); convpath = Qnil; transcode_search_path(sname, dname, search_convpath_i, &convpath); if (NIL_P(convpath)) { VALUE exc = rb_econv_open_exc(sname, dname, ecflags); RB_GC_GUARD(snamev); RB_GC_GUARD(dnamev); rb_exc_raise(exc); } if (decorate_convpath(convpath, ecflags) == -1) { VALUE exc = rb_econv_open_exc(sname, dname, ecflags); RB_GC_GUARD(snamev); RB_GC_GUARD(dnamev); rb_exc_raise(exc); } return convpath; }
公共实例方法
static VALUE econv_equal(VALUE self, VALUE other) { rb_econv_t *ec1 = check_econv(self); rb_econv_t *ec2; int i; if (!rb_typeddata_is_kind_of(other, &econv_data_type)) { return Qnil; } ec2 = DATA_PTR(other); if (!ec2) return Qfalse; if (ec1->source_encoding_name != ec2->source_encoding_name && strcmp(ec1->source_encoding_name, ec2->source_encoding_name)) return Qfalse; if (ec1->destination_encoding_name != ec2->destination_encoding_name && strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name)) return Qfalse; if (ec1->flags != ec2->flags) return Qfalse; if (ec1->replacement_enc != ec2->replacement_enc && strcmp(ec1->replacement_enc, ec2->replacement_enc)) return Qfalse; if (ec1->replacement_len != ec2->replacement_len) return Qfalse; if (ec1->replacement_str != ec2->replacement_str && memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len)) return Qfalse; if (ec1->num_trans != ec2->num_trans) return Qfalse; for (i = 0; i < ec1->num_trans; i++) { if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder) return Qfalse; } return Qtrue; }
转换 source_string 并返回 destination_string。
source_string 被假定为 source 的一部分。即:在内部指定 :partial_input=>true。finish 方法应最后使用。
ec = Encoding::Converter.new("utf-8", "euc-jp") puts ec.convert("\u3042").dump #=> "\xA4\xA2" puts ec.finish.dump #=> "" ec = Encoding::Converter.new("euc-jp", "utf-8") puts ec.convert("\xA4").dump #=> "" puts ec.convert("\xA2").dump #=> "\xE3\x81\x82" puts ec.finish.dump #=> "" ec = Encoding::Converter.new("utf-8", "iso-2022-jp") puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP") puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP") puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP") puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
如果发生转换错误,则会引发 Encoding::UndefinedConversionError
或 Encoding::InvalidByteSequenceError
。 Encoding::Converter#convert
不提供从这些异常中恢复或重新开始的方法。当您想要处理这些转换错误时,请使用 Encoding::Converter#primitive_convert
。
static VALUE econv_convert(VALUE self, VALUE source_string) { VALUE ret, dst; VALUE av[5]; int ac; rb_econv_t *ec = check_econv(self); StringValue(source_string); dst = rb_str_new(NULL, 0); av[0] = rb_str_dup(source_string); av[1] = dst; av[2] = Qnil; av[3] = Qnil; av[4] = INT2NUM(ECONV_PARTIAL_INPUT); ac = 5; ret = econv_primitive_convert(ac, av, self); if (ret == sym_invalid_byte_sequence || ret == sym_undefined_conversion || ret == sym_incomplete_input) { VALUE exc = make_econv_exception(ec); rb_exc_raise(exc); } if (ret == sym_finished) { rb_raise(rb_eArgError, "converter already finished"); } if (ret != sym_source_buffer_empty) { rb_bug("unexpected result of econv_primitive_convert"); } return dst; }
返回 ec 的转换路径。
结果是转换的数组。
ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true) p ec.convpath #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>], # "crlf_newline"]
数组的每个元素都是编码或字符串的成对。成对表示编码转换。字符串表示装饰器。
在上面的示例中,[#<Encoding:ISO-8859-1>,
static VALUE econv_convpath(VALUE self) { rb_econv_t *ec = check_econv(self); VALUE result; int i; result = rb_ary_new(); for (i = 0; i < ec->num_trans; i++) { const rb_transcoder *tr = ec->elems[i].tc->transcoder; VALUE v; if (DECORATOR_P(tr->src_encoding, tr->dst_encoding)) v = rb_str_new_cstr(tr->dst_encoding); else v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding)); rb_ary_push(result, v); } return result; }
将目标编码作为 Encoding
对象返回。
static VALUE econv_destination_encoding(VALUE self) { rb_econv_t *ec = check_econv(self); return econv_get_encoding(ec->destination_encoding); }
完成转换器。它返回已转换字符串的最后部分。
ec = Encoding::Converter.new("utf-8", "iso-2022-jp") p ec.convert("\u3042") #=> "\e$B$\"" p ec.finish #=> "\e(B"
static VALUE econv_finish(VALUE self) { VALUE ret, dst; VALUE av[5]; int ac; rb_econv_t *ec = check_econv(self); dst = rb_str_new(NULL, 0); av[0] = Qnil; av[1] = dst; av[2] = Qnil; av[3] = Qnil; av[4] = INT2FIX(0); ac = 5; ret = econv_primitive_convert(ac, av, self); if (ret == sym_invalid_byte_sequence || ret == sym_undefined_conversion || ret == sym_incomplete_input) { VALUE exc = make_econv_exception(ec); rb_exc_raise(exc); } if (ret != sym_finished) { rb_bug("unexpected result of econv_primitive_convert"); } return dst; }
将字符串插入编码转换器。该字符串将被转换为目标编码并在以后的转换中输出。
如果目标编码是有状态的,则根据状态转换字符串并更新状态。
仅当发生转换错误时才应使用此方法。
ec = Encoding::Converter.new("utf-8", "iso-8859-1") src = "HIRAGANA LETTER A is \u{3042}." dst = "" p ec.primitive_convert(src, dst) #=> :undefined_conversion puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."] ec.insert_output("<err>") p ec.primitive_convert(src, dst) #=> :finished puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""] ec = Encoding::Converter.new("utf-8", "iso-2022-jp") src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp dst = "" p ec.primitive_convert(src, dst) #=> :undefined_conversion puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"] ec.insert_output "?" # state change required to output "?". p ec.primitive_convert(src, dst) #=> :finished puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
static VALUE econv_insert_output(VALUE self, VALUE string) { const char *insert_enc; int ret; rb_econv_t *ec = check_econv(self); StringValue(string); insert_enc = rb_econv_encoding_to_insert_output(ec); string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil); ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc); if (ret == -1) { rb_raise(rb_eArgError, "too big string"); } return Qnil; }
返回 ec 的可打印版本
ec = Encoding::Converter.new("iso-8859-1", "utf-8") puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
static VALUE econv_inspect(VALUE self) { const char *cname = rb_obj_classname(self); rb_econv_t *ec; TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec); if (!ec) return rb_sprintf("#<%s: uninitialized>", cname); else { const char *sname = ec->source_encoding_name; const char *dname = ec->destination_encoding_name; VALUE str; str = rb_sprintf("#<%s: ", cname); econv_description(sname, dname, ec->flags, str); rb_str_cat2(str, ">"); return str; } }
返回上次转换的异常对象。如果上次转换未产生错误,则返回 nil。
“error”表示Encoding::InvalidByteSequenceError
和Encoding::UndefinedConversionError
表示Encoding::Converter#convert
和:invalid_byte_sequence、:incomplete_input和:undefined_conversion表示Encoding::Converter#primitive_convert
。
ec = Encoding::Converter.new("utf-8", "iso-8859-1") p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8> p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full p ec.last_error #=> nil
static VALUE econv_last_error(VALUE self) { rb_econv_t *ec = check_econv(self); VALUE exc; exc = make_econv_exception(ec); if (NIL_P(exc)) return Qnil; return exc; }
可能的opt元素
hash form: :partial_input => true # source buffer may be part of larger source :after_output => true # stop conversion after output before input integer form: Encoding::Converter::PARTIAL_INPUT Encoding::Converter::AFTER_OUTPUT
可能的结果
:invalid_byte_sequence :incomplete_input :undefined_conversion :after_output :destination_buffer_full :source_buffer_empty :finished
primitive_convert
将source_buffer转换为destination_buffer。
source_buffer应为字符串或nil。nil表示空字符串。
destination_buffer应为字符串。
destination_byteoffset应为整数或nil。nil表示destination_buffer的末尾。如果省略,则假定为nil。
destination_bytesize应为整数或nil。nil表示无限制。如果省略,则假定为nil。
opt应为nil、哈希或整数。nil表示无标志。如果省略,则假定为nil。
primitive_convert
将source_buffer的内容从开头转换,并将结果存储到destination_buffer中。
destination_byteoffset和destination_bytesize指定存储转换结果的区域。destination_byteoffset以字节为单位指定destination_buffer中的起始位置。如果destination_byteoffset为nil,则destination_buffer.bytesize用于附加结果。destination_bytesize指定最大字节数。如果destination_bytesize为nil,则目标大小不受限制。转换后,destination_buffer的大小调整为destination_byteoffset + 实际产生的字节数。此外,destination_buffer的编码设置为destination_encoding。
primitive_convert
删除source_buffer的已转换部分。已删除的部分在destination_buffer中转换或缓存在Encoding::Converter
对象中。
primitive_convert
在满足以下条件之一时停止转换。
-
在源缓冲区中找到无效字节序列(:invalid_byte_sequence)
primitive_errinfo
和last_error
方法返回错误的详细信息。 -
源缓冲区的意外结束(:incomplete_input) 仅在未指定:partial_input时发生。
primitive_errinfo
和last_error
方法返回错误的详细信息。 -
字符无法在输出编码中表示(:undefined_conversion)
primitive_errinfo
和last_error
方法返回错误的详细信息。 -
在生成一些输出后,在输入完成前(:after_output)仅当指定 :after_output 时才会发生这种情况。
-
目标缓冲区已满(:destination_buffer_full)仅当 destination_bytesize 为非空时才会发生这种情况。
-
源缓冲区为空(:source_buffer_empty)仅当指定 :partial_input 时才会发生这种情况。
-
转换已完成(:finished)
示例
ec = Encoding::Converter.new("UTF-8", "UTF-16BE") ret = ec.primitive_convert(src="pi", dst="", nil, 100) p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"] ec = Encoding::Converter.new("UTF-8", "UTF-16BE") ret = ec.primitive_convert(src="pi", dst="", nil, 1) p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"] ret = ec.primitive_convert(src, dst="", nil, 1) p [ret, src, dst] #=> [:destination_buffer_full, "", "p"] ret = ec.primitive_convert(src, dst="", nil, 1) p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"] ret = ec.primitive_convert(src, dst="", nil, 1) p [ret, src, dst] #=> [:finished, "", "i"]
static VALUE econv_primitive_convert(int argc, VALUE *argv, VALUE self) { VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v; rb_econv_t *ec = check_econv(self); rb_econv_result_t res; const unsigned char *ip, *is; unsigned char *op, *os; long output_byteoffset, output_bytesize; unsigned long output_byteend; int flags; argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt); if (NIL_P(output_byteoffset_v)) output_byteoffset = 0; /* dummy */ else output_byteoffset = NUM2LONG(output_byteoffset_v); if (NIL_P(output_bytesize_v)) output_bytesize = 0; /* dummy */ else output_bytesize = NUM2LONG(output_bytesize_v); if (!NIL_P(flags_v)) { if (!NIL_P(opt)) { rb_error_arity(argc + 1, 2, 5); } flags = NUM2INT(rb_to_int(flags_v)); } else if (!NIL_P(opt)) { VALUE v; flags = 0; v = rb_hash_aref(opt, sym_partial_input); if (RTEST(v)) flags |= ECONV_PARTIAL_INPUT; v = rb_hash_aref(opt, sym_after_output); if (RTEST(v)) flags |= ECONV_AFTER_OUTPUT; } else { flags = 0; } StringValue(output); if (!NIL_P(input)) StringValue(input); rb_str_modify(output); if (NIL_P(output_bytesize_v)) { output_bytesize = rb_str_capacity(output); if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input)) output_bytesize = RSTRING_LEN(input); } retry: if (NIL_P(output_byteoffset_v)) output_byteoffset = RSTRING_LEN(output); if (output_byteoffset < 0) rb_raise(rb_eArgError, "negative output_byteoffset"); if (RSTRING_LEN(output) < output_byteoffset) rb_raise(rb_eArgError, "output_byteoffset too big"); if (output_bytesize < 0) rb_raise(rb_eArgError, "negative output_bytesize"); output_byteend = (unsigned long)output_byteoffset + (unsigned long)output_bytesize; if (output_byteend < (unsigned long)output_byteoffset || LONG_MAX < output_byteend) rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big"); if (rb_str_capacity(output) < output_byteend) rb_str_resize(output, output_byteend); if (NIL_P(input)) { ip = is = NULL; } else { ip = (const unsigned char *)RSTRING_PTR(input); is = ip + RSTRING_LEN(input); } op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset; os = op + output_bytesize; res = rb_econv_convert(ec, &ip, is, &op, os, flags); rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output)); if (!NIL_P(input)) { rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input)); } if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) { if (LONG_MAX / 2 < output_bytesize) rb_raise(rb_eArgError, "too long conversion result"); output_bytesize *= 2; output_byteoffset_v = Qnil; goto retry; } if (ec->destination_encoding) { rb_enc_associate(output, ec->destination_encoding); } return econv_result_to_symbol(res); }
primitive_errinfo
将有关最后一个错误的重要信息作为 5 元素数组返回
[result, enc1, enc2, error_bytes, readagain_bytes]
result 是 primitive_convert 的最后一个结果。
当 result 为 :invalid_byte_sequence、:incomplete_input 或 :undefined_conversion 时,其他元素才有意义。
enc1 和 enc2 将转换步骤表示为一对字符串。例如,从 EUC-JP 到 ISO-8859-1 的转换器将字符串转换如下:EUC-JP -> UTF-8 -> ISO-8859-1。因此,[enc1, enc2] 为 [“EUC-JP”, “UTF-8”] 或 [“UTF-8”, “ISO-8859-1”]。
error_bytes 和 readagain_bytes 指示导致错误的字节序列。error_bytes 是被丢弃的部分。readagain_bytes 是在下次转换时再次读取的缓冲部分。
示例
# \xff is invalid as EUC-JP. ec = Encoding::Converter.new("EUC-JP", "Shift_JIS") ec.primitive_convert(src="\xff", dst="", nil, 10) p ec.primitive_errinfo #=> [:invalid_byte_sequence, "EUC-JP", "Shift_JIS", "\xFF", ""] # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1. # Since this error is occur in UTF-8 to ISO-8859-1 conversion, # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82). ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10) p ec.primitive_errinfo #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""] # partial character is invalid ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") ec.primitive_convert(src="\xa4", dst="", nil, 10) p ec.primitive_errinfo #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""] # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by # partial characters. ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT) p ec.primitive_errinfo #=> [:source_buffer_empty, nil, nil, nil, nil] # \xd8\x00\x00@ is invalid as UTF-16BE because # no low surrogate after high surrogate (\xd8\x00). # It is detected by 3rd byte (\00) which is part of next character. # So the high surrogate (\xd8\x00) is discarded and # the 3rd byte is read again later. # Since the byte is buffered in ec, it is dropped from src. ec = Encoding::Converter.new("UTF-16BE", "UTF-8") ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10) p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"] p src #=> "@" # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE. # The problem is detected by 4th byte. ec = Encoding::Converter.new("UTF-16LE", "UTF-8") ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10) p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"] p src #=> ""
static VALUE econv_primitive_errinfo(VALUE self) { rb_econv_t *ec = check_econv(self); VALUE ary; ary = rb_ary_new2(5); rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result)); rb_ary_store(ary, 4, Qnil); if (ec->last_error.source_encoding) rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding)); if (ec->last_error.destination_encoding) rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding)); if (ec->last_error.error_bytes_start) { rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len)); rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len)); } return ary; }
放回将被转换的字节。
字节由 invalid_byte_sequence 错误导致。当出现 invalid_byte_sequence 错误时,一些字节会被丢弃,一些字节会被缓冲以供以后转换。后一个字节可以放回。可以通过 Encoding::InvalidByteSequenceError#readagain_bytes
和 Encoding::Converter#primitive_errinfo
观察到。
ec = Encoding::Converter.new("utf-16le", "iso-8859-1") src = "\x00\xd8\x61\x00" dst = "" p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"] p ec.putback #=> "a\x00" p ec.putback #=> "" # no more bytes to put back
static VALUE econv_putback(int argc, VALUE *argv, VALUE self) { rb_econv_t *ec = check_econv(self); int n; int putbackable; VALUE str, max; if (!rb_check_arity(argc, 0, 1) || NIL_P(max = argv[0])) { n = rb_econv_putbackable(ec); } else { n = NUM2INT(max); putbackable = rb_econv_putbackable(ec); if (putbackable < n) n = putbackable; } str = rb_str_new(NULL, n); rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n); if (ec->source_encoding) { rb_enc_associate(str, ec->source_encoding); } return str; }
返回替换字符串。
ec = Encoding::Converter.new("euc-jp", "us-ascii") p ec.replacement #=> "?" ec = Encoding::Converter.new("euc-jp", "utf-8") p ec.replacement #=> "\uFFFD"
static VALUE econv_get_replacement(VALUE self) { rb_econv_t *ec = check_econv(self); int ret; rb_encoding *enc; ret = make_replacement(ec); if (ret == -1) { rb_raise(rb_eUndefinedConversionError, "replacement character setup failed"); } enc = rb_enc_find(ec->replacement_enc); return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc); }
设置替换字符串。
ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace) ec.replacement = "<undef>" p ec.convert("a \u3042 b") #=> "a <undef> b"
static VALUE econv_set_replacement(VALUE self, VALUE arg) { rb_econv_t *ec = check_econv(self); VALUE string = arg; int ret; rb_encoding *enc; StringValue(string); enc = rb_enc_get(string); ret = rb_econv_set_replacement(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), rb_enc_name(enc)); if (ret == -1) { /* xxx: rb_eInvalidByteSequenceError? */ rb_raise(rb_eUndefinedConversionError, "replacement character setup failed"); } return arg; }
将源编码作为 Encoding
对象返回。
static VALUE econv_source_encoding(VALUE self) { rb_econv_t *ec = check_econv(self); return econv_get_encoding(ec->source_encoding); }