From 6d5ef97a32f74917efaa53049815873c4ae00aa1 Mon Sep 17 00:00:00 2001 From: duerst Date: Thu, 21 Feb 2008 08:42:10 +0000 Subject: [PATCH] Thu Feb 21 17:15:15 2008 Martin Duerst * transcode.c: Added basic support for passing options to String#encode via a hash. Currently only one option, with one value, is supported: invalid: :ignore (dropping invalid byte sequences instead of producing an error). Option naming is not yet stable! * test/ruby/test_transcode.rb: Added a single test for invalid: :ignore option. Not more tests because most data does not yet distinguish between INVALID and UNKNOWN. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@15565 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 11 +++++++++++ test/ruby/test_transcode.rb | 7 +++++-- transcode.c | 33 +++++++++++++++++++++++++++++---- 3 files changed, 45 insertions(+), 6 deletions(-) diff --git a/ChangeLog b/ChangeLog index 4f6b92c619c536..e9ebc1e74edb9b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +Thu Feb 21 17:15:15 2008 Martin Duerst + + * transcode.c: Added basic support for passing options to String#encode + via a hash. Currently only one option, with one value, is supported: + invalid: :ignore (dropping invalid byte sequences instead of + producing an error). Option naming is not yet stable! + + * test/ruby/test_transcode.rb: Added a single test for invalid: :ignore + option. Not more tests because most data does not yet distinguish + between INVALID and UNKNOWN. + Thu Feb 21 16:35:26 2008 Nobuyoshi Nakada * array.c (rb_ary_unshift_m): expands enough for argc. [ruby-dev:33880] diff --git a/test/ruby/test_transcode.rb b/test/ruby/test_transcode.rb index 957d852c19ff19..9edf30882be83e 100644 --- a/test/ruby/test_transcode.rb +++ b/test/ruby/test_transcode.rb @@ -19,8 +19,6 @@ def setup # trick to create all the necessary encodings end def test_errors - # we don't have semantics for conversion without attribute yet - # maybe 'convert to UTF-8' would be nice :-) assert_raise(ArgumentError) { 'abc'.encode } assert_raise(ArgumentError) { 'abc'.encode! } assert_raise(ArgumentError) { 'abc'.encode('foo', 'bar') } @@ -241,4 +239,9 @@ def test_utf_32 check_utf_32_both_ways("\u{8FF00}", "\x00\x08\xFF\x00") check_utf_32_both_ways("\u{F00FF}", "\x00\x0F\x00\xFF") end + + def test_invalid_ignore + # arguments only + 'abc'.encode('utf-8', invalid: :ignore) + end end diff --git a/transcode.c b/transcode.c index 4173df9dc17337..ed01374f5b8067 100644 --- a/transcode.c +++ b/transcode.c @@ -15,6 +15,9 @@ #include "transcode_data.h" #include +static VALUE sym_invalid, sym_ignore; +#define INVALID_IGNORE 0x1 + /* * Dispatch data and logic */ @@ -132,7 +135,8 @@ static void transcode_loop(unsigned char **in_pos, unsigned char **out_pos, unsigned char *in_stop, unsigned char *out_stop, const rb_transcoder *my_transcoder, - rb_transcoding *my_transcoding) + rb_transcoding *my_transcoding, + const int opt) { unsigned char *in_p = *in_pos, *out_p = *out_pos; const BYTE_LOOKUP *conv_tree_start = my_transcoder->conv_tree_start; @@ -211,14 +215,17 @@ transcode_loop(unsigned char **in_pos, unsigned char **out_pos, case INVALID: goto invalid; case UNDEF: - /* todo: add code for alternative behaviors */ + /* todo: add code for alternate behaviors */ rb_raise(rb_eRuntimeError /*@@@change exception*/, "conversion undefined for byte sequence (maybe invalid byte sequence)"); continue; } continue; invalid: /* deal with invalid byte sequence */ - /* todo: add code for alternative behaviors */ + /* todo: add more alternative behaviors */ + if (opt&INVALID_IGNORE) { + continue; + } rb_raise(rb_eRuntimeError /*change exception*/, "invalid byte sequence"); continue; } @@ -254,7 +261,22 @@ str_transcode(int argc, VALUE *argv, VALUE *self) const rb_transcoder *my_transcoder; rb_transcoding my_transcoding; int final_encoding = 0; + VALUE opt; + int options = 0; + + opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash"); + if (!NIL_P(opt)) { + VALUE v; + argc--; + v = rb_hash_aref(opt, sym_invalid); + if (NIL_P(v)) { + rb_raise(rb_eArgError, "unknown value for invalid: setting"); + } + else if (v==sym_ignore) { + options |= INVALID_IGNORE; + } + } if (argc < 1 || argc > 2) { rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); } @@ -325,7 +347,7 @@ str_transcode(int argc, VALUE *argv, VALUE *self) my_transcoding.ruby_string_dest = dest; my_transcoding.flush_func = str_transcoding_resize; - transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), my_transcoder, &my_transcoding); + transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), my_transcoder, &my_transcoding, options); if (fromp != sp+slen) { rb_raise(rb_eArgError, "not fully converted, %d bytes left", sp+slen-fromp); } @@ -426,6 +448,9 @@ Init_transcode(void) transcoder_lib_table = st_init_strcasetable(); init_transcoder_table(); + sym_invalid = ID2SYM(rb_intern("invalid")); + sym_ignore = ID2SYM(rb_intern("ignore")); + rb_define_method(rb_cString, "encode", rb_str_transcode, -1); rb_define_method(rb_cString, "encode!", rb_str_transcode_bang, -1); }