actually encode to UTF-8 when input charset is non UTF-8 + basic specs

ttyard · Mar 8, 2014 · 5e01c52 · 5e01c52
1 parent fa27dc9
commit 5e01c52
Show file tree

Hide file tree

Showing 3 changed files with 93 additions and 24 deletions.
diff --git a/Makefile b/Makefile
@@ -43,7 +43,7 @@ default:
 	@echo "  tarball -- builds the tarball package"
 	@echo "  tarball-test -- runs the test suite against the tarball package"
 
-TESTS=$(wildcard spec/inputs/file.rb spec/inputs/gelf.rb spec/inputs/imap.rb spec/support/*.rb spec/filters/*.rb spec/examples/*.rb spec/codecs/*.rb spec/conditionals/*.rb spec/event.rb spec/jar.rb)
+TESTS=$(wildcard spec/inputs/file.rb spec/inputs/gelf.rb spec/inputs/imap.rb spec/util/*.rb spec/support/*.rb spec/filters/*.rb spec/examples/*.rb spec/codecs/*.rb spec/conditionals/*.rb spec/event.rb spec/jar.rb)
 
 # The 'version' is generated based on the logstash version, git revision, etc.
 .VERSION.mk: REVISION=$(shell git rev-parse --short HEAD | tr -d ' ')
@@ -105,7 +105,7 @@ clean:
 
 .PHONY: vendor-clean
 vendor-clean:
-	-$(QUIET)rm -rf vendor/kibana vendor/geoip vendor/collectd 
+	-$(QUIET)rm -rf vendor/kibana vendor/geoip vendor/collectd
 	-$(QUIET)rm -rf vendor/jar vendor/ua-parser
 
 .PHONY: clean-vendor

diff --git a/lib/logstash/util/charset.rb b/lib/logstash/util/charset.rb
@@ -4,36 +4,31 @@
 
 class LogStash::Util::Charset
   attr_accessor :logger
+
   def initialize(charset)
     @charset = charset
   end
 
   def convert(data)
     data.force_encoding(@charset)
-    if @charset == "UTF-8"
-      # Some users don't know the charset of their logs or just don't know they
-      # can set the charset setting.
-      if !data.valid_encoding?
-        @logger.warn("Received an event that has a different character encoding than you configured.", :text => data.inspect[1..-2], :expected_charset => @charset)
-        #if @force_lossy_charset_conversion
-          ## Janky hack to force ruby to re-encode UTF-8 with replacement chars.
-          #data.force_encoding("CP65001")
-          #data = data.encode("UTF-8", :invalid => :replace, :undef => :replace)
-        #else
-        #end
 
-        # A silly hack to help convert some of the unknown bytes to
-        # somewhat-readable escape codes. The [1..-2] is to trim the quotes
-        # ruby puts on the value.
-        data = data.inspect[1..-2]
-      else
-        # The user has declared the character encoding of this data is
-        # something other than UTF-8. Let's convert it (as cleanly as possible)
-        # into UTF-8 so we can use it with JSON, etc.
-        data = data.encode("UTF-8", :invalid => :replace, :undef => :replace)
+    # NON UTF-8 charset declared.
+    # Let's convert it (as cleanly as possible) into UTF-8 so we can use it with JSON, etc.
+    return data.encode("UTF-8", :invalid => :replace, :undef => :replace) unless @charset == "UTF-8"
+
+    # UTF-8 charset declared.
+    # Some users don't know the charset of their logs or just don't know they
+    # can set the charset setting.
+    unless data.valid_encoding?
+      # A silly hack to help convert some of the unknown bytes to
+      # somewhat-readable escape codes. The [1..-2] is to trim the quotes
+      # ruby puts on the value.
+      return data.inspect[1..-2].tap do |escaped|
+        @logger.warn("Received an event that has a different character encoding than you configured.", :text => escaped, :expected_charset => @charset)
       end
     end
-    return data
+
+    data
   end # def convert
-end # class LogStash::Util::Charset
 
+end # class LogStash::Util::Charset
diff --git a/spec/util/charset_spec.rb b/spec/util/charset_spec.rb
@@ -0,0 +1,74 @@
+# encoding: utf-8
+
+require "test_utils"
+require "logstash/util/charset"
+
+describe LogStash::Util::Charset do
+  let(:logger) { double("logger") }
+
+  context "with valid UTF-8 source encoding" do
+    subject {LogStash::Util::Charset.new("UTF-8")}
+
+    it "should return untouched data" do
+      ["foobar", "κόσμε"].each do |data|
+        insist { data.encoding.name } == "UTF-8"
+        insist { subject.convert(data) } == data
+        insist { subject.convert(data).encoding.name } == "UTF-8"
+      end
+    end
+  end
+
+  context "with invalid UTF-8 source encoding" do
+    subject do
+      LogStash::Util::Charset.new("UTF-8").tap do |charset|
+        charset.logger = logger
+      end
+    end
+
+    it "should escape invalid sequences" do
+      ["foo \xED\xB9\x81\xC3", "bar \xAD"].each do |data|
+        insist { data.encoding.name } == "UTF-8"
+        insist { data.valid_encoding? } == false
+        logger.should_receive(:warn).twice
+        insist { subject.convert(data) } == data.inspect[1..-2]
+        insist { subject.convert(data).encoding.name } == "UTF-8"
+      end
+    end
+
+  end
+
+  context "with valid non UTF-8 source encoding" do
+    subject {LogStash::Util::Charset.new("ISO-8859-1")}
+
+    it "should encode to UTF-8" do
+      samples = [
+        ["foobar", "foobar"],
+        ["\xE0 Montr\xE9al", "à Montréal"],
+      ]
+      samples.map{|(a, b)| [a.force_encoding("ISO-8859-1"), b]}.each do |(a, b)|
+        insist { a.encoding.name } == "ISO-8859-1"
+        insist { b.encoding.name } == "UTF-8"
+        insist { a.valid_encoding? } == true
+        insist { subject.convert(a).encoding.name } == "UTF-8"
+        insist { subject.convert(a) } == b
+      end
+    end
+  end
+
+  context "with invalid non UTF-8 source encoding" do
+    subject {LogStash::Util::Charset.new("ASCII-8BIT")}
+
+    it "should encode to UTF-8 and replace invalid chars" do
+      samples = [
+        ["\xE0 Montr\xE9al", "� Montr�al"],
+        ["\xCE\xBA\xCF\x8C\xCF\x83\xCE\xBC\xCE\xB5", "����������"],
+      ]
+      samples.map{|(a, b)| [a.force_encoding("ASCII-8BIT"), b]}.each do |(a, b)|
+        insist { a.encoding.name } == "ASCII-8BIT"
+        insist { b.encoding.name } == "UTF-8"
+        insist { subject.convert(a).encoding.name } == "UTF-8"
+        insist { subject.convert(a) } == b
+      end
+    end
+  end
+end