Skip to content

Commit

Permalink
actually encode to UTF-8 when input charset is non UTF-8 + basic specs
Browse files Browse the repository at this point in the history
  • Loading branch information
colinsurprenant committed Mar 8, 2014
1 parent fa27dc9 commit 5e01c52
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 24 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ default:
@echo " tarball -- builds the tarball package"
@echo " tarball-test -- runs the test suite against the tarball package"

TESTS=$(wildcard spec/inputs/file.rb spec/inputs/gelf.rb spec/inputs/imap.rb spec/support/*.rb spec/filters/*.rb spec/examples/*.rb spec/codecs/*.rb spec/conditionals/*.rb spec/event.rb spec/jar.rb)
TESTS=$(wildcard spec/inputs/file.rb spec/inputs/gelf.rb spec/inputs/imap.rb spec/util/*.rb spec/support/*.rb spec/filters/*.rb spec/examples/*.rb spec/codecs/*.rb spec/conditionals/*.rb spec/event.rb spec/jar.rb)

# The 'version' is generated based on the logstash version, git revision, etc.
.VERSION.mk: REVISION=$(shell git rev-parse --short HEAD | tr -d ' ')
Expand Down Expand Up @@ -105,7 +105,7 @@ clean:

.PHONY: vendor-clean
vendor-clean:
-$(QUIET)rm -rf vendor/kibana vendor/geoip vendor/collectd
-$(QUIET)rm -rf vendor/kibana vendor/geoip vendor/collectd
-$(QUIET)rm -rf vendor/jar vendor/ua-parser

.PHONY: clean-vendor
Expand Down
39 changes: 17 additions & 22 deletions lib/logstash/util/charset.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,36 +4,31 @@

class LogStash::Util::Charset
attr_accessor :logger

def initialize(charset)
@charset = charset
end

def convert(data)
data.force_encoding(@charset)
if @charset == "UTF-8"
# Some users don't know the charset of their logs or just don't know they
# can set the charset setting.
if !data.valid_encoding?
@logger.warn("Received an event that has a different character encoding than you configured.", :text => data.inspect[1..-2], :expected_charset => @charset)
#if @force_lossy_charset_conversion
## Janky hack to force ruby to re-encode UTF-8 with replacement chars.
#data.force_encoding("CP65001")
#data = data.encode("UTF-8", :invalid => :replace, :undef => :replace)
#else
#end

# A silly hack to help convert some of the unknown bytes to
# somewhat-readable escape codes. The [1..-2] is to trim the quotes
# ruby puts on the value.
data = data.inspect[1..-2]
else
# The user has declared the character encoding of this data is
# something other than UTF-8. Let's convert it (as cleanly as possible)
# into UTF-8 so we can use it with JSON, etc.
data = data.encode("UTF-8", :invalid => :replace, :undef => :replace)
# NON UTF-8 charset declared.
# Let's convert it (as cleanly as possible) into UTF-8 so we can use it with JSON, etc.
return data.encode("UTF-8", :invalid => :replace, :undef => :replace) unless @charset == "UTF-8"

# UTF-8 charset declared.
# Some users don't know the charset of their logs or just don't know they
# can set the charset setting.
unless data.valid_encoding?
# A silly hack to help convert some of the unknown bytes to
# somewhat-readable escape codes. The [1..-2] is to trim the quotes
# ruby puts on the value.
return data.inspect[1..-2].tap do |escaped|
@logger.warn("Received an event that has a different character encoding than you configured.", :text => escaped, :expected_charset => @charset)
end
end
return data

data
end # def convert
end # class LogStash::Util::Charset

end # class LogStash::Util::Charset
74 changes: 74 additions & 0 deletions spec/util/charset_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# encoding: utf-8

require "test_utils"
require "logstash/util/charset"

describe LogStash::Util::Charset do
let(:logger) { double("logger") }

context "with valid UTF-8 source encoding" do
subject {LogStash::Util::Charset.new("UTF-8")}

it "should return untouched data" do
["foobar", "κόσμε"].each do |data|
insist { data.encoding.name } == "UTF-8"
insist { subject.convert(data) } == data
insist { subject.convert(data).encoding.name } == "UTF-8"
end
end
end

context "with invalid UTF-8 source encoding" do
subject do
LogStash::Util::Charset.new("UTF-8").tap do |charset|
charset.logger = logger
end
end

it "should escape invalid sequences" do
["foo \xED\xB9\x81\xC3", "bar \xAD"].each do |data|
insist { data.encoding.name } == "UTF-8"
insist { data.valid_encoding? } == false
logger.should_receive(:warn).twice
insist { subject.convert(data) } == data.inspect[1..-2]
insist { subject.convert(data).encoding.name } == "UTF-8"
end
end

end

context "with valid non UTF-8 source encoding" do
subject {LogStash::Util::Charset.new("ISO-8859-1")}

it "should encode to UTF-8" do
samples = [
["foobar", "foobar"],
["\xE0 Montr\xE9al", "à Montréal"],
]
samples.map{|(a, b)| [a.force_encoding("ISO-8859-1"), b]}.each do |(a, b)|
insist { a.encoding.name } == "ISO-8859-1"
insist { b.encoding.name } == "UTF-8"
insist { a.valid_encoding? } == true
insist { subject.convert(a).encoding.name } == "UTF-8"
insist { subject.convert(a) } == b
end
end
end

context "with invalid non UTF-8 source encoding" do
subject {LogStash::Util::Charset.new("ASCII-8BIT")}

it "should encode to UTF-8 and replace invalid chars" do
samples = [
["\xE0 Montr\xE9al", "� Montr�al"],
["\xCE\xBA\xCF\x8C\xCF\x83\xCE\xBC\xCE\xB5", "����������"],
]
samples.map{|(a, b)| [a.force_encoding("ASCII-8BIT"), b]}.each do |(a, b)|
insist { a.encoding.name } == "ASCII-8BIT"
insist { b.encoding.name } == "UTF-8"
insist { subject.convert(a).encoding.name } == "UTF-8"
insist { subject.convert(a) } == b
end
end
end
end

0 comments on commit 5e01c52

Please sign in to comment.