Skip to content

Commit

Permalink
fixed encoding issues in js-gherkin
Browse files Browse the repository at this point in the history
  • Loading branch information
ldegen committed Aug 28, 2013
1 parent 1551c60 commit 9116a95
Show file tree
Hide file tree
Showing 2 changed files with 174 additions and 14 deletions.
84 changes: 84 additions & 0 deletions js/test/test.de.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
var assert = require("assert");
var Lexer = require('../lib/gherkin').Lexer('de');

var Recorder=function(){
records=[];
return {
comment: function(value, line) {
records.push({token:'comment',value:value,line:line});
},
tag: function(value, line) {
records.push({token:'tag',value:value,line:line});
},
feature: function(keyword, name, description, line) {
records.push({token:'feature',keyword:keyword,name:name,description:description,line:line});
},
background: function(keyword, name, description, line) {
records.push({token:'background',keyword:keyword,name:name,description:description,line:line});
},
scenario: function(keyword, name, description, line) {
records.push({token:'scenario',keyword:keyword,name:name,description:description,line:line});
},
scenario_outline: function(keyword, name, description, line) {
records.push({token:'scenario_outline',keyword:keyword,name:name,description:description,line:line});
},
examples: function(keyword, name, description, line) {
records.push({token:'examples',keyword:keyword,name:name,description:description,line:line});
},
step: function(keyword, name, line) {
records.push({token:'step',keyword:keyword,name:name,line:line});
},
doc_string: function(content_type, string, line) {
records.push({token:'doc_string',content_type:content_type,string:string,line:line});
},
row: function(row, line) {
records.push({token:'row',row:row,line:line});
},
eof: function() {
records.push({token:'eof'});
},
records:records
};
};


describe("Lexer",function(){
describe('#scan with String',function(){
it("should accept keywords that include non-ascii characters",function(){
var recorder=Recorder();
var lexer=new Lexer(recorder);
assert.doesNotThrow(function(){
lexer.scan("Funktionalität: Jede Menge €€€s!");
});
assert.equal(recorder.records.length,2);
assert.equal(recorder.records[0].token,"feature");
assert.equal(recorder.records[0].keyword,"Funktionalität");
assert.equal(recorder.records[0].name,"Jede Menge €€€s!");
});
});

describe('#scan with Buffer',function(){
it("should accept keywords that include non-ascii characters",function(){
var recorder=Recorder();
var lexer=new Lexer(recorder);
assert.doesNotThrow(function(){
//same string as above, utf-8-encoded
var utf8_data = [
0x46, 0x75, 0x6e, 0x6b, 0x74, 0x69, 0x6f, 0x6e,
0x61, 0x6c, 0x69, 0x74, 0xc3, 0xa4, 0x74, 0x3a,
0x20, 0x4a, 0x65, 0x64, 0x65, 0x20, 0x4d, 0x65,
0x6e, 0x67, 0x65, 0x20, 0xe2, 0x82, 0xac, 0xe2,
0x82, 0xac, 0xe2, 0x82, 0xac, 0x73, 0x21, 0x0a
];

lexer.scan(new Buffer(utf8_data));
});
assert.equal(recorder.records.length,2);
assert.equal(recorder.records[0].token,"feature");
assert.equal(recorder.records[0].keyword,"Funktionalität");
assert.equal(recorder.records[0].name,"Jede Menge €€€s!");
});
});


});
104 changes: 90 additions & 14 deletions ragel/lexer.js.rl.erb
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@
%% write data;
%% access this.;
%% variable data data;
%% getkey signedCharValue(data[p]);



var Lexer = function(listener) {
// Check that listener has the required functions
Expand Down Expand Up @@ -157,7 +160,7 @@ Lexer.prototype.scan = function(data) {

this.line_number = 1;
this.last_newline = 0;

var signedCharValue=function(v){return v > 127 ? v-256 : v; };
%% write init;
%% write exec;
};
Expand All @@ -166,21 +169,93 @@ Lexer.prototype.bytesToString = function(bytes) {
if(typeof bytes.write == 'function') {
// Node.js
return bytes.toString('utf-8');
} else {
var result = "";
for(var b in bytes) {
result += String.fromCharCode(bytes[b]);
}
return result;
}
}
//console.log("DEBUG: using mirabilos");

var result = "";
var i = 0;
var wc;
var c;

while (i < bytes.length) {
/* parse as UTF-8 lead byte */
wc = bytes[i++];
if (wc < 0x80) {
count = 0;
} else if (wc < 0xC2 || wc >= 0xF8) {
throw "input is not a valid UTF-8 lead octet";
} else if (wc < 0xE0) {
count = 1;
wc = (wc & 0x1F) << 6;
} else if (wc < 0xF0) {
count = 2;
wc = (wc & 0x0F) << 12;
} else /* wc < 0xF8 */ {
count = 3;
wc = (wc & 0x07) << 18;
}

/* parse trail bytes, if any */
while (count) {
if (!(i < bytes.length)) {
throw "short read";
}
if ((c = bytes[i++] ^ 0x80) > 0x3F) {
throw "input is not a valid UTF-8 trail octet";
}
wc |= c << (6 * --count);
if (wc < (1 << (5 * count + 6))) {
throw "invalid non-minimal encoded input";
}
}

/* handle conversion to UTF-16 if needed */
if (wc > 0xFFFF) {
wc -= 0x10000;
result += String.fromCharCode(0xD800 + (wc >> 10));
wc = 0xDC00 + (wc & 0x3FF);
}
result += String.fromCharCode(wc);
}

return result;

};

Lexer.prototype.stringToBytes = function(string) {
var bytes = [];
for(var i = 0; i < string.length; i++) {
bytes[i] = string.charCodeAt(i);
}
return bytes;
var bytes = [];
var i = 0;
var j = 0;
var wc;
while (i < string.length) {
wc = string.charCodeAt(i++);
if (wc >= 0xD800 && wc <= 0xDBFF &&
i < string.length &&
string.charCodeAt(i) >= 0xDC00 &&
string.charCodeAt(i) <= 0xDFFF) {
/* decode UTF-16 */
wc = 0x10000 + ((wc & 0x3FF) << 10) +
(string.charCodeAt(i++) & 0x3FF);
}
if (wc < 0x80) {
bytes[j++] = wc;
count = 0;
} else if (wc < 0x800) {
bytes[j++] = 0xC0 | (wc >> 6);
count = 1;
} else if (wc < 0x10000) {
bytes[j++] = 0xE0 | (wc >> 12);
count = 2;
} else {
/* SMP: 21-bit Unicode */
bytes[j++] = 0xF0 | (wc >> 18);
count = 3;
}
while (count) {
bytes[j++] = 0x80 | ((wc >> (6 * --count)) & 0x3F);
}
}
return bytes;
};

Lexer.prototype.unindent = function(startcol, text) {
Expand All @@ -202,7 +277,8 @@ Lexer.prototype.store_keyword_content = function(event, data, p, eof) {
};

Lexer.prototype.current_line_content = function(data, p) {
var rest = data.slice(this.last_newline, -1);
var rest = Array.prototype.slice.call(data,this.last_newline, -1);

var end = rest.indexOf(10) || -1;
return this.bytesToString(rest.slice(0, end)).trim();
};
Expand Down

0 comments on commit 9116a95

Please sign in to comment.