fixed encoding issues in js-gherkin

See cucumber/gherkin#225
anay · Aug 28, 2013 · 9116a95 · 9116a95
1 parent 1551c60
commit 9116a95
Show file tree

Hide file tree

Showing 2 changed files with 174 additions and 14 deletions.
diff --git a/js/test/test.de.js b/js/test/test.de.js
@@ -0,0 +1,84 @@
+var assert = require("assert");
+var Lexer = require('../lib/gherkin').Lexer('de');
+
+var Recorder=function(){
+  records=[];
+  return {
+    comment: function(value, line) {
+      records.push({token:'comment',value:value,line:line});
+    },
+    tag: function(value, line) {
+      records.push({token:'tag',value:value,line:line});
+    },
+    feature: function(keyword, name, description, line) {
+      records.push({token:'feature',keyword:keyword,name:name,description:description,line:line});
+    },
+    background: function(keyword, name, description, line) {
+      records.push({token:'background',keyword:keyword,name:name,description:description,line:line});
+    },
+    scenario: function(keyword, name, description, line) {
+      records.push({token:'scenario',keyword:keyword,name:name,description:description,line:line});
+    },
+    scenario_outline: function(keyword, name, description, line) {
+      records.push({token:'scenario_outline',keyword:keyword,name:name,description:description,line:line});
+    },
+    examples: function(keyword, name, description, line) {
+      records.push({token:'examples',keyword:keyword,name:name,description:description,line:line});
+    },
+    step: function(keyword, name, line) {
+      records.push({token:'step',keyword:keyword,name:name,line:line});
+    },
+    doc_string: function(content_type, string, line) {
+      records.push({token:'doc_string',content_type:content_type,string:string,line:line});
+    },
+    row: function(row, line) {
+      records.push({token:'row',row:row,line:line});
+    },
+    eof: function() {
+      records.push({token:'eof'});
+    },
+    records:records
+  };
+};
+
+
+describe("Lexer",function(){
+  describe('#scan with String',function(){
+    it("should accept keywords that include non-ascii characters",function(){
+      var recorder=Recorder();
+      var lexer=new Lexer(recorder);
+      assert.doesNotThrow(function(){
+        lexer.scan("Funktionalität: Jede Menge €€€s!");
+      });
+      assert.equal(recorder.records.length,2);
+      assert.equal(recorder.records[0].token,"feature");
+      assert.equal(recorder.records[0].keyword,"Funktionalität");
+      assert.equal(recorder.records[0].name,"Jede Menge €€€s!");
+    });
+  });
+
+  describe('#scan with Buffer',function(){
+    it("should accept keywords that include non-ascii characters",function(){
+      var recorder=Recorder();
+      var lexer=new Lexer(recorder);
+      assert.doesNotThrow(function(){
+        //same string as above, utf-8-encoded
+	var utf8_data = [
+          0x46, 0x75, 0x6e, 0x6b, 0x74, 0x69, 0x6f, 0x6e,  
+          0x61, 0x6c, 0x69, 0x74, 0xc3, 0xa4, 0x74, 0x3a,
+          0x20, 0x4a, 0x65, 0x64, 0x65, 0x20, 0x4d, 0x65,  
+          0x6e, 0x67, 0x65, 0x20, 0xe2, 0x82, 0xac, 0xe2,
+          0x82, 0xac, 0xe2, 0x82, 0xac, 0x73, 0x21, 0x0a
+        ]; 
+
+        lexer.scan(new Buffer(utf8_data));
+      });
+      assert.equal(recorder.records.length,2);
+      assert.equal(recorder.records[0].token,"feature");
+      assert.equal(recorder.records[0].keyword,"Funktionalität");
+      assert.equal(recorder.records[0].name,"Jede Menge €€€s!");
+    });
+  });
+
+
+});
diff --git a/ragel/lexer.js.rl.erb b/ragel/lexer.js.rl.erb
@@ -128,6 +128,9 @@
 %% write data;
 %% access this.;
 %% variable data data;
+%% getkey signedCharValue(data[p]);
+
+
 
 var Lexer = function(listener) {
   // Check that listener has the required functions
@@ -157,7 +160,7 @@ Lexer.prototype.scan = function(data) {
 
   this.line_number = 1;
   this.last_newline = 0;
-
+  var signedCharValue=function(v){return v > 127 ? v-256 : v; };
   %% write init;
   %% write exec;
 };
@@ -166,21 +169,93 @@ Lexer.prototype.bytesToString = function(bytes) {
   if(typeof bytes.write == 'function') {
     // Node.js
     return bytes.toString('utf-8');
-  } else {
-    var result = "";
-    for(var b in bytes) {
-      result += String.fromCharCode(bytes[b]);
-    }
-    return result;
-  }
+  } 
+//console.log("DEBUG: using mirabilos");
+
+	var result = "";
+	var i = 0;
+	var wc;
+	var c;
+
+	while (i < bytes.length) {
+		/* parse as UTF-8 lead byte */
+		wc = bytes[i++];
+		if (wc < 0x80) {
+			count = 0;
+		} else if (wc < 0xC2 || wc >= 0xF8) {
+			throw "input is not a valid UTF-8 lead octet";
+		} else if (wc < 0xE0) {
+			count = 1;
+			wc = (wc & 0x1F) << 6;
+		} else if (wc < 0xF0) {
+			count = 2;
+			wc = (wc & 0x0F) << 12;
+		} else /* wc < 0xF8 */ {
+			count = 3;
+			wc = (wc & 0x07) << 18;
+		}
+
+		/* parse trail bytes, if any */
+		while (count) {
+			if (!(i < bytes.length)) {
+				throw "short read";
+			}
+			if ((c = bytes[i++] ^ 0x80) > 0x3F) {
+				throw "input is not a valid UTF-8 trail octet";
+			}
+			wc |= c << (6 * --count);
+			if (wc < (1 << (5 * count + 6))) {
+				throw "invalid non-minimal encoded input";
+			}
+		}
+
+		/* handle conversion to UTF-16 if needed */
+		if (wc > 0xFFFF) {
+			wc -= 0x10000;
+			result += String.fromCharCode(0xD800 + (wc >> 10));
+			wc = 0xDC00 + (wc & 0x3FF);
+		}
+		result += String.fromCharCode(wc);
+	}
+
+	return result;
+
 };
 
 Lexer.prototype.stringToBytes = function(string) {
-  var bytes = [];
-  for(var i = 0; i < string.length; i++) {
-    bytes[i] = string.charCodeAt(i);
-  }
-  return bytes;
+	var bytes = [];
+	var i = 0;
+	var j = 0;
+	var wc;
+	while (i < string.length) {
+		wc = string.charCodeAt(i++);
+		if (wc >= 0xD800 && wc <= 0xDBFF &&
+		    i < string.length &&
+		    string.charCodeAt(i) >= 0xDC00 &&
+		    string.charCodeAt(i) <= 0xDFFF) {
+			/* decode UTF-16 */
+			wc = 0x10000 + ((wc & 0x3FF) << 10) +
+			    (string.charCodeAt(i++) & 0x3FF);
+		}
+		if (wc < 0x80) {
+			bytes[j++] = wc;
+			count = 0;
+		} else if (wc < 0x800) {
+			bytes[j++] = 0xC0 | (wc >> 6);
+			count = 1;
+		} else if (wc < 0x10000) {
+			bytes[j++] = 0xE0 | (wc >> 12);
+			count = 2;
+		} else {
+			/* SMP: 21-bit Unicode */
+			bytes[j++] = 0xF0 | (wc >> 18);
+			count = 3;
+		}
+		while (count) {
+			bytes[j++] = 0x80 | ((wc >> (6 * --count)) & 0x3F);
+		}
+	}
+	return bytes;
 };
 
 Lexer.prototype.unindent = function(startcol, text) {
@@ -202,7 +277,8 @@ Lexer.prototype.store_keyword_content = function(event, data, p, eof) {
 };
 
 Lexer.prototype.current_line_content = function(data, p) {
-  var rest = data.slice(this.last_newline, -1);
+  var rest = Array.prototype.slice.call(data,this.last_newline, -1);
+
   var end = rest.indexOf(10) || -1;
   return this.bytesToString(rest.slice(0, end)).trim();
 };