Skip to content

Commit

Permalink
Pay attention to single_quote flag during CSV guessing
Browse files Browse the repository at this point in the history
Guess col splits & counts either looking at or ignoring single-quotes
based on the setup flag.
  • Loading branch information
cliffclick committed Nov 27, 2013
1 parent c0a44fb commit cfbd584
Showing 1 changed file with 17 additions and 16 deletions.
33 changes: 17 additions & 16 deletions src/main/java/water/parser/CsvParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -508,13 +508,13 @@ public CsvParser clone(){
/** Dermines the number of separators in given line. Correctly handles quoted
* tokens.
*/
private static int[] determineSeparatorCounts(String from) {
private static int[] determineSeparatorCounts(String from, int single_quote) {
int[] result = new int[separators.length];
byte[] bits = from.getBytes();
boolean in_quote = false;
for( int j=0; j< bits.length; j++ ) {
byte c = bits[j];
if( (c == CHAR_SINGLE_QUOTE) || (c == CHAR_DOUBLE_QUOTE) )
if( (c == single_quote) || (c == CHAR_DOUBLE_QUOTE) )
in_quote ^= true;
if( !in_quote || c == HIVE_SEP )
for( int i = 0; i < separators.length; ++i)
Expand All @@ -527,7 +527,7 @@ private static int[] determineSeparatorCounts(String from) {
/** Determines the tokens that are inside a line and returns them as strings
* in an array. Assumes the given separator.
*/
private static String[] determineTokens(String from, byte separator) {
private static String[] determineTokens(String from, byte separator, int single_quote) {
ArrayList<String> tokens = new ArrayList();
byte[] bits = from.getBytes();
int offset = 0;
Expand All @@ -537,7 +537,7 @@ private static String[] determineTokens(String from, byte separator) {
if(offset == bits.length)break;
StringBuilder t = new StringBuilder();
byte c = bits[offset];
if ((c == '"') || (c == '\'')) {
if ((c == CHAR_DOUBLE_QUOTE) || (c == single_quote)) {
quotes = c;
++offset;
}
Expand Down Expand Up @@ -588,18 +588,18 @@ private static boolean hasHeader(String[] l1, String[] l2) {
return allStrings(l1) && !allStrings(l2);
}

private static byte guessSeparator(String l1, String l2){
int[] s1 = determineSeparatorCounts(l1);
int[] s2 = determineSeparatorCounts(l2);
private static byte guessSeparator(String l1, String l2, int single_quote){
int[] s1 = determineSeparatorCounts(l1, single_quote);
int[] s2 = determineSeparatorCounts(l2, single_quote);
// now we have the counts - if both lines have the same number of separators
// the we assume it is the separator. Separators are ordered by their
// likelyhoods. If no separators have same counts, space will be used as the
// default one
for (int i = 0; i < s1.length; ++i)
if (((s1[i] == s2[i]) && (s1[i] != 0)) || (i == separators.length-1)) {
try {
String[] t1 = determineTokens(l1, separators[i]);
String[] t2 = determineTokens(l2, separators[i]);
String[] t1 = determineTokens(l1, separators[i], single_quote);
String[] t2 = determineTokens(l2, separators[i], single_quote);
if (t1.length == 0 || t1.length != t2.length)
continue;
return separators[i];
Expand Down Expand Up @@ -658,6 +658,7 @@ public static CustomParser.PSetupGuess guessSetup(byte[] bits, ParserSetup setup
if(lines.isEmpty())
return new PSetupGuess(new ParserSetup(ParserType.AUTO,CsvParser.AUTO_SEP,0,false,null,setup._singleQuotes),0,0,null,null);
boolean hasHeader = false;
final int single_quote = setup._singleQuotes ? CHAR_SINGLE_QUOTE : -1;
byte sep = setup._separator;
final String [][] data = new String[lines.size()][];
int ncols;
Expand All @@ -674,20 +675,20 @@ else if(lines.get(0).split(" ").length > 2)
}
}
if(lines.size() == 1)
data[0] = determineTokens(lines.get(0), sep);
data[0] = determineTokens(lines.get(0), sep, single_quote);
ncols = (setup._ncols > 0)?setup._ncols:data[0].length;
hasHeader = (checkHeader && allStrings(data[0])) || setup._header;
} else {
if(setup._separator == AUTO_SEP){ // first guess the separator{
sep = guessSeparator(lines.get(0), lines.get(1));
if(setup._separator == AUTO_SEP){ // first guess the separator
sep = guessSeparator(lines.get(0), lines.get(1), single_quote);
if(sep == AUTO_SEP && lines.size() > 2){
if(sep == AUTO_SEP)sep = guessSeparator(lines.get(1), lines.get(2));
if(sep == AUTO_SEP)sep = guessSeparator(lines.get(0), lines.get(2));
if(sep == AUTO_SEP)sep = guessSeparator(lines.get(1), lines.get(2), single_quote);
if(sep == AUTO_SEP)sep = guessSeparator(lines.get(0), lines.get(2), single_quote);
}
if(sep == AUTO_SEP)sep = (byte)' ';
}
for(int i = 0; i < lines.size(); ++i)
data[i] = determineTokens(lines.get(i), sep);
data[i] = determineTokens(lines.get(i), sep, single_quote);
// we do not have enough lines to decide
ncols = (setup._ncols > 0)?setup._ncols:guessNcols(setup,data);
if(checkHeader){
Expand All @@ -708,7 +709,7 @@ else if(lines.get(0).split(" ").length > 2)
int ilines = 0;
for(int i = 0; i < data.length; ++i){
if(data[i].length != resSetup._ncols){
errors.add("error at line " + i + " : incompatoble line length. Got " + data[i].length + " columns.");
errors.add("error at line " + i + " : incompatible line length. Got " + data[i].length + " columns.");
++ilines;
}
}
Expand Down

0 comments on commit cfbd584

Please sign in to comment.