Skip to content

Commit 56640a3

Browse files
authoredMar 2, 2018
Escaped the BOMs during a read_csv. (apple#279)
* Escaped the BOMs during a read_csv. * Fixed an issue with the BOM char comparison. * Added pushback to the correct location
1 parent 15ef2d8 commit 56640a3

File tree

2 files changed

+140
-103
lines changed

2 files changed

+140
-103
lines changed
 

‎src/sframe/parallel_csv_parser.cpp

+121-102
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,23 @@ namespace turi {
2929
using fileio::file_status;
3030

3131
/**
32-
* Code from
32+
* Escape BOMs
33+
*/
34+
void skip_BOM(general_ifstream& fin) {
35+
char fChar, sChar, tChar;
36+
fChar = fin.get();
37+
sChar = fin.get();
38+
tChar = fin.get();
39+
bool isBOM = ((fChar == (char)0xEF) && (sChar == (char)0xBB) && (tChar == (char)0xBF));
40+
if (!isBOM) {
41+
fin.putback(tChar);
42+
fin.putback(sChar);
43+
fin.putback(fChar);
44+
}
45+
}
46+
47+
/**
48+
* Code from
3349
* http://stackoverflow.com/questions/6089231/getting-std-ifstream-to-handle-lf-cr-and-crlf
3450
*
3551
* A getline implementation which supports '\n', '\r' and '\r\n'
@@ -70,13 +86,13 @@ std::istream& eol_safe_getline(std::istream& is, std::string& t) {
7086
/**
7187
* Reads until the eol string is encountered
7288
*/
73-
std::istream& custom_eol_getline(std::istream& is,
74-
std::string& t,
89+
std::istream& custom_eol_getline(std::istream& is,
90+
std::string& t,
7591
const std::string& eol) {
7692
t.clear();
7793
if (eol.empty()) {
7894
// read the entire stream
79-
t = std::string(std::istreambuf_iterator<char>(is),
95+
t = std::string(std::istreambuf_iterator<char>(is),
8096
std::istreambuf_iterator<char>());
8197
return is;
8298
} else {
@@ -108,12 +124,12 @@ std::istream& custom_eol_getline(std::istream& is,
108124
}
109125

110126
/**
111-
* if eol == "\n", this will get read a line until the next
127+
* if eol == "\n", this will get read a line until the next
112128
* "\n", "\r" or "\r\n" sequence.
113129
* Otherwise, it will read until the eol string appears
114130
*/
115-
std::istream& eol_getline(std::istream& is,
116-
std::string& t,
131+
std::istream& eol_getline(std::istream& is,
132+
std::string& t,
117133
const std::string& eol) {
118134
if (eol == "\n") {
119135
return eol_safe_getline(is, t);
@@ -138,11 +154,11 @@ class parallel_csv_parser {
138154
* the type of each column. Generally this should the same
139155
* number of columns as in the CSV. If only a subset if
140156
* (say 3 out of 4) columns are to be stored, this should
141-
* contain the types of the output columns, and
157+
* contain the types of the output columns, and
142158
* column_output_order is used to map the CSV columns to
143-
* the output columns.
159+
* the output columns.
144160
* \param tokenizer The tokenizer rules to use
145-
* \param continue_on_failure Whether to keep going even when an error is
161+
* \param continue_on_failure Whether to keep going even when an error is
146162
* encountered.
147163
* \param store_errors Whether to store bad lines in a separate SArray
148164
* \param row_limit Maximum number of rows to read
@@ -151,30 +167,30 @@ class parallel_csv_parser {
151167
* if output_order[i] == -1, the column is ignored.
152168
* If output_order is empty (default), this is equivalent
153169
* to the having output_order[i] == i.
154-
* \param num_threads Amount of parallelism to use.
170+
* \param num_threads Amount of parallelism to use.
155171
*/
156-
parallel_csv_parser(std::vector<flex_type_enum> column_types,
172+
parallel_csv_parser(std::vector<flex_type_enum> column_types,
157173
csv_line_tokenizer tokenizer,
158174
bool continue_on_failure,
159175
bool store_errors,
160176
size_t row_limit,
161177
std::vector<size_t> column_output_order = std::vector<size_t>(),
162178
size_t num_threads = thread_pool::get_instance().size()):
163-
nthreads(std::max<size_t>(num_threads, 2) - 1),
164-
parsed_buffer(nthreads), parsed_buffer_last_elem(nthreads),
165-
writing_buffer(nthreads), writing_buffer_last_elem(nthreads),
179+
nthreads(std::max<size_t>(num_threads, 2) - 1),
180+
parsed_buffer(nthreads), parsed_buffer_last_elem(nthreads),
181+
writing_buffer(nthreads), writing_buffer_last_elem(nthreads),
166182
error_buffer(nthreads), writing_error_buffer(nthreads),
167-
thread_local_tokenizer(nthreads, tokenizer),
168-
read_group(thread_pool::get_instance()),
183+
thread_local_tokenizer(nthreads, tokenizer),
184+
read_group(thread_pool::get_instance()),
169185
write_group(thread_pool::get_instance()), column_types(column_types),
170186
column_output_order(column_output_order),
171-
row_limit(row_limit),
172-
continue_on_failure(continue_on_failure),
187+
row_limit(row_limit),
188+
continue_on_failure(continue_on_failure),
173189
store_errors(store_errors),
174190
line_terminator(tokenizer.line_terminator),
175191
is_regular_line_terminator(line_terminator == "\n") {
176192
};
177-
193+
178194
/**
179195
* Sets the total size of all inputs. Required if multiple output segments
180196
* are desired. Otherwise all outputs will go to segment 0.
@@ -185,16 +201,16 @@ class parallel_csv_parser {
185201
/**
186202
* Parses an input file into an output frame
187203
*/
188-
void parse(general_ifstream& fin,
189-
sframe& output_frame,
204+
void parse(general_ifstream& fin,
205+
sframe& output_frame,
190206
sarray<flexible_type>& errors) {
191207
size_t num_output_segments = output_frame.num_segments();
192208
size_t current_input_file_size = fin.file_size();
193209
try {
194210
timer ti;
195211
bool fill_buffer_is_good = true;
196-
while(fin.good() && fill_buffer_is_good &&
197-
(row_limit == 0 || lines_read.value < row_limit)) {
212+
while(fin.good() && fill_buffer_is_good &&
213+
(row_limit == 0 || lines_read.value < row_limit)) {
198214
fill_buffer_is_good = fill_buffer(fin);
199215
if (buffer.size() == 0) break;
200216

@@ -234,10 +250,10 @@ class parallel_csv_parser {
234250
if (total_input_file_sizes > 0) {
235251
// compute the current output segment
236252
// It really is simply.
237-
// current_output_segment =
238-
// (fin.get_bytes_read() + cumulative_file_read_sizes)
253+
// current_output_segment =
254+
// (fin.get_bytes_read() + cumulative_file_read_sizes)
239255
// * num_output_segments / total_input_file_sizes;
240-
// But a lot of sanity checking is required because
256+
// But a lot of sanity checking is required because
241257
// - fin.get_bytes_read() may fail.
242258
// - files on disk may change after I last computed the file sizes, so
243259
// there is no guarantee that cumulatively, they will all add up.
@@ -252,7 +268,7 @@ class parallel_csv_parser {
252268
read_pos += cumulative_file_read_sizes;
253269
}
254270
next_output_segment = read_pos * num_output_segments / total_input_file_sizes;
255-
// sanity boundary check
271+
// sanity boundary check
256272
if (next_output_segment >= num_output_segments) next_output_segment = num_output_segments - 1;
257273
// we never go back
258274
current_output_segment = std::max(current_output_segment, next_output_segment);
@@ -275,9 +291,9 @@ class parallel_csv_parser {
275291

276292
cumulative_file_read_sizes += current_input_file_size;
277293
} catch (...) {
278-
try { read_group.join(); } catch (...) { }
279-
try { write_group.join(); } catch (...) { }
280-
// even on a failure, we still increment the cumulative read count
294+
try { read_group.join(); } catch (...) { }
295+
try { write_group.join(); } catch (...) { }
296+
// even on a failure, we still increment the cumulative read count
281297
cumulative_file_read_sizes += current_input_file_size;
282298
throw;
283299
}
@@ -292,7 +308,7 @@ class parallel_csv_parser {
292308

293309
/**
294310
* Returns the number of CSV lines read
295-
*/
311+
*/
296312
size_t num_lines_read() const {
297313
return lines_read.value;
298314
}
@@ -304,7 +320,7 @@ class parallel_csv_parser {
304320
if (column_output_order.empty()) return column_types.size();
305321
else return column_output_order.size();
306322
}
307-
323+
308324
/**
309325
* Returns the number of output columns in the CSV file
310326
*/
@@ -372,7 +388,7 @@ class parallel_csv_parser {
372388

373389
inline bool is_end_line_str(char* c, char* cend) const {
374390
if (is_regular_line_terminator) return (*c) == '\n' || (*c) == '\r';
375-
else if (line_terminator.empty() == false &&
391+
else if (line_terminator.empty() == false &&
376392
cend - c >= (int)(line_terminator.length())) {
377393
for (char nl : line_terminator) {
378394
if (nl != (*c)) return false;
@@ -400,7 +416,7 @@ class parallel_csv_parser {
400416
newline_was_matched = true;
401417
return c + 1;
402418
} else if ((*c) == '\r') {
403-
// its a \r. It could be just a \r, or a \r\n.
419+
// its a \r. It could be just a \r, or a \r\n.
404420
// check for \r\n
405421
if (c + 1 < cend && (*(c+1)) == '\n') {
406422
// its a \r\n, advance past and return
@@ -429,7 +445,7 @@ class parallel_csv_parser {
429445
}
430446
++c;
431447
}
432-
}
448+
}
433449
newline_was_matched = false;
434450
return cend;
435451
}
@@ -448,13 +464,13 @@ class parallel_csv_parser {
448464
local_tokens[i].reset(column_types[i]);
449465
}
450466
}
451-
const std::vector<size_t>* ptr_to_output_order =
467+
const std::vector<size_t>* ptr_to_output_order =
452468
column_output_order.empty() ? nullptr : &column_output_order;
453469

454-
size_t num_tokens_parsed =
470+
size_t num_tokens_parsed =
455471
thread_local_tokenizer[threadid].
456472
tokenize_line(pstart, pnext - pstart,
457-
local_tokens,
473+
local_tokens,
458474
true /*permit undefined*/,
459475
ptr_to_output_order);
460476

@@ -473,17 +489,17 @@ class parallel_csv_parser {
473489
if (num_failures.value < 10) {
474490
std::string badline = std::string(pstart, pnext - pstart);
475491
if (badline.length() > 256) badline=badline.substr(0, 256) + "...";
476-
logprogress_stream << std::string("Unable to parse line \"") +
492+
logprogress_stream << std::string("Unable to parse line \"") +
477493
badline + "\"" << std::endl;
478494
}
479495
++num_failures;
480496
} else {
481-
log_and_throw(std::string("Unable to parse line \"") +
497+
log_and_throw(std::string("Unable to parse line \"") +
482498
std::string(pstart, pnext - pstart) + "\"\n" +
483499
"Set error_bad_lines=False to skip bad lines");
484500
}
485-
}
486-
}
501+
}
502+
}
487503
}
488504
/**
489505
* Performs the parse on a section of the buffer (threadid in nthreads)
@@ -500,7 +516,7 @@ class parallel_csv_parser {
500516
if (threadid == nthreads - 1) pend = bufend;
501517

502518
// ok, this is important. Pay attention.
503-
// We are sweeping from
519+
// We are sweeping from
504520
// - the first line which begins AFTER pstart, but before pend
505521
// - And we are finishing on the last line which ends AFTER pend.
506522
// (if we are the last thread, something special happens and
@@ -515,10 +531,10 @@ class parallel_csv_parser {
515531
// hello, world abcd
516532
// 1, 2 abcd
517533
// 3, 4 abcd
518-
//
534+
//
519535
// Then whichever range includes a "d" handles the line after that.
520536
//
521-
// This is a little subtle when the line_terminator may be multiple
537+
// This is a little subtle when the line_terminator may be multiple
522538
// characters.
523539
//
524540

@@ -531,34 +547,34 @@ class parallel_csv_parser {
531547
bool start_position_found = (threadid == 0);
532548
if (threadid > 0) {
533549
// find the first line beginning after pstart but before pend
534-
535-
// if we have a multicharacter line terminator, we have to be a bit
536-
// intelligent. to match the "last character" of the terminator,
537-
// we need to shift the newline search backwards by
550+
551+
// if we have a multicharacter line terminator, we have to be a bit
552+
// intelligent. to match the "last character" of the terminator,
553+
// we need to shift the newline search backwards by
538554
// line_terminator.length() - 1 characters
539-
if (!is_regular_line_terminator &&
555+
if (!is_regular_line_terminator &&
540556
line_terminator.length() > 1 &&
541557
// make sure there is enough room to shift backwards
542-
pstart - bufstart >= int(line_terminator.length() - 1)) {
558+
pstart - bufstart >= int(line_terminator.length() - 1)) {
543559
pstart -= line_terminator.length() - 1;
544560
}
545561
bool newline_was_matched;
546562
pstart = advance_past_newline(pstart, pend, newline_was_matched);
547563
if (newline_was_matched) {
548564
start_position_found = true;
549565
}
550-
}
566+
}
551567
if (start_position_found) {
552568
/**************************************************************************/
553569
/* */
554570
/* Find the End Position */
555571
/* */
556572
/**************************************************************************/
557573
// find the end position
558-
if (!is_regular_line_terminator &&
574+
if (!is_regular_line_terminator &&
559575
line_terminator.length() > 1 &&
560576
// make sure there is enough room to shift backwards
561-
pend - bufstart >= int(line_terminator.length() - 0)) {
577+
pend - bufstart >= int(line_terminator.length() - 0)) {
562578
pend -= line_terminator.length() - 1;
563579
}
564580
bool newline_was_matched_unused;
@@ -572,7 +588,7 @@ class parallel_csv_parser {
572588
char* pnext = pstart;
573589

574590
// the rule that every line must end with a terminator is wrong when
575-
// the line terminator is empty. some special handling is needed for this
591+
// the line terminator is empty. some special handling is needed for this
576592
// case.
577593
if (line_terminator.empty()) {
578594
parse_line(pstart, pend, threadid);
@@ -596,17 +612,17 @@ class parallel_csv_parser {
596612
}
597613

598614
/**
599-
* Adds a line terminator to the buffer if it does not already
600-
* end with a line terminator. Used by the buffer reading routines on
615+
* Adds a line terminator to the buffer if it does not already
616+
* end with a line terminator. Used by the buffer reading routines on
601617
* EOF so that the parser is always guaranteed that every line
602-
* ends with a line terminator, even the last line.
618+
* ends with a line terminator, even the last line.
603619
*/
604620
void add_line_terminator_to_buffer() {
605-
if (is_regular_line_terminator &&
606-
buffer[buffer.length() - 1] != '\n' &&
621+
if (is_regular_line_terminator &&
622+
buffer[buffer.length() - 1] != '\n' &&
607623
buffer[buffer.length() - 1] != '\r') {
608624
buffer.push_back('\n');
609-
} else if (!is_regular_line_terminator &&
625+
} else if (!is_regular_line_terminator &&
610626
buffer.length() >= line_terminator.length() &&
611627
buffer.substr(buffer.length() - line_terminator.length()) != line_terminator) {
612628
buffer += line_terminator;
@@ -649,7 +665,7 @@ class parallel_csv_parser {
649665
// parse buffer in parallel
650666
mutex last_parsed_token_lock;
651667
char* last_parsed_token = &(buffer[0]);
652-
668+
653669
for (size_t threadid = 0; threadid < nthreads; ++threadid) {
654670
read_group.launch(
655671
[=,&last_parsed_token_lock,&last_parsed_token](void) {
@@ -668,13 +684,13 @@ class parallel_csv_parser {
668684
}
669685

670686
/**
671-
* Spins up a background thread to write parse results from parallel_parse
672-
* to the output frame. First the parsed_buffer is swapped into the
687+
* Spins up a background thread to write parse results from parallel_parse
688+
* to the output frame. First the parsed_buffer is swapped into the
673689
* writing_buffer, thus permitting the parsed_buffer to be used again in
674690
* a different thread.
675691
*/
676-
void start_background_write(sframe& output_frame,
677-
sarray<flexible_type>& errors_array,
692+
void start_background_write(sframe& output_frame,
693+
sarray<flexible_type>& errors_array,
678694
size_t output_segment) {
679695
// switch the parse buffer with the write buffer
680696
writing_buffer.swap(parsed_buffer);
@@ -688,17 +704,17 @@ class parallel_csv_parser {
688704
write_group.launch([&, output_segment] {
689705
auto iter = output_frame.get_output_iterator(output_segment);
690706
for (size_t i = 0; i < writing_buffer.size(); ++i) {
691-
std::copy(writing_buffer[i].begin(),
707+
std::copy(writing_buffer[i].begin(),
692708
writing_buffer[i].begin() + writing_buffer_last_elem[i], iter);
693709
lines_read.inc(writing_buffer_last_elem[i]);
694710
}
695-
if (store_errors) {
711+
if (store_errors) {
696712
auto errors_iter = errors_array.get_output_iterator(0);
697713
for (auto& chunk_errors : writing_error_buffer) {
698714
std::copy(chunk_errors.begin(), chunk_errors.end(), errors_iter);
699715
chunk_errors.clear();
700716
}
701-
}
717+
}
702718
background_thread_running = false;
703719
});
704720
}
@@ -718,12 +734,12 @@ class parallel_csv_parser {
718734
*
719735
* e.g.
720736
* {"A", "A", "A.1"} --> {"A", "A.2", "A.1"}
721-
*
737+
*
722738
* \param column_names The set of column names to be renamed. The vector
723739
* will be modified in place.
724740
*/
725741
void make_unique_column_names(std::vector<std::string>& column_names) {
726-
// this is the set of column names to the left of the column we
742+
// this is the set of column names to the left of the column we
727743
// are current inspected. i.e. these column names are already validated to
728744
// be correct.
729745
log_func_entry();
@@ -738,7 +754,7 @@ void make_unique_column_names(std::vector<std::string>& column_names) {
738754
// already exists.
739755
std::set<std::string> all_column_names(column_names.begin(),
740756
column_names.end());
741-
// start incrementing at A.1, A.2, etc.
757+
// start incrementing at A.1, A.2, etc.
742758
size_t number = 1;
743759
std::string new_column_name;
744760
while(1) {
@@ -783,6 +799,7 @@ void read_csv_header(csv_info& info,
783799
if (!probe_fin.good()) {
784800
log_and_throw("Fail reading " + sanitize_url(path));
785801
}
802+
skip_BOM(probe_fin);
786803

787804
// skip skip_rows lines
788805
std::string skip_string;
@@ -794,8 +811,8 @@ void read_csv_header(csv_info& info,
794811
while (first_line_tokens.size() == 0 && probe_fin.good()) {
795812
eol_getline(probe_fin, first_line, tokenizer.line_terminator);
796813
boost::algorithm::trim(first_line);
797-
tokenizer.tokenize_line(&(first_line[0]),
798-
first_line.length(),
814+
tokenizer.tokenize_line(&(first_line[0]),
815+
first_line.length(),
799816
first_line_tokens);
800817
}
801818

@@ -829,7 +846,7 @@ void read_csv_header(csv_info& info,
829846
/* - column_types.size() == column_names.size() == ncols */
830847
/* */
831848
/**************************************************************************/
832-
void get_column_types(csv_info& info,
849+
void get_column_types(csv_info& info,
833850
std::map<std::string, flex_type_enum> column_type_hints) {
834851
info.column_types.resize(info.ncols, flex_type_enum::STRING);
835852

@@ -838,7 +855,7 @@ void get_column_types(csv_info& info,
838855
} else if (column_type_hints.count("__X0__")) {
839856
if (column_type_hints.size() != info.column_types.size()) {
840857
std::stringstream warning_msg;
841-
warning_msg << "column_type_hints has different size from actual number of columns: "
858+
warning_msg << "column_type_hints has different size from actual number of columns: "
842859
<< "column_type_hints.size()=" << column_type_hints.size()
843860
<< ";number of columns=" << info.ncols
844861
<< std::endl;
@@ -874,18 +891,19 @@ void get_column_types(csv_info& info,
874891

875892
} // anonymous namespace
876893

894+
877895
/**
878896
* Parsed a CSV file to an SFrame.
879897
*
880898
* \param path The file to open as a csv
881-
* \param tokenizer The tokenizer configuration to use. This should be
899+
* \param tokenizer The tokenizer configuration to use. This should be
882900
* filled with all the tokenization rules (like what
883-
* separator character to use, what quoting character to use,
901+
* separator character to use, what quoting character to use,
884902
* etc.)
885903
* \param writer The sframe writer to use.
886-
* \param frame_sidx_file Where to write the frame to
904+
* \param frame_sidx_file Where to write the frame to
887905
* \param parallel_csv_parser A parallel_csv_parser
888-
* \param errors A reference to a map in which to store an sarray of bad lines
906+
* \param errors A reference to a map in which to store an sarray of bad lines
889907
* for each input file.
890908
*/
891909
void parse_csv_to_sframe(
@@ -907,14 +925,15 @@ void parse_csv_to_sframe(
907925
{
908926
general_ifstream fin(path);
909927
if (!fin.good()) log_and_throw("Cannot open " + sanitize_url(path));
928+
skip_BOM(fin);
910929

911930
// skip skip_rows lines
912931
std::string skip_string;
913932
for (size_t i = 0;i < skip_rows; ++i) {
914933
eol_getline(fin, skip_string, tokenizer.line_terminator);
915934
}
916935

917-
// if use_header, we keep throwing away empty or comment lines until we
936+
// if use_header, we keep throwing away empty or comment lines until we
918937
// get one good line
919938
if (use_header) {
920939
std::vector<std::string> first_line_tokens;
@@ -924,16 +943,16 @@ void parse_csv_to_sframe(
924943
eol_getline(fin, line, tokenizer.line_terminator);
925944
tokenizer.tokenize_line(&(line[0]), line.length(), first_line_tokens);
926945
}
927-
// if we are going to store errors, we don't do early skippng on
946+
// if we are going to store errors, we don't do early skippng on
928947
// mismatched files
929-
if (!store_errors &&
948+
if (!store_errors &&
930949
first_line_tokens.size() != parser.num_input_columns()) {
931950
logprogress_stream << "Unexpected number of columns found in " << path
932951
<< ". Skipping this file." << std::endl;
933952
return;
934953
}
935954
}
936-
955+
937956
// store errors for this particular file in an sarray
938957
auto file_errors = std::make_shared<sarray<flexible_type>>();
939958
if (store_errors) {
@@ -950,8 +969,8 @@ void parse_csv_to_sframe(
950969
}
951970

952971
if (continue_on_failure && parser.num_lines_failed() > 0) {
953-
logprogress_stream << parser.num_lines_failed()
954-
<< " lines failed to parse correctly"
972+
logprogress_stream << parser.num_lines_failed()
973+
<< " lines failed to parse correctly"
955974
<< std::endl;
956975
}
957976

@@ -980,14 +999,14 @@ std::map<std::string, std::shared_ptr<sarray<flexible_type>>> parse_csvs_to_sfra
980999
auto output_columns = options.output_columns;
9811000
auto row_limit = options.row_limit;
9821001
auto skip_rows = options.skip_rows;
983-
1002+
9841003
if (store_errors) continue_on_failure = true;
985-
// otherwise, check that url is valid directory, and get its listing if no
1004+
// otherwise, check that url is valid directory, and get its listing if no
9861005
// pattern present
9871006
std::vector<std::string> files;
9881007
bool found_zero_byte_files = false;
9891008
std::vector<std::pair<std::string, file_status>> file_and_status = fileio::get_glob_files(url);
990-
1009+
9911010
for (auto p : file_and_status) {
9921011
if (p.second == file_status::REGULAR_FILE) {
9931012
// throw away empty files
@@ -1007,7 +1026,7 @@ std::map<std::string, std::shared_ptr<sarray<flexible_type>>> parse_csvs_to_sfra
10071026
<< std::endl;
10081027
}
10091028

1010-
logstream(LOG_INFO) << "Adding CSV file "
1029+
logstream(LOG_INFO) << "Adding CSV file "
10111030
<< sanitize_url(p.first)
10121031
<< " to list of files to parse"
10131032
<< std::endl;
@@ -1016,7 +1035,7 @@ std::map<std::string, std::shared_ptr<sarray<flexible_type>>> parse_csvs_to_sfra
10161035
}
10171036

10181037
file_and_status.clear(); // don't need these anymore
1019-
1038+
10201039
// ensure that we actually found some valid files
10211040
if (files.empty()) {
10221041
if (found_zero_byte_files) {
@@ -1056,11 +1075,11 @@ std::map<std::string, std::shared_ptr<sarray<flexible_type>>> parse_csvs_to_sfra
10561075
for (size_t i = 0;i < output_columns.size(); ++i) {
10571076
const auto& outcol = output_columns[i];
10581077
auto iter = std::find(info.column_names.begin(),
1059-
info.column_names.end(),
1078+
info.column_names.end(),
10601079
outcol);
10611080
// Cannot find this column in the talble?
10621081
// is output_columns a positional type? i.e. "X" something
1063-
if (iter == info.column_names.end() &&
1082+
if (iter == info.column_names.end() &&
10641083
outcol.length() > 1 && outcol[i] == 'X') {
10651084
size_t colnumber = stoull(outcol.substr(1));
10661085
// column number is 1 based
@@ -1097,8 +1116,8 @@ std::map<std::string, std::shared_ptr<sarray<flexible_type>>> parse_csvs_to_sfra
10971116
if (!frame.is_opened_for_write()) {
10981117
// open as many segments as there are temp directories.
10991118
// But at least one segment
1100-
frame.open_for_write(info.column_names, info.column_types,
1101-
frame_sidx_file,
1119+
frame.open_for_write(info.column_names, info.column_types,
1120+
frame_sidx_file,
11021121
std::max<size_t>(1, num_temp_directories()));
11031122
}
11041123

@@ -1109,19 +1128,19 @@ std::map<std::string, std::shared_ptr<sarray<flexible_type>>> parse_csvs_to_sfra
11091128
parser.start_timer();
11101129

11111130
for (auto file : files) {
1112-
// check that we've read < row_limit
1113-
if (parser.num_lines_read() < row_limit || row_limit == 0) {
1114-
parse_csv_to_sframe(file, tokenizer, options, frame,
1131+
// check that we've read < row_limit
1132+
if (parser.num_lines_read() < row_limit || row_limit == 0) {
1133+
parse_csv_to_sframe(file, tokenizer, options, frame,
11151134
frame_sidx_file, parser, errors);
11161135
} else break;
11171136
}
1118-
1137+
11191138
logprogress_stream << "Parsing completed. Parsed " << parser.num_lines_read()
11201139
<< " lines in " << parser.get_time_elapsed() << " secs." << std::endl;
11211140

1122-
1141+
11231142
if (frame.is_opened_for_write()) frame.close();
1124-
1143+
11251144
return errors;
11261145
}
11271146

‎src/unity/python/turicreate/test/test_sframe.py

+19-1
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,25 @@ def test_creation_from_dataframe(self):
196196
sf = SFrame(data=original_p)
197197
self.__test_equal(sf, original_p)
198198

199+
def test_auto_parse_csv_with_bom(self):
200+
with tempfile.NamedTemporaryFile(mode='w', delete=False) as csvfile:
201+
df = pd.DataFrame({'float_data': self.float_data,
202+
'int_data': self.int_data,
203+
'string_data': self.a_to_z[:len(self.int_data)]})
204+
df.to_csv(csvfile, index=False)
205+
csvfile.close()
206+
207+
import codecs
208+
with open(csvfile.name) as f:
209+
content = f.read()
210+
with open(csvfile.name, 'w') as f:
211+
f.write(codecs.BOM_UTF8)
212+
f.write(content)
213+
214+
sf = SFrame.read_csv(csvfile.name, header=True)
215+
self.assertEqual(sf.dtype, [float, int, str])
216+
self.__test_equal(sf, df)
217+
199218
def test_auto_parse_csv(self):
200219
with tempfile.NamedTemporaryFile(mode='w', delete=False) as csvfile:
201220
df = pd.DataFrame({'float_data': self.float_data,
@@ -209,7 +228,6 @@ def test_auto_parse_csv(self):
209228
self.assertEqual(sf.dtype, [float, int, str])
210229
self.__test_equal(sf, df)
211230

212-
213231
def test_parse_csv(self):
214232
with tempfile.NamedTemporaryFile(mode='w', delete=False) as csvfile:
215233
self.dataframe.to_csv(csvfile, index=False)

0 commit comments

Comments
 (0)
Please sign in to comment.