Skip to content

Commit 1418f97

Browse files
committed
File: Re-add support to skip CR (\r) in File::get_as_text
This was removed in godotengine#63481, and we confirmed that it's better like this, but we add back the possibility to strip CR as an option, to optionally restore the previous behavior. For performance this is done directly in `String::parse_utf8`. Also fixes Android `FileAccess::get_line()` as this one _should_ strip CR. Supersedes godotengine#63717.
1 parent 14828c3 commit 1418f97

15 files changed

+74
-13
lines changed

.gitattributes

+2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ thirdparty/* linguist-vendored
77
* text=auto eol=lf
88
# Except for bat files, which are Windows only files
99
*.bat eol=crlf
10+
# And some test files where the EOL matters
11+
*.test.txt -text
1012

1113
# The above only works properly for Git 2.10+, so for older versions
1214
# we need to manually list the binary files we don't want modified.

core/core_bind.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -1227,13 +1227,13 @@ Vector<uint8_t> File::get_buffer(int64_t p_length) const {
12271227
return data;
12281228
}
12291229

1230-
String File::get_as_text() const {
1230+
String File::get_as_text(bool p_skip_cr) const {
12311231
ERR_FAIL_COND_V_MSG(f.is_null(), String(), "File must be opened before use, or is lacking read-write permission.");
12321232

12331233
uint64_t original_pos = f->get_position();
12341234
const_cast<FileAccess *>(*f)->seek(0);
12351235

1236-
String text = f->get_as_utf8_string();
1236+
String text = f->get_as_utf8_string(p_skip_cr);
12371237

12381238
const_cast<FileAccess *>(*f)->seek(original_pos);
12391239

@@ -1430,7 +1430,7 @@ void File::_bind_methods() {
14301430
ClassDB::bind_method(D_METHOD("get_buffer", "length"), &File::get_buffer);
14311431
ClassDB::bind_method(D_METHOD("get_line"), &File::get_line);
14321432
ClassDB::bind_method(D_METHOD("get_csv_line", "delim"), &File::get_csv_line, DEFVAL(","));
1433-
ClassDB::bind_method(D_METHOD("get_as_text"), &File::get_as_text);
1433+
ClassDB::bind_method(D_METHOD("get_as_text", "skip_cr"), &File::get_as_text, DEFVAL(false));
14341434
ClassDB::bind_method(D_METHOD("get_md5", "path"), &File::get_md5);
14351435
ClassDB::bind_method(D_METHOD("get_sha256", "path"), &File::get_sha256);
14361436
ClassDB::bind_method(D_METHOD("is_big_endian"), &File::is_big_endian);

core/core_bind.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,7 @@ class File : public RefCounted {
411411
Vector<uint8_t> get_buffer(int64_t p_length) const; // Get an array of bytes.
412412
String get_line() const;
413413
Vector<String> get_csv_line(const String &p_delim = ",") const;
414-
String get_as_text() const;
414+
String get_as_text(bool p_skip_cr = false) const;
415415
String get_md5(const String &p_path) const;
416416
String get_sha256(const String &p_path) const;
417417

core/io/file_access.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,7 @@ uint64_t FileAccess::get_buffer(uint8_t *p_dst, uint64_t p_length) const {
377377
return i;
378378
}
379379

380-
String FileAccess::get_as_utf8_string() const {
380+
String FileAccess::get_as_utf8_string(bool p_skip_cr) const {
381381
Vector<uint8_t> sourcef;
382382
uint64_t len = get_length();
383383
sourcef.resize(len + 1);
@@ -388,7 +388,7 @@ String FileAccess::get_as_utf8_string() const {
388388
w[len] = 0;
389389

390390
String s;
391-
s.parse_utf8((const char *)w);
391+
s.parse_utf8((const char *)w, -1, p_skip_cr);
392392
return s;
393393
}
394394

core/io/file_access.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ class FileAccess : public RefCounted {
113113
virtual String get_line() const;
114114
virtual String get_token() const;
115115
virtual Vector<String> get_csv_line(const String &p_delim = ",") const;
116-
virtual String get_as_utf8_string() const;
116+
virtual String get_as_utf8_string(bool p_skip_cr = false) const;
117117

118118
/**
119119
* Use this for files WRITTEN in _big_ endian machines (ie, amiga/mac)

core/string/ustring.cpp

+9-1
Original file line numberDiff line numberDiff line change
@@ -1656,7 +1656,7 @@ String String::utf8(const char *p_utf8, int p_len) {
16561656
return ret;
16571657
}
16581658

1659-
Error String::parse_utf8(const char *p_utf8, int p_len) {
1659+
Error String::parse_utf8(const char *p_utf8, int p_len, bool p_skip_cr) {
16601660
if (!p_utf8) {
16611661
return ERR_INVALID_DATA;
16621662
}
@@ -1689,6 +1689,10 @@ Error String::parse_utf8(const char *p_utf8, int p_len) {
16891689
uint8_t c = *ptrtmp >= 0 ? *ptrtmp : uint8_t(256 + *ptrtmp);
16901690

16911691
if (skip == 0) {
1692+
if (p_skip_cr && c == '\r') {
1693+
ptrtmp++;
1694+
continue;
1695+
}
16921696
/* Determine the number of characters in sequence */
16931697
if ((c & 0x80) == 0) {
16941698
skip = 0;
@@ -1753,6 +1757,10 @@ Error String::parse_utf8(const char *p_utf8, int p_len) {
17531757
uint8_t c = *p_utf8 >= 0 ? *p_utf8 : uint8_t(256 + *p_utf8);
17541758

17551759
if (skip == 0) {
1760+
if (p_skip_cr && c == '\r') {
1761+
p_utf8++;
1762+
continue;
1763+
}
17561764
/* Determine the number of characters in sequence */
17571765
if ((c & 0x80) == 0) {
17581766
*(dst++) = c;

core/string/ustring.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,7 @@ class String {
377377

378378
CharString ascii(bool p_allow_extended = false) const;
379379
CharString utf8() const;
380-
Error parse_utf8(const char *p_utf8, int p_len = -1);
380+
Error parse_utf8(const char *p_utf8, int p_len = -1, bool p_skip_cr = false);
381381
static String utf8(const char *p_utf8, int p_len = -1);
382382

383383
Char16String utf16() const;

doc/classes/File.xml

+3-2
Original file line numberDiff line numberDiff line change
@@ -115,9 +115,10 @@
115115
</method>
116116
<method name="get_as_text" qualifiers="const">
117117
<return type="String" />
118+
<argument index="0" name="skip_cr" type="bool" default="false" />
118119
<description>
119-
Returns the whole file as a [String].
120-
Text is interpreted as being UTF-8 encoded.
120+
Returns the whole file as a [String]. Text is interpreted as being UTF-8 encoded.
121+
If [code]skip_cr[/code] is [code]true[/code], carriage return characters ([code]\r[/code], CR) will be ignored when parsing the UTF-8, so that only line feed characters ([code]\n[/code], LF) represent a new line (Unix convention).
121122
</description>
122123
</method>
123124
<method name="get_buffer" qualifiers="const">

misc/scripts/file_format.sh

+2
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ while IFS= read -rd '' f; do
3737
continue
3838
elif [[ "$f" == *"-so_wrap."* ]]; then
3939
continue
40+
elif [[ "$f" == *".test.txt" ]]; then
41+
continue
4042
fi
4143
# Ensure that files are UTF-8 formatted.
4244
recode UTF-8 "$f" 2> /dev/null

platform/android/file_access_filesystem_jandroid.cpp

+4-2
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,11 @@
2929
/*************************************************************************/
3030

3131
#include "file_access_filesystem_jandroid.h"
32+
3233
#include "core/os/os.h"
3334
#include "core/templates/local_vector.h"
3435
#include "thread_jandroid.h"
36+
3537
#include <unistd.h>
3638

3739
jobject FileAccessFilesystemJAndroid::file_access_handler = nullptr;
@@ -198,15 +200,15 @@ String FileAccessFilesystemJAndroid::get_line() const {
198200
if (elem == '\n' || elem == '\0') {
199201
// Found the end of the line
200202
const_cast<FileAccessFilesystemJAndroid *>(this)->seek(start_position + line_buffer_position + 1);
201-
if (result.parse_utf8((const char *)line_buffer.ptr(), line_buffer_position)) {
203+
if (result.parse_utf8((const char *)line_buffer.ptr(), line_buffer_position, true)) {
202204
return String();
203205
}
204206
return result;
205207
}
206208
}
207209
}
208210

209-
if (result.parse_utf8((const char *)line_buffer.ptr(), line_buffer_position)) {
211+
if (result.parse_utf8((const char *)line_buffer.ptr(), line_buffer_position, true)) {
210212
return String();
211213
}
212214
return result;

tests/core/io/test_file_access.h

+23
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,29 @@ TEST_CASE("[FileAccess] CSV read") {
7878
CHECK(row5[1] == "tab separated");
7979
CHECK(row5[2] == "lines, good?");
8080
}
81+
82+
TEST_CASE("[FileAccess] Get as UTF-8 String") {
83+
Ref<FileAccess> f_lf = FileAccess::open(TestUtils::get_data_path("line_endings_lf.test.txt"), FileAccess::READ);
84+
String s_lf = f_lf->get_as_utf8_string();
85+
f_lf->seek(0);
86+
String s_lf_nocr = f_lf->get_as_utf8_string(true);
87+
CHECK(s_lf == "Hello darkness\nMy old friend\nI've come to talk\nWith you again\n");
88+
CHECK(s_lf_nocr == "Hello darkness\nMy old friend\nI've come to talk\nWith you again\n");
89+
90+
Ref<FileAccess> f_crlf = FileAccess::open(TestUtils::get_data_path("line_endings_crlf.test.txt"), FileAccess::READ);
91+
String s_crlf = f_crlf->get_as_utf8_string();
92+
f_crlf->seek(0);
93+
String s_crlf_nocr = f_crlf->get_as_utf8_string(true);
94+
CHECK(s_crlf == "Hello darkness\r\nMy old friend\r\nI've come to talk\r\nWith you again\r\n");
95+
CHECK(s_crlf_nocr == "Hello darkness\nMy old friend\nI've come to talk\nWith you again\n");
96+
97+
Ref<FileAccess> f_cr = FileAccess::open(TestUtils::get_data_path("line_endings_cr.test.txt"), FileAccess::READ);
98+
String s_cr = f_cr->get_as_utf8_string();
99+
f_cr->seek(0);
100+
String s_cr_nocr = f_cr->get_as_utf8_string(true);
101+
CHECK(s_cr == "Hello darkness\rMy old friend\rI've come to talk\rWith you again\r");
102+
CHECK(s_cr_nocr == "Hello darknessMy old friendI've come to talkWith you again");
103+
}
81104
} // namespace TestFileAccess
82105

83106
#endif // TEST_FILE_ACCESS_H

tests/core/string/test_string.h

+14
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,20 @@ TEST_CASE("[String] UTF16 with BOM") {
152152
CHECK(String::utf16(cs) == s);
153153
}
154154

155+
TEST_CASE("[String] UTF8 with CR") {
156+
const String base = U"Hello darkness\r\nMy old friend\nI've come to talk\rWith you again";
157+
158+
String keep_cr;
159+
Error err = keep_cr.parse_utf8(base.utf8().get_data());
160+
CHECK(err == OK);
161+
CHECK(keep_cr == base);
162+
163+
String no_cr;
164+
err = no_cr.parse_utf8(base.utf8().get_data(), -1, true); // Skip CR.
165+
CHECK(err == OK);
166+
CHECK(no_cr == base.replace("\r", ""));
167+
}
168+
155169
TEST_CASE("[String] Invalid UTF8 (non-standard)") {
156170
ERR_PRINT_OFF
157171
static const uint8_t u8str[] = { 0x45, 0xE3, 0x81, 0x8A, 0xE3, 0x82, 0x88, 0xE3, 0x81, 0x86, 0xF0, 0x9F, 0x8E, 0xA4, 0xF0, 0x82, 0x82, 0xAC, 0xED, 0xA0, 0x81, 0 };

tests/data/line_endings_cr.test.txt

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Hello darknessMy old friendI've come to talkWith you again

tests/data/line_endings_crlf.test.txt

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Hello darkness
2+
My old friend
3+
I've come to talk
4+
With you again

tests/data/line_endings_lf.test.txt

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Hello darkness
2+
My old friend
3+
I've come to talk
4+
With you again

0 commit comments

Comments
 (0)