diff --git a/builtin-functions/kphp-light/stdlib/file-functions.txt b/builtin-functions/kphp-light/stdlib/file-functions.txt index 7f998987a7..c71f7739fc 100644 --- a/builtin-functions/kphp-light/stdlib/file-functions.txt +++ b/builtin-functions/kphp-light/stdlib/file-functions.txt @@ -51,6 +51,8 @@ function is_file ($name ::: string) ::: bool; function fgets ($stream ::: mixed, $length ::: int = -1) ::: string | false; +function fgetcsv ($stream ::: mixed, $length ::: int = 0, $delimiter ::: string = ",", $enclosure ::: string = "\"", $escape ::: string = "\\") ::: mixed[] | false; + // === UNSUPPORTED === /** @kphp-extern-func-info stub generation-required */ function chmod ($name ::: string, $mode ::: int) ::: bool; @@ -75,10 +77,6 @@ function scandir ($directory ::: string) ::: string[] | false; /** @kphp-extern-func-info stub generation-required */ function tempnam ($dir ::: string, $prefix ::: string) ::: string | false; - - /** @kphp-extern-func-info stub generation-required */ -function fgetcsv ($stream ::: mixed, $length ::: int = 0, $delimiter ::: string = ",", $enclosure ::: string = "\"", $escape ::: string = "\\") ::: mixed[] | false; - define('SEEK_SET', 0); define('SEEK_END', 1); define('SEEK_CUR', 2); diff --git a/runtime-common/stdlib/file/file-functions.h b/runtime-common/stdlib/file/file-functions.h new file mode 100644 index 0000000000..8ec7c07efc --- /dev/null +++ b/runtime-common/stdlib/file/file-functions.h @@ -0,0 +1,52 @@ +// Compiler for PHP (aka KPHP) +// Copyright (c) 2026 LLC «V Kontakte» +// Distributed under the GPL v3 License, see LICENSE.notice.txt + +#pragma once + +#include +#include + +#include "runtime-common/core/utils/kphp-assert-core.h" + +namespace kphp::fs::details { +// this function is imported from https://github.com/php/php-src/blob/master/ext/standard/file.c, +// function php_fgetcsv_lookup_trailing_spaces +inline const char* fgetcsv_lookup_trailing_spaces(const char* ptr, size_t len, mbstate_t* ps) noexcept { + php_assert(ps != nullptr); + + int32_t inc_len{}; + unsigned char last_chars[2]{0, 0}; + + while (len > 0) { + // SAFETY: mbrlen is thread-safe if ps != nullptr, and ps != nullptr because there is assertion at the beginning of function + inc_len = (*ptr == '\0' ? 1 : mbrlen(ptr, len, ps)); // NOLINT + switch (inc_len) { + case -2: + case -1: + inc_len = 1; + break; + case 0: + goto quit_loop; + case 1: + default: + last_chars[0] = last_chars[1]; + last_chars[1] = *ptr; + break; + } + ptr += inc_len; + len -= inc_len; + } +quit_loop: + switch (last_chars[1]) { + case '\n': + if (last_chars[0] == '\r') { + return ptr - 2; + } + /* fallthrough */ + case '\r': + return ptr - 1; + } + return ptr; +} +} // namespace kphp::fs::details diff --git a/runtime-light/stdlib/file/file-system-functions.cpp b/runtime-light/stdlib/file/file-system-functions.cpp index 905dbda5c3..8b7f33543b 100644 --- a/runtime-light/stdlib/file/file-system-functions.cpp +++ b/runtime-light/stdlib/file/file-system-functions.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -21,6 +22,7 @@ #include "runtime-common/core/allocator/script-malloc-interface.h" #include "runtime-common/core/runtime-core.h" +#include "runtime-common/stdlib/file/file-functions.h" #include "runtime-light/k2-platform/k2-api.h" #include "runtime-light/stdlib/diagnostics/logs.h" #include "runtime-light/stdlib/file/file-system-state.h" @@ -28,6 +30,7 @@ namespace { +constexpr int32_t PHP_CSV_NO_ESCAPE{EOF}; constexpr size_t MIN_FILE_SIZE{12}; constexpr size_t MAX_READ_SIZE{3 * 256 + 64}; @@ -430,3 +433,277 @@ Optional f$fgets(const resource& stream, int64_t length) noexcept { res.shrink(static_cast(read_res)); return res; } + +namespace { +// Common csv-parsing functionality for +// * fgetcsv +// The function is similar to `php_fgetcsv` function from https://github.com/php/php-src/blob/master/ext/standard/file.c +Optional> getcsv(const resource& stream, string buffer, char delimiter, char enclosure, char escape, mbstate_t* ps) noexcept { + kphp::log::assertion(ps != nullptr); + + array answer{}; + int32_t current_id{0}; + string_buffer tmp_buffer{}; + // Following part is imported from `php_fgetcsv` + char const* buf{buffer.c_str()}; + char const* bptr{buf}; + size_t buf_len{buffer.size()}; + char const* tptr{kphp::fs::details::fgetcsv_lookup_trailing_spaces(buf, buf_len, ps)}; + size_t line_end_len{buf_len - (tptr - buf)}; + char const* line_end{tptr}; + char const* limit{tptr}; + bool first_field{true}; + size_t temp_len{buf_len}; + int32_t inc_len{}; + do { + char const* hunk_begin{}; + + // SAFETY: mbrlen is thread-safe if ps != nullptr, and ps != nullptr because there is assertion at the beginning of function + inc_len = (bptr < limit ? (*bptr == '\0' ? 1 : mbrlen(bptr, limit - bptr, ps)) : 0); // NOLINT + if (inc_len == 1) { + char const* tmp{bptr}; + while ((*tmp != delimiter) && isspace(static_cast(*tmp))) { + tmp++; + } + if (*tmp == enclosure) { + bptr = tmp; + } + } + + if (first_field && bptr == line_end) { + answer.set_value(current_id++, mixed{}); + break; + } + first_field = false; + /* 2. Read field, leaving bptr pointing at start of next field */ + if (inc_len != 0 && *bptr == enclosure) { + int32_t state{0}; + + bptr++; /* move on to first character in field */ + hunk_begin = bptr; + + /* 2A. handle enclosure delimited field */ + for (;;) { + switch (inc_len) { + case 0: + switch (state) { + case 2: + tmp_buffer.append(hunk_begin, static_cast(bptr - hunk_begin - 1)); + hunk_begin = bptr; + goto quit_loop_2; + + case 1: + tmp_buffer.append(hunk_begin, static_cast(bptr - hunk_begin)); + hunk_begin = bptr; + /* fallthrough */ + case 0: { + + if (hunk_begin != line_end) { + tmp_buffer.append(hunk_begin, static_cast(bptr - hunk_begin)); + hunk_begin = bptr; + } + + /* add the embedded line end to the field */ + tmp_buffer.append(line_end, line_end_len); + string new_buffer{}; + + if (stream.is_null()) { + goto quit_loop_2; + } else { + Optional new_buffer_optional{f$fgets(stream)}; + if (!new_buffer_optional.has_value()) { + if (temp_len > static_cast(limit - buf)) { + goto quit_loop_2; + } + return answer; + } + new_buffer = new_buffer_optional.val(); + } + temp_len += new_buffer.size(); + buf_len = new_buffer.size(); + buffer = new_buffer; + buf = bptr = buffer.c_str(); + hunk_begin = buf; + + line_end = limit = kphp::fs::details::fgetcsv_lookup_trailing_spaces(buf, buf_len, ps); + line_end_len = buf_len - static_cast(limit - buf); + + state = 0; + } break; + default: + kphp::log::error("unreachable case"); + break; + } + break; + + case -2: + case -1: + /* break is omitted intentionally */ + case 1: + /* we need to determine if the enclosure is + * 'real' or is it escaped */ + switch (state) { + case 1: /* escaped */ + bptr++; + state = 0; + break; + case 2: /* embedded enclosure ? let's check it */ + if (*bptr != enclosure) { + /* real enclosure */ + tmp_buffer.append(hunk_begin, static_cast(bptr - hunk_begin - 1)); + hunk_begin = bptr; + goto quit_loop_2; + } + tmp_buffer.append(hunk_begin, static_cast(bptr - hunk_begin)); + bptr++; + hunk_begin = bptr; + state = 0; + break; + default: + if (*bptr == enclosure) { + state = 2; + } else if (escape != PHP_CSV_NO_ESCAPE && *bptr == escape) { + state = 1; + } + bptr++; + break; + } + break; + + default: + switch (state) { + case 2: + /* real enclosure */ + tmp_buffer.append(hunk_begin, static_cast(bptr - hunk_begin - 1)); + hunk_begin = bptr; + goto quit_loop_2; + case 1: + bptr += inc_len; + tmp_buffer.append(hunk_begin, static_cast(bptr - hunk_begin)); + hunk_begin = bptr; + state = 0; + break; + default: + bptr += inc_len; + break; + } + break; + } + // SAFETY: mbrlen is thread-safe if ps != nullptr, and ps != nullptr because there is assertion at the beginning of function + inc_len = (bptr < limit ? (*bptr == '\0' ? 1 : mbrlen(bptr, limit - bptr, ps)) : 0); // NOLINT + } + + quit_loop_2: + /* look up for a delimiter */ + for (;;) { + switch (inc_len) { + case 0: + goto quit_loop_3; + + case -2: + case -1: + inc_len = 1; + /* fallthrough */ + case 1: + if (*bptr == delimiter) { + goto quit_loop_3; + } + break; + default: + break; + } + bptr += inc_len; + + // SAFETY: mbrlen is thread-safe if ps != nullptr, and ps != nullptr because there is assertion at the beginning of function + inc_len = (bptr < limit ? (*bptr == '\0' ? 1 : mbrlen(bptr, limit - bptr, ps)) : 0); // NOLINT + } + + quit_loop_3: + tmp_buffer.append(hunk_begin, static_cast(bptr - hunk_begin)); + bptr += inc_len; + } else { + /* 2B. Handle non-enclosure field */ + + hunk_begin = bptr; + + for (;;) { + switch (inc_len) { + case 0: + goto quit_loop_4; + case -2: + case -1: + inc_len = 1; + /* fallthrough */ + case 1: + if (*bptr == delimiter) { + goto quit_loop_4; + } + break; + default: + break; + } + bptr += inc_len; + + // SAFETY: mbrlen is thread-safe if ps != nullptr, and ps != nullptr because there is assertion at the beginning of function + inc_len = (bptr < limit ? (*bptr == '\0' ? 1 : mbrlen(bptr, limit - bptr, ps)) : 0); // NOLINT + } + quit_loop_4: + tmp_buffer.append(hunk_begin, static_cast(bptr - hunk_begin)); + + char const* comp_end{kphp::fs::details::fgetcsv_lookup_trailing_spaces(tmp_buffer.c_str(), tmp_buffer.size(), ps)}; + tmp_buffer.set_pos(comp_end - tmp_buffer.c_str()); + if (*bptr == delimiter) { + bptr++; + } + } + + /* 3. Now pass our field back to php */ + answer.set_value(current_id++, tmp_buffer.str()); + tmp_buffer.clean(); + } while (inc_len > 0); + + return answer; +} +} // namespace + +// don't forget to add "interruptible" to file-functions.txt when this function becomes a coroutine +Optional> f$fgetcsv(const resource& stream, int64_t length, string delimiter, string enclosure, string escape) noexcept { + if (delimiter.empty()) { + kphp::log::warning("delimiter must be a character"); + return false; + } + if (delimiter.size() > 1) { + kphp::log::warning("delimiter must be a single character"); + } + if (enclosure.empty()) { + kphp::log::warning("enclosure must be a character"); + return false; + } + if (enclosure.size() > 1) { + kphp::log::warning("enclosure must be a single character"); + } + int32_t escape_char{PHP_CSV_NO_ESCAPE}; + if (!escape.empty()) { + escape_char = static_cast(static_cast(escape[0])); + } else if (escape.size() > 1) { + kphp::log::warning("escape_char must be a single character"); + } + + const char delimiter_char{delimiter[0]}; + const char enclosure_char{enclosure[0]}; + + if (length < 0) { + kphp::log::warning("length parameter may not be negative"); + return false; + } + if (length == 0) { + length = -2; // this is necessary to pass a negative number to fgets + } + Optional line_optional{f$fgets(stream, length + 1)}; + if (!line_optional.has_value()) { + return false; + } + + mbstate_t ps{}; + return getcsv(stream, line_optional.val(), delimiter_char, enclosure_char, escape_char, std::addressof(ps)); +} diff --git a/runtime-light/stdlib/file/file-system-functions.h b/runtime-light/stdlib/file/file-system-functions.h index 0d476b8e60..bdc3ece9c7 100644 --- a/runtime-light/stdlib/file/file-system-functions.h +++ b/runtime-light/stdlib/file/file-system-functions.h @@ -277,3 +277,7 @@ mixed f$getimagesize(const string& name) noexcept; // don't forget to add "interruptible" to file-functions.txt when this function becomes a coroutine Optional f$fgets(const resource& stream, int64_t length = -1) noexcept; + +// don't forget to add "interruptible" to file-functions.txt when this function becomes a coroutine +Optional> f$fgetcsv(const resource& stream, int64_t length = 0, string delimiter = string{",", 1}, string enclosure = string{"\"", 1}, + string escape = string{"\\", 1}) noexcept; diff --git a/runtime/streams.cpp b/runtime/streams.cpp index 794c0b00b1..1573f22553 100644 --- a/runtime/streams.cpp +++ b/runtime/streams.cpp @@ -6,9 +6,11 @@ #include #include +#include #include #include "common/kprintf.h" +#include "runtime-common/stdlib/file/file-functions.h" #include "runtime-common/stdlib/string/string-functions.h" #include "runtime/allocator.h" #include "runtime/array_functions.h" @@ -470,48 +472,11 @@ Optional f$fputcsv(const Stream& stream, const array& fields, st return f$fwrite(stream, csvline.str()); } -// this function is imported from https://github.com/php/php-src/blob/master/ext/standard/file.c, -// function php_fgetcsv_lookup_trailing_spaces -static const char* fgetcsv_lookup_trailing_spaces(const char* ptr, size_t len) { - int inc_len; - unsigned char last_chars[2] = {0, 0}; - - while (len > 0) { - inc_len = (*ptr == '\0' ? 1 : mblen(ptr, len)); - switch (inc_len) { - case -2: - case -1: - inc_len = 1; - break; - case 0: - goto quit_loop; - case 1: - default: - last_chars[0] = last_chars[1]; - last_chars[1] = *ptr; - break; - } - ptr += inc_len; - len -= inc_len; - } -quit_loop: - switch (last_chars[1]) { - case '\n': - if (last_chars[0] == '\r') { - return ptr - 2; - } - /* fallthrough */ - case '\r': - return ptr - 1; - } - return ptr; -} - // Common csv-parsing functionality for // * fgetcsv // * str_getcsv // The function is similar to `php_fgetcsv` function from https://github.com/php/php-src/blob/master/ext/standard/file.c -Optional> getcsv(const Stream& stream, string buffer, char delimiter, char enclosure, char escape) { +Optional> getcsv(const Stream& stream, string buffer, char delimiter, char enclosure, char escape, mbstate_t* ps) { array answer; int current_id = 0; string_buffer tmp_buffer; @@ -519,7 +484,7 @@ Optional> getcsv(const Stream& stream, string buffer, char delimite char const* buf = buffer.c_str(); char const* bptr = buf; size_t buf_len = buffer.size(); - char const* tptr = fgetcsv_lookup_trailing_spaces(buf, buf_len); + char const* tptr = kphp::fs::details::fgetcsv_lookup_trailing_spaces(buf, buf_len, ps); size_t line_end_len = buf_len - (tptr - buf); char const *line_end = tptr, *limit = tptr; bool first_field = true; @@ -528,7 +493,7 @@ Optional> getcsv(const Stream& stream, string buffer, char delimite do { char const* hunk_begin; - inc_len = (bptr < limit ? (*bptr == '\0' ? 1 : mblen(bptr, limit - bptr)) : 0); + inc_len = (bptr < limit ? (*bptr == '\0' ? 1 : mbrlen(bptr, limit - bptr, ps)) : 0); if (inc_len == 1) { char const* tmp = bptr; while ((*tmp != delimiter) && isspace((int)*(unsigned char*)tmp)) { @@ -594,7 +559,7 @@ Optional> getcsv(const Stream& stream, string buffer, char delimite buf = bptr = buffer.c_str(); hunk_begin = buf; - line_end = limit = fgetcsv_lookup_trailing_spaces(buf, buf_len); + line_end = limit = kphp::fs::details::fgetcsv_lookup_trailing_spaces(buf, buf_len, ps); line_end_len = buf_len - (size_t)(limit - buf); state = 0; @@ -655,7 +620,7 @@ Optional> getcsv(const Stream& stream, string buffer, char delimite } break; } - inc_len = (bptr < limit ? (*bptr == '\0' ? 1 : mblen(bptr, limit - bptr)) : 0); + inc_len = (bptr < limit ? (*bptr == '\0' ? 1 : mbrlen(bptr, limit - bptr, ps)) : 0); } quit_loop_2: @@ -678,7 +643,7 @@ Optional> getcsv(const Stream& stream, string buffer, char delimite break; } bptr += inc_len; - inc_len = (bptr < limit ? (*bptr == '\0' ? 1 : mblen(bptr, limit - bptr)) : 0); + inc_len = (bptr < limit ? (*bptr == '\0' ? 1 : mbrlen(bptr, limit - bptr, ps)) : 0); } quit_loop_3: @@ -706,12 +671,12 @@ Optional> getcsv(const Stream& stream, string buffer, char delimite break; } bptr += inc_len; - inc_len = (bptr < limit ? (*bptr == '\0' ? 1 : mblen(bptr, limit - bptr)) : 0); + inc_len = (bptr < limit ? (*bptr == '\0' ? 1 : mbrlen(bptr, limit - bptr, ps)) : 0); } quit_loop_4: tmp_buffer.append(hunk_begin, static_cast(bptr - hunk_begin)); - char const* comp_end = (char*)fgetcsv_lookup_trailing_spaces(tmp_buffer.c_str(), tmp_buffer.size()); + char const* comp_end = (char*)kphp::fs::details::fgetcsv_lookup_trailing_spaces(tmp_buffer.c_str(), tmp_buffer.size(), ps); tmp_buffer.set_pos(comp_end - tmp_buffer.c_str()); if (*bptr == delimiter) { bptr++; @@ -757,7 +722,8 @@ Optional> f$fgetcsv(const Stream& stream, int64_t length, string de if (!buf_optional.has_value()) { return false; } - return getcsv(stream, buf_optional.val(), delimiter_char, enclosure_char, escape_char); + mbstate_t ps{}; + return getcsv(stream, buf_optional.val(), delimiter_char, enclosure_char, escape_char, &ps); } Optional f$file_get_contents(const string& stream) { diff --git a/runtime/streams.h b/runtime/streams.h index 78959e0c51..1e12c1847d 100644 --- a/runtime/streams.h +++ b/runtime/streams.h @@ -4,6 +4,8 @@ #pragma once +#include + #include "runtime-common/core/runtime-core.h" using Stream = mixed; @@ -87,7 +89,7 @@ Optional f$vfprintf(const Stream& stream, const string& format, const a Optional f$fputcsv(const Stream& stream, const array& fields, string delimiter = string(",", 1), string enclosure = string("\"", 1), string escape_char = string("\\", 1)); -Optional> getcsv(const Stream& stream, string buffer, char delimiter, char enclosure, char escape); +Optional> getcsv(const Stream& stream, string buffer, char delimiter, char enclosure, char escape, mbstate_t* ps); Optional> f$fgetcsv(const Stream& stream, int64_t length = 0, string delimiter = string(",", 1), string enclosure = string("\"", 1), string escape_char = string("\\", 1)); diff --git a/runtime/string_functions.cpp b/runtime/string_functions.cpp index 9e2cb5a313..62faaad12b 100644 --- a/runtime/string_functions.cpp +++ b/runtime/string_functions.cpp @@ -4,6 +4,8 @@ #include "runtime/string_functions.h" +#include + #include "common/unicode/unicode-utils.h" #include "runtime-common/stdlib/string/string-functions.h" #include "runtime/interface.h" @@ -73,5 +75,6 @@ Optional> f$str_getcsv(const string& str, const string& delimiter, escape_char = escape[0]; } - return getcsv(mixed() /* null */, str, delimiter_char, enclosure_char, escape_char); + mbstate_t ps{}; + return getcsv(mixed() /* null */, str, delimiter_char, enclosure_char, escape_char, &ps); } diff --git a/tests/phpt/streams/fgetcsv.php b/tests/phpt/streams/fgetcsv.php index 37938ec7f0..44412c09fb 100644 --- a/tests/phpt/streams/fgetcsv.php +++ b/tests/phpt/streams/fgetcsv.php @@ -1,4 +1,4 @@ -@ok k2_skip +@ok