Skip to content

Commit 251b512

Browse files
authored
Use PyCapsule for internal datetime functions (#51525)
1 parent e9ea582 commit 251b512

33 files changed

+748
-223
lines changed

MANIFEST.in

+2
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,5 @@ prune pandas/tests/io/parser/data
5858
# Selectively re-add *.cxx files that were excluded above
5959
graft pandas/_libs/src
6060
graft pandas/_libs/tslibs/src
61+
include pandas/_libs/pd_parser.h
62+
include pandas/_libs/pd_parser.c

pandas/_libs/__init__.py

+5
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@
1010
]
1111

1212

13+
# Below imports needs to happen first to ensure pandas top level
14+
# module gets monkeypatched with the pandas_datetime_CAPI
15+
# see pandas_datetime_exec in pd_datetime.c
16+
import pandas._libs.pandas_parser # noqa # isort: skip # type: ignore[reportUnusedImport]
17+
import pandas._libs.pandas_datetime # noqa # isort: skip # type: ignore[reportUnusedImport]
1318
from pandas._libs.interval import Interval
1419
from pandas._libs.tslibs import (
1520
NaT,

pandas/_libs/index.pyx

+5
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,12 @@ from pandas._libs.tslibs.nattype cimport c_NaT as NaT
2020
from pandas._libs.tslibs.np_datetime cimport (
2121
NPY_DATETIMEUNIT,
2222
get_unit_from_dtype,
23+
import_pandas_datetime,
2324
)
25+
26+
import_pandas_datetime()
27+
28+
2429
from pandas._libs.tslibs.period cimport is_period_object
2530
from pandas._libs.tslibs.timedeltas cimport _Timedelta
2631
from pandas._libs.tslibs.timestamps cimport _Timestamp

pandas/_libs/lib.pyx

+4-2
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,11 @@ cdef extern from "numpy/arrayobject.h":
8888
cdef extern from "numpy/ndarrayobject.h":
8989
bint PyArray_CheckScalar(obj) nogil
9090

91-
92-
cdef extern from "src/parse_helper.h":
91+
cdef extern from "pd_parser.h":
9392
int floatify(object, float64_t *result, int *maybe_int) except -1
93+
void PandasParser_IMPORT()
94+
95+
PandasParser_IMPORT
9496

9597
from pandas._libs cimport util
9698
from pandas._libs.util cimport (

pandas/_libs/missing.pyx

+3
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,11 @@ from pandas._libs.tslibs.np_datetime cimport (
3434
get_datetime64_unit,
3535
get_datetime64_value,
3636
get_timedelta64_value,
37+
import_pandas_datetime,
3738
)
3839

40+
import_pandas_datetime()
41+
3942
from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op
4043

4144
cdef:

pandas/_libs/parsers.pyx

+53-20
Original file line numberDiff line numberDiff line change
@@ -229,9 +229,9 @@ cdef extern from "parser/tokenizer.h":
229229
int64_t skip_first_N_rows
230230
int64_t skipfooter
231231
# pick one, depending on whether the converter requires GIL
232-
float64_t (*double_converter)(const char *, char **,
233-
char, char, char,
234-
int, int *, int *) nogil
232+
double (*double_converter)(const char *, char **,
233+
char, char, char,
234+
int, int *, int *) nogil
235235

236236
# error handling
237237
char *warn_msg
@@ -249,6 +249,16 @@ cdef extern from "parser/tokenizer.h":
249249
int seen_uint
250250
int seen_null
251251

252+
void COLITER_NEXT(coliter_t, const char *) nogil
253+
254+
cdef extern from "pd_parser.h":
255+
void *new_rd_source(object obj) except NULL
256+
257+
int del_rd_source(void *src)
258+
259+
void* buffer_rd_bytes(void *source, size_t nbytes,
260+
size_t *bytes_read, int *status, const char *encoding_errors)
261+
252262
void uint_state_init(uint_state *self)
253263
int uint64_conflict(uint_state *self)
254264

@@ -279,26 +289,49 @@ cdef extern from "parser/tokenizer.h":
279289
uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
280290
uint64_t uint_max, int *error, char tsep) nogil
281291

282-
float64_t xstrtod(const char *p, char **q, char decimal,
292+
double xstrtod(const char *p, char **q, char decimal,
293+
char sci, char tsep, int skip_trailing,
294+
int *error, int *maybe_int) nogil
295+
double precise_xstrtod(const char *p, char **q, char decimal,
296+
char sci, char tsep, int skip_trailing,
297+
int *error, int *maybe_int) nogil
298+
double round_trip(const char *p, char **q, char decimal,
283299
char sci, char tsep, int skip_trailing,
284300
int *error, int *maybe_int) nogil
285-
float64_t precise_xstrtod(const char *p, char **q, char decimal,
286-
char sci, char tsep, int skip_trailing,
287-
int *error, int *maybe_int) nogil
288-
float64_t round_trip(const char *p, char **q, char decimal,
289-
char sci, char tsep, int skip_trailing,
290-
int *error, int *maybe_int) nogil
291301

292302
int to_boolean(const char *item, uint8_t *val) nogil
293303

304+
void PandasParser_IMPORT()
294305

295-
cdef extern from "parser/io.h":
296-
void *new_rd_source(object obj) except NULL
306+
PandasParser_IMPORT
297307

298-
int del_rd_source(void *src)
308+
# When not invoked directly but rather assigned as a function,
309+
# cdef extern'ed declarations seem to leave behind an undefined symbol
310+
cdef double xstrtod_wrapper(const char *p, char **q, char decimal,
311+
char sci, char tsep, int skip_trailing,
312+
int *error, int *maybe_int) nogil:
313+
return xstrtod(p, q, decimal, sci, tsep, skip_trailing, error, maybe_int)
299314

300-
void* buffer_rd_bytes(void *source, size_t nbytes,
301-
size_t *bytes_read, int *status, const char *encoding_errors)
315+
316+
cdef double precise_xstrtod_wrapper(const char *p, char **q, char decimal,
317+
char sci, char tsep, int skip_trailing,
318+
int *error, int *maybe_int) nogil:
319+
return precise_xstrtod(p, q, decimal, sci, tsep, skip_trailing, error, maybe_int)
320+
321+
322+
cdef double round_trip_wrapper(const char *p, char **q, char decimal,
323+
char sci, char tsep, int skip_trailing,
324+
int *error, int *maybe_int) nogil:
325+
return round_trip(p, q, decimal, sci, tsep, skip_trailing, error, maybe_int)
326+
327+
328+
cdef void* buffer_rd_bytes_wrapper(void *source, size_t nbytes,
329+
size_t *bytes_read, int *status,
330+
const char *encoding_errors) noexcept:
331+
return buffer_rd_bytes(source, nbytes, bytes_read, status, encoding_errors)
332+
333+
cdef int del_rd_source_wrapper(void *src) noexcept:
334+
return del_rd_source(src)
302335

303336

304337
cdef class TextReader:
@@ -487,11 +520,11 @@ cdef class TextReader:
487520

488521
if float_precision == "round_trip":
489522
# see gh-15140
490-
self.parser.double_converter = round_trip
523+
self.parser.double_converter = round_trip_wrapper
491524
elif float_precision == "legacy":
492-
self.parser.double_converter = xstrtod
525+
self.parser.double_converter = xstrtod_wrapper
493526
elif float_precision == "high" or float_precision is None:
494-
self.parser.double_converter = precise_xstrtod
527+
self.parser.double_converter = precise_xstrtod_wrapper
495528
else:
496529
raise ValueError(f"Unrecognized float_precision option: "
497530
f"{float_precision}")
@@ -610,8 +643,8 @@ cdef class TextReader:
610643

611644
ptr = new_rd_source(source)
612645
self.parser.source = ptr
613-
self.parser.cb_io = &buffer_rd_bytes
614-
self.parser.cb_cleanup = &del_rd_source
646+
self.parser.cb_io = buffer_rd_bytes_wrapper
647+
self.parser.cb_cleanup = del_rd_source_wrapper
615648

616649
cdef _get_header(self, list prelim_header):
617650
# header is now a list of lists, so field_count should use header[0]

pandas/_libs/pd_parser.c

+178
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
/*
2+
3+
Copyright (c) 2023, PyData Development Team
4+
All rights reserved.
5+
6+
Distributed under the terms of the BSD Simplified License.
7+
8+
*/
9+
#define _PANDAS_PARSER_IMPL
10+
11+
#include "pd_parser.h"
12+
#include "src/parser/io.h"
13+
14+
static int to_double(char *item, double *p_value, char sci, char decimal,
15+
int *maybe_int) {
16+
char *p_end = NULL;
17+
int error = 0;
18+
19+
/* Switch to precise xstrtod GH 31364 */
20+
*p_value =
21+
precise_xstrtod(item, &p_end, decimal, sci, '\0', 1, &error, maybe_int);
22+
23+
return (error == 0) && (!*p_end);
24+
}
25+
26+
static int floatify(PyObject *str, double *result, int *maybe_int) {
27+
int status;
28+
char *data;
29+
PyObject *tmp = NULL;
30+
const char sci = 'E';
31+
const char dec = '.';
32+
33+
if (PyBytes_Check(str)) {
34+
data = PyBytes_AS_STRING(str);
35+
} else if (PyUnicode_Check(str)) {
36+
tmp = PyUnicode_AsUTF8String(str);
37+
if (tmp == NULL) {
38+
return -1;
39+
}
40+
data = PyBytes_AS_STRING(tmp);
41+
} else {
42+
PyErr_SetString(PyExc_TypeError, "Invalid object type");
43+
return -1;
44+
}
45+
46+
status = to_double(data, result, sci, dec, maybe_int);
47+
48+
if (!status) {
49+
/* handle inf/-inf infinity/-infinity */
50+
if (strlen(data) == 3) {
51+
if (0 == strcasecmp(data, "inf")) {
52+
*result = HUGE_VAL;
53+
*maybe_int = 0;
54+
} else {
55+
goto parsingerror;
56+
}
57+
} else if (strlen(data) == 4) {
58+
if (0 == strcasecmp(data, "-inf")) {
59+
*result = -HUGE_VAL;
60+
*maybe_int = 0;
61+
} else if (0 == strcasecmp(data, "+inf")) {
62+
*result = HUGE_VAL;
63+
*maybe_int = 0;
64+
} else {
65+
goto parsingerror;
66+
}
67+
} else if (strlen(data) == 8) {
68+
if (0 == strcasecmp(data, "infinity")) {
69+
*result = HUGE_VAL;
70+
*maybe_int = 0;
71+
} else {
72+
goto parsingerror;
73+
}
74+
} else if (strlen(data) == 9) {
75+
if (0 == strcasecmp(data, "-infinity")) {
76+
*result = -HUGE_VAL;
77+
*maybe_int = 0;
78+
} else if (0 == strcasecmp(data, "+infinity")) {
79+
*result = HUGE_VAL;
80+
*maybe_int = 0;
81+
} else {
82+
goto parsingerror;
83+
}
84+
} else {
85+
goto parsingerror;
86+
}
87+
}
88+
89+
Py_XDECREF(tmp);
90+
return 0;
91+
92+
parsingerror:
93+
PyErr_Format(PyExc_ValueError, "Unable to parse string \"%s\"", data);
94+
Py_XDECREF(tmp);
95+
return -1;
96+
}
97+
98+
99+
static void pandas_parser_destructor(PyObject *op) {
100+
void *ptr = PyCapsule_GetPointer(op, PandasParser_CAPSULE_NAME);
101+
PyMem_Free(ptr);
102+
}
103+
104+
static int pandas_parser_exec(PyObject *module) {
105+
PandasParser_CAPI *capi = PyMem_Malloc(sizeof(PandasParser_CAPI));
106+
if (capi == NULL) {
107+
PyErr_NoMemory();
108+
return -1;
109+
}
110+
111+
capi->to_double = to_double;
112+
capi->floatify = floatify;
113+
capi->new_rd_source = new_rd_source;
114+
capi->del_rd_source = del_rd_source;
115+
capi->buffer_rd_bytes = buffer_rd_bytes;
116+
capi->uint_state_init = uint_state_init;
117+
capi->uint64_conflict = uint64_conflict;
118+
capi->coliter_setup = coliter_setup;
119+
capi->parser_new = parser_new;
120+
capi->parser_init = parser_init;
121+
capi->parser_free = parser_free;
122+
capi->parser_del = parser_del;
123+
capi->parser_add_skiprow = parser_add_skiprow;
124+
capi->parser_set_skipfirstnrows = parser_set_skipfirstnrows;
125+
capi->parser_set_default_options = parser_set_default_options;
126+
capi->parser_consume_rows = parser_consume_rows;
127+
capi->parser_trim_buffers = parser_trim_buffers;
128+
capi->tokenize_all_rows = tokenize_all_rows;
129+
capi->tokenize_nrows = tokenize_nrows;
130+
capi->str_to_int64 = str_to_int64;
131+
capi->str_to_uint64 = str_to_uint64;
132+
capi->xstrtod = xstrtod;
133+
capi->precise_xstrtod = precise_xstrtod;
134+
capi->round_trip = round_trip;
135+
capi->to_boolean = to_boolean;
136+
137+
PyObject *capsule =
138+
PyCapsule_New(capi, PandasParser_CAPSULE_NAME, pandas_parser_destructor);
139+
if (capsule == NULL) {
140+
PyMem_Free(capi);
141+
return -1;
142+
}
143+
144+
// Monkeypatch the top level pandas module to have an attribute for the
145+
// C-API. This is required because Python capsules do not support setting
146+
// this attribute on anything but the top level package. Ideally not
147+
// done when cpython gh-6898 gets implemented
148+
PyObject *pandas = PyImport_ImportModule("pandas");
149+
if (!pandas) {
150+
PyErr_SetString(PyExc_ImportError,
151+
"pd_parser.c could not import module pandas");
152+
Py_DECREF(capsule);
153+
return -1;
154+
}
155+
156+
if (PyModule_AddObject(pandas, "_pandas_parser_CAPI", capsule) < 0) {
157+
Py_DECREF(capsule);
158+
return -1;
159+
}
160+
161+
return 0;
162+
}
163+
164+
static PyModuleDef_Slot pandas_parser_slots[] = {
165+
{Py_mod_exec, pandas_parser_exec}, {0, NULL}};
166+
167+
static struct PyModuleDef pandas_parsermodule = {
168+
PyModuleDef_HEAD_INIT,
169+
.m_name = "pandas._libs.pandas_parser",
170+
171+
.m_doc = "Internal module with parser support for other extensions",
172+
.m_size = 0,
173+
.m_methods = NULL,
174+
.m_slots = pandas_parser_slots};
175+
176+
PyMODINIT_FUNC PyInit_pandas_parser(void) {
177+
return PyModuleDef_Init(&pandas_parsermodule);
178+
}

0 commit comments

Comments
 (0)