From 508aa9d23e16a4a358ce5fc978de3c82d7602809 Mon Sep 17 00:00:00 2001 From: Cody Maloney Date: Mon, 26 Aug 2024 22:54:07 -0700 Subject: [PATCH 1/7] gh-120754: Refactor _io to stash whole stat Multiple places in the I/O stack optimize common cases by using the information from stat. Currently individual members are extracted from the stat and stored into the fileio struct. Refactor the code to store the whole stat struct instead. --- Modules/_io/fileio.c | 67 +++++++++++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 22 deletions(-) diff --git a/Modules/_io/fileio.c b/Modules/_io/fileio.c index 5d9d87d6118a75..3b96604b7ac0a0 100644 --- a/Modules/_io/fileio.c +++ b/Modules/_io/fileio.c @@ -74,8 +74,7 @@ typedef struct { signed int seekable : 2; /* -1 means unknown */ unsigned int closefd : 1; char finalizing; - unsigned int blksize; - Py_off_t estimated_size; + struct _Py_stat_struct *stat_atopen; PyObject *weakreflist; PyObject *dict; } fileio; @@ -199,8 +198,7 @@ fileio_new(PyTypeObject *type, PyObject *args, PyObject *kwds) self->writable = 0; self->appending = 0; self->seekable = -1; - self->blksize = 0; - self->estimated_size = -1; + self->stat_atopen = NULL; self->closefd = 1; self->weakreflist = NULL; } @@ -256,7 +254,6 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode, #elif !defined(MS_WINDOWS) int *atomic_flag_works = NULL; #endif - struct _Py_stat_struct fdfstat; int fstat_result; int async_err = 0; @@ -454,9 +451,13 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode, #endif } - self->blksize = DEFAULT_BUFFER_SIZE; + self->stat_atopen = PyMem_New(struct _Py_stat_struct, 1); + if (self->stat_atopen == NULL) { + PyErr_Format(PyExc_MemoryError, "Unable to allocate space for stat result"); + goto error; + } Py_BEGIN_ALLOW_THREADS - fstat_result = _Py_fstat_noraise(self->fd, &fdfstat); + fstat_result = _Py_fstat_noraise(self->fd, self->stat_atopen); Py_END_ALLOW_THREADS if (fstat_result < 0) { /* Tolerate fstat() errors other than EBADF. See Issue #25717, where @@ -471,25 +472,21 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode, #endif goto error; } + + PyMem_Free(self->stat_atopen); + self->stat_atopen = NULL; } else { #if defined(S_ISDIR) && defined(EISDIR) /* On Unix, open will succeed for directories. In Python, there should be no file objects referring to directories, so we need a check. */ - if (S_ISDIR(fdfstat.st_mode)) { + if (S_ISDIR(self->stat_atopen->st_mode)) { errno = EISDIR; PyErr_SetFromErrnoWithFilenameObject(PyExc_OSError, nameobj); goto error; } #endif /* defined(S_ISDIR) */ -#ifdef HAVE_STRUCT_STAT_ST_BLKSIZE - if (fdfstat.st_blksize > 1) - self->blksize = fdfstat.st_blksize; -#endif /* HAVE_STRUCT_STAT_ST_BLKSIZE */ - if (fdfstat.st_size < PY_SSIZE_T_MAX) { - self->estimated_size = (Py_off_t)fdfstat.st_size; - } } #if defined(MS_WINDOWS) || defined(__CYGWIN__) @@ -521,6 +518,10 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode, internal_close(self); _PyErr_ChainExceptions1(exc); } + if (self->stat_atopen != NULL) { + PyMem_Free(self->stat_atopen); + self->stat_atopen = NULL; + } done: #ifdef MS_WINDOWS @@ -553,6 +554,10 @@ fileio_dealloc(fileio *self) if (_PyIOBase_finalize((PyObject *) self) < 0) return; _PyObject_GC_UNTRACK(self); + if (self->stat_atopen != NULL) { + PyMem_Free(self->stat_atopen); + self->stat_atopen = NULL; + } if (self->weakreflist != NULL) PyObject_ClearWeakRefs((PyObject *) self); (void)fileio_clear(self); @@ -725,7 +730,12 @@ _io_FileIO_readall_impl(fileio *self) return err_closed(); } - end = self->estimated_size; + if (self->stat_atopen != NULL && self->stat_atopen->st_size < _PY_READ_MAX) { + end = (Py_off_t)self->stat_atopen->st_size; + } + else { + end = -1; + } if (end <= 0) { /* Use a default size and resize as needed. */ bufsize = SMALLCHUNK; @@ -1094,11 +1104,13 @@ _io_FileIO_truncate_impl(fileio *self, PyTypeObject *cls, PyObject *posobj) return NULL; } - /* Sometimes a large file is truncated. While estimated_size is used as a - estimate, that it is much larger than the actual size can result in a - significant over allocation and sometimes a MemoryError / running out of - memory. */ - self->estimated_size = pos; + /* Since the file was truncated, its size at open is no longer accurate + as an estimate. Clear out the stat result, and rely on dynamic resize + code if a readall is requested. */ + if (self->stat_atopen != NULL) { + PyMem_Free(self->stat_atopen); + self->stat_atopen = NULL; + } return posobj; } @@ -1229,16 +1241,27 @@ get_mode(fileio *self, void *closure) return PyUnicode_FromString(mode_string(self)); } +static PyObject * +get_blksize(fileio *self, void *closure) +{ +#ifdef HAVE_STRUCT_STAT_ST_BLKSIZE + if (self->stat_atopen != NULL && self->stat_atopen->st_blksize > 1) { + return PyLong_FromLong(self->stat_atopen->st_blksize); + } +#endif /* HAVE_STRUCT_STAT_ST_BLKSIZE */ + return PyLong_FromLong(DEFAULT_BUFFER_SIZE); +} + static PyGetSetDef fileio_getsetlist[] = { {"closed", (getter)get_closed, NULL, "True if the file is closed"}, {"closefd", (getter)get_closefd, NULL, "True if the file descriptor will be closed by close()."}, {"mode", (getter)get_mode, NULL, "String giving the file mode"}, + {"_blksize", (getter)get_blksize, NULL, "Stat st_blksize if available"}, {NULL}, }; static PyMemberDef fileio_members[] = { - {"_blksize", Py_T_UINT, offsetof(fileio, blksize), 0}, {"_finalizing", Py_T_BOOL, offsetof(fileio, finalizing), 0}, {"__weaklistoffset__", Py_T_PYSSIZET, offsetof(fileio, weakreflist), Py_READONLY}, {"__dictoffset__", Py_T_PYSSIZET, offsetof(fileio, dict), Py_READONLY}, From 9d849ce137f16ef2c72cf9889cd2e747fdabdd74 Mon Sep 17 00:00:00 2001 From: Cody Maloney Date: Mon, 26 Aug 2024 23:15:15 -0700 Subject: [PATCH 2/7] gh-120754: Refactor _pyio to stash whole stat Parallels the changes to _io. The `stat` Python object doesn't allow changing members, so rather than modifying estimated_size, just clear the value. --- Lib/_pyio.py | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/Lib/_pyio.py b/Lib/_pyio.py index 75b5ad1b1a47d2..ba0d844d2e78e7 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -242,14 +242,7 @@ def open(file, mode="r", buffering=-1, encoding=None, errors=None, buffering = -1 line_buffering = True if buffering < 0: - buffering = DEFAULT_BUFFER_SIZE - try: - bs = os.fstat(raw.fileno()).st_blksize - except (OSError, AttributeError): - pass - else: - if bs > 1: - buffering = bs + buffering = raw._blksize if buffering < 0: raise ValueError("invalid buffering size") if buffering == 0: @@ -1565,19 +1558,15 @@ def __init__(self, file, mode='r', closefd=True, opener=None): os.set_inheritable(fd, False) self._closefd = closefd - fdfstat = os.fstat(fd) + self._stat_atopen = os.fstat(fd) try: - if stat.S_ISDIR(fdfstat.st_mode): + if stat.S_ISDIR(self._stat_atopen.st_mode): raise IsADirectoryError(errno.EISDIR, os.strerror(errno.EISDIR), file) except AttributeError: # Ignore the AttributeError if stat.S_ISDIR or errno.EISDIR # don't exist. pass - self._blksize = getattr(fdfstat, 'st_blksize', 0) - if self._blksize <= 1: - self._blksize = DEFAULT_BUFFER_SIZE - self._estimated_size = fdfstat.st_size if _setmode: # don't translate newlines (\r\n <=> \n) @@ -1623,6 +1612,14 @@ def __repr__(self): return ('<%s name=%r mode=%r closefd=%r>' % (class_name, name, self.mode, self._closefd)) + @property + def _blksize(self): + if self._stat_atopen: + res = getattr(self._stat_atopen, "st_blksize", 0) + + # WASI sets blksize to 0 + return res if res > 0 else DEFAULT_BUFFER_SIZE + def _checkReadable(self): if not self._readable: raise UnsupportedOperation('File not open for reading') @@ -1655,16 +1652,16 @@ def readall(self): """ self._checkClosed() self._checkReadable() - if self._estimated_size <= 0: + if not self._stat_atopen or self._stat_atopen.st_size <= 0: bufsize = DEFAULT_BUFFER_SIZE else: - bufsize = self._estimated_size + 1 + bufsize = self._stat_atopen.st_size + 1 - if self._estimated_size > 65536: + if self._stat_atopen.st_size > 65536: try: pos = os.lseek(self._fd, 0, SEEK_CUR) - if self._estimated_size >= pos: - bufsize = self._estimated_size - pos + 1 + if self._stat_atopen.st_size >= pos: + bufsize = self._stat_atopen.st_size - pos + 1 except OSError: pass @@ -1742,7 +1739,7 @@ def truncate(self, size=None): if size is None: size = self.tell() os.ftruncate(self._fd, size) - self._estimated_size = size + self._stat_atopen = None return size def close(self): From bfcfcf295d1a197bc4a5aa20de417cea8926add2 Mon Sep 17 00:00:00 2001 From: Cody Maloney Date: Wed, 28 Aug 2024 17:20:36 -0700 Subject: [PATCH 3/7] Apply suggestions from code review Co-authored-by: Victor Stinner --- Lib/_pyio.py | 2 +- Modules/_io/fileio.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/_pyio.py b/Lib/_pyio.py index ba0d844d2e78e7..8884196b6e2ba5 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -1652,7 +1652,7 @@ def readall(self): """ self._checkClosed() self._checkReadable() - if not self._stat_atopen or self._stat_atopen.st_size <= 0: + if self._stat_atopen is None or self._stat_atopen.st_size <= 0: bufsize = DEFAULT_BUFFER_SIZE else: bufsize = self._stat_atopen.st_size + 1 diff --git a/Modules/_io/fileio.c b/Modules/_io/fileio.c index 3b96604b7ac0a0..9045347c368639 100644 --- a/Modules/_io/fileio.c +++ b/Modules/_io/fileio.c @@ -453,7 +453,7 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode, self->stat_atopen = PyMem_New(struct _Py_stat_struct, 1); if (self->stat_atopen == NULL) { - PyErr_Format(PyExc_MemoryError, "Unable to allocate space for stat result"); + PyErr_NoMemory(); goto error; } Py_BEGIN_ALLOW_THREADS From 312266561bde06d511d5f64cf85bdc2e0dced42e Mon Sep 17 00:00:00 2001 From: Cody Maloney Date: Wed, 28 Aug 2024 17:35:16 -0700 Subject: [PATCH 4/7] Add comments around stat_atopen and why bufsize is + 1 --- Lib/_pyio.py | 4 ++++ Modules/_io/fileio.c | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/Lib/_pyio.py b/Lib/_pyio.py index 8884196b6e2ba5..605fc995681470 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -1655,6 +1655,10 @@ def readall(self): if self._stat_atopen is None or self._stat_atopen.st_size <= 0: bufsize = DEFAULT_BUFFER_SIZE else: + # In order to detect end of file, need a read() of at least 1 + # byte which returns size 0. Oversize the buffer by 1 byte so the + # I/O can be completed with two read() calls (one for all data, one + # for EOF) without needing to resize the buffer. bufsize = self._stat_atopen.st_size + 1 if self._stat_atopen.st_size > 65536: diff --git a/Modules/_io/fileio.c b/Modules/_io/fileio.c index 9045347c368639..23e1a834561fbe 100644 --- a/Modules/_io/fileio.c +++ b/Modules/_io/fileio.c @@ -74,6 +74,11 @@ typedef struct { signed int seekable : 2; /* -1 means unknown */ unsigned int closefd : 1; char finalizing; + /* Stat result which was grabbed at file open, useful for optimizing common + File I/O patterns to be more efficient. This is only guidance / an estimate, + as it is subject to Time-Of-Check to Time-Of-Use (TOCTOU) issues / bugs. + Both the underlying file descriptor and file may be modified outside of the + fileio object / Python (ex. gh-90102, GH-121941, gh-109523). */ struct _Py_stat_struct *stat_atopen; PyObject *weakreflist; PyObject *dict; From 8f5cfe434863e4b4160d3991b3ae4e920554d477 Mon Sep 17 00:00:00 2001 From: Cody Maloney Date: Wed, 28 Aug 2024 17:37:21 -0700 Subject: [PATCH 5/7] Apply review changes for _pyio _blkszie --- Lib/_pyio.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Lib/_pyio.py b/Lib/_pyio.py index 605fc995681470..18849b309b8605 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -1614,11 +1614,14 @@ def __repr__(self): @property def _blksize(self): - if self._stat_atopen: - res = getattr(self._stat_atopen, "st_blksize", 0) - - # WASI sets blksize to 0 - return res if res > 0 else DEFAULT_BUFFER_SIZE + if self._stat_atopen is None: + return DEFAULT_BUFFER_SIZE + + blksize = getattr(self._stat_atopen, "st_blksize", 0) + # WASI sets blsize to 0 + if not blksize: + return DEFAULT_BUFFER_SIZE + return blksize def _checkReadable(self): if not self._readable: From d18a82d78a24612d8a3e74ddc44b759a07aaf176 Mon Sep 17 00:00:00 2001 From: Cody Maloney Date: Wed, 28 Aug 2024 23:01:48 -0700 Subject: [PATCH 6/7] Fix comment formatting --- Modules/_io/fileio.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Modules/_io/fileio.c b/Modules/_io/fileio.c index 23e1a834561fbe..7b040387a90cc7 100644 --- a/Modules/_io/fileio.c +++ b/Modules/_io/fileio.c @@ -75,10 +75,11 @@ typedef struct { unsigned int closefd : 1; char finalizing; /* Stat result which was grabbed at file open, useful for optimizing common - File I/O patterns to be more efficient. This is only guidance / an estimate, - as it is subject to Time-Of-Check to Time-Of-Use (TOCTOU) issues / bugs. - Both the underlying file descriptor and file may be modified outside of the - fileio object / Python (ex. gh-90102, GH-121941, gh-109523). */ + File I/O patterns to be more efficient. This is only guidance / an + estimate, as it is subject to Time-Of-Check to Time-Of-Use (TOCTOU) + issues / bugs. Both the underlying file descriptor and file may be + modified outside of the fileio object / Python (ex. gh-90102, GH-121941, + gh-109523). */ struct _Py_stat_struct *stat_atopen; PyObject *weakreflist; PyObject *dict; From c55d10e2fce68da61677786240efaa8bc4dbffe2 Mon Sep 17 00:00:00 2001 From: Cody Maloney Date: Thu, 29 Aug 2024 15:28:25 -0700 Subject: [PATCH 7/7] Add _pyiio +1 comment to fileio for better clarity --- Modules/_io/fileio.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Modules/_io/fileio.c b/Modules/_io/fileio.c index 7b040387a90cc7..865b0e3634f3b4 100644 --- a/Modules/_io/fileio.c +++ b/Modules/_io/fileio.c @@ -747,14 +747,16 @@ _io_FileIO_readall_impl(fileio *self) bufsize = SMALLCHUNK; } else { - /* This is probably a real file, so we try to allocate a - buffer one byte larger than the rest of the file. If the - calculation is right then we should get EOF without having - to enlarge the buffer. */ + /* This is probably a real file. */ if (end > _PY_READ_MAX - 1) { bufsize = _PY_READ_MAX; } else { + /* In order to detect end of file, need a read() of at + least 1 byte which returns size 0. Oversize the buffer + by 1 byte so the I/O can be completed with two read() + calls (one for all data, one for EOF) without needing + to resize the buffer. */ bufsize = (size_t)end + 1; }