| # SPDX-FileCopyrightText: 2015 Eric Larson |
| # |
| # SPDX-License-Identifier: Apache-2.0 |
| |
| from tempfile import NamedTemporaryFile |
| import mmap |
| |
| |
| class CallbackFileWrapper(object): |
| """ |
| Small wrapper around a fp object which will tee everything read into a |
| buffer, and when that file is closed it will execute a callback with the |
| contents of that buffer. |
| |
| All attributes are proxied to the underlying file object. |
| |
| This class uses members with a double underscore (__) leading prefix so as |
| not to accidentally shadow an attribute. |
| |
| The data is stored in a temporary file until it is all available. As long |
| as the temporary files directory is disk-based (sometimes it's a |
| memory-backed-``tmpfs`` on Linux), data will be unloaded to disk if memory |
| pressure is high. For small files the disk usually won't be used at all, |
| it'll all be in the filesystem memory cache, so there should be no |
| performance impact. |
| """ |
| |
| def __init__(self, fp, callback): |
| self.__buf = NamedTemporaryFile("rb+", delete=True) |
| self.__fp = fp |
| self.__callback = callback |
| |
| def __getattr__(self, name): |
| # The vaguaries of garbage collection means that self.__fp is |
| # not always set. By using __getattribute__ and the private |
| # name[0] allows looking up the attribute value and raising an |
| # AttributeError when it doesn't exist. This stop thigns from |
| # infinitely recursing calls to getattr in the case where |
| # self.__fp hasn't been set. |
| # |
| # [0] https://docs.python.org/2/reference/expressions.html#atom-identifiers |
| fp = self.__getattribute__("_CallbackFileWrapper__fp") |
| return getattr(fp, name) |
| |
| def __is_fp_closed(self): |
| try: |
| return self.__fp.fp is None |
| |
| except AttributeError: |
| pass |
| |
| try: |
| return self.__fp.closed |
| |
| except AttributeError: |
| pass |
| |
| # We just don't cache it then. |
| # TODO: Add some logging here... |
| return False |
| |
| def _close(self): |
| if self.__callback: |
| if self.__buf.tell() == 0: |
| # Empty file: |
| result = b"" |
| else: |
| # Return the data without actually loading it into memory, |
| # relying on Python's buffer API and mmap(). mmap() just gives |
| # a view directly into the filesystem's memory cache, so it |
| # doesn't result in duplicate memory use. |
| self.__buf.seek(0, 0) |
| result = memoryview( |
| mmap.mmap(self.__buf.fileno(), 0, access=mmap.ACCESS_READ) |
| ) |
| self.__callback(result) |
| |
| # We assign this to None here, because otherwise we can get into |
| # really tricky problems where the CPython interpreter dead locks |
| # because the callback is holding a reference to something which |
| # has a __del__ method. Setting this to None breaks the cycle |
| # and allows the garbage collector to do it's thing normally. |
| self.__callback = None |
| |
| # Closing the temporary file releases memory and frees disk space. |
| # Important when caching big files. |
| self.__buf.close() |
| |
| def read(self, amt=None): |
| data = self.__fp.read(amt) |
| if data: |
| # We may be dealing with b'', a sign that things are over: |
| # it's passed e.g. after we've already closed self.__buf. |
| self.__buf.write(data) |
| if self.__is_fp_closed(): |
| self._close() |
| |
| return data |
| |
| def _safe_read(self, amt): |
| data = self.__fp._safe_read(amt) |
| if amt == 2 and data == b"\r\n": |
| # urllib executes this read to toss the CRLF at the end |
| # of the chunk. |
| return data |
| |
| self.__buf.write(data) |
| if self.__is_fp_closed(): |
| self._close() |
| |
| return data |