From a8407c7b6a56b31cd4fee2c8767e7defd8867dd5 2020-03-27 11:35:45 From: Brett Smith Date: 2020-03-27 11:35:45 Subject: [PATCH] rtutil: Add RTLinkCache class to cache links to disk. This will greatly reduce RT requests across multiple runs and speed up link checking/conversion. --- diff --git a/conservancy_beancount/config.py b/conservancy_beancount/config.py index fded3c852fedd378de048db13be858724d0a60c5..de9b1f42d3e44a70ff51635c2a3ed1f11b9b38c5 100644 --- a/conservancy_beancount/config.py +++ b/conservancy_beancount/config.py @@ -126,8 +126,16 @@ class Config: wrapper_client = self.rt_client(credentials, client) if wrapper_client is None: return None + cache_dir_path = self.cache_dir_path() + if cache_dir_path is None: + cache_db = None else: - return rtutil.RT(wrapper_client) + cache_name = '{}@{}.sqlite3'.format( + credentials.user, + urlparse.quote(str(credentials.server), ''), + ) + cache_db = rtutil.RTLinkCache.setup(cache_dir_path / cache_name) + return rtutil.RT(wrapper_client, cache_db) def rt_wrapper(self, credentials: RTCredentials=None, diff --git a/conservancy_beancount/rtutil.py b/conservancy_beancount/rtutil.py index f1548456fae820629b46285a4586344d7dbb08db..906e79582beba0b1c8e85f1738b2b620e79ea3a8 100644 --- a/conservancy_beancount/rtutil.py +++ b/conservancy_beancount/rtutil.py @@ -17,17 +17,142 @@ import functools import mimetypes import re +import sqlite3 import urllib.parse as urlparse import rt +from pathlib import Path + from typing import ( + Callable, + Iterator, + MutableMapping, Optional, + Set, Tuple, Union, ) RTId = Union[int, str] +TicketAttachmentIds = Tuple[str, Optional[str]] +_LinkCache = MutableMapping[TicketAttachmentIds, Optional[str]] + +class RTLinkCache(_LinkCache): + """Cache RT links to disk + + This class provides a dict-like interface to a cache of RT links. + Once an object is in RT, a link to it should never change. + The only exception is when objects get shredded, and those objects + shouldn't be referenced in books anyway. + + This implementation is backed by a sqlite database. You can call:: + + db = RTLinkCache.setup(path) + + This method will try to open a sqlite database at the given path, + and set up necessary tables, etc. + If it succeeds, it returns a database connection you can use to + initialize the cache. + If it fails, it returns None, and the caller should use some other + dict-like object (like a normal dict) for caching. + You can give the result to the RT utility class either way, + and it will do the right thing for itself:: + + rt = RT(rt_client, db) + """ + + CREATE_TABLE_SQL = """CREATE TABLE IF NOT EXISTS RTLinkCache( + ticket_id TEXT NOT NULL, + attachment_id TEXT, + url TEXT NOT NULL, + PRIMARY KEY (ticket_id, attachment_id) +)""" + + @classmethod + def setup(cls, cache_path: Path) -> Optional[sqlite3.Connection]: + try: + db = sqlite3.connect(cache_path, isolation_level=None) + cursor = db.cursor() + cursor.execute(cls.CREATE_TABLE_SQL) + cursor.execute('SELECT url FROM RTLinkCache LIMIT 1') + have_data = cursor.fetchone() is not None + except sqlite3.OperationalError: + # If we couldn't get this far, sqlite provides no benefit. + return None + try: + # There shouldn't be any records where url is NULL, so running this + # DELETE pulls double duty for us: it tells us whether or not we + # can write to the database and it enforces database integrity. + cursor.execute('DELETE FROM RTLinkCache WHERE url IS NULL') + except sqlite3.OperationalError: + can_write = False + else: + can_write = True + print(cache_path, have_data, can_write) + if not (can_write or have_data): + # If there's nothing to read and no way to write, sqlite provides + # no benefit. + return None + elif not can_write: + # Set up an in-memory database that we can write to, seeded with + # the data available to read. + try: + cursor.close() + db.close() + db = sqlite3.connect(':memory:', isolation_level=None) + cursor = db.cursor() + cursor.execute('ATTACH DATABASE ? AS readsource', + ('{}?mode=ro'.format(cache_path.as_uri()),)) + cursor.execute(cls.CREATE_TABLE_SQL) + cursor.execute('INSERT INTO RTLinkCache SELECT * FROM readsource.RTLinkCache') + cursor.execute('DETACH DATABASE readsource') + except sqlite3.OperationalError as error: + # We're back to the case of having nothing to read and no way + # to write. + return None + cursor.close() + db.commit() + return db + + def __init__(self, cache_db: sqlite3.Connection) -> None: + self._db = cache_db + self._nourls: Set[TicketAttachmentIds] = set() + + def __iter__(self) -> Iterator[TicketAttachmentIds]: + yield from self._db.execute('SELECT ticket_id, attachment_id FROM RTLinkCache') + yield from self._nourls + + def __len__(self) -> int: + cursor = self._db.execute('SELECT COUNT(*) FROM RTLinkCache') + return cursor.fetchone()[0] + len(self._nourls) + + def __getitem__(self, key: TicketAttachmentIds) -> Optional[str]: + if key in self._nourls: + return None + cursor = self._db.execute( + 'SELECT url FROM RTLinkCache WHERE ticket_id = ? AND attachment_id IS ?', + key, + ) + retval = cursor.fetchone() + if retval is None: + raise KeyError(key) + else: + return retval[0] + + def __setitem__(self, key: TicketAttachmentIds, value: Optional[str]) -> None: + if value is None: + self._nourls.add(key) + else: + ticket_id, attachment_id = key + self._db.execute( + 'INSERT INTO RTLinkCache VALUES(?, ?, ?)', + (ticket_id, attachment_id, value), + ) + + def __delitem__(self, key: TicketAttachmentIds) -> None: + raise NotImplementedError("RTLinkCache.__delitem__") + class RT: """RT utility wrapper class @@ -38,7 +163,9 @@ class RT: * Parse links * Verify that they refer to extant objects in RT * Convert metadata links to RT web links - * Cache results, to reduce network requests + * Cache results, to reduce network requests. + You can set up an RTLinkCache to cache links to disks over multiple runs. + Refer to RTLinkCache's docstring for details and instructions. """ PARSE_REGEXPS = [ @@ -46,8 +173,7 @@ class RT: re.compile(r'^rt://ticket/([0-9]+)(?:/attachments?/([0-9]+))?/?$'), ] - def __init__(self, rt_client: rt.Rt) -> None: - self.rt = rt_client + def __init__(self, rt_client: rt.Rt, cache_db: Optional[sqlite3.Connection]=None) -> None: urlparts = urlparse.urlparse(rt_client.url) try: index = urlparts.path.rindex('/REST/') @@ -56,6 +182,32 @@ class RT: else: base_path = urlparts.path[:index + 1] self.url_base = urlparts._replace(path=base_path) + self.rt = rt_client + self._cache: _LinkCache + if cache_db is None: + self._cache = {} + else: + self._cache = RTLinkCache(cache_db) + + # mypy complains that the first argument isn't self, but this isn't meant + # to be a method, it's just an internal decrator. + def _cache_method(func: Callable) -> Callable: # type:ignore[misc] + @functools.wraps(func) + def caching_wrapper(self, + ticket_id: RTId, + attachment_id: Optional[RTId]=None, + ) -> str: + cache_key = (str(ticket_id), attachment_id and str(attachment_id)) + try: + url = self._cache[cache_key] + except KeyError: + if attachment_id is None: + url = func(self, ticket_id) + else: + url = func(self, ticket_id, attachment_id) + self._cache[cache_key] = url + return url + return caching_wrapper def _extend_url(self, path_tail: str, @@ -84,7 +236,7 @@ class RT: fragment = 'txn-{}'.format(txn_id) return self._extend_url('Ticket/Display.html', fragment, id=str(ticket_id)) - @functools.lru_cache() + @_cache_method def attachment_url(self, ticket_id: RTId, attachment_id: RTId) -> Optional[str]: attachment = self.rt.get_attachment(ticket_id, attachment_id) if attachment is None: @@ -118,7 +270,7 @@ class RT: return (ticket_id, attachment_id) return None - @functools.lru_cache() + @_cache_method def ticket_url(self, ticket_id: RTId) -> Optional[str]: if self.rt.get_ticket(ticket_id) is None: return None diff --git a/tests/test_config.py b/tests/test_config.py index e503b8e11166515e3f36ac1c0842d3e3d96f6906..ef3a6438907bde57e1f244a71ab2c7621fad7df8 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -207,6 +207,22 @@ def test_rt_wrapper_cache_responds_to_external_credential_changes(rt_environ): rt2 = config.rt_wrapper(None, testutil.RTClient) assert rt1 is not rt2 +def test_rt_wrapper_has_cache(tmp_path): + with update_environ(XDG_CACHE_DIR=tmp_path): + config = config_mod.Config() + rt = config.rt_wrapper(None, testutil.RTClient) + rt.exists(1) + expected = 'conservancy_beancount/{}@*.sqlite3'.format(RT_FILE_CREDS[1]) + assert any(tmp_path.glob(expected)) + +def test_rt_wrapper_without_cache(tmp_path): + tmp_path.chmod(0) + with update_environ(XDG_CACHE_DIR=tmp_path): + config = config_mod.Config() + rt = config.rt_wrapper(None, testutil.RTClient) + tmp_path.chmod(0o600) + assert not any(tmp_path.iterdir()) + def test_cache_mkdir(tmp_path): expected = tmp_path / 'TESTcache' with update_environ(XDG_CACHE_DIR=tmp_path): diff --git a/tests/test_rtutil.py b/tests/test_rtutil.py index 042634c270bd5cff11c698b22ebd94a018bb4967..0ca15fe9af646d972c5571275663395d9032adce 100644 --- a/tests/test_rtutil.py +++ b/tests/test_rtutil.py @@ -14,6 +14,8 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +import contextlib + import pytest from . import testutil @@ -46,6 +48,14 @@ def new_client(): TICKET_DATA = testutil.RTClient.TICKET_DATA.copy() return RTClient() +def new_cache(database=':memory:'): + db = rtutil.RTLinkCache.setup(database) + if db is None: + print("NOTE: did not set up database cache at {}".format(database)) + return contextlib.nullcontext(db) + else: + return contextlib.closing(db) + @pytest.mark.parametrize('ticket_id,attachment_id,expected', EXPECTED_URLS) def test_url(rt, ticket_id, attachment_id, expected): if expected is not None: @@ -125,3 +135,47 @@ def test_uncommon_server_url_parsing(): client = testutil.RTClient(url + 'REST/1.0/') rt = rtutil.RT(client) assert rt.url(1).startswith(url) + +def test_shared_cache(new_client): + ticket_id, _, expected = EXPECTED_URLS[0] + expected = DEFAULT_RT_URL + expected + with new_cache() as cachedb: + rt1 = rtutil.RT(new_client, cachedb) + assert rt1.url(ticket_id) == expected + new_client.TICKET_DATA.clear() + rt2 = rtutil.RT(new_client, cachedb) + assert rt2.url(ticket_id) == expected + assert not rt2.exists(ticket_id + 1) + assert rt1 is not rt2 + +def test_no_shared_cache(new_client): + with new_cache() as cache1, new_cache() as cache2: + rt1 = rtutil.RT(new_client, cache1) + rt2 = rtutil.RT(new_client, cache2) + assert rt1.exists(1) + new_client.TICKET_DATA.clear() + assert not rt2.exists(1) + assert rt1.exists(1) + +def test_read_only_cache(new_client, tmp_path): + db_path = tmp_path / 'test.db' + ticket_id, _, expected = EXPECTED_URLS[0] + expected = DEFAULT_RT_URL + expected + with new_cache(db_path) as cache1: + rt1 = rtutil.RT(new_client, cache1) + assert rt1.url(ticket_id) == expected + new_client.TICKET_DATA.clear() + db_path.chmod(0o400) + with new_cache(db_path) as cache2: + rt2 = rtutil.RT(new_client, cache2) + assert rt2.url(ticket_id) == expected + assert rt2.url(ticket_id + 1) is None + +def test_results_not_found_only_in_transient_cache(new_client): + with new_cache() as cache: + rt1 = rtutil.RT(new_client, cache) + rt2 = rtutil.RT(new_client, cache) + assert not rt1.exists(9) + new_client.TICKET_DATA['9'] = [('99', '(Unnamed)', 'text/plain', '0b')] + assert not rt1.exists(9) + assert rt2.exists(9)