Changeset - a8407c7b6a56
[Not reviewed]
0 4 0
Brett Smith - 4 years ago 2020-03-27 11:35:45
brettcsmith@brettcsmith.org
rtutil: Add RTLinkCache class to cache links to disk.

This will greatly reduce RT requests across multiple runs
and speed up link checking/conversion.
4 files changed with 236 insertions and 6 deletions:
0 comments (0 inline, 0 general)
conservancy_beancount/config.py
Show inline comments
...
 
@@ -128,4 +128,12 @@ class Config:
 
            return None
 
        cache_dir_path = self.cache_dir_path()
 
        if cache_dir_path is None:
 
            cache_db = None
 
        else:
 
            return rtutil.RT(wrapper_client)
 
            cache_name = '{}@{}.sqlite3'.format(
 
                credentials.user,
 
                urlparse.quote(str(credentials.server), ''),
 
            )
 
            cache_db = rtutil.RTLinkCache.setup(cache_dir_path / cache_name)
 
        return rtutil.RT(wrapper_client, cache_db)
 

	
conservancy_beancount/rtutil.py
Show inline comments
...
 
@@ -19,2 +19,3 @@ import mimetypes
 
import re
 
import sqlite3
 
import urllib.parse as urlparse
...
 
@@ -23,4 +24,10 @@ import rt
 

	
 
from pathlib import Path
 

	
 
from typing import (
 
    Callable,
 
    Iterator,
 
    MutableMapping,
 
    Optional,
 
    Set,
 
    Tuple,
...
 
@@ -30,2 +37,120 @@ from typing import (
 
RTId = Union[int, str]
 
TicketAttachmentIds = Tuple[str, Optional[str]]
 
_LinkCache = MutableMapping[TicketAttachmentIds, Optional[str]]
 

	
 
class RTLinkCache(_LinkCache):
 
    """Cache RT links to disk
 

	
 
    This class provides a dict-like interface to a cache of RT links.
 
    Once an object is in RT, a link to it should never change.
 
    The only exception is when objects get shredded, and those objects
 
    shouldn't be referenced in books anyway.
 

	
 
    This implementation is backed by a sqlite database. You can call::
 

	
 
        db = RTLinkCache.setup(path)
 

	
 
    This method will try to open a sqlite database at the given path,
 
    and set up necessary tables, etc.
 
    If it succeeds, it returns a database connection you can use to
 
    initialize the cache.
 
    If it fails, it returns None, and the caller should use some other
 
    dict-like object (like a normal dict) for caching.
 
    You can give the result to the RT utility class either way,
 
    and it will do the right thing for itself::
 

	
 
        rt = RT(rt_client, db)
 
    """
 

	
 
    CREATE_TABLE_SQL = """CREATE TABLE IF NOT EXISTS RTLinkCache(
 
 ticket_id TEXT NOT NULL,
 
 attachment_id TEXT,
 
 url TEXT NOT NULL,
 
 PRIMARY KEY (ticket_id, attachment_id)
 
)"""
 

	
 
    @classmethod
 
    def setup(cls, cache_path: Path) -> Optional[sqlite3.Connection]:
 
        try:
 
            db = sqlite3.connect(cache_path, isolation_level=None)
 
            cursor = db.cursor()
 
            cursor.execute(cls.CREATE_TABLE_SQL)
 
            cursor.execute('SELECT url FROM RTLinkCache LIMIT 1')
 
            have_data = cursor.fetchone() is not None
 
        except sqlite3.OperationalError:
 
            # If we couldn't get this far, sqlite provides no benefit.
 
            return None
 
        try:
 
            # There shouldn't be any records where url is NULL, so running this
 
            # DELETE pulls double duty for us: it tells us whether or not we
 
            # can write to the database and it enforces database integrity.
 
            cursor.execute('DELETE FROM RTLinkCache WHERE url IS NULL')
 
        except sqlite3.OperationalError:
 
            can_write = False
 
        else:
 
            can_write = True
 
        print(cache_path, have_data, can_write)
 
        if not (can_write or have_data):
 
            # If there's nothing to read and no way to write, sqlite provides
 
            # no benefit.
 
            return None
 
        elif not can_write:
 
            # Set up an in-memory database that we can write to, seeded with
 
            # the data available to read.
 
            try:
 
                cursor.close()
 
                db.close()
 
                db = sqlite3.connect(':memory:', isolation_level=None)
 
                cursor = db.cursor()
 
                cursor.execute('ATTACH DATABASE ? AS readsource',
 
                               ('{}?mode=ro'.format(cache_path.as_uri()),))
 
                cursor.execute(cls.CREATE_TABLE_SQL)
 
                cursor.execute('INSERT INTO RTLinkCache SELECT * FROM readsource.RTLinkCache')
 
                cursor.execute('DETACH DATABASE readsource')
 
            except sqlite3.OperationalError as error:
 
                # We're back to the case of having nothing to read and no way
 
                # to write.
 
                return None
 
        cursor.close()
 
        db.commit()
 
        return db
 

	
 
    def __init__(self, cache_db: sqlite3.Connection) -> None:
 
        self._db = cache_db
 
        self._nourls: Set[TicketAttachmentIds] = set()
 

	
 
    def __iter__(self) -> Iterator[TicketAttachmentIds]:
 
        yield from self._db.execute('SELECT ticket_id, attachment_id FROM RTLinkCache')
 
        yield from self._nourls
 

	
 
    def __len__(self) -> int:
 
        cursor = self._db.execute('SELECT COUNT(*) FROM RTLinkCache')
 
        return cursor.fetchone()[0] + len(self._nourls)
 

	
 
    def __getitem__(self, key: TicketAttachmentIds) -> Optional[str]:
 
        if key in self._nourls:
 
            return None
 
        cursor = self._db.execute(
 
            'SELECT url FROM RTLinkCache WHERE ticket_id = ? AND attachment_id IS ?',
 
            key,
 
        )
 
        retval = cursor.fetchone()
 
        if retval is None:
 
            raise KeyError(key)
 
        else:
 
            return retval[0]
 

	
 
    def __setitem__(self, key: TicketAttachmentIds, value: Optional[str]) -> None:
 
        if value is None:
 
            self._nourls.add(key)
 
        else:
 
            ticket_id, attachment_id = key
 
            self._db.execute(
 
                'INSERT INTO RTLinkCache VALUES(?, ?, ?)',
 
                (ticket_id, attachment_id, value),
 
            )
 

	
 
    def __delitem__(self, key: TicketAttachmentIds) -> None:
 
        raise NotImplementedError("RTLinkCache.__delitem__")
 

	
 

	
...
 
@@ -40,3 +165,5 @@ class RT:
 
    * Convert metadata links to RT web links
 
    * Cache results, to reduce network requests
 
    * Cache results, to reduce network requests.
 
      You can set up an RTLinkCache to cache links to disks over multiple runs.
 
      Refer to RTLinkCache's docstring for details and instructions.
 
    """
...
 
@@ -48,4 +175,3 @@ class RT:
 

	
 
    def __init__(self, rt_client: rt.Rt) -> None:
 
        self.rt = rt_client
 
    def __init__(self, rt_client: rt.Rt, cache_db: Optional[sqlite3.Connection]=None) -> None:
 
        urlparts = urlparse.urlparse(rt_client.url)
...
 
@@ -58,2 +184,28 @@ class RT:
 
        self.url_base = urlparts._replace(path=base_path)
 
        self.rt = rt_client
 
        self._cache: _LinkCache
 
        if cache_db is None:
 
            self._cache = {}
 
        else:
 
            self._cache = RTLinkCache(cache_db)
 

	
 
    # mypy complains that the first argument isn't self, but this isn't meant
 
    # to be a method, it's just an internal decrator.
 
    def _cache_method(func: Callable) -> Callable:  # type:ignore[misc]
 
        @functools.wraps(func)
 
        def caching_wrapper(self,
 
                            ticket_id: RTId,
 
                            attachment_id: Optional[RTId]=None,
 
        ) -> str:
 
            cache_key = (str(ticket_id), attachment_id and str(attachment_id))
 
            try:
 
                url = self._cache[cache_key]
 
            except KeyError:
 
                if attachment_id is None:
 
                    url = func(self, ticket_id)
 
                else:
 
                    url = func(self, ticket_id, attachment_id)
 
                self._cache[cache_key] = url
 
            return url
 
        return caching_wrapper
 

	
...
 
@@ -86,3 +238,3 @@ class RT:
 

	
 
    @functools.lru_cache()
 
    @_cache_method
 
    def attachment_url(self, ticket_id: RTId, attachment_id: RTId) -> Optional[str]:
...
 
@@ -120,3 +272,3 @@ class RT:
 

	
 
    @functools.lru_cache()
 
    @_cache_method
 
    def ticket_url(self, ticket_id: RTId) -> Optional[str]:
tests/test_config.py
Show inline comments
...
 
@@ -209,2 +209,18 @@ def test_rt_wrapper_cache_responds_to_external_credential_changes(rt_environ):
 

	
 
def test_rt_wrapper_has_cache(tmp_path):
 
    with update_environ(XDG_CACHE_DIR=tmp_path):
 
        config = config_mod.Config()
 
        rt = config.rt_wrapper(None, testutil.RTClient)
 
        rt.exists(1)
 
    expected = 'conservancy_beancount/{}@*.sqlite3'.format(RT_FILE_CREDS[1])
 
    assert any(tmp_path.glob(expected))
 

	
 
def test_rt_wrapper_without_cache(tmp_path):
 
    tmp_path.chmod(0)
 
    with update_environ(XDG_CACHE_DIR=tmp_path):
 
        config = config_mod.Config()
 
        rt = config.rt_wrapper(None, testutil.RTClient)
 
    tmp_path.chmod(0o600)
 
    assert not any(tmp_path.iterdir())
 

	
 
def test_cache_mkdir(tmp_path):
tests/test_rtutil.py
Show inline comments
...
 
@@ -16,2 +16,4 @@
 

	
 
import contextlib
 

	
 
import pytest
...
 
@@ -48,2 +50,10 @@ def new_client():
 

	
 
def new_cache(database=':memory:'):
 
    db = rtutil.RTLinkCache.setup(database)
 
    if db is None:
 
        print("NOTE: did not set up database cache at {}".format(database))
 
        return contextlib.nullcontext(db)
 
    else:
 
        return contextlib.closing(db)
 

	
 
@pytest.mark.parametrize('ticket_id,attachment_id,expected', EXPECTED_URLS)
...
 
@@ -127 +137,45 @@ def test_uncommon_server_url_parsing():
 
    assert rt.url(1).startswith(url)
 

	
 
def test_shared_cache(new_client):
 
    ticket_id, _, expected = EXPECTED_URLS[0]
 
    expected = DEFAULT_RT_URL + expected
 
    with new_cache() as cachedb:
 
        rt1 = rtutil.RT(new_client, cachedb)
 
        assert rt1.url(ticket_id) == expected
 
        new_client.TICKET_DATA.clear()
 
        rt2 = rtutil.RT(new_client, cachedb)
 
        assert rt2.url(ticket_id) == expected
 
        assert not rt2.exists(ticket_id + 1)
 
        assert rt1 is not rt2
 

	
 
def test_no_shared_cache(new_client):
 
    with new_cache() as cache1, new_cache() as cache2:
 
        rt1 = rtutil.RT(new_client, cache1)
 
        rt2 = rtutil.RT(new_client, cache2)
 
        assert rt1.exists(1)
 
        new_client.TICKET_DATA.clear()
 
        assert not rt2.exists(1)
 
        assert rt1.exists(1)
 

	
 
def test_read_only_cache(new_client, tmp_path):
 
    db_path = tmp_path / 'test.db'
 
    ticket_id, _, expected = EXPECTED_URLS[0]
 
    expected = DEFAULT_RT_URL + expected
 
    with new_cache(db_path) as cache1:
 
        rt1 = rtutil.RT(new_client, cache1)
 
        assert rt1.url(ticket_id) == expected
 
    new_client.TICKET_DATA.clear()
 
    db_path.chmod(0o400)
 
    with new_cache(db_path) as cache2:
 
        rt2 = rtutil.RT(new_client, cache2)
 
        assert rt2.url(ticket_id) == expected
 
        assert rt2.url(ticket_id + 1) is None
 

	
 
def test_results_not_found_only_in_transient_cache(new_client):
 
    with new_cache() as cache:
 
        rt1 = rtutil.RT(new_client, cache)
 
        rt2 = rtutil.RT(new_client, cache)
 
        assert not rt1.exists(9)
 
        new_client.TICKET_DATA['9'] = [('99', '(Unnamed)', 'text/plain', '0b')]
 
        assert not rt1.exists(9)
 
        assert rt2.exists(9)
0 comments (0 inline, 0 general)