Changeset - a8407c7b6a56
[Not reviewed]
0 4 0
Brett Smith - 4 years ago 2020-03-27 11:35:45
brettcsmith@brettcsmith.org
rtutil: Add RTLinkCache class to cache links to disk.

This will greatly reduce RT requests across multiple runs
and speed up link checking/conversion.
4 files changed with 236 insertions and 6 deletions:
0 comments (0 inline, 0 general)
conservancy_beancount/config.py
Show inline comments
...
 
@@ -126,8 +126,16 @@ class Config:
 
        wrapper_client = self.rt_client(credentials, client)
 
        if wrapper_client is None:
 
            return None
 
        cache_dir_path = self.cache_dir_path()
 
        if cache_dir_path is None:
 
            cache_db = None
 
        else:
 
            return rtutil.RT(wrapper_client)
 
            cache_name = '{}@{}.sqlite3'.format(
 
                credentials.user,
 
                urlparse.quote(str(credentials.server), ''),
 
            )
 
            cache_db = rtutil.RTLinkCache.setup(cache_dir_path / cache_name)
 
        return rtutil.RT(wrapper_client, cache_db)
 

	
 
    def rt_wrapper(self,
 
                  credentials: RTCredentials=None,
conservancy_beancount/rtutil.py
Show inline comments
...
 
@@ -17,17 +17,142 @@
 
import functools
 
import mimetypes
 
import re
 
import sqlite3
 
import urllib.parse as urlparse
 

	
 
import rt
 

	
 
from pathlib import Path
 

	
 
from typing import (
 
    Callable,
 
    Iterator,
 
    MutableMapping,
 
    Optional,
 
    Set,
 
    Tuple,
 
    Union,
 
)
 

	
 
RTId = Union[int, str]
 
TicketAttachmentIds = Tuple[str, Optional[str]]
 
_LinkCache = MutableMapping[TicketAttachmentIds, Optional[str]]
 

	
 
class RTLinkCache(_LinkCache):
 
    """Cache RT links to disk
 

	
 
    This class provides a dict-like interface to a cache of RT links.
 
    Once an object is in RT, a link to it should never change.
 
    The only exception is when objects get shredded, and those objects
 
    shouldn't be referenced in books anyway.
 

	
 
    This implementation is backed by a sqlite database. You can call::
 

	
 
        db = RTLinkCache.setup(path)
 

	
 
    This method will try to open a sqlite database at the given path,
 
    and set up necessary tables, etc.
 
    If it succeeds, it returns a database connection you can use to
 
    initialize the cache.
 
    If it fails, it returns None, and the caller should use some other
 
    dict-like object (like a normal dict) for caching.
 
    You can give the result to the RT utility class either way,
 
    and it will do the right thing for itself::
 

	
 
        rt = RT(rt_client, db)
 
    """
 

	
 
    CREATE_TABLE_SQL = """CREATE TABLE IF NOT EXISTS RTLinkCache(
 
 ticket_id TEXT NOT NULL,
 
 attachment_id TEXT,
 
 url TEXT NOT NULL,
 
 PRIMARY KEY (ticket_id, attachment_id)
 
)"""
 

	
 
    @classmethod
 
    def setup(cls, cache_path: Path) -> Optional[sqlite3.Connection]:
 
        try:
 
            db = sqlite3.connect(cache_path, isolation_level=None)
 
            cursor = db.cursor()
 
            cursor.execute(cls.CREATE_TABLE_SQL)
 
            cursor.execute('SELECT url FROM RTLinkCache LIMIT 1')
 
            have_data = cursor.fetchone() is not None
 
        except sqlite3.OperationalError:
 
            # If we couldn't get this far, sqlite provides no benefit.
 
            return None
 
        try:
 
            # There shouldn't be any records where url is NULL, so running this
 
            # DELETE pulls double duty for us: it tells us whether or not we
 
            # can write to the database and it enforces database integrity.
 
            cursor.execute('DELETE FROM RTLinkCache WHERE url IS NULL')
 
        except sqlite3.OperationalError:
 
            can_write = False
 
        else:
 
            can_write = True
 
        print(cache_path, have_data, can_write)
 
        if not (can_write or have_data):
 
            # If there's nothing to read and no way to write, sqlite provides
 
            # no benefit.
 
            return None
 
        elif not can_write:
 
            # Set up an in-memory database that we can write to, seeded with
 
            # the data available to read.
 
            try:
 
                cursor.close()
 
                db.close()
 
                db = sqlite3.connect(':memory:', isolation_level=None)
 
                cursor = db.cursor()
 
                cursor.execute('ATTACH DATABASE ? AS readsource',
 
                               ('{}?mode=ro'.format(cache_path.as_uri()),))
 
                cursor.execute(cls.CREATE_TABLE_SQL)
 
                cursor.execute('INSERT INTO RTLinkCache SELECT * FROM readsource.RTLinkCache')
 
                cursor.execute('DETACH DATABASE readsource')
 
            except sqlite3.OperationalError as error:
 
                # We're back to the case of having nothing to read and no way
 
                # to write.
 
                return None
 
        cursor.close()
 
        db.commit()
 
        return db
 

	
 
    def __init__(self, cache_db: sqlite3.Connection) -> None:
 
        self._db = cache_db
 
        self._nourls: Set[TicketAttachmentIds] = set()
 

	
 
    def __iter__(self) -> Iterator[TicketAttachmentIds]:
 
        yield from self._db.execute('SELECT ticket_id, attachment_id FROM RTLinkCache')
 
        yield from self._nourls
 

	
 
    def __len__(self) -> int:
 
        cursor = self._db.execute('SELECT COUNT(*) FROM RTLinkCache')
 
        return cursor.fetchone()[0] + len(self._nourls)
 

	
 
    def __getitem__(self, key: TicketAttachmentIds) -> Optional[str]:
 
        if key in self._nourls:
 
            return None
 
        cursor = self._db.execute(
 
            'SELECT url FROM RTLinkCache WHERE ticket_id = ? AND attachment_id IS ?',
 
            key,
 
        )
 
        retval = cursor.fetchone()
 
        if retval is None:
 
            raise KeyError(key)
 
        else:
 
            return retval[0]
 

	
 
    def __setitem__(self, key: TicketAttachmentIds, value: Optional[str]) -> None:
 
        if value is None:
 
            self._nourls.add(key)
 
        else:
 
            ticket_id, attachment_id = key
 
            self._db.execute(
 
                'INSERT INTO RTLinkCache VALUES(?, ?, ?)',
 
                (ticket_id, attachment_id, value),
 
            )
 

	
 
    def __delitem__(self, key: TicketAttachmentIds) -> None:
 
        raise NotImplementedError("RTLinkCache.__delitem__")
 

	
 

	
 
class RT:
 
    """RT utility wrapper class
...
 
@@ -38,7 +163,9 @@ class RT:
 
    * Parse links
 
    * Verify that they refer to extant objects in RT
 
    * Convert metadata links to RT web links
 
    * Cache results, to reduce network requests
 
    * Cache results, to reduce network requests.
 
      You can set up an RTLinkCache to cache links to disks over multiple runs.
 
      Refer to RTLinkCache's docstring for details and instructions.
 
    """
 

	
 
    PARSE_REGEXPS = [
...
 
@@ -46,8 +173,7 @@ class RT:
 
        re.compile(r'^rt://ticket/([0-9]+)(?:/attachments?/([0-9]+))?/?$'),
 
    ]
 

	
 
    def __init__(self, rt_client: rt.Rt) -> None:
 
        self.rt = rt_client
 
    def __init__(self, rt_client: rt.Rt, cache_db: Optional[sqlite3.Connection]=None) -> None:
 
        urlparts = urlparse.urlparse(rt_client.url)
 
        try:
 
            index = urlparts.path.rindex('/REST/')
...
 
@@ -56,6 +182,32 @@ class RT:
 
        else:
 
            base_path = urlparts.path[:index + 1]
 
        self.url_base = urlparts._replace(path=base_path)
 
        self.rt = rt_client
 
        self._cache: _LinkCache
 
        if cache_db is None:
 
            self._cache = {}
 
        else:
 
            self._cache = RTLinkCache(cache_db)
 

	
 
    # mypy complains that the first argument isn't self, but this isn't meant
 
    # to be a method, it's just an internal decrator.
 
    def _cache_method(func: Callable) -> Callable:  # type:ignore[misc]
 
        @functools.wraps(func)
 
        def caching_wrapper(self,
 
                            ticket_id: RTId,
 
                            attachment_id: Optional[RTId]=None,
 
        ) -> str:
 
            cache_key = (str(ticket_id), attachment_id and str(attachment_id))
 
            try:
 
                url = self._cache[cache_key]
 
            except KeyError:
 
                if attachment_id is None:
 
                    url = func(self, ticket_id)
 
                else:
 
                    url = func(self, ticket_id, attachment_id)
 
                self._cache[cache_key] = url
 
            return url
 
        return caching_wrapper
 

	
 
    def _extend_url(self,
 
                    path_tail: str,
...
 
@@ -84,7 +236,7 @@ class RT:
 
            fragment = 'txn-{}'.format(txn_id)
 
        return self._extend_url('Ticket/Display.html', fragment, id=str(ticket_id))
 

	
 
    @functools.lru_cache()
 
    @_cache_method
 
    def attachment_url(self, ticket_id: RTId, attachment_id: RTId) -> Optional[str]:
 
        attachment = self.rt.get_attachment(ticket_id, attachment_id)
 
        if attachment is None:
...
 
@@ -118,7 +270,7 @@ class RT:
 
                return (ticket_id, attachment_id)
 
        return None
 

	
 
    @functools.lru_cache()
 
    @_cache_method
 
    def ticket_url(self, ticket_id: RTId) -> Optional[str]:
 
        if self.rt.get_ticket(ticket_id) is None:
 
            return None
tests/test_config.py
Show inline comments
...
 
@@ -207,6 +207,22 @@ def test_rt_wrapper_cache_responds_to_external_credential_changes(rt_environ):
 
        rt2 = config.rt_wrapper(None, testutil.RTClient)
 
    assert rt1 is not rt2
 

	
 
def test_rt_wrapper_has_cache(tmp_path):
 
    with update_environ(XDG_CACHE_DIR=tmp_path):
 
        config = config_mod.Config()
 
        rt = config.rt_wrapper(None, testutil.RTClient)
 
        rt.exists(1)
 
    expected = 'conservancy_beancount/{}@*.sqlite3'.format(RT_FILE_CREDS[1])
 
    assert any(tmp_path.glob(expected))
 

	
 
def test_rt_wrapper_without_cache(tmp_path):
 
    tmp_path.chmod(0)
 
    with update_environ(XDG_CACHE_DIR=tmp_path):
 
        config = config_mod.Config()
 
        rt = config.rt_wrapper(None, testutil.RTClient)
 
    tmp_path.chmod(0o600)
 
    assert not any(tmp_path.iterdir())
 

	
 
def test_cache_mkdir(tmp_path):
 
    expected = tmp_path / 'TESTcache'
 
    with update_environ(XDG_CACHE_DIR=tmp_path):
tests/test_rtutil.py
Show inline comments
...
 
@@ -14,6 +14,8 @@
 
# You should have received a copy of the GNU Affero General Public License
 
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
 

	
 
import contextlib
 

	
 
import pytest
 

	
 
from . import testutil
...
 
@@ -46,6 +48,14 @@ def new_client():
 
        TICKET_DATA = testutil.RTClient.TICKET_DATA.copy()
 
    return RTClient()
 

	
 
def new_cache(database=':memory:'):
 
    db = rtutil.RTLinkCache.setup(database)
 
    if db is None:
 
        print("NOTE: did not set up database cache at {}".format(database))
 
        return contextlib.nullcontext(db)
 
    else:
 
        return contextlib.closing(db)
 

	
 
@pytest.mark.parametrize('ticket_id,attachment_id,expected', EXPECTED_URLS)
 
def test_url(rt, ticket_id, attachment_id, expected):
 
    if expected is not None:
...
 
@@ -125,3 +135,47 @@ def test_uncommon_server_url_parsing():
 
    client = testutil.RTClient(url + 'REST/1.0/')
 
    rt = rtutil.RT(client)
 
    assert rt.url(1).startswith(url)
 

	
 
def test_shared_cache(new_client):
 
    ticket_id, _, expected = EXPECTED_URLS[0]
 
    expected = DEFAULT_RT_URL + expected
 
    with new_cache() as cachedb:
 
        rt1 = rtutil.RT(new_client, cachedb)
 
        assert rt1.url(ticket_id) == expected
 
        new_client.TICKET_DATA.clear()
 
        rt2 = rtutil.RT(new_client, cachedb)
 
        assert rt2.url(ticket_id) == expected
 
        assert not rt2.exists(ticket_id + 1)
 
        assert rt1 is not rt2
 

	
 
def test_no_shared_cache(new_client):
 
    with new_cache() as cache1, new_cache() as cache2:
 
        rt1 = rtutil.RT(new_client, cache1)
 
        rt2 = rtutil.RT(new_client, cache2)
 
        assert rt1.exists(1)
 
        new_client.TICKET_DATA.clear()
 
        assert not rt2.exists(1)
 
        assert rt1.exists(1)
 

	
 
def test_read_only_cache(new_client, tmp_path):
 
    db_path = tmp_path / 'test.db'
 
    ticket_id, _, expected = EXPECTED_URLS[0]
 
    expected = DEFAULT_RT_URL + expected
 
    with new_cache(db_path) as cache1:
 
        rt1 = rtutil.RT(new_client, cache1)
 
        assert rt1.url(ticket_id) == expected
 
    new_client.TICKET_DATA.clear()
 
    db_path.chmod(0o400)
 
    with new_cache(db_path) as cache2:
 
        rt2 = rtutil.RT(new_client, cache2)
 
        assert rt2.url(ticket_id) == expected
 
        assert rt2.url(ticket_id + 1) is None
 

	
 
def test_results_not_found_only_in_transient_cache(new_client):
 
    with new_cache() as cache:
 
        rt1 = rtutil.RT(new_client, cache)
 
        rt2 = rtutil.RT(new_client, cache)
 
        assert not rt1.exists(9)
 
        new_client.TICKET_DATA['9'] = [('99', '(Unnamed)', 'text/plain', '0b')]
 
        assert not rt1.exists(9)
 
        assert rt2.exists(9)
0 comments (0 inline, 0 general)