Handle collections correctly

Fix #132 Passing the collections parameter used to mean that the storage should append its value to the URL or path. This was a leaky abstraction for the reasons explained in #132. The new behavior removes this meaning from this parameter. Vdirsyncer now maintains a cache of discovered collections.
2026-06-23 04:44:46 +00:00 · 2014-12-13 18:20:13 +01:00 · 2014-12-13 18:20:13 +01:00 · 06a701bc10
commit 06a701bc10
parent 1e8e931464
9 changed files with 376 additions and 149 deletions
--- a/tests/storage/init.py
+++ b/tests/storage/init.py
@ -148,22 +148,30 @@ class SupportsCollections(object):

    def test_discover(self, get_storage_args, get_item):
        expected = set()
+        items = {}

        for i in range(1, 5):
            # Create collections, but use the "collection" attribute because
            # Radicale requires file extensions in their names.
-            expected.add(
-                self.storage_class(
-                    **get_storage_args(collection='test{}'.format(i))
-                ).collection
-            )
+            collection = 'test{}'.format(i)
+            s = self.storage_class(**get_storage_args(collection=collection))
+            items[s.collection] = [s.upload(get_item())]
+            expected.add(s.collection)

        d = self.storage_class.discover(
            **get_storage_args(collection=None))

-        actual = set(s.collection for s in d)
+        actual = set(args['collection'] for args in d)
        assert actual >= expected

+        for storage_args in d:
+            collection = storage_args['collection']
+            if collection not in expected:
+                continue
+            s = self.storage_class(**storage_args)
+            rv = list(s.list())
+            assert rv == items[collection]
+
    def test_discover_collection_arg(self, get_storage_args):
        args = get_storage_args(collection='test2')
        with pytest.raises(TypeError) as excinfo:
--- a/tests/storage/dav/servers/radicale/init.py
+++ b/tests/storage/dav/servers/radicale/init.py
@ -110,6 +110,7 @@ class ServerMixin(object):
            url = 'http://127.0.0.1/bob/'
            if collection is not None:
                collection += self.storage_class.fileext
+                url = url.rstrip('/') + '/' + collection

            rv = {'url': url, 'username': 'bob', 'password': 'bob',
                  'collection': collection, 'unsafe_href_chars': ''}
--- a/tests/storage/test_filesystem.py
+++ b/tests/storage/test_filesystem.py
@ -31,22 +31,22 @@ class TestFilesystemStorage(StorageTests):
            path = self.tmpdir
            if collection is not None:
                os.makedirs(os.path.join(path, collection))
+                path = os.path.join(path, collection)
            return {'path': path, 'fileext': '.txt', 'collection': collection}
        return inner

    def test_create_is_false(self, tmpdir):
        with pytest.raises(IOError):
-            self.storage_class(str(tmpdir), '.txt', collection='lol',
-                               create=False)
+            self.storage_class(str(tmpdir) + '/lol', '.txt', create=False)

    def test_is_not_directory(self, tmpdir):
        with pytest.raises(IOError):
            f = tmpdir.join('hue')
            f.write('stub')
-            self.storage_class(str(tmpdir), '.txt', collection='hue')
+            self.storage_class(str(tmpdir) + '/hue', '.txt')

    def test_create_is_true(self, tmpdir):
-        self.storage_class(str(tmpdir), '.txt', collection='asd')
+        self.storage_class(str(tmpdir) + '/asd', '.txt')
        assert tmpdir.listdir() == [tmpdir.join('asd')]

    def test_broken_data(self, tmpdir):
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -79,11 +79,9 @@ def test_parse_pairs_args():
    assert sorted(
        cli.parse_pairs_args(['foo/foocoll', 'one', 'eins'], pairs)
    ) == [
-        ('eins', None),
-        ('foo', 'foocoll'),
-        ('one', 'a'),
-        ('one', 'b'),
-        ('one', 'c')
+        ('eins', set()),
+        ('foo', {'foocoll'}),
+        ('one', set()),
    ]


@ -111,21 +109,12 @@ def test_simple_run(tmpdir):
    runner = CliRunner(env={'VDIRSYNCER_CONFIG': str(config_file)})
    result = runner.invoke(cli.app, ['sync'])
    assert not result.exception
-    assert result.output.lower().strip() == 'syncing my_pair'

    tmpdir.join('path_a/haha.txt').write('UID:haha')
    result = runner.invoke(cli.app, ['sync'])
-    assert result.output == ('Syncing my_pair\n'
-                             'Copying (uploading) item haha to my_b\n')
+    assert 'Copying (uploading) item haha to my_b' in result.output
    assert tmpdir.join('path_b/haha.txt').read() == 'UID:haha'

-    result = runner.invoke(cli.app, ['sync', 'my_pair', 'my_pair'])
-    assert set(result.output.splitlines()) == set([
-        'Syncing my_pair',
-        'warning: Already prepared my_pair, skipping'
-    ])
-    assert not result.exception
-

 def test_empty_storage(tmpdir):
    config_file = tmpdir.join('config')
@ -151,7 +140,6 @@ def test_empty_storage(tmpdir):
    runner = CliRunner(env={'VDIRSYNCER_CONFIG': str(config_file)})
    result = runner.invoke(cli.app, ['sync'])
    assert not result.exception
-    assert result.output.lower().strip() == 'syncing my_pair'

    tmpdir.join('path_a/haha.txt').write('UID:haha')
    result = runner.invoke(cli.app, ['sync'])
@ -244,3 +232,120 @@ def test_invalid_storage_name():
        cli.load_config(f)

    assert 'invalid characters' in str(excinfo.value).lower()
+
+
+def test_deprecated_item_status(tmpdir):
+    f = tmpdir.join('mypair.items')
+    f.write(dedent('''
+    ["ident", ["href_a", "etag_a", "href_b", "etag_b"]]
+    ["ident_two", ["href_a", "etag_a", "href_b", "etag_b"]]
+    ''').strip())
+
+    data = {
+        'ident': ['href_a', 'etag_a', 'href_b', 'etag_b'],
+        'ident_two': ['href_a', 'etag_a', 'href_b', 'etag_b']
+    }
+
+    assert cli.load_status(str(tmpdir), 'mypair', data_type='items') == data
+
+    cli.save_status(str(tmpdir), 'mypair', data_type='items', data=data)
+    assert cli.load_status(str(tmpdir), 'mypair', data_type='items') == data
+
+
+def test_collections_cache_invalidation(tmpdir):
+    cfg = tmpdir.join('config')
+    cfg.write(dedent('''
+    [general]
+    status_path = {0}/status/
+
+    [storage foo]
+    type = filesystem
+    path = {0}/foo/
+    fileext = .txt
+
+    [storage bar]
+    type = filesystem
+    path = {0}/bar/
+    fileext = .txt
+
+    [pair foobar]
+    a = foo
+    b = bar
+    collections = a, b, c
+    ''').format(str(tmpdir)))
+
+    foo = tmpdir.mkdir('foo')
+    bar = tmpdir.mkdir('bar')
+    foo.mkdir('a').join('itemone.txt').write('UID:itemone')
+
+    runner = CliRunner()
+    result = runner.invoke(cli.app, ['sync'],
+                           env={'VDIRSYNCER_CONFIG': str(cfg)})
+    assert not result.exception
+
+    rv = bar.join('a').listdir()
+    assert len(rv) == 1
+    assert rv[0].basename == 'itemone.txt'
+
+    cfg.write(dedent('''
+    [general]
+    status_path = {0}/status/
+
+    [storage foo]
+    type = filesystem
+    path = {0}/foo/
+    fileext = .txt
+
+    [storage bar]
+    type = filesystem
+    path = {0}/bar2/
+    fileext = .txt
+
+    [pair foobar]
+    a = foo
+    b = bar
+    collections = a, b, c
+    ''').format(str(tmpdir)))
+
+    tmpdir.join('status').remove()
+    bar2 = tmpdir.mkdir('bar2')
+    result = runner.invoke(cli.app, ['sync'],
+                           env={'VDIRSYNCER_CONFIG': str(cfg)})
+    assert not result.exception
+
+    rv = bar.join('a').listdir()
+    rv2 = bar2.join('a').listdir()
+    assert len(rv) == len(rv2) == 1
+    assert rv[0].basename == rv2[0].basename == 'itemone.txt'
+
+
+def test_invalid_pairs_as_cli_arg(tmpdir):
+    cfg = tmpdir.join('config')
+    cfg.write(dedent('''
+    [general]
+    status_path = {0}/status/
+
+    [storage foo]
+    type = filesystem
+    path = {0}/foo/
+    fileext = .txt
+
+    [storage bar]
+    type = filesystem
+    path = {0}/bar/
+    fileext = .txt
+
+    [pair foobar]
+    a = foo
+    b = bar
+    collections = a, b, c
+    ''').format(str(tmpdir)))
+
+    tmpdir.mkdir('foo')
+    tmpdir.mkdir('bar')
+
+    runner = CliRunner()
+    result = runner.invoke(cli.app, ['sync', 'foobar/d'],
+                           env={'VDIRSYNCER_CONFIG': str(cfg)})
+    assert result.exception
+    assert 'pair foobar: collection d not found' in result.output.lower()
--- a/tests/utils/test_main.py
+++ b/tests/utils/test_main.py
@ -246,7 +246,8 @@ def test_get_class_init_args_on_storage():
    from vdirsyncer.storage.memory import MemoryStorage

    all, required = utils.get_class_init_args(MemoryStorage)
-    assert all == set(['fileext', 'collection', 'read_only', 'instance_name'])
+    assert all == set(['fileext', 'collection', 'read_only', 'instance_name',
+                       'collection_human'])
    assert not required


--- a/vdirsyncer/cli.py
+++ b/vdirsyncer/cli.py
@ -9,6 +9,7 @@

 import errno
 import functools
+import hashlib
 import json
 import os
 import string
@ -78,6 +79,115 @@ def get_status_name(pair, collection):
    return pair + '/' + collection


+def get_collections_cache_key(pair_options, config_a, config_b):
+    m = hashlib.sha256()
+    j = json.dumps([pair_options, config_a, config_b], sort_keys=True)
+    m.update(j.encode('utf-8'))
+    return m.hexdigest()
+
+
+def collections_for_pair(status_path, name_a, name_b, pair_name, config_a,
+                         config_b, pair_options, skip_cache=False):
+    '''Determine all configured collections for a given pair. Takes care of
+    shortcut expansion and result caching.
+
+    :param status_path: The path to the status directory.
+    :param name_a: The config name of storage A.
+    :param name_b: The config name of storage B.
+    :param pair_name: The config name of the pair.
+    :param config_a: The configuration for storage A, with pair-defined
+        defaults.
+    :param config_b: The configuration for storage B, with pair-defined
+        defaults.
+    :param pair_options: Pair-specific options.
+    :param skip_cache: Whether to skip the cached data and always do discovery.
+        Even with this option enabled, the new cache is written.
+
+    :returns: iterable of (collection, a_args, b_args)
+    '''
+    rv = load_status(status_path, pair_name, data_type='collections')
+    cache_key = get_collections_cache_key(pair_options, config_a, config_b)
+    if rv and not skip_cache:
+        if rv.get('cache_key', None) == cache_key:
+            return rv.get('collections', rv)
+        elif rv:
+            cli_logger.info('Detected change in config file, discovering '
+                            'collections for {}'.format(pair_name))
+
+    cli_logger.info('Discovering collections for pair {}'
+                    .format(pair_name))
+
+    # We have to use a list here because the special None/null value would get
+    # mangled to string (because JSON objects always have string keys).
+    rv = list(_collections_for_pair_impl(status_path, name_a, name_b,
+                                         pair_name, config_a, config_b,
+                                         pair_options))
+    save_status(status_path, pair_name, data_type='collections',
+                data={'collections': rv, 'cache_key': cache_key})
+    return rv
+
+
+def _collections_for_pair_impl(status_path, name_a, name_b, pair_name,
+                               config_a, config_b, pair_options):
+
+    def _discover_from_config(config):
+        storage_type = config['type']
+        cls, config = storage_class_from_config(config)
+
+        try:
+            discovered = list(cls.discover(**config))
+        except Exception:
+            return handle_storage_init_error(cls, config)
+        else:
+            rv = {}
+            for args in discovered:
+                args['type'] = storage_type
+                rv[args['collection']] = args
+            return rv
+
+    def _get_coll(discovered, collection, storage_name, config):
+        try:
+            return discovered[collection]
+        except KeyError:
+            storage_type = config['type']
+            cls, config = storage_class_from_config(config)
+            try:
+                rv = cls.join_collection(collection=collection, **config)
+                rv['type'] = storage_type
+                return rv
+            except NotImplementedError:
+                raise CliError(
+                    'Unable to find collection {collection!r} for storage '
+                    '{storage_name!r}.\n For pair {pair_name!r}, you wanted '
+                    'to use the collections {shortcut!r}, which yielded '
+                    '{collection!r} (amongst others). Vdirsyncer was unable '
+                    'to find an equivalent collection for the other '
+                    'storage.'.format(
+                        collection=collection, shortcut=shortcut,
+                        storage_name=storage_name, pair_name=pair_name
+                    )
+                )
+
+    shortcuts = set(_parse_old_config_list_value(pair_options, 'collections'))
+    if not shortcuts:
+        yield None, (config_a, config_b)
+    else:
+        a_discovered = _discover_from_config(config_a)
+        b_discovered = _discover_from_config(config_b)
+
+    for shortcut in shortcuts:
+        if shortcut in ('from a', 'from b'):
+            collections = (a_discovered if shortcut == 'from a'
+                           else b_discovered)
+        else:
+            collections = [shortcut]
+
+        for collection in collections:
+            a_args = _get_coll(a_discovered, collection, name_a, config_a)
+            b_args = _get_coll(b_discovered, collection, name_b, config_b)
+            yield collection, (a_args, b_args)
+
+
 def validate_general_section(general_config):
    if general_config is None:
        raise CliError(
@ -143,19 +253,43 @@ def load_config(f, pair_options=('collections', 'conflict_resolution')):
    return general, pairs, storages


-def load_status(path, status_name):
-    full_path = expand_path(os.path.join(path, status_name))
-    if not os.path.exists(full_path):
-        return {}
-    with open(full_path) as f:
-        return dict(json.loads(line) for line in f)
+def load_status(base_path, pair, collection=None, data_type=None):
+    assert data_type is not None
+    status_name = get_status_name(pair, collection)
+    path = expand_path(os.path.join(base_path, status_name))
+    if os.path.isfile(path) and data_type == 'items':
+        new_path = path + '.items'
+        cli_logger.warning('Migrating statuses: Renaming {} to {}'
+                           .format(path, new_path))
+        os.rename(path, new_path)
+
+    path += '.' + data_type
+    if not os.path.exists(path):
+        return None
+
+    with open(path) as f:
+        try:
+            return dict(json.load(f))
+        except ValueError:
+            pass
+
+        f.seek(0)
+        try:
+            return dict(json.loads(line) for line in f)
+        except ValueError:
+            pass
+
+    return {}


-def save_status(path, status_name, status):
-    full_path = expand_path(os.path.join(path, status_name))
-    base_path = os.path.dirname(full_path)
+def save_status(base_path, pair, collection=None, data_type=None, data=None):
+    assert data_type is not None
+    assert data is not None
+    status_name = get_status_name(pair, collection)
+    path = expand_path(os.path.join(base_path, status_name)) + '.' + data_type
+    base_path = os.path.dirname(path)

-    if os.path.isfile(base_path):
+    if collection is not None and os.path.isfile(base_path):
        raise CliError('{} is probably a legacy file and could be removed '
                       'automatically, but this choice is left to the '
                       'user. If you think this is an error, please file '
@ -167,10 +301,8 @@ def save_status(path, status_name, status):
        if e.errno != errno.EEXIST:
            raise

-    with safe_write(full_path, 'w+') as f:
-        for k, v in status.items():
-            json.dump((k, v), f)
-            f.write('\n')
+    with safe_write(path, 'w+') as f:
+        json.dump(data, f)


 def storage_class_from_config(config):
@ -226,33 +358,24 @@ def handle_storage_init_error(cls, config):
 def parse_pairs_args(pairs_args, all_pairs):
    '''
    Expand the various CLI shortforms ("pair, pair/collection") to an iterable
-    of (pair, collection).
+    of (pair, collections).
    '''
-    if not pairs_args:
-        pairs_args = list(all_pairs)
-    for pair_and_collection in pairs_args:
+    rv = {}
+    for pair_and_collection in (pairs_args or all_pairs):
        pair, collection = pair_and_collection, None
        if '/' in pair:
            pair, collection = pair.split('/')

-        try:
-            a_name, b_name, pair_options, storage_defaults = \
-                all_pairs[pair]
-        except KeyError:
+        if pair not in all_pairs:
            raise CliError('Pair not found: {}\n'
                           'These are the pairs found: {}'
                           .format(pair, list(all_pairs)))

-        if collection is None:
-            collections = set(
-                _parse_old_config_list_value(pair_options, 'collections')
-                or [None]
-            )
-        else:
-            collections = [collection]
+        collections = rv.setdefault(pair, set())
+        if collection:
+            collections.add(collection)

-        for c in collections:
-            yield pair, c
+    return rv.items()

 # We create the app inside a factory and destroy that factory after first use
 # to avoid pollution of the module namespace.
@ -337,12 +460,12 @@ def _create_app():
        wq = WorkerQueue(max_workers)
        wq.handled_jobs = set()

-        for pair_name, collection in parse_pairs_args(pairs, all_pairs):
+        for pair_name, collections in parse_pairs_args(pairs, all_pairs):
            wq.spawn_worker()
            wq.put(
-                functools.partial(prepare_sync, pair_name=pair_name,
-                                  collection=collection, general=general,
-                                  all_pairs=all_pairs,
+                functools.partial(sync_pair, pair_name=pair_name,
+                                  collections_to_sync=collections,
+                                  general=general, all_pairs=all_pairs,
                                  all_storages=all_storages,
                                  force_delete=force_delete))

@ -354,76 +477,40 @@ app = main = _create_app()
 del _create_app


-def expand_collection(pair_name, collection, general, all_pairs, all_storages):
-    a_name, b_name, pair_options, storage_defaults = all_pairs[pair_name]
-    if collection in ('from a', 'from b'):
-        # from_name: name of the storage which should be used for discovery
-        # other_name: the other storage's name
-        if collection == 'from a':
-            from_name, other_name = a_name, b_name
-        else:
-            from_name, other_name = b_name, a_name
-
-        cli_logger.info('Syncing {}: Discovering collections from {}'
-                        .format(pair_name, from_name))
-
-        config = dict(storage_defaults)
-        config.update(all_storages[from_name])
-        cls, config = storage_class_from_config(config)
-        try:
-            storages = list(cls.discover(**config))
-        except Exception:
-            handle_storage_init_error(cls, config)
-
-        for storage in storages:
-            config = dict(storage_defaults)
-            config.update(all_storages[other_name])
-            config['collection'] = actual_collection = storage.collection
-            other_storage = storage_instance_from_config(config)
-
-            if collection == 'from a':
-                a, b = storage, other_storage
-            else:
-                b, a = storage, other_storage
-
-            yield actual_collection, a, b
-    else:
-        config = dict(storage_defaults)
-        config.update(all_storages[a_name])
-        config['collection'] = collection
-        a = storage_instance_from_config(config)
-
-        config = dict(storage_defaults)
-        config.update(all_storages[b_name])
-        config['collection'] = collection
-        b = storage_instance_from_config(config)
-
-        yield collection, a, b
-
-
-def prepare_sync(wq, pair_name, collection, general, all_pairs, all_storages,
-                 force_delete):
-    key = ('prepare', pair_name, collection)
+def sync_pair(wq, pair_name, collections_to_sync, general, all_pairs,
+              all_storages, force_delete):
+    key = ('prepare', pair_name)
    if key in wq.handled_jobs:
-        status_name = get_status_name(pair_name, collection)
-        cli_logger.warning('Already prepared {}, skipping'.format(status_name))
+        cli_logger.warning('Already prepared {}, skipping'.format(pair_name))
        return
    wq.handled_jobs.add(key)

    a_name, b_name, pair_options, storage_defaults = all_pairs[pair_name]
-    jobs = list(expand_collection(pair_name, collection, general, all_pairs,
-                                  all_storages))

-    for i in range(len(jobs) - 1):
-        # spawn one worker less because we can reuse the current one
+    all_collections = dict(collections_for_pair(
+        general['status_path'], a_name, b_name, pair_name,
+        all_storages[a_name], all_storages[b_name], pair_options
+    ))
+
+    # spawn one worker less because we can reuse the current one
+    new_workers = -1
+    for collection in (collections_to_sync or all_collections):
+        try:
+            config_a, config_b = all_collections[collection]
+        except KeyError:
+            raise CliError('Pair {}: Collection {} not found. These are the '
+                           'configured collections:\n{}'.format(
+                               pair_name, collection, list(all_collections)))
+        new_workers += 1
+        wq.put(functools.partial(
+            sync_collection, pair_name=pair_name, collection=collection,
+            config_a=config_a, config_b=config_b, pair_options=pair_options,
+            general=general, force_delete=force_delete
+        ))
+
+    for i in range(new_workers):
        wq.spawn_worker()

-    for collection, a, b in jobs:
-        wq.put(functools.partial(sync_collection, pair_name=pair_name,
-                                 collection=collection, a=a, b=b,
-                                 pair_options=pair_options, general=general,
-                                 force_delete=force_delete))
-

 def handle_cli_error(status_name='sync'):
    try:
@ -460,8 +547,8 @@ def handle_cli_error(status_name='sync'):
        return True


-def sync_collection(wq, pair_name, collection, a, b, pair_options, general,
-                    force_delete):
+def sync_collection(wq, pair_name, collection, config_a, config_b,
+                    pair_options, general, force_delete):
    status_name = get_status_name(pair_name, collection)

    key = ('sync', pair_name, collection)
@ -473,8 +560,12 @@ def sync_collection(wq, pair_name, collection, a, b, pair_options, general,
    try:
        cli_logger.info('Syncing {}'.format(status_name))

-        status = load_status(general['status_path'], status_name)
+        status = load_status(general['status_path'], pair_name,
+                             collection, data_type='items') or {}
        cli_logger.debug('Loaded status for {}'.format(status_name))
+
+        a = storage_instance_from_config(config_a)
+        b = storage_instance_from_config(config_b)
        sync(
            a, b, status,
            conflict_resolution=pair_options.get('conflict_resolution', None),
@ -484,7 +575,8 @@ def sync_collection(wq, pair_name, collection, a, b, pair_options, general,
        if not handle_cli_error(status_name):
            raise JobFailed()

-    save_status(general['status_path'], status_name, status)
+    save_status(general['status_path'], pair_name, collection,
+                data_type='items', data=status)


 class WorkerQueue(object):
--- a/vdirsyncer/storage/base.py
+++ b/vdirsyncer/storage/base.py
@ -30,10 +30,6 @@ class Storage(object):
    Strings can be either unicode strings or bytestrings. If bytestrings, an
    ASCII encoding is assumed.

-    :param collection: If None, the given URL or path is already directly
-        referring to a collection. Otherwise it will be treated as a basepath
-        to many collections (e.g. a vdir) and the given collection name will be
-        looked for.
    :param read_only: Whether the synchronization algorithm should avoid writes
        to this storage. Some storages accept no value other than ``True``.
    '''
@ -44,10 +40,16 @@ class Storage(object):
    # overridden by subclasses.
    storage_name = None

-    # The string used in the config to denote a particular instance. Should be
+    # The string used in the config to denote a particular instance. Will be
    # overridden during instantiation.
    instance_name = None

+    # The machine-readable name of this collection.
+    collection = None
+
+    # The human-readable name of this collection.
+    collection_human = None
+
    # A value of True means the storage does not support write-methods such as
    # upload, update and delete.  A value of False means the storage does
    # support those methods.
@ -56,7 +58,8 @@ class Storage(object):
    # The attribute values to show in the representation of the storage.
    _repr_attributes = ()

-    def __init__(self, instance_name=None, read_only=None, collection=None):
+    def __init__(self, instance_name=None, read_only=None, collection=None,
+                 collection_human=None):
        if read_only is None:
            read_only = self.read_only
        if self.read_only and not read_only:
@ -66,6 +69,7 @@ class Storage(object):
            instance_name = '{}/{}'.format(instance_name, collection)
        self.instance_name = instance_name
        self.collection = collection
+        self.collection_human = collection_human

    @classmethod
    def discover(cls, **kwargs):
@ -74,8 +78,21 @@ class Storage(object):
        :param **kwargs: Keyword arguments to additionally pass to the storage
            instances returned. You shouldn't pass `collection` here, otherwise
            TypeError will be raised.
-        :returns: Iterable of storages which represent the discovered
-            collections, all of which are passed kwargs during initialization.
+        :returns: iterable of ``storage_args``.
+            ``storage_args`` is a dictionary of ``**kwargs`` to pass to this
+            class to obtain a storage instance pointing to this collection. It
+            also must contain a ``"collection"`` key.  That key's value is used
+            to match two collections together for synchronization. IOW it is a
+            machine-readable identifier for the collection, usually obtained
+            from the last segment of a URL or filesystem path.
+
+        '''
+        raise NotImplementedError()
+
+    @classmethod
+    def join_collection(cls, collection, **kwargs):
+        '''Append the collection to the URL or path specified in ``**kwargs``
+        and return the new arguments.
        '''
        raise NotImplementedError()

--- a/vdirsyncer/storage/dav.py
+++ b/vdirsyncer/storage/dav.py
@ -250,9 +250,6 @@ class DavStorage(Storage):
        super(DavStorage, self).__init__(**kwargs)

        url = url.rstrip('/') + '/'
-        collection = kwargs.get('collection')
-        if collection is not None:
-            url = utils.urlparse.urljoin(url, collection)
        self.session = DavSession(url, username, password, verify, auth,
                                  useragent, verify_fingerprint)
        self.unsafe_href_chars = unsafe_href_chars
@ -271,10 +268,12 @@ class DavStorage(Storage):
        ))
        d = cls.discovery_class(DavSession(url=url, **discover_args))
        for c in d.discover():
-            base, collection = c['href'].rstrip('/').rsplit('/', 1)
-            s = cls(url=base, collection=collection, **kwargs)
-            s.displayname = c['displayname']
-            yield s
+            url = c['href']
+            _, collection = url.rstrip('/').rsplit('/', 1)
+            storage_args = {'url': url, 'collection': collection,
+                            'collection_human': c['displayname']}
+            storage_args.update(kwargs)
+            yield storage_args

    def _normalize_href(self, *args, **kwargs):
        return _normalize_href(self.session.url, *args, **kwargs)
--- a/vdirsyncer/storage/filesystem.py
+++ b/vdirsyncer/storage/filesystem.py
@ -40,9 +40,6 @@ class FilesystemStorage(Storage):
    def __init__(self, path, fileext, encoding='utf-8', create=True, **kwargs):
        super(FilesystemStorage, self).__init__(**kwargs)
        path = expand_path(path)
-        collection = kwargs.get('collection')
-        if collection is not None:
-            path = os.path.join(path, collection)
        checkdir(path, create=create)
        self.path = path
        self.encoding = encoding
@ -54,9 +51,16 @@ class FilesystemStorage(Storage):
            raise TypeError('collection argument must not be given.')
        path = expand_path(path)
        for collection in os.listdir(path):
-            if os.path.isdir(os.path.join(path, collection)):
-                s = cls(path=path, collection=collection, **kwargs)
-                yield s
+            collection_path = os.path.join(path, collection)
+            if os.path.isdir(collection_path):
+                args = dict(collection=collection, path=collection_path,
+                            **kwargs)
+                yield args
+
+    @classmethod
+    def join_collection(cls, collection, **kwargs):
+        kwargs['path'] = os.path.join(kwargs['path'], collection)
+        return kwargs

    def _get_filepath(self, href):
        return os.path.join(self.path, href)