Move item hashing into rust

This commit is contained in:
Markus Unterwaditzer 2017-10-10 00:50:35 +02:00
parent c17fa308fb
commit bdbfc360ff
9 changed files with 127 additions and 96 deletions

View file

@ -9,7 +9,8 @@ name = "vdirsyncer_rustext"
crate-type = ["cdylib"]
[dependencies]
vobject = "0.2.0"
vobject = "0.3.0"
ring = "0.12.1"
[build-dependencies]
cbindgen = "0.1"

View file

@ -1,10 +1,13 @@
extern crate vobject;
extern crate ring;
use std::ffi::{CStr, CString};
use std::os::raw::c_char;
use std::mem;
use std::ptr;
use std::fmt::Write;
const EMPTY_STRING: *const c_char = b"\0" as *const u8 as *const c_char;
#[repr(C)]
@ -100,3 +103,53 @@ pub unsafe extern "C" fn vdirsyncer_clone_component(c: *mut VdirsyncerComponent)
pub unsafe extern "C" fn vdirsyncer_write_component(c: *mut VdirsyncerComponent) -> *const c_char {
CString::new(vobject::write_component(&(*c).0)).unwrap().into_raw()
}
#[no_mangle]
pub unsafe extern "C" fn vdirsyncer_hash_component(c: *mut VdirsyncerComponent) -> *const c_char {
CString::new(safe_hash_component(&(*c).0)).unwrap().into_raw()
}
fn safe_hash_component(c: &vobject::Component) -> String {
let mut new_c = c.clone();
{
let mut stack = vec![&mut new_c];
while let Some(component) = stack.pop() {
// PRODID is changed by radicale for some reason after upload
component.remove("PRODID");
// Sometimes METHOD:PUBLISH is added by WebCAL providers, for us it doesn't make a difference
component.remove("METHOD");
// X-RADICALE-NAME is used by radicale, because hrefs don't really exist in their filesystem backend
component.remove("X-RADICALE-NAME");
// Apparently this is set by Horde?
// https://github.com/pimutils/vdirsyncer/issues/318
component.remove("X-WR-CALNAME");
// Those are from the VCARD specification and is supposed to change when the
// item does -- however, we can determine that ourselves
component.remove("REV");
component.remove("LAST-MODIFIED");
component.remove("CREATED");
// Some iCalendar HTTP calendars generate the DTSTAMP at request time, so
// this property always changes when the rest of the item didn't. Some do
// the same with the UID.
//
// - Google's read-only calendar links
// - http://www.feiertage-oesterreich.at/
component.remove("DTSTAMP");
component.remove("UID");
if component.name == "VCALENDAR" {
component.subcomponents.retain(|ref c| c.name != "VTIMEZONE");
}
stack.extend(component.subcomponents.iter_mut());
}
}
// FIXME: Possible optimization: Stream component to hasher instead of allocating new string
let digest = ring::digest::digest(&ring::digest::SHA256, vobject::write_component(&new_c).as_bytes());
let mut rv = String::new();
for &byte in digest.as_ref() {
write!(&mut rv, "{:x}", byte).unwrap();
}
rv
}

View file

@ -7,7 +7,7 @@ import random
import hypothesis.strategies as st
from vdirsyncer.vobject import Item, normalize_item
from vdirsyncer.vobject import Item
import urllib3
import urllib3.exceptions
@ -20,7 +20,7 @@ def blow_up(*a, **kw):
def assert_item_equals(a, b):
assert normalize_item(a) == normalize_item(b)
assert a.hash == b.hash
VCARD_TEMPLATE = u'''BEGIN:VCARD

View file

@ -15,7 +15,7 @@ from vdirsyncer.storage.base import normalize_meta_value
from vdirsyncer.vobject import Item
from .. import EVENT_TEMPLATE, TASK_TEMPLATE, VCARD_TEMPLATE, \
assert_item_equals, format_item, normalize_item, \
assert_item_equals, format_item, \
printable_characters_strategy
@ -348,4 +348,4 @@ class StorageTests(object):
href, etag = s.upload(item)
item2, etag2 = s.get(href)
assert normalize_item(item) == normalize_item(item2)
assert item.hash == item2.hash

View file

@ -6,14 +6,8 @@ import os
import pytest
import requests
import requests.exceptions
from tests import assert_item_equals
from vdirsyncer import exceptions
from vdirsyncer.vobject import Item
from .. import StorageTests, get_server_mixin
@ -24,14 +18,6 @@ ServerMixin = get_server_mixin(dav_server)
class DAVStorageTests(ServerMixin, StorageTests):
dav_server = dav_server
@pytest.mark.skipif(dav_server == 'radicale',
reason='Radicale is very tolerant.')
def test_dav_broken_item(self, s):
item = Item(u'HAHA:YES')
with pytest.raises((exceptions.Error, requests.exceptions.HTTPError)):
s.upload(item)
assert not list(s.list())
def test_dav_empty_get_multi_performance(self, s, monkeypatch):
def breakdown(*a, **kw):
raise AssertionError('Expected not to be called.')

View file

@ -4,10 +4,9 @@ import pytest
from requests import Response
from tests import normalize_item
from vdirsyncer.exceptions import UserError
from vdirsyncer.storage.http import HttpStorage, prepare_auth
from vdirsyncer.vobject import Item
def test_list(monkeypatch):
@ -56,9 +55,9 @@ def test_list(monkeypatch):
item, etag2 = s.get(href)
assert item.uid is not None
assert etag2 == etag
found_items[normalize_item(item)] = href
found_items[item.hash] = href
expected = set(normalize_item(u'BEGIN:VCALENDAR\n' + x + '\nEND:VCALENDAR')
expected = set(Item(u'BEGIN:VCALENDAR\n' + x + '\nEND:VCALENDAR').hash
for x in items)
assert set(found_items) == expected
@ -67,7 +66,7 @@ def test_list(monkeypatch):
item, etag2 = s.get(href)
assert item.uid is not None
assert etag2 == etag
assert found_items[normalize_item(item)] == href
assert found_items[item.hash] == href
def test_readonly_param():

View file

@ -9,7 +9,7 @@ from hypothesis.stateful import Bundle, RuleBasedStateMachine, rule
import pytest
from tests import BARE_EVENT_TEMPLATE, EVENT_TEMPLATE, \
EVENT_WITH_TIMEZONE_TEMPLATE, VCARD_TEMPLATE, normalize_item, \
EVENT_WITH_TIMEZONE_TEMPLATE, VCARD_TEMPLATE, \
uid_strategy
import vdirsyncer.vobject as vobject
@ -31,8 +31,8 @@ _simple_joined = u'\r\n'.join(
def test_split_collection_simple(benchmark):
given = benchmark(lambda: list(vobject.split_collection(_simple_joined)))
assert [normalize_item(item) for item in given] == \
[normalize_item(item) for item in _simple_split]
assert [vobject.Item(item).hash for item in given] == \
[vobject.Item(item).hash for item in _simple_split]
assert [x.splitlines() for x in given] == \
[x.splitlines() for x in _simple_split]
@ -47,8 +47,8 @@ def test_split_collection_multiple_wrappers(benchmark):
)
given = benchmark(lambda: list(vobject.split_collection(joined)))
assert [normalize_item(item) for item in given] == \
[normalize_item(item) for item in _simple_split]
assert [vobject.Item(item).hash for item in given] == \
[vobject.Item(item).hash for item in _simple_split]
assert [x.splitlines() for x in given] == \
[x.splitlines() for x in _simple_split]
@ -56,7 +56,7 @@ def test_split_collection_multiple_wrappers(benchmark):
def test_join_collection_simple(benchmark):
given = benchmark(lambda: vobject.join_collection(_simple_split))
assert normalize_item(given) == normalize_item(_simple_joined)
assert vobject.Item(given).hash == vobject.Item(_simple_joined).hash
assert given.splitlines() == _simple_joined.splitlines()
@ -123,12 +123,12 @@ def test_split_collection_timezones():
[timezone, u'END:VCALENDAR']
)
given = set(normalize_item(item)
given = set(vobject.Item(item).hash
for item in vobject.split_collection(full))
expected = set(
normalize_item(u'\r\n'.join((
vobject.Item(u'\r\n'.join((
u'BEGIN:VCALENDAR', item, timezone, u'END:VCALENDAR'
)))
))).hash
for item in items
)
@ -146,11 +146,11 @@ def test_split_contacts():
with_wrapper.splitlines()
def test_hash_item():
def test_hash_item2():
a = EVENT_TEMPLATE.format(r=1, uid=1)
b = u'\n'.join(line for line in a.splitlines()
if u'PRODID' not in line)
assert vobject.hash_item(a) == vobject.hash_item(b)
assert vobject.Item(a).hash == vobject.Item(b).hash
def test_multiline_uid(benchmark):
@ -351,3 +351,47 @@ def test_component_contains():
with pytest.raises(ValueError):
42 in item
def test_hash_item():
item1 = vobject.Item(
'BEGIN:FOO\r\n'
'X-RADICALE-NAME:YES\r\n'
'END:FOO\r\n'
)
item2 = vobject.Item(
'BEGIN:FOO\r\n'
'X-RADICALE-NAME:NO\r\n'
'END:FOO\r\n'
)
assert item1.hash == item2.hash
item2 = vobject.Item(
'BEGIN:FOO\r\n'
'X-RADICALE-NAME:NO\r\n'
'OTHER-PROP:YAY\r\n'
'END:FOO\r\n'
)
assert item1.hash != item2.hash
def test_hash_item_timezones():
item1 = vobject.Item(
'BEGIN:VCALENDAR\r\n'
'HELLO:HAHA\r\n'
'BEGIN:VTIMEZONE\r\n'
'PROP:YES\r\n'
'END:VTIMEZONE\r\n'
'END:VCALENDAR\r\n'
)
item2 = vobject.Item(
'BEGIN:VCALENDAR\r\n'
'HELLO:HAHA\r\n'
'END:VCALENDAR\r\n'
)
assert item1.hash == item2.hash

View file

@ -39,3 +39,7 @@ def _component_rv(c):
def clone_component(c):
return _component_rv(lib.vdirsyncer_clone_component(c))
def hash_component(c):
return _string_rv(lib.vdirsyncer_hash_component(c))

View file

@ -1,40 +1,11 @@
# -*- coding: utf-8 -*-
import hashlib
from itertools import chain, tee
from .utils import cached_property, uniq
from . import exceptions, native
IGNORE_PROPS = (
# PRODID is changed by radicale for some reason after upload
'PRODID',
# Sometimes METHOD:PUBLISH is added by WebCAL providers, for us it doesn't
# make a difference
'METHOD',
# X-RADICALE-NAME is used by radicale, because hrefs don't really exist in
# their filesystem backend
'X-RADICALE-NAME',
# Apparently this is set by Horde?
# https://github.com/pimutils/vdirsyncer/issues/318
'X-WR-CALNAME',
# Those are from the VCARD specification and is supposed to change when the
# item does -- however, we can determine that ourselves
'REV',
'LAST-MODIFIED',
'CREATED',
# Some iCalendar HTTP calendars generate the DTSTAMP at request time, so
# this property always changes when the rest of the item didn't. Some do
# the same with the UID.
#
# - Google's read-only calendar links
# - http://www.feiertage-oesterreich.at/
'DTSTAMP',
'UID',
)
class Item(object):
'''Immutable wrapper class for VCALENDAR (VEVENT, VTODO) and
@ -76,8 +47,11 @@ class Item(object):
@cached_property
def hash(self):
'''Hash of self.raw, used for etags.'''
return hash_item(self.raw)
'''Used for etags.'''
if not self.is_valid:
raise ValueError('Item malformed.')
return native.hash_component(self._component)
@cached_property
def ident(self):
@ -107,36 +81,6 @@ class Item(object):
return bool(self._component)
def normalize_item(item, ignore_props=IGNORE_PROPS):
'''Create syntactically invalid mess that is equal for similar items.'''
if not isinstance(item, Item):
item = Item(item)
item = _strip_timezones(item)
x = _Component('TEMP', item.raw.splitlines(), [])
for prop in IGNORE_PROPS:
del x[prop]
x.props.sort()
return u'\r\n'.join(filter(bool, (line.strip() for line in x.props)))
def _strip_timezones(item):
parsed = item.parsed
if not parsed or parsed.name != 'VCALENDAR':
return item
parsed.subcomponents = [c for c in parsed.subcomponents
if c.name != 'VTIMEZONE']
return Item('\r\n'.join(parsed.dump_lines()))
def hash_item(text):
return hashlib.sha256(normalize_item(text).encode('utf-8')).hexdigest()
def split_collection(text):
assert isinstance(text, str)
inline = []