From 71a518780254bf66e91a972a22718a3e9ee7fd25 Mon Sep 17 00:00:00 2001 From: Bertold Van den Bergh Date: Wed, 14 Aug 2019 00:54:40 +0200 Subject: [PATCH] Reduce filesize further by not encoding points multiple times (not well tested, do not use) --- database/builder/builder.cpp | 356 ++++++++++++++++++++++++++--------- database/builder/makedb.sh | 6 +- library/zonedetect.c | 80 +++++++- 3 files changed, 348 insertions(+), 94 deletions(-) diff --git a/database/builder/builder.cpp b/database/builder/builder.cpp index 1f71d23..1cea9b5 100644 --- a/database/builder/builder.cpp +++ b/database/builder/builder.cpp @@ -84,26 +84,62 @@ int encodeVariableLength(std::vector& output, int64_t valueIn, bool han return bytesUsed; } +uint64_t encodePointTo64(int64_t lat, int64_t lon){ + assert(lat || lon, "Tried to encode 0,0. This is not allowed"); + + uint64_t latu=encodeSignedToUnsigned(lat); + uint64_t lonu=encodeSignedToUnsigned(lon); + + assert(latu < (uint64_t)1<<32, "Unsigned lat overflow"); + assert(lonu < (uint64_t)1<<32, "Unsigned lat overflow"); + + uint64_t point = 0; + for(uint8_t i=31; i<=31; i--){ + point <<= 2; + if(latu & (1< pointMap_; + struct Point { - Point(double lat = 0, double lon = 0, unsigned int precision = 32) + static Point* GetPoint(double dlat = 0, double dlon = 0, unsigned int precision = 32){ + int64_t lat = doubleToFixedPoint(dlat, 90, precision); + int64_t lon = doubleToFixedPoint(dlon, 180, precision); + + uint64_t key = encodePointTo64(lat, lon); + if(pointMap_.count(key)){ + return pointMap_[key]; + } + + Point* p = new Point(lat, lon); + p->key_ = key; + pointMap_[key] = p; + return p; + } + + Point(int64_t lat = 0, int64_t lon = 0) { - lat_ = doubleToFixedPoint(lat, 90, precision); - lon_ = doubleToFixedPoint(lon, 180, precision); + lat_ = lat; + lon_ = lon; } std::tuple value() @@ -121,125 +157,267 @@ struct Point { int64_t lat_; int64_t lon_; + uint64_t key_; + PolygonData* parent_ = nullptr; + int index_ = 0; + bool encoded_ = false; + uint64_t encodedOffset_ = 0; }; struct PolygonData { Point boundingMin; Point boundingMax; - std::vector points_; + std::vector points_; unsigned long fileIndex_ = 0; unsigned long metadataId_; + Point* lastPoint_ = nullptr; - void processPoint(const Point& p) + void processPoint(Point* p) { - if(p.lat_ < boundingMin.lat_) { - boundingMin.lat_ = p.lat_; + if(p->lat_ < boundingMin.lat_) { + boundingMin.lat_ = p->lat_; } - if(p.lon_ < boundingMin.lon_) { - boundingMin.lon_ = p.lon_; + if(p->lon_ < boundingMin.lon_) { + boundingMin.lon_ = p->lon_; } - if(p.lat_ > boundingMax.lat_) { - boundingMax.lat_ = p.lat_; + if(p->lat_ > boundingMax.lat_) { + boundingMax.lat_ = p->lat_; } - if(p.lon_ > boundingMax.lon_) { - boundingMax.lon_ = p.lon_; + if(p->lon_ > boundingMax.lon_) { + boundingMax.lon_ = p->lon_; } + /* Don't encode duplicate points */ + if(lastPoint_ == p){ + return; + } + lastPoint_ = p; + points_.push_back(p); } PolygonData(unsigned long id): - boundingMin(Inf, Inf), - boundingMax(-Inf, -Inf), + boundingMin(INT64_MAX, INT64_MAX), + boundingMax(INT64_MIN, INT64_MIN), metadataId_(id) { } - uint64_t encodePointTo64(int64_t lat, int64_t lon){ - assert(lat || lon, "Tried to encode 0,0. This is not allowed"); - uint64_t latu=encodeSignedToUnsigned(lat); - uint64_t lonu=encodeSignedToUnsigned(lon); - - assert(latu < (uint64_t)1<<32, "Unsigned lat overflow"); - assert(lonu < (uint64_t)1<<32, "Unsigned lat overflow"); - - uint64_t point = 0; - for(uint8_t i=31; i<=31; i--){ - point <<= 2; - if(latu & (1< points_; + Point* prevPoint_; + PolygonData* parent_; - bool sameDirection(int64_t x1, int64_t y1, int64_t x2, int64_t y2){ - if((x1 > 0 && x2 < 0) || (x1 < 0 && x2 > 0)){ - return false; - } - if((y1 > 0 && y2 < 0) || (y1 < 0 && y2 > 0)){ - return false; + bool sameDirection(int64_t x1, int64_t y1, int64_t x2, int64_t y2){ + if(!x2 && !y2){ + return false; + } + + if((x1 > 0 && x2 < 0) || (x1 < 0 && x2 > 0)){ + return false; + } + if((y1 > 0 && y2 < 0) || (y1 < 0 && y2 > 0)){ + return false; + } + + if(x1 == 0){ + return x2 == 0; + } + + return y2 == (y1*x2/x1); } - if(x1 == 0){ - return x2 == 0; - } + + void encodeDelta(std::vector& output, PolygonData* mark = nullptr, int start = 0, int end = -1){ + if(end < 0){ + end = points_.size()-1; + } - return y2 == (y1*x2/x1); - } + int64_t accDiffLat = 0, accDiffLon = 0; + int64_t prevDiffLat = 0, prevDiffLon = 0; + + int64_t prevLat, prevLon; + + Point* prevPoint = prevPoint_; + if(start > 0){ + prevPoint = points_[start-1]; + } + + std::tie(prevLat, prevLon) = prevPoint->value(); + + auto encodePoint = [&](){ + /* Encode accumulator. + * After this the position is equal to that of the previous point */ + if(accDiffLat || accDiffLon){ + encodeVariableLength(output, encodePointTo64(accDiffLat, accDiffLon), false); + } + /* Mark points as encoded if we mark and we are the parent */ + if(mark && prevPoint->parent_ == mark){ + prevPoint->encoded_ = true; + prevPoint->encodedOffset_ = output.size(); + } + + /* Reset accumulator */ + accDiffLat = 0; + accDiffLon = 0; + }; + + for(int i = start; i<=end; i++){ + Point* point = points_[i]; + + int64_t lat, lon; + std::tie(lat, lon) = point->value(); + + /* Calculate difference */ + int64_t diffLat = lat - prevLat; + int64_t diffLon = lon - prevLon; + + /* Encode delta */ + if(!sameDirection(diffLat, diffLon, prevDiffLat, prevDiffLon)){ + encodePoint(); + } + + accDiffLat += diffLat; + accDiffLon += diffLon; + + /* Store previous values */ + prevDiffLat = diffLat; + prevDiffLon = diffLon; + prevLat = lat; + prevLon = lon; + prevPoint = point; + } + + /* Encode remainder if needed */ + encodePoint(); + } + + bool encodeReference(std::vector& output){ + /* Search for first marked point */ + int end = -1, start = -1; + for(int i=0; iencoded_){ + start = i; + break; + } + } + + for(int i=points_.size()-1; i>=0; i--){ + if(points_[i]->encoded_){ + end = i; + break; + } + } + + if(end < 0 || start < 0){ + /* Only unencoded points, then we can only delta encode it ourself */ + return false; + } + + /* Encode delta until where we can refer */ + encodeDelta(output, nullptr, 0, start); + + + /* Add reference marker if it is still needed */ + if(start != end){ + uint64_t startRef = points_[start]->encodedOffset_; + uint64_t endRef = points_[end]->encodedOffset_; + + output.push_back(0); + output.push_back(1); + encodeVariableLength(output, startRef, false); + encodeVariableLength(output, endRef - startRef, true); + } + + /* Encode delta till the end of the segment */ + encodeDelta(output, nullptr, end+1); + + return true; + } + }; long encodeBinaryData(std::vector& output) { - bool first = true; - int64_t latFixedPoint = 0, lonFixedPoint = 0; - int64_t latFixedPointPrev, lonFixedPointPrev; + std::vector lines_; + PolygonData* currentParent = nullptr; + LineSegment* segment = nullptr; - int64_t diffLatAcc = 0, diffLonAcc = 0, diffLatPrev = 0, diffLonPrev = 0; + /* Step 1: Encode first point */ + Point* prevPoint = points_[0]; + encodeVariableLength(output, prevPoint->key_, false); - for(Point point: points_){ - /* The points should first be rounded, and then the integer value is differentiated */ - latFixedPointPrev = latFixedPoint; - lonFixedPointPrev = lonFixedPoint; - std::tie(latFixedPoint, lonFixedPoint) = point.value(); + int direction = 0; + /* Step 2: Go through the list of points and check which ones already exist. + * We skip the first and last one since the first one is already encoded + * and the last one is identical to the first */ + for(int i=1; iparent_){ + point->parent_ = this; + point->index_ = i; + } + + bool newSegment = false; - if(first) { - /* First point is always encoded */ - encodeVariableLength(output, encodePointTo64(latFixedPoint, lonFixedPoint), false); - - first = false; - } else { - if(!sameDirection(diffLat, diffLon, diffLatPrev, diffLonPrev)) { - /* Encode accumulator */ - if(diffLatAcc || diffLonAcc){ - encodeVariableLength(output, encodePointTo64(diffLatAcc, diffLonAcc), false); - - diffLatAcc = 0; - diffLonAcc = 0; + if(point->parent_ == currentParent){ + if(direction == 0){ + direction = point->index_ - prevPoint->index_; + if(direction > 1 || direction < -1){ + newSegment = true; + } + }else{ + if(point->index_ != prevPoint->index_ + direction){ + newSegment = true; } } - - diffLatAcc += diffLat; - diffLonAcc += diffLon; } - diffLatPrev = diffLat; - diffLonPrev = diffLon; + if(point->parent_ != currentParent || newSegment){ + if(segment){ + lines_.push_back(segment); + } + + currentParent = point->parent_; + + segment = new LineSegment(); + segment->prevPoint_ = prevPoint; + segment->parent_ = currentParent; + direction = 0; + } + + segment->points_.push_back(point); + + prevPoint = point; } - - /* Encode final point if needed */ - if(diffLonAcc || diffLatAcc) { - encodeVariableLength(output, encodePointTo64(diffLatAcc, diffLonAcc), false); + if(segment){ + lines_.push_back(segment); } - /* Encode stop marker */ + /* Step 3: Encode segments */ + for(LineSegment* segment: lines_){ + if(segment->parent_ == this){ + /* If we are the parent of the segment we must encode and mark it */ + segment->encodeDelta(output, this); + }else{ + /* We are not the parent, we can encode it or refer to it, depending on + * which takes less bytes. In any case we should not mark it. */ + std::vector delta; + segment->encodeDelta(delta); + + std::vector reference; + bool possible = segment->encodeReference(reference); + + if(!possible || delta.size() <= reference.size()){ + output.insert(std::end(output), std::begin(delta), std::end(delta)); + }else{ + output.insert(std::end(output), std::begin(reference), std::end(reference)); + } + } + } + + /* Step 4: Write end marker */ output.push_back(0); output.push_back(0); @@ -520,7 +698,7 @@ int main(int argc, char ** argv ) } } - Point p(shapeObject->padfY[j], shapeObject->padfX[j], precision); + Point* p = Point::GetPoint(shapeObject->padfY[j], shapeObject->padfX[j], precision); polygonData->processPoint(p); } diff --git a/database/builder/makedb.sh b/database/builder/makedb.sh index c59a39b..88421a6 100755 --- a/database/builder/makedb.sh +++ b/database/builder/makedb.sh @@ -8,14 +8,14 @@ mkdir -p naturalearth; cd naturalearth #wget https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_0_countries_lakes.zip #unzip ne_10m_admin_0_countries_lakes.zip cd .. -#./builder C naturalearth/ne_10m_admin_0_countries_lakes ./out/country16.bin 16 "Made with Natural Earth, placed in the Public Domain." -#./builder C naturalearth/ne_10m_admin_0_countries_lakes ./out/country21.bin 21 "Made with Natural Earth, placed in the Public Domain." +./builder C naturalearth/ne_10m_admin_0_countries_lakes ./out/country16.bin 16 "Made with Natural Earth, placed in the Public Domain." +./builder C naturalearth/ne_10m_admin_0_countries_lakes ./out/country21.bin 21 "Made with Natural Earth, placed in the Public Domain." mkdir timezone; cd timezone #wget https://github.com/evansiroky/timezone-boundary-builder/releases/download/2018i/timezones.shapefile.zip #unzip timezones.shapefile.zip cd .. -#./builder T timezone/dist/combined-shapefile ./out/timezone16.bin 16 "Contains data from Natural Earth, placed in the Public Domain. Contains information from https://github.com/evansiroky/timezone-boundary-builder, which is made available here under the Open Database License (ODbL)." +./builder T timezone/dist/combined-shapefile ./out/timezone16.bin 16 "Contains data from Natural Earth, placed in the Public Domain. Contains information from https://github.com/evansiroky/timezone-boundary-builder, which is made available here under the Open Database License (ODbL)." ./builder T timezone/dist/combined-shapefile ./out/timezone21.bin 21 "Contains data from Natural Earth, placed in the Public Domain. Contains information from https://github.com/evansiroky/timezone-boundary-builder, which is made available here under the Open Database License (ODbL)." #rm -rf naturalearth #zip db.zip out/* diff --git a/library/zonedetect.c b/library/zonedetect.c index 260ee5c..f63ca21 100644 --- a/library/zonedetect.c +++ b/library/zonedetect.c @@ -138,6 +138,34 @@ static unsigned int ZDDecodeVariableLengthUnsigned(const ZoneDetect *library, ui return i; } +static unsigned int ZDDecodeVariableLengthUnsignedReverse(const ZoneDetect *library, uint32_t *index, uint64_t *result){ + uint32_t i = *index; + + if(library->mapping[i] & UINT8_C(0x80)){ + printf("BUG, reverse mapping final byte is not the end of stream\n"); + return 0; + } + + if(!i){ + return 0; + } + i--; + + while(library->mapping[i] & UINT8_C(0x80)){ + if(!i){ + return 0; + } + i--; + } + + *index = i; + + i++; + + uint32_t i2 = i; + return ZDDecodeVariableLengthUnsigned(library, &i2, result); +} + static int64_t ZDDecodeUnsignedToSigned(uint64_t value){ return (value & 1) ? -(int64_t)(value / 2) : (int64_t)(value / 2); } @@ -306,20 +334,64 @@ static ZDLookupResult ZDPointInPolygon(const ZoneDetect *library, uint32_t polyg int prevQuadrant = 0, winding = 0; uint8_t done = 0, first = 1; + uint32_t referenceStart=0, referenceEnd=0; + int32_t referenceDirection = 0; + do{ uint64_t point; - if(!ZDDecodeVariableLengthUnsigned(library, &polygonIndex, &point)) return ZD_LOOKUP_PARSE_ERROR; + uint8_t referenceDone = 0; + if(!referenceDirection){ + if(!ZDDecodeVariableLengthUnsigned(library, &polygonIndex, &point)) return ZD_LOOKUP_PARSE_ERROR; + }else{ + if(referenceDirection > 0){ + /* Read reference forward */ + if(!ZDDecodeVariableLengthUnsigned(library, &referenceStart, &point)) return ZD_LOOKUP_PARSE_ERROR; + if(referenceStart >= referenceEnd){ + referenceDone = 1; + } + }else if(referenceDirection < 0){ + /* Read reference backwards */ + //TODO: This code is wrong (doh) + if(!ZDDecodeVariableLengthUnsignedReverse(library, &referenceStart, &point)) return ZD_LOOKUP_PARSE_ERROR; + if(referenceStart <= referenceEnd){ + referenceDone = 1; + } + } + } + //TODO: special marker during reference mode is an error if(!point){ /* This is a special marker */ + if(referenceDirection){ + printf("BUG, marker in reference mode?\n"); + exit(10); + } + uint64_t value; if(!ZDDecodeVariableLengthUnsigned(library, &polygonIndex, &value)) return ZD_LOOKUP_PARSE_ERROR; if(value == 0){ done = 1; + }else if(value == 1){ + int32_t diff; + int64_t start; + if(!ZDDecodeVariableLengthUnsigned(library, &polygonIndex, (uint64_t*)&start)) return ZD_LOOKUP_PARSE_ERROR; + if(!ZDDecodeVariableLengthSigned(library, &polygonIndex, &diff)) return ZD_LOOKUP_PARSE_ERROR; + + referenceStart = library->dataOffset+(uint32_t)start; + referenceEnd = library->dataOffset+(uint32_t)(start + diff); + referenceDirection = diff; + if(diff < 0){ + referenceStart--; + } + continue; } }else{ ZDDecodePoint(point, &diffLat, &diffLon); + if(referenceDirection < 0){ + diffLat = -diffLat; + diffLon = -diffLon; + } } if(!done){ @@ -330,7 +402,7 @@ static ZDLookupResult ZDPointInPolygon(const ZoneDetect *library, uint32_t polyg firstLon = pointLon; } } else { - /* The polygons should be closed, but just in case */ + /* Close the polygon (the closing point is not encoded) */ pointLat = firstLat; pointLon = firstLon; } @@ -451,6 +523,10 @@ static ZDLookupResult ZDPointInPolygon(const ZoneDetect *library, uint32_t polyg if(first){ first = 0; } + + if(referenceDone){ + referenceDirection = 0; + } }while(!done); if(winding == -4) {