Reduce filesize further by not encoding points multiple times (not well tested, do not use)

This commit is contained in:
Bertold Van den Bergh 2019-08-14 00:54:40 +02:00
parent 4269644520
commit 71a5187802
3 changed files with 348 additions and 94 deletions

View file

@ -84,77 +84,6 @@ int encodeVariableLength(std::vector<uint8_t>& output, int64_t valueIn, bool han
return bytesUsed; return bytesUsed;
} }
int64_t doubleToFixedPoint(double input, double scale, unsigned int precision = 32)
{
if(input == Inf){
return INT64_MAX;
}
if(input == -Inf){
return INT64_MIN;
}
double inputScaled = input / scale;
return inputScaled * pow(2, precision-1);
}
struct Point {
Point(double lat = 0, double lon = 0, unsigned int precision = 32)
{
lat_ = doubleToFixedPoint(lat, 90, precision);
lon_ = doubleToFixedPoint(lon, 180, precision);
}
std::tuple<int64_t, int64_t> value()
{
return std::make_tuple(lat_, lon_);
}
int encodePointBinary(std::vector<uint8_t>& output)
{
int bytesUsed = encodeVariableLength(output, lat_);
bytesUsed += encodeVariableLength(output, lon_);
return bytesUsed;
}
int64_t lat_;
int64_t lon_;
};
struct PolygonData {
Point boundingMin;
Point boundingMax;
std::vector<Point> points_;
unsigned long fileIndex_ = 0;
unsigned long metadataId_;
void processPoint(const Point& p)
{
if(p.lat_ < boundingMin.lat_) {
boundingMin.lat_ = p.lat_;
}
if(p.lon_ < boundingMin.lon_) {
boundingMin.lon_ = p.lon_;
}
if(p.lat_ > boundingMax.lat_) {
boundingMax.lat_ = p.lat_;
}
if(p.lon_ > boundingMax.lon_) {
boundingMax.lon_ = p.lon_;
}
points_.push_back(p);
}
PolygonData(unsigned long id):
boundingMin(Inf, Inf),
boundingMax(-Inf, -Inf),
metadataId_(id)
{
}
uint64_t encodePointTo64(int64_t lat, int64_t lon){ uint64_t encodePointTo64(int64_t lat, int64_t lon){
assert(lat || lon, "Tried to encode 0,0. This is not allowed"); assert(lat || lon, "Tried to encode 0,0. This is not allowed");
@ -178,7 +107,113 @@ struct PolygonData {
return point; return point;
} }
int64_t doubleToFixedPoint(double input, double scale, unsigned int precision = 32)
{
double inputScaled = input / scale;
return inputScaled * pow(2, precision-1);
}
struct Point;
struct PolygonData;
std::unordered_map<uint64_t, Point*> pointMap_;
struct Point {
static Point* GetPoint(double dlat = 0, double dlon = 0, unsigned int precision = 32){
int64_t lat = doubleToFixedPoint(dlat, 90, precision);
int64_t lon = doubleToFixedPoint(dlon, 180, precision);
uint64_t key = encodePointTo64(lat, lon);
if(pointMap_.count(key)){
return pointMap_[key];
}
Point* p = new Point(lat, lon);
p->key_ = key;
pointMap_[key] = p;
return p;
}
Point(int64_t lat = 0, int64_t lon = 0)
{
lat_ = lat;
lon_ = lon;
}
std::tuple<int64_t, int64_t> value()
{
return std::make_tuple(lat_, lon_);
}
int encodePointBinary(std::vector<uint8_t>& output)
{
int bytesUsed = encodeVariableLength(output, lat_);
bytesUsed += encodeVariableLength(output, lon_);
return bytesUsed;
}
int64_t lat_;
int64_t lon_;
uint64_t key_;
PolygonData* parent_ = nullptr;
int index_ = 0;
bool encoded_ = false;
uint64_t encodedOffset_ = 0;
};
struct PolygonData {
Point boundingMin;
Point boundingMax;
std::vector<Point*> points_;
unsigned long fileIndex_ = 0;
unsigned long metadataId_;
Point* lastPoint_ = nullptr;
void processPoint(Point* p)
{
if(p->lat_ < boundingMin.lat_) {
boundingMin.lat_ = p->lat_;
}
if(p->lon_ < boundingMin.lon_) {
boundingMin.lon_ = p->lon_;
}
if(p->lat_ > boundingMax.lat_) {
boundingMax.lat_ = p->lat_;
}
if(p->lon_ > boundingMax.lon_) {
boundingMax.lon_ = p->lon_;
}
/* Don't encode duplicate points */
if(lastPoint_ == p){
return;
}
lastPoint_ = p;
points_.push_back(p);
}
PolygonData(unsigned long id):
boundingMin(INT64_MAX, INT64_MAX),
boundingMax(INT64_MIN, INT64_MIN),
metadataId_(id)
{
}
struct LineSegment {
std::vector<Point*> points_;
Point* prevPoint_;
PolygonData* parent_;
bool sameDirection(int64_t x1, int64_t y1, int64_t x2, int64_t y2){ bool sameDirection(int64_t x1, int64_t y1, int64_t x2, int64_t y2){
if(!x2 && !y2){
return false;
}
if((x1 > 0 && x2 < 0) || (x1 < 0 && x2 > 0)){ if((x1 > 0 && x2 < 0) || (x1 < 0 && x2 > 0)){
return false; return false;
} }
@ -193,53 +228,196 @@ struct PolygonData {
return y2 == (y1*x2/x1); return y2 == (y1*x2/x1);
} }
void encodeDelta(std::vector<uint8_t>& output, PolygonData* mark = nullptr, int start = 0, int end = -1){
if(end < 0){
end = points_.size()-1;
}
int64_t accDiffLat = 0, accDiffLon = 0;
int64_t prevDiffLat = 0, prevDiffLon = 0;
int64_t prevLat, prevLon;
Point* prevPoint = prevPoint_;
if(start > 0){
prevPoint = points_[start-1];
}
std::tie(prevLat, prevLon) = prevPoint->value();
auto encodePoint = [&](){
/* Encode accumulator.
* After this the position is equal to that of the previous point */
if(accDiffLat || accDiffLon){
encodeVariableLength(output, encodePointTo64(accDiffLat, accDiffLon), false);
}
/* Mark points as encoded if we mark and we are the parent */
if(mark && prevPoint->parent_ == mark){
prevPoint->encoded_ = true;
prevPoint->encodedOffset_ = output.size();
}
/* Reset accumulator */
accDiffLat = 0;
accDiffLon = 0;
};
for(int i = start; i<=end; i++){
Point* point = points_[i];
int64_t lat, lon;
std::tie(lat, lon) = point->value();
/* Calculate difference */
int64_t diffLat = lat - prevLat;
int64_t diffLon = lon - prevLon;
/* Encode delta */
if(!sameDirection(diffLat, diffLon, prevDiffLat, prevDiffLon)){
encodePoint();
}
accDiffLat += diffLat;
accDiffLon += diffLon;
/* Store previous values */
prevDiffLat = diffLat;
prevDiffLon = diffLon;
prevLat = lat;
prevLon = lon;
prevPoint = point;
}
/* Encode remainder if needed */
encodePoint();
}
bool encodeReference(std::vector<uint8_t>& output){
/* Search for first marked point */
int end = -1, start = -1;
for(int i=0; i<points_.size(); i++){
if(points_[i]->encoded_){
start = i;
break;
}
}
for(int i=points_.size()-1; i>=0; i--){
if(points_[i]->encoded_){
end = i;
break;
}
}
if(end < 0 || start < 0){
/* Only unencoded points, then we can only delta encode it ourself */
return false;
}
/* Encode delta until where we can refer */
encodeDelta(output, nullptr, 0, start);
/* Add reference marker if it is still needed */
if(start != end){
uint64_t startRef = points_[start]->encodedOffset_;
uint64_t endRef = points_[end]->encodedOffset_;
output.push_back(0);
output.push_back(1);
encodeVariableLength(output, startRef, false);
encodeVariableLength(output, endRef - startRef, true);
}
/* Encode delta till the end of the segment */
encodeDelta(output, nullptr, end+1);
return true;
}
};
long encodeBinaryData(std::vector<uint8_t>& output) long encodeBinaryData(std::vector<uint8_t>& output)
{ {
bool first = true; std::vector<LineSegment*> lines_;
int64_t latFixedPoint = 0, lonFixedPoint = 0; PolygonData* currentParent = nullptr;
int64_t latFixedPointPrev, lonFixedPointPrev; LineSegment* segment = nullptr;
int64_t diffLatAcc = 0, diffLonAcc = 0, diffLatPrev = 0, diffLonPrev = 0; /* Step 1: Encode first point */
Point* prevPoint = points_[0];
encodeVariableLength(output, prevPoint->key_, false);
for(Point point: points_){ int direction = 0;
/* The points should first be rounded, and then the integer value is differentiated */ /* Step 2: Go through the list of points and check which ones already exist.
latFixedPointPrev = latFixedPoint; * We skip the first and last one since the first one is already encoded
lonFixedPointPrev = lonFixedPoint; * and the last one is identical to the first */
std::tie(latFixedPoint, lonFixedPoint) = point.value(); for(int i=1; i<points_.size()-1; i++){
Point* point = points_[i];
int64_t diffLat = latFixedPoint - latFixedPointPrev; if(!point->parent_){
int64_t diffLon = lonFixedPoint - lonFixedPointPrev; point->parent_ = this;
point->index_ = i;
}
if(first) { bool newSegment = false;
/* First point is always encoded */
encodeVariableLength(output, encodePointTo64(latFixedPoint, lonFixedPoint), false);
first = false; if(point->parent_ == currentParent){
if(direction == 0){
direction = point->index_ - prevPoint->index_;
if(direction > 1 || direction < -1){
newSegment = true;
}
}else{ }else{
if(!sameDirection(diffLat, diffLon, diffLatPrev, diffLonPrev)) { if(point->index_ != prevPoint->index_ + direction){
/* Encode accumulator */ newSegment = true;
if(diffLatAcc || diffLonAcc){ }
encodeVariableLength(output, encodePointTo64(diffLatAcc, diffLonAcc), false);
diffLatAcc = 0;
diffLonAcc = 0;
} }
} }
diffLatAcc += diffLat; if(point->parent_ != currentParent || newSegment){
diffLonAcc += diffLon; if(segment){
lines_.push_back(segment);
} }
diffLatPrev = diffLat; currentParent = point->parent_;
diffLonPrev = diffLon;
segment = new LineSegment();
segment->prevPoint_ = prevPoint;
segment->parent_ = currentParent;
direction = 0;
} }
/* Encode final point if needed */ segment->points_.push_back(point);
if(diffLonAcc || diffLatAcc) {
encodeVariableLength(output, encodePointTo64(diffLatAcc, diffLonAcc), false); prevPoint = point;
}
if(segment){
lines_.push_back(segment);
} }
/* Encode stop marker */ /* Step 3: Encode segments */
for(LineSegment* segment: lines_){
if(segment->parent_ == this){
/* If we are the parent of the segment we must encode and mark it */
segment->encodeDelta(output, this);
}else{
/* We are not the parent, we can encode it or refer to it, depending on
* which takes less bytes. In any case we should not mark it. */
std::vector<uint8_t> delta;
segment->encodeDelta(delta);
std::vector<uint8_t> reference;
bool possible = segment->encodeReference(reference);
if(!possible || delta.size() <= reference.size()){
output.insert(std::end(output), std::begin(delta), std::end(delta));
}else{
output.insert(std::end(output), std::begin(reference), std::end(reference));
}
}
}
/* Step 4: Write end marker */
output.push_back(0); output.push_back(0);
output.push_back(0); output.push_back(0);
@ -520,7 +698,7 @@ int main(int argc, char ** argv )
} }
} }
Point p(shapeObject->padfY[j], shapeObject->padfX[j], precision); Point* p = Point::GetPoint(shapeObject->padfY[j], shapeObject->padfX[j], precision);
polygonData->processPoint(p); polygonData->processPoint(p);
} }

View file

@ -8,14 +8,14 @@ mkdir -p naturalearth; cd naturalearth
#wget https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_0_countries_lakes.zip #wget https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_0_countries_lakes.zip
#unzip ne_10m_admin_0_countries_lakes.zip #unzip ne_10m_admin_0_countries_lakes.zip
cd .. cd ..
#./builder C naturalearth/ne_10m_admin_0_countries_lakes ./out/country16.bin 16 "Made with Natural Earth, placed in the Public Domain." ./builder C naturalearth/ne_10m_admin_0_countries_lakes ./out/country16.bin 16 "Made with Natural Earth, placed in the Public Domain."
#./builder C naturalearth/ne_10m_admin_0_countries_lakes ./out/country21.bin 21 "Made with Natural Earth, placed in the Public Domain." ./builder C naturalearth/ne_10m_admin_0_countries_lakes ./out/country21.bin 21 "Made with Natural Earth, placed in the Public Domain."
mkdir timezone; cd timezone mkdir timezone; cd timezone
#wget https://github.com/evansiroky/timezone-boundary-builder/releases/download/2018i/timezones.shapefile.zip #wget https://github.com/evansiroky/timezone-boundary-builder/releases/download/2018i/timezones.shapefile.zip
#unzip timezones.shapefile.zip #unzip timezones.shapefile.zip
cd .. cd ..
#./builder T timezone/dist/combined-shapefile ./out/timezone16.bin 16 "Contains data from Natural Earth, placed in the Public Domain. Contains information from https://github.com/evansiroky/timezone-boundary-builder, which is made available here under the Open Database License (ODbL)." ./builder T timezone/dist/combined-shapefile ./out/timezone16.bin 16 "Contains data from Natural Earth, placed in the Public Domain. Contains information from https://github.com/evansiroky/timezone-boundary-builder, which is made available here under the Open Database License (ODbL)."
./builder T timezone/dist/combined-shapefile ./out/timezone21.bin 21 "Contains data from Natural Earth, placed in the Public Domain. Contains information from https://github.com/evansiroky/timezone-boundary-builder, which is made available here under the Open Database License (ODbL)." ./builder T timezone/dist/combined-shapefile ./out/timezone21.bin 21 "Contains data from Natural Earth, placed in the Public Domain. Contains information from https://github.com/evansiroky/timezone-boundary-builder, which is made available here under the Open Database License (ODbL)."
#rm -rf naturalearth #rm -rf naturalearth
#zip db.zip out/* #zip db.zip out/*

View file

@ -138,6 +138,34 @@ static unsigned int ZDDecodeVariableLengthUnsigned(const ZoneDetect *library, ui
return i; return i;
} }
static unsigned int ZDDecodeVariableLengthUnsignedReverse(const ZoneDetect *library, uint32_t *index, uint64_t *result){
uint32_t i = *index;
if(library->mapping[i] & UINT8_C(0x80)){
printf("BUG, reverse mapping final byte is not the end of stream\n");
return 0;
}
if(!i){
return 0;
}
i--;
while(library->mapping[i] & UINT8_C(0x80)){
if(!i){
return 0;
}
i--;
}
*index = i;
i++;
uint32_t i2 = i;
return ZDDecodeVariableLengthUnsigned(library, &i2, result);
}
static int64_t ZDDecodeUnsignedToSigned(uint64_t value){ static int64_t ZDDecodeUnsignedToSigned(uint64_t value){
return (value & 1) ? -(int64_t)(value / 2) : (int64_t)(value / 2); return (value & 1) ? -(int64_t)(value / 2) : (int64_t)(value / 2);
} }
@ -306,20 +334,64 @@ static ZDLookupResult ZDPointInPolygon(const ZoneDetect *library, uint32_t polyg
int prevQuadrant = 0, winding = 0; int prevQuadrant = 0, winding = 0;
uint8_t done = 0, first = 1; uint8_t done = 0, first = 1;
uint32_t referenceStart=0, referenceEnd=0;
int32_t referenceDirection = 0;
do{ do{
uint64_t point; uint64_t point;
uint8_t referenceDone = 0;
if(!referenceDirection){
if(!ZDDecodeVariableLengthUnsigned(library, &polygonIndex, &point)) return ZD_LOOKUP_PARSE_ERROR; if(!ZDDecodeVariableLengthUnsigned(library, &polygonIndex, &point)) return ZD_LOOKUP_PARSE_ERROR;
}else{
if(referenceDirection > 0){
/* Read reference forward */
if(!ZDDecodeVariableLengthUnsigned(library, &referenceStart, &point)) return ZD_LOOKUP_PARSE_ERROR;
if(referenceStart >= referenceEnd){
referenceDone = 1;
}
}else if(referenceDirection < 0){
/* Read reference backwards */
//TODO: This code is wrong (doh)
if(!ZDDecodeVariableLengthUnsignedReverse(library, &referenceStart, &point)) return ZD_LOOKUP_PARSE_ERROR;
if(referenceStart <= referenceEnd){
referenceDone = 1;
}
}
}
//TODO: special marker during reference mode is an error
if(!point){ if(!point){
/* This is a special marker */ /* This is a special marker */
if(referenceDirection){
printf("BUG, marker in reference mode?\n");
exit(10);
}
uint64_t value; uint64_t value;
if(!ZDDecodeVariableLengthUnsigned(library, &polygonIndex, &value)) return ZD_LOOKUP_PARSE_ERROR; if(!ZDDecodeVariableLengthUnsigned(library, &polygonIndex, &value)) return ZD_LOOKUP_PARSE_ERROR;
if(value == 0){ if(value == 0){
done = 1; done = 1;
}else if(value == 1){
int32_t diff;
int64_t start;
if(!ZDDecodeVariableLengthUnsigned(library, &polygonIndex, (uint64_t*)&start)) return ZD_LOOKUP_PARSE_ERROR;
if(!ZDDecodeVariableLengthSigned(library, &polygonIndex, &diff)) return ZD_LOOKUP_PARSE_ERROR;
referenceStart = library->dataOffset+(uint32_t)start;
referenceEnd = library->dataOffset+(uint32_t)(start + diff);
referenceDirection = diff;
if(diff < 0){
referenceStart--;
}
continue;
} }
}else{ }else{
ZDDecodePoint(point, &diffLat, &diffLon); ZDDecodePoint(point, &diffLat, &diffLon);
if(referenceDirection < 0){
diffLat = -diffLat;
diffLon = -diffLon;
}
} }
if(!done){ if(!done){
@ -330,7 +402,7 @@ static ZDLookupResult ZDPointInPolygon(const ZoneDetect *library, uint32_t polyg
firstLon = pointLon; firstLon = pointLon;
} }
} else { } else {
/* The polygons should be closed, but just in case */ /* Close the polygon (the closing point is not encoded) */
pointLat = firstLat; pointLat = firstLat;
pointLon = firstLon; pointLon = firstLon;
} }
@ -451,6 +523,10 @@ static ZDLookupResult ZDPointInPolygon(const ZoneDetect *library, uint32_t polyg
if(first){ if(first){
first = 0; first = 0;
} }
if(referenceDone){
referenceDirection = 0;
}
}while(!done); }while(!done);
if(winding == -4) { if(winding == -4) {