Reduce filesize further by not encoding points multiple times (not well tested, do not use)

This commit is contained in:
Bertold Van den Bergh 2019-08-14 00:54:40 +02:00
parent 4269644520
commit 71a5187802
3 changed files with 348 additions and 94 deletions

View file

@ -84,26 +84,62 @@ int encodeVariableLength(std::vector<uint8_t>& output, int64_t valueIn, bool han
return bytesUsed;
}
uint64_t encodePointTo64(int64_t lat, int64_t lon){
assert(lat || lon, "Tried to encode 0,0. This is not allowed");
uint64_t latu=encodeSignedToUnsigned(lat);
uint64_t lonu=encodeSignedToUnsigned(lon);
assert(latu < (uint64_t)1<<32, "Unsigned lat overflow");
assert(lonu < (uint64_t)1<<32, "Unsigned lat overflow");
uint64_t point = 0;
for(uint8_t i=31; i<=31; i--){
point <<= 2;
if(latu & (1<<i)){
point |= 1;
}
if(lonu & (1<<i)){
point |= 2;
}
}
return point;
}
int64_t doubleToFixedPoint(double input, double scale, unsigned int precision = 32)
{
if(input == Inf){
return INT64_MAX;
}
if(input == -Inf){
return INT64_MIN;
}
double inputScaled = input / scale;
return inputScaled * pow(2, precision-1);
}
struct Point;
struct PolygonData;
std::unordered_map<uint64_t, Point*> pointMap_;
struct Point {
Point(double lat = 0, double lon = 0, unsigned int precision = 32)
static Point* GetPoint(double dlat = 0, double dlon = 0, unsigned int precision = 32){
int64_t lat = doubleToFixedPoint(dlat, 90, precision);
int64_t lon = doubleToFixedPoint(dlon, 180, precision);
uint64_t key = encodePointTo64(lat, lon);
if(pointMap_.count(key)){
return pointMap_[key];
}
Point* p = new Point(lat, lon);
p->key_ = key;
pointMap_[key] = p;
return p;
}
Point(int64_t lat = 0, int64_t lon = 0)
{
lat_ = doubleToFixedPoint(lat, 90, precision);
lon_ = doubleToFixedPoint(lon, 180, precision);
lat_ = lat;
lon_ = lon;
}
std::tuple<int64_t, int64_t> value()
@ -121,125 +157,267 @@ struct Point {
int64_t lat_;
int64_t lon_;
uint64_t key_;
PolygonData* parent_ = nullptr;
int index_ = 0;
bool encoded_ = false;
uint64_t encodedOffset_ = 0;
};
struct PolygonData {
Point boundingMin;
Point boundingMax;
std::vector<Point> points_;
std::vector<Point*> points_;
unsigned long fileIndex_ = 0;
unsigned long metadataId_;
Point* lastPoint_ = nullptr;
void processPoint(const Point& p)
void processPoint(Point* p)
{
if(p.lat_ < boundingMin.lat_) {
boundingMin.lat_ = p.lat_;
if(p->lat_ < boundingMin.lat_) {
boundingMin.lat_ = p->lat_;
}
if(p.lon_ < boundingMin.lon_) {
boundingMin.lon_ = p.lon_;
if(p->lon_ < boundingMin.lon_) {
boundingMin.lon_ = p->lon_;
}
if(p.lat_ > boundingMax.lat_) {
boundingMax.lat_ = p.lat_;
if(p->lat_ > boundingMax.lat_) {
boundingMax.lat_ = p->lat_;
}
if(p.lon_ > boundingMax.lon_) {
boundingMax.lon_ = p.lon_;
if(p->lon_ > boundingMax.lon_) {
boundingMax.lon_ = p->lon_;
}
/* Don't encode duplicate points */
if(lastPoint_ == p){
return;
}
lastPoint_ = p;
points_.push_back(p);
}
PolygonData(unsigned long id):
boundingMin(Inf, Inf),
boundingMax(-Inf, -Inf),
boundingMin(INT64_MAX, INT64_MAX),
boundingMax(INT64_MIN, INT64_MIN),
metadataId_(id)
{
}
uint64_t encodePointTo64(int64_t lat, int64_t lon){
assert(lat || lon, "Tried to encode 0,0. This is not allowed");
uint64_t latu=encodeSignedToUnsigned(lat);
uint64_t lonu=encodeSignedToUnsigned(lon);
assert(latu < (uint64_t)1<<32, "Unsigned lat overflow");
assert(lonu < (uint64_t)1<<32, "Unsigned lat overflow");
uint64_t point = 0;
for(uint8_t i=31; i<=31; i--){
point <<= 2;
if(latu & (1<<i)){
point |= 1;
}
if(lonu & (1<<i)){
point |= 2;
}
}
return point;
}
struct LineSegment {
std::vector<Point*> points_;
Point* prevPoint_;
PolygonData* parent_;
bool sameDirection(int64_t x1, int64_t y1, int64_t x2, int64_t y2){
if((x1 > 0 && x2 < 0) || (x1 < 0 && x2 > 0)){
return false;
}
if((y1 > 0 && y2 < 0) || (y1 < 0 && y2 > 0)){
return false;
bool sameDirection(int64_t x1, int64_t y1, int64_t x2, int64_t y2){
if(!x2 && !y2){
return false;
}
if((x1 > 0 && x2 < 0) || (x1 < 0 && x2 > 0)){
return false;
}
if((y1 > 0 && y2 < 0) || (y1 < 0 && y2 > 0)){
return false;
}
if(x1 == 0){
return x2 == 0;
}
return y2 == (y1*x2/x1);
}
if(x1 == 0){
return x2 == 0;
}
void encodeDelta(std::vector<uint8_t>& output, PolygonData* mark = nullptr, int start = 0, int end = -1){
if(end < 0){
end = points_.size()-1;
}
return y2 == (y1*x2/x1);
}
int64_t accDiffLat = 0, accDiffLon = 0;
int64_t prevDiffLat = 0, prevDiffLon = 0;
int64_t prevLat, prevLon;
Point* prevPoint = prevPoint_;
if(start > 0){
prevPoint = points_[start-1];
}
std::tie(prevLat, prevLon) = prevPoint->value();
auto encodePoint = [&](){
/* Encode accumulator.
* After this the position is equal to that of the previous point */
if(accDiffLat || accDiffLon){
encodeVariableLength(output, encodePointTo64(accDiffLat, accDiffLon), false);
}
/* Mark points as encoded if we mark and we are the parent */
if(mark && prevPoint->parent_ == mark){
prevPoint->encoded_ = true;
prevPoint->encodedOffset_ = output.size();
}
/* Reset accumulator */
accDiffLat = 0;
accDiffLon = 0;
};
for(int i = start; i<=end; i++){
Point* point = points_[i];
int64_t lat, lon;
std::tie(lat, lon) = point->value();
/* Calculate difference */
int64_t diffLat = lat - prevLat;
int64_t diffLon = lon - prevLon;
/* Encode delta */
if(!sameDirection(diffLat, diffLon, prevDiffLat, prevDiffLon)){
encodePoint();
}
accDiffLat += diffLat;
accDiffLon += diffLon;
/* Store previous values */
prevDiffLat = diffLat;
prevDiffLon = diffLon;
prevLat = lat;
prevLon = lon;
prevPoint = point;
}
/* Encode remainder if needed */
encodePoint();
}
bool encodeReference(std::vector<uint8_t>& output){
/* Search for first marked point */
int end = -1, start = -1;
for(int i=0; i<points_.size(); i++){
if(points_[i]->encoded_){
start = i;
break;
}
}
for(int i=points_.size()-1; i>=0; i--){
if(points_[i]->encoded_){
end = i;
break;
}
}
if(end < 0 || start < 0){
/* Only unencoded points, then we can only delta encode it ourself */
return false;
}
/* Encode delta until where we can refer */
encodeDelta(output, nullptr, 0, start);
/* Add reference marker if it is still needed */
if(start != end){
uint64_t startRef = points_[start]->encodedOffset_;
uint64_t endRef = points_[end]->encodedOffset_;
output.push_back(0);
output.push_back(1);
encodeVariableLength(output, startRef, false);
encodeVariableLength(output, endRef - startRef, true);
}
/* Encode delta till the end of the segment */
encodeDelta(output, nullptr, end+1);
return true;
}
};
long encodeBinaryData(std::vector<uint8_t>& output)
{
bool first = true;
int64_t latFixedPoint = 0, lonFixedPoint = 0;
int64_t latFixedPointPrev, lonFixedPointPrev;
std::vector<LineSegment*> lines_;
PolygonData* currentParent = nullptr;
LineSegment* segment = nullptr;
int64_t diffLatAcc = 0, diffLonAcc = 0, diffLatPrev = 0, diffLonPrev = 0;
/* Step 1: Encode first point */
Point* prevPoint = points_[0];
encodeVariableLength(output, prevPoint->key_, false);
for(Point point: points_){
/* The points should first be rounded, and then the integer value is differentiated */
latFixedPointPrev = latFixedPoint;
lonFixedPointPrev = lonFixedPoint;
std::tie(latFixedPoint, lonFixedPoint) = point.value();
int direction = 0;
/* Step 2: Go through the list of points and check which ones already exist.
* We skip the first and last one since the first one is already encoded
* and the last one is identical to the first */
for(int i=1; i<points_.size()-1; i++){
Point* point = points_[i];
int64_t diffLat = latFixedPoint - latFixedPointPrev;
int64_t diffLon = lonFixedPoint - lonFixedPointPrev;
if(!point->parent_){
point->parent_ = this;
point->index_ = i;
}
bool newSegment = false;
if(first) {
/* First point is always encoded */
encodeVariableLength(output, encodePointTo64(latFixedPoint, lonFixedPoint), false);
first = false;
} else {
if(!sameDirection(diffLat, diffLon, diffLatPrev, diffLonPrev)) {
/* Encode accumulator */
if(diffLatAcc || diffLonAcc){
encodeVariableLength(output, encodePointTo64(diffLatAcc, diffLonAcc), false);
diffLatAcc = 0;
diffLonAcc = 0;
if(point->parent_ == currentParent){
if(direction == 0){
direction = point->index_ - prevPoint->index_;
if(direction > 1 || direction < -1){
newSegment = true;
}
}else{
if(point->index_ != prevPoint->index_ + direction){
newSegment = true;
}
}
diffLatAcc += diffLat;
diffLonAcc += diffLon;
}
diffLatPrev = diffLat;
diffLonPrev = diffLon;
if(point->parent_ != currentParent || newSegment){
if(segment){
lines_.push_back(segment);
}
currentParent = point->parent_;
segment = new LineSegment();
segment->prevPoint_ = prevPoint;
segment->parent_ = currentParent;
direction = 0;
}
segment->points_.push_back(point);
prevPoint = point;
}
/* Encode final point if needed */
if(diffLonAcc || diffLatAcc) {
encodeVariableLength(output, encodePointTo64(diffLatAcc, diffLonAcc), false);
if(segment){
lines_.push_back(segment);
}
/* Encode stop marker */
/* Step 3: Encode segments */
for(LineSegment* segment: lines_){
if(segment->parent_ == this){
/* If we are the parent of the segment we must encode and mark it */
segment->encodeDelta(output, this);
}else{
/* We are not the parent, we can encode it or refer to it, depending on
* which takes less bytes. In any case we should not mark it. */
std::vector<uint8_t> delta;
segment->encodeDelta(delta);
std::vector<uint8_t> reference;
bool possible = segment->encodeReference(reference);
if(!possible || delta.size() <= reference.size()){
output.insert(std::end(output), std::begin(delta), std::end(delta));
}else{
output.insert(std::end(output), std::begin(reference), std::end(reference));
}
}
}
/* Step 4: Write end marker */
output.push_back(0);
output.push_back(0);
@ -520,7 +698,7 @@ int main(int argc, char ** argv )
}
}
Point p(shapeObject->padfY[j], shapeObject->padfX[j], precision);
Point* p = Point::GetPoint(shapeObject->padfY[j], shapeObject->padfX[j], precision);
polygonData->processPoint(p);
}

View file

@ -8,14 +8,14 @@ mkdir -p naturalearth; cd naturalearth
#wget https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_0_countries_lakes.zip
#unzip ne_10m_admin_0_countries_lakes.zip
cd ..
#./builder C naturalearth/ne_10m_admin_0_countries_lakes ./out/country16.bin 16 "Made with Natural Earth, placed in the Public Domain."
#./builder C naturalearth/ne_10m_admin_0_countries_lakes ./out/country21.bin 21 "Made with Natural Earth, placed in the Public Domain."
./builder C naturalearth/ne_10m_admin_0_countries_lakes ./out/country16.bin 16 "Made with Natural Earth, placed in the Public Domain."
./builder C naturalearth/ne_10m_admin_0_countries_lakes ./out/country21.bin 21 "Made with Natural Earth, placed in the Public Domain."
mkdir timezone; cd timezone
#wget https://github.com/evansiroky/timezone-boundary-builder/releases/download/2018i/timezones.shapefile.zip
#unzip timezones.shapefile.zip
cd ..
#./builder T timezone/dist/combined-shapefile ./out/timezone16.bin 16 "Contains data from Natural Earth, placed in the Public Domain. Contains information from https://github.com/evansiroky/timezone-boundary-builder, which is made available here under the Open Database License (ODbL)."
./builder T timezone/dist/combined-shapefile ./out/timezone16.bin 16 "Contains data from Natural Earth, placed in the Public Domain. Contains information from https://github.com/evansiroky/timezone-boundary-builder, which is made available here under the Open Database License (ODbL)."
./builder T timezone/dist/combined-shapefile ./out/timezone21.bin 21 "Contains data from Natural Earth, placed in the Public Domain. Contains information from https://github.com/evansiroky/timezone-boundary-builder, which is made available here under the Open Database License (ODbL)."
#rm -rf naturalearth
#zip db.zip out/*

View file

@ -138,6 +138,34 @@ static unsigned int ZDDecodeVariableLengthUnsigned(const ZoneDetect *library, ui
return i;
}
static unsigned int ZDDecodeVariableLengthUnsignedReverse(const ZoneDetect *library, uint32_t *index, uint64_t *result){
uint32_t i = *index;
if(library->mapping[i] & UINT8_C(0x80)){
printf("BUG, reverse mapping final byte is not the end of stream\n");
return 0;
}
if(!i){
return 0;
}
i--;
while(library->mapping[i] & UINT8_C(0x80)){
if(!i){
return 0;
}
i--;
}
*index = i;
i++;
uint32_t i2 = i;
return ZDDecodeVariableLengthUnsigned(library, &i2, result);
}
static int64_t ZDDecodeUnsignedToSigned(uint64_t value){
return (value & 1) ? -(int64_t)(value / 2) : (int64_t)(value / 2);
}
@ -306,20 +334,64 @@ static ZDLookupResult ZDPointInPolygon(const ZoneDetect *library, uint32_t polyg
int prevQuadrant = 0, winding = 0;
uint8_t done = 0, first = 1;
uint32_t referenceStart=0, referenceEnd=0;
int32_t referenceDirection = 0;
do{
uint64_t point;
if(!ZDDecodeVariableLengthUnsigned(library, &polygonIndex, &point)) return ZD_LOOKUP_PARSE_ERROR;
uint8_t referenceDone = 0;
if(!referenceDirection){
if(!ZDDecodeVariableLengthUnsigned(library, &polygonIndex, &point)) return ZD_LOOKUP_PARSE_ERROR;
}else{
if(referenceDirection > 0){
/* Read reference forward */
if(!ZDDecodeVariableLengthUnsigned(library, &referenceStart, &point)) return ZD_LOOKUP_PARSE_ERROR;
if(referenceStart >= referenceEnd){
referenceDone = 1;
}
}else if(referenceDirection < 0){
/* Read reference backwards */
//TODO: This code is wrong (doh)
if(!ZDDecodeVariableLengthUnsignedReverse(library, &referenceStart, &point)) return ZD_LOOKUP_PARSE_ERROR;
if(referenceStart <= referenceEnd){
referenceDone = 1;
}
}
}
//TODO: special marker during reference mode is an error
if(!point){
/* This is a special marker */
if(referenceDirection){
printf("BUG, marker in reference mode?\n");
exit(10);
}
uint64_t value;
if(!ZDDecodeVariableLengthUnsigned(library, &polygonIndex, &value)) return ZD_LOOKUP_PARSE_ERROR;
if(value == 0){
done = 1;
}else if(value == 1){
int32_t diff;
int64_t start;
if(!ZDDecodeVariableLengthUnsigned(library, &polygonIndex, (uint64_t*)&start)) return ZD_LOOKUP_PARSE_ERROR;
if(!ZDDecodeVariableLengthSigned(library, &polygonIndex, &diff)) return ZD_LOOKUP_PARSE_ERROR;
referenceStart = library->dataOffset+(uint32_t)start;
referenceEnd = library->dataOffset+(uint32_t)(start + diff);
referenceDirection = diff;
if(diff < 0){
referenceStart--;
}
continue;
}
}else{
ZDDecodePoint(point, &diffLat, &diffLon);
if(referenceDirection < 0){
diffLat = -diffLat;
diffLon = -diffLon;
}
}
if(!done){
@ -330,7 +402,7 @@ static ZDLookupResult ZDPointInPolygon(const ZoneDetect *library, uint32_t polyg
firstLon = pointLon;
}
} else {
/* The polygons should be closed, but just in case */
/* Close the polygon (the closing point is not encoded) */
pointLat = firstLat;
pointLon = firstLon;
}
@ -451,6 +523,10 @@ static ZDLookupResult ZDPointInPolygon(const ZoneDetect *library, uint32_t polyg
if(first){
first = 0;
}
if(referenceDone){
referenceDirection = 0;
}
}while(!done);
if(winding == -4) {