diff --git a/database/README.md b/database/README.md new file mode 100644 index 0000000..5309a92 --- /dev/null +++ b/database/README.md @@ -0,0 +1,3 @@ +# Database files + +Please download the database files [here](https://files.bertold.org/zd/db.zip) or create them using the builder. diff --git a/database/builder/builder.cpp b/database/builder/builder.cpp new file mode 100644 index 0000000..af6e65e --- /dev/null +++ b/database/builder/builder.cpp @@ -0,0 +1,566 @@ +/* + * Copyright (c) 2018, Bertold Van den Bergh (vandenbergh@bertold.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the author nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR DISTRIBUTOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const double Inf = std::numeric_limits::infinity(); + +std::unordered_map alpha2ToName; +std::unordered_map tzidToAlpha2; + +void errorFatal(std::string what) +{ + std::cerr<& output, int64_t valueIn, bool handleNeg = true) +{ + uint64_t value = valueIn * 2; + if(valueIn < 0) { + value = -valueIn * 2 + 1; + } + + if(!handleNeg) { + value = valueIn; + } + + int bytesUsed = 0; + do { + uint8_t byteOut = value & 0x7F; + if(value >= 128) { + byteOut |= 0x80; + } + output.push_back(byteOut); + bytesUsed ++; + value >>= 7; + } while(value); + + return bytesUsed; +} + +int64_t doubleToFixedPoint(double input, double scale, unsigned int precision = 32) +{ + double inputScaled = input / scale; + return inputScaled * pow(2, precision-1); + +} + +struct Point { + Point(double lat = 0, double lon = 0) + { + lat_ = lat; + lon_ = lon; + } + + Point operator-(const Point& p) + { + Point result(lat_ - p.lat_, lon_ - p.lon_); + return result; + } + + std::tuple toFixedPoint(unsigned int precision = 32) + { + int64_t latFixedPoint = doubleToFixedPoint(lat_, 90, precision); + int64_t lonFixedPoint = doubleToFixedPoint(lon_, 180, precision); + + return std::make_tuple(latFixedPoint, lonFixedPoint); + } + + int encodePointBinary(std::vector& output, unsigned int precision = 32) + { + int64_t latFixedPoint, lonFixedPoint; + std::tie(latFixedPoint, lonFixedPoint) = toFixedPoint(precision); + + int bytesUsed = encodeVariableLength(output, latFixedPoint); + bytesUsed += encodeVariableLength(output, lonFixedPoint); + + return bytesUsed; + } + + double lat_; + double lon_; +}; + +struct PolygonData { + Point boundingMin; + Point boundingMax; + std::vector points_; + unsigned long fileIndex_ = 0; + unsigned long metadataId_; + + void processPoint(const Point& p) + { + if(p.lat_ < boundingMin.lat_) { + boundingMin.lat_ = p.lat_; + } + if(p.lon_ < boundingMin.lon_) { + boundingMin.lon_ = p.lon_; + } + if(p.lat_ > boundingMax.lat_) { + boundingMax.lat_ = p.lat_; + } + if(p.lon_ > boundingMax.lon_) { + boundingMax.lon_ = p.lon_; + } + + points_.push_back(p); + } + + PolygonData(unsigned long id): + boundingMin(Inf, Inf), + boundingMax(-Inf, -Inf), + metadataId_(id) + { + } + + long encodeBinaryData(std::vector& output, unsigned int precision = 20) + { + long bytesEncoded = 0; + bool first = true; + int64_t latFixedPoint = 0, lonFixedPoint = 0; + int64_t latFixedPointPrev, lonFixedPointPrev; + uint64_t vertices = 0; + + std::vector tmp; + + int64_t diffLatAcc = 0, diffLonAcc = 0, diffLatPrev = 0, diffLonPrev = 0; + + for(Point& point: points_) { + /* The points should first be rounded, and then the integer value is differentiated */ + latFixedPointPrev = latFixedPoint; + lonFixedPointPrev = lonFixedPoint; + std::tie(latFixedPoint, lonFixedPoint) = point.toFixedPoint(precision); + + int64_t diffLat = latFixedPoint - latFixedPointPrev; + int64_t diffLon = lonFixedPoint - lonFixedPointPrev; + + if(first) { + /* First point is always encoded */ + vertices++; + encodeVariableLength(tmp, latFixedPoint); + encodeVariableLength(tmp, lonFixedPoint); + first = false; + } else { + /* Ignore points that are not different */ + if(!diffLon && !diffLat) { + continue; + } + + if(diffLat != diffLatPrev || diffLon != diffLonPrev) { + /* Encode accumulator */ + vertices++; + encodeVariableLength(tmp, diffLatAcc); + encodeVariableLength(tmp, diffLonAcc); + + diffLatAcc = 0; + diffLonAcc = 0; + } + + diffLatAcc += diffLat; + diffLonAcc += diffLon; + } + + diffLatPrev = diffLat; + diffLonPrev = diffLon; + } + + /* Encode final point */ + vertices++; + encodeVariableLength(tmp, diffLatAcc); + encodeVariableLength(tmp, diffLonAcc); + + encodeVariableLength(output, vertices, false); + std::copy(tmp.begin(), tmp.end(), std::back_inserter(output)); + + return bytesEncoded; + } +}; + +void encodeStringToBinary(std::vector& output, std::string& input) +{ + encodeVariableLength(output, input.size(), false); + for(unsigned int i=0; i usedStrings_; + +struct MetaData { + void encodeBinaryData(std::vector& output) + { + for(std::string& str: data_) { + if(str.length() >= 256) { + std::cout << "Metadata string is too long\n"; + exit(1); + } + + if(!usedStrings_.count(str)) { + usedStrings_[str] = output.size(); + encodeStringToBinary(output, str); + } else { + encodeVariableLength(output, usedStrings_[str] + 256, false); + } + } + } + + std::vector data_; + + unsigned long fileIndex_; +}; + + +std::vector polygons_; +std::vector metadata_; +std::vector fieldNames_; + + +unsigned int decodeVariableLength(uint8_t* buffer, int64_t* result, bool handleNeg = true) +{ + int64_t value = 0; + unsigned int i=0, shift = 0; + + do { + value |= (buffer[i] & 0x7F) << shift; + shift += 7; + } while(buffer[i++] & 0x80); + + if(!handleNeg) { + *result = value; + } else { + *result = (value & 1)?-(value/2):(value/2); + } + return i; +} + +void readMetaDataTimezone(DBFHandle dataHandle) +{ + /* Specify field names */ + fieldNames_.push_back("TimezoneIdPrefix"); + fieldNames_.push_back("TimezoneId"); + fieldNames_.push_back("CountryAlpha2"); + fieldNames_.push_back("CountryName"); + + /* Parse attribute names */ + for(int i = 0; i < DBFGetRecordCount(dataHandle); i++) { + metadata_[i].data_.resize(4); + for(int j = 0; j < DBFGetFieldCount(dataHandle); j++) { + char fieldTitle[12]; + int fieldWidth, fieldDecimals; + DBFFieldType eType = DBFGetFieldInfo(dataHandle, j, fieldTitle, &fieldWidth, &fieldDecimals); + + fieldTitle[11] = 0; + std::string fieldTitleStr(fieldTitle); + + if( eType == FTString ) { + if(fieldTitleStr == "tzid") { + std::string data = DBFReadStringAttribute(dataHandle, i, j); + size_t pos = data.find('/'); + if (pos == std::string::npos) { + metadata_[i].data_.at(0) = data; + } else { + metadata_[i].data_.at(0) = data.substr(0, pos) + "/"; + metadata_[i].data_.at(1) = data.substr(pos + 1, std::string::npos); + } + if(tzidToAlpha2.count(data)) { + metadata_[i].data_.at(2) = tzidToAlpha2[data]; + if(alpha2ToName.count(metadata_[i].data_.at(2))) { + metadata_[i].data_.at(3) = alpha2ToName[metadata_[i].data_.at(2)]; + } else { + std::cout< parseAlpha2ToName(DBFHandle dataHandle) +{ + std::unordered_map result; + + for(int i = 0; i < DBFGetRecordCount(dataHandle); i++) { + std::string alpha2, name; + for(int j = 0; j < DBFGetFieldCount(dataHandle); j++) { + char fieldTitle[12]; + int fieldWidth, fieldDecimals; + DBFFieldType eType = DBFGetFieldInfo(dataHandle, j, fieldTitle, &fieldWidth, &fieldDecimals); + + fieldTitle[11] = 0; + std::string fieldTitleStr(fieldTitle); + + if( eType == FTString ) { + if(fieldTitleStr == "ISO_A2" || fieldTitleStr == "WB_A2") { + std::string tmp = DBFReadStringAttribute(dataHandle, i, j); + if(tmp != "-99" && alpha2 == "") { + alpha2 = tmp; + } + } else if(fieldTitleStr == "NAME_LONG") { + name = DBFReadStringAttribute(dataHandle, i, j); + } + } + } + if(alpha2 != "") { + result[alpha2]=name; + } + } + + result["GF"]="French Guiana"; + result["GP"]="Guadeloupe"; + result["BQ"]="Bonaire"; + result["MQ"]="Martinique"; + result["SJ"]="Svalbard and Jan Mayen Islands"; + result["NO"]="Norway"; + result["CX"]="Christmas Island"; + result["CC"]="Cocos Islands"; + result["YT"]="Mayotte"; + result["RE"]="Réunion"; + result["TK"]="Tokelau"; + + return result; +} + +std::unordered_map parseTimezoneToAlpha2(std::string path) +{ + std::unordered_map result; + //TODO: Clean solution... +#include "zoneToAlpha.h" + + return result; +} + +int main(int argc, char ** argv ) +{ + if(argc != 6) { + std::cout << "Wrong number of parameters\n"; + return 1; + } + + tzidToAlpha2 = parseTimezoneToAlpha2("TODO"); + + char tableType = argv[1][0]; + std::string path = argv[2]; + std::string outPath = argv[3]; + unsigned int precision = strtol(argv[4], NULL, 10); + std::string notice = argv[5]; + + DBFHandle dataHandle = DBFOpen("naturalearth/ne_10m_admin_0_countries_lakes", "rb" ); + alpha2ToName = parseAlpha2ToName(dataHandle); + DBFClose(dataHandle); + + dataHandle = DBFOpen(path.c_str(), "rb" ); + if( dataHandle == NULL ) { + errorFatal("Could not open attribute file\n"); + } + + metadata_.resize(DBFGetRecordCount(dataHandle)); + std::cout << "Reading "<nSHPType != 3 && shapeObject->nSHPType != 5 && + shapeObject->nSHPType != 13 && shapeObject->nSHPType != 15) { + std::cout<<"Unsupported shape object ("<< SHPTypeName(shapeObject->nSHPType) <<")\n"; + continue; + } + + int partIndex = 0; + + PolygonData* polygonData = nullptr; + + for(int j = 0; j < shapeObject->nVertices; j++ ) { + if(j == 0 || j == shapeObject->panPartStart[partIndex]) { + totalPolygons++; + + if(polygonData) { + /* Commit it */ + polygons_.push_back(polygonData); + } + polygonData = new PolygonData(i); + + if(partIndex + 1 < shapeObject->nParts) { + partIndex++; + } + } + + Point p(shapeObject->padfY[j], shapeObject->padfX[j]); + polygonData->processPoint(p); + + } + + if(polygonData) { + /* Commit it */ + polygons_.push_back(polygonData); + } + + SHPDestroyObject(shapeObject); + } + } + + SHPClose(shapeHandle); + + std::cout<<"Parsed "<boundingMin.lat_ < b->boundingMin.lat_; + }); + + /* Encode data section and store pointers */ + std::vector outputData; + for(PolygonData* polygon: polygons_) { + polygon->fileIndex_ = outputData.size(); + polygon->encodeBinaryData(outputData, precision); + } + std::cout << "Encoded data section into "< outputMeta; + for(MetaData& metadata: metadata_) { + metadata.fileIndex_ = outputMeta.size(); + metadata.encodeBinaryData(outputMeta); + } + std::cout << "Encoded metadata into "< outputBBox; + int64_t prevFileIndex = 0; + int64_t prevMetaIndex = 0; + for(PolygonData* polygon: polygons_) { + polygon->boundingMin.encodePointBinary(outputBBox, precision); + polygon->boundingMax.encodePointBinary(outputBBox, precision); + + encodeVariableLength(outputBBox, metadata_.at(polygon->metadataId_).fileIndex_ - prevMetaIndex); + prevMetaIndex = metadata_[polygon->metadataId_].fileIndex_; + + encodeVariableLength(outputBBox, polygon->fileIndex_ - prevFileIndex, false); + prevFileIndex = polygon->fileIndex_; + } + std::cout << "Encoded bounding box section into "< outputHeader; + outputHeader.push_back('P'); + outputHeader.push_back('L'); + outputHeader.push_back('B'); + outputHeader.push_back(tableType); + outputHeader.push_back(0); + outputHeader.push_back(precision); + outputHeader.push_back(fieldNames_.size()); + for(unsigned int i=0; i