Source code for voxcity.downloader.mbfp
"""
Module for downloading and processing Microsoft Building Footprints data.
This module provides functionality to download building footprint data from Microsoft's
open dataset, which contains building polygons extracted from satellite imagery using
AI. It handles downloading quadkey-based data files and converting them to GeoJSON format.
The data is organized using quadkeys, which are hierarchical spatial indexing strings
that identify tiles on the map at different zoom levels. Each quadkey corresponds to
a specific geographic area and zoom level.
Key Features:
- Downloads building footprint data from Microsoft's global buildings dataset
- Handles quadkey-based spatial queries
- Converts compressed data files to GeoJSON format
- Supports rectangular region queries using vertex coordinates
"""
import pandas as pd
import os
from .utils import download_file
from ..geoprocessor.utils import tile_from_lat_lon, quadkey_to_tile
from ..geoprocessor.io import load_gdf_from_multiple_gz
__all__ = ["get_mbfp_gdf"]
def get_geojson_links(output_dir):
"""Download and load the dataset links CSV file containing building footprint URLs.
This function downloads a master CSV file from Microsoft's server that contains
links to all available building footprint datasets. The CSV includes metadata
such as location names, quadkeys, URLs, and file sizes for each dataset tile.
Args:
output_dir (str): Directory path where the CSV file will be saved
Returns:
pandas.DataFrame: DataFrame containing dataset links with columns:
- Location: String identifier for the geographic region
- QuadKey: String representing the tile's quadkey
- Url: Direct download link for the GeoJSON data
- Size: File size information
Note:
The CSV file is cached locally in the output directory for future use.
"""
# URL for the master CSV file containing links to all building footprint data
url = "https://minedbuildings.z5.web.core.windows.net/global-buildings/dataset-links.csv"
filepath = os.path.join(output_dir, "dataset-links.csv")
# Download the CSV file
download_file(url, filepath)
# Define data types for CSV columns to ensure proper loading
data_types = {
'Location': 'str',
'QuadKey': 'str',
'Url': 'str',
'Size': 'str'
}
# Load and return the CSV as a DataFrame
df_links = pd.read_csv(filepath, dtype=data_types)
return df_links
def find_row_for_location(df, lon, lat):
"""Find the dataset row containing building data for a given lon/lat coordinate.
This function searches through the dataset links DataFrame to find the appropriate
tile containing the specified geographic coordinates. It converts the input
coordinates to tile coordinates at the same zoom level as each quadkey and
checks for a match.
Args:
df (pandas.DataFrame): DataFrame containing dataset links from get_geojson_links()
lon (float): Longitude coordinate to search for (-180 to 180)
lat (float): Latitude coordinate to search for (-90 to 90)
Returns:
pandas.Series: Matching row from DataFrame containing the quadkey and download URL,
or None if no matching tile is found
Note:
The function handles invalid quadkeys gracefully by skipping them and
continues searching through all available tiles.
"""
for index, row in df.iterrows():
quadkey = str(row['QuadKey'])
if not isinstance(quadkey, str) or len(quadkey) == 0:
continue
try:
# Convert lon/lat to tile coordinates at the quadkey's zoom level
loc_tile_x, loc_tile_y = tile_from_lat_lon(lat, lon, len(quadkey))
qk_tile_x, qk_tile_y, _ = quadkey_to_tile(quadkey)
# Return row if tile coordinates match
if loc_tile_x == qk_tile_x and loc_tile_y == qk_tile_y:
return row
except Exception as e:
print(f"Error processing row {index}: {e}")
return None
[docs]
def get_mbfp_gdf(output_dir, rectangle_vertices):
"""Download and process building footprint data for a rectangular region.
This function takes a list of coordinates defining a rectangular region and:
1. Downloads the necessary building footprint data files covering the region
2. Loads and combines the GeoJSON data from all relevant files
3. Processes the data to ensure consistent coordinate ordering
4. Assigns unique sequential IDs to each building
Args:
output_dir (str): Directory path where downloaded files will be saved
rectangle_vertices (list): List of (lon, lat) tuples defining the rectangle corners.
The coordinates should define a bounding box of the
area of interest.
Returns:
geopandas.GeoDataFrame: GeoDataFrame containing building footprints with columns:
- geometry: Building polygon geometries
- id: Sequential unique identifier for each building
Note:
- Files are downloaded only if not already present in the output directory
- Coordinates in the input vertices should be in (longitude, latitude) order
- The function handles cases where some vertices might not have available data
"""
print("Downloading geojson files")
df_links = get_geojson_links(output_dir)
# Find and download files for each vertex of the rectangle
filenames = []
for vertex in rectangle_vertices:
lon, lat = vertex
row = find_row_for_location(df_links, lon, lat)
if row is not None:
# Construct filename and download if not already downloaded
location = row["Location"]
quadkey = row["QuadKey"]
filename = os.path.join(output_dir, f"{location}_{quadkey}.gz")
if filename not in filenames:
filenames.append(filename)
download_file(row["Url"], filename)
else:
print("No matching row found.")
# Load GeoJSON data from downloaded files and fix coordinate ordering
gdf = load_gdf_from_multiple_gz(filenames)
# Replace id column with index numbers
gdf['id'] = gdf.index
return gdf