Source code for voxcity.geoprocessor.merge_utils
"""
Utilities to merge GeoDataFrames while resolving ID conflicts.
"""
import pandas as pd
def _merge_gdfs_with_missing_columns(gdf_1, gdf_2):
"""
Helper to merge two GeoDataFrames while handling missing columns by filling with None.
"""
columns_1 = set(gdf_1.columns)
columns_2 = set(gdf_2.columns)
only_in_1 = columns_1 - columns_2
only_in_2 = columns_2 - columns_1
for col in only_in_2:
gdf_1[col] = None
for col in only_in_1:
gdf_2[col] = None
all_columns = sorted(list(columns_1.union(columns_2)))
gdf_1 = gdf_1[all_columns]
gdf_2 = gdf_2[all_columns]
merged_gdf = pd.concat([gdf_1, gdf_2], ignore_index=True)
return merged_gdf
[docs]
def merge_gdfs_with_id_conflict_resolution(gdf_1, gdf_2, id_columns=['id', 'building_id']):
"""
Merge two GeoDataFrames while resolving ID conflicts by modifying IDs in the second GeoDataFrame.
"""
gdf_primary = gdf_1.copy()
gdf_secondary = gdf_2.copy()
missing_columns = []
for col in id_columns:
if col not in gdf_primary.columns:
missing_columns.append(f"'{col}' missing from gdf_1")
if col not in gdf_secondary.columns:
missing_columns.append(f"'{col}' missing from gdf_2")
if missing_columns:
print(f"Warning: Missing ID columns: {', '.join(missing_columns)}")
id_columns = [col for col in id_columns if col in gdf_primary.columns and col in gdf_secondary.columns]
if not id_columns:
print("Warning: No valid ID columns found. Merging without ID conflict resolution.")
merged_gdf = _merge_gdfs_with_missing_columns(gdf_primary, gdf_secondary)
return merged_gdf
max_ids = {}
for col in id_columns:
if gdf_primary[col].dtype in ['int64', 'int32', 'float64', 'float32']:
max_ids[col] = gdf_primary[col].max()
else:
max_ids[col] = len(gdf_primary)
next_ids = {col: max_ids[col] + 1 for col in id_columns}
modified_buildings = 0
for idx, row in gdf_secondary.iterrows():
needs_new_ids = False
for col in id_columns:
current_id = row[col]
if current_id in gdf_primary[col].values:
needs_new_ids = True
break
if needs_new_ids:
modified_buildings += 1
for col in id_columns:
new_id = next_ids[col]
gdf_secondary.at[idx, col] = new_id
next_ids[col] += 1
merged_gdf = _merge_gdfs_with_missing_columns(gdf_primary, gdf_secondary)
total_buildings = len(merged_gdf)
primary_buildings = len(gdf_primary)
secondary_buildings = len(gdf_secondary)
print(f"Merged {primary_buildings} buildings from primary dataset with {secondary_buildings} buildings from secondary dataset.")
print(f"Total buildings in merged dataset: {total_buildings}")
if modified_buildings > 0:
print(f"Modified IDs for {modified_buildings} buildings in secondary dataset to resolve conflicts.")
return merged_gdf