Files
geo/geo_app/cluster_index.py
Ruslan Bakiev 7efa753092
Some checks failed
Build Docker Image / build (push) Failing after 2m14s
Add server-side clustering with pysupercluster
2026-01-14 10:12:39 +07:00

176 lines
5.0 KiB
Python

"""
Cached SuperCluster index for server-side map clustering.
Uses pysupercluster for fast geospatial point clustering.
Index is lazily initialized on first request and cached in memory.
"""
import logging
import threading
import numpy as np
logger = logging.getLogger(__name__)
# Global cache for cluster indices
_cluster_cache = {}
_cache_lock = threading.Lock()
def _build_index(nodes, transport_type=None):
"""
Build SuperCluster index from node list.
Args:
nodes: List of node dicts with latitude, longitude, _key, name
transport_type: Optional filter for transport type
Returns:
Tuple of (SuperCluster index, node_data dict keyed by index)
"""
try:
import pysupercluster
except ImportError:
logger.error("pysupercluster not installed")
return None, {}
# Filter nodes with valid coordinates
valid_nodes = []
for node in nodes:
lat = node.get('latitude')
lon = node.get('longitude')
if lat is not None and lon is not None:
# Filter by transport type if specified
if transport_type:
types = node.get('transport_types') or []
if transport_type not in types:
continue
valid_nodes.append(node)
if not valid_nodes:
logger.warning("No valid nodes for clustering")
return None, {}
# Build numpy array of coordinates (lon, lat)
coords = np.array([
(node['longitude'], node['latitude'])
for node in valid_nodes
])
# Build node data lookup by index
node_data = {
i: {
'uuid': node.get('_key'),
'name': node.get('name'),
'latitude': node.get('latitude'),
'longitude': node.get('longitude'),
}
for i, node in enumerate(valid_nodes)
}
# Create SuperCluster index
# min_zoom=0, max_zoom=16 covers typical map zoom range
# radius=60 pixels for clustering
index = pysupercluster.SuperCluster(
coords,
min_zoom=0,
max_zoom=16,
radius=60,
extent=512,
)
logger.info("Built cluster index with %d points", len(valid_nodes))
return index, node_data
def get_clustered_nodes(db, west, south, east, north, zoom, transport_type=None):
"""
Get clustered nodes for given bounding box and zoom level.
Args:
db: ArangoDB connection
west, south, east, north: Bounding box coordinates
zoom: Map zoom level (integer)
transport_type: Optional filter
Returns:
List of cluster/point dicts with id, latitude, longitude, count, expansion_zoom, name
"""
cache_key = f"nodes:{transport_type or 'all'}"
with _cache_lock:
if cache_key not in _cluster_cache:
# Load all nodes from database
aql = """
FOR node IN nodes
FILTER node.node_type == 'logistics' OR node.node_type == null
FILTER node.latitude != null AND node.longitude != null
RETURN node
"""
cursor = db.aql.execute(aql)
all_nodes = list(cursor)
# Build index
index, node_data = _build_index(all_nodes, transport_type)
_cluster_cache[cache_key] = (index, node_data, all_nodes)
index, node_data, all_nodes = _cluster_cache[cache_key]
if index is None:
return []
# Get clusters for bounding box
# pysupercluster uses top_left (lon, lat) and bottom_right (lon, lat)
try:
clusters = index.getClusters(
top_left=(west, north),
bottom_right=(east, south),
zoom=int(zoom),
)
except Exception as e:
logger.error("getClusters failed: %s", e)
return []
results = []
for cluster in clusters:
cluster_id = cluster.get('id')
count = cluster.get('count', 1)
lat = cluster.get('latitude')
lon = cluster.get('longitude')
expansion_zoom = cluster.get('expansion_zoom')
# For single points (count=1), get the actual node data
name = None
uuid = None
if count == 1 and cluster_id is not None and cluster_id in node_data:
node_info = node_data[cluster_id]
name = node_info.get('name')
uuid = node_info.get('uuid')
results.append({
'id': uuid or f"cluster-{cluster_id}",
'latitude': lat,
'longitude': lon,
'count': count,
'expansion_zoom': expansion_zoom,
'name': name,
})
logger.info("Returning %d clusters/points for zoom=%d", len(results), zoom)
return results
def invalidate_cache(transport_type=None):
"""
Invalidate cluster cache.
Call this after nodes are updated in the database.
"""
with _cache_lock:
if transport_type:
cache_key = f"nodes:{transport_type}"
if cache_key in _cluster_cache:
del _cluster_cache[cache_key]
else:
_cluster_cache.clear()
logger.info("Cluster cache invalidated")